add a comment
[ghc.git] / compiler / cmm / CmmLayoutStack.hs
1 {-# LANGUAGE CPP, RecordWildCards, GADTs #-}
2 module CmmLayoutStack (
3 cmmLayoutStack, setInfoTableStackMap
4 ) where
5
6 import StgCmmUtils ( callerSaveVolatileRegs ) -- XXX layering violation
7 import StgCmmForeign ( saveThreadState, loadThreadState ) -- XXX layering violation
8
9 import BasicTypes
10 import Cmm
11 import CmmInfo
12 import BlockId
13 import CLabel
14 import CmmUtils
15 import MkGraph
16 import ForeignCall
17 import CmmLive
18 import CmmProcPoint
19 import SMRep
20 import Hoopl
21 import UniqSupply
22 import Maybes
23 import UniqFM
24 import Util
25
26 import DynFlags
27 import FastString
28 import Outputable
29 import qualified Data.Set as Set
30 import Control.Monad.Fix
31 import Data.Array as Array
32 import Data.Bits
33 import Data.List (nub)
34 import Control.Monad (liftM)
35
36 #include "HsVersions.h"
37
38 {- Note [Stack Layout]
39
40 The job of this pass is to
41
42 - replace references to abstract stack Areas with fixed offsets from Sp.
43
44 - replace the CmmHighStackMark constant used in the stack check with
45 the maximum stack usage of the proc.
46
47 - save any variables that are live across a call, and reload them as
48 necessary.
49
50 Before stack allocation, local variables remain live across native
51 calls (CmmCall{ cmm_cont = Just _ }), and after stack allocation local
52 variables are clobbered by native calls.
53
54 We want to do stack allocation so that as far as possible
55 - stack use is minimized, and
56 - unnecessary stack saves and loads are avoided.
57
58 The algorithm we use is a variant of linear-scan register allocation,
59 where the stack is our register file.
60
61 - First, we do a liveness analysis, which annotates every block with
62 the variables live on entry to the block.
63
64 - We traverse blocks in reverse postorder DFS; that is, we visit at
65 least one predecessor of a block before the block itself. The
66 stack layout flowing from the predecessor of the block will
67 determine the stack layout on entry to the block.
68
69 - We maintain a data structure
70
71 Map Label StackMap
72
73 which describes the contents of the stack and the stack pointer on
74 entry to each block that is a successor of a block that we have
75 visited.
76
77 - For each block we visit:
78
79 - Look up the StackMap for this block.
80
81 - If this block is a proc point (or a call continuation, if we
82 aren't splitting proc points), emit instructions to reload all
83 the live variables from the stack, according to the StackMap.
84
85 - Walk forwards through the instructions:
86 - At an assignment x = Sp[loc]
87 - Record the fact that Sp[loc] contains x, so that we won't
88 need to save x if it ever needs to be spilled.
89 - At an assignment x = E
90 - If x was previously on the stack, it isn't any more
91 - At the last node, if it is a call or a jump to a proc point
92 - Lay out the stack frame for the call (see setupStackFrame)
93 - emit instructions to save all the live variables
94 - Remember the StackMaps for all the successors
95 - emit an instruction to adjust Sp
96 - If the last node is a branch, then the current StackMap is the
97 StackMap for the successors.
98
99 - Manifest Sp: replace references to stack areas in this block
100 with real Sp offsets. We cannot do this until we have laid out
101 the stack area for the successors above.
102
103 In this phase we also eliminate redundant stores to the stack;
104 see elimStackStores.
105
106 - There is one important gotcha: sometimes we'll encounter a control
107 transfer to a block that we've already processed (a join point),
108 and in that case we might need to rearrange the stack to match
109 what the block is expecting. (exactly the same as in linear-scan
110 register allocation, except here we have the luxury of an infinite
111 supply of temporary variables).
112
113 - Finally, we update the magic CmmHighStackMark constant with the
114 stack usage of the function, and eliminate the whole stack check
115 if there was no stack use. (in fact this is done as part of the
116 main traversal, by feeding the high-water-mark output back in as
117 an input. I hate cyclic programming, but it's just too convenient
118 sometimes.)
119
120 There are plenty of tricky details: update frames, proc points, return
121 addresses, foreign calls, and some ad-hoc optimisations that are
122 convenient to do here and effective in common cases. Comments in the
123 code below explain these.
124
125 -}
126
127
128 -- All stack locations are expressed as positive byte offsets from the
129 -- "base", which is defined to be the address above the return address
130 -- on the stack on entry to this CmmProc.
131 --
132 -- Lower addresses have higher StackLocs.
133 --
134 type StackLoc = ByteOff
135
136 {-
137 A StackMap describes the stack at any given point. At a continuation
138 it has a particular layout, like this:
139
140 | | <- base
141 |-------------|
142 | ret0 | <- base + 8
143 |-------------|
144 . upd frame . <- base + sm_ret_off
145 |-------------|
146 | |
147 . vars .
148 . (live/dead) .
149 | | <- base + sm_sp - sm_args
150 |-------------|
151 | ret1 |
152 . ret vals . <- base + sm_sp (<--- Sp points here)
153 |-------------|
154
155 Why do we include the final return address (ret0) in our stack map? I
156 have absolutely no idea, but it seems to be done that way consistently
157 in the rest of the code generator, so I played along here. --SDM
158
159 Note that we will be constructing an info table for the continuation
160 (ret1), which needs to describe the stack down to, but not including,
161 the update frame (or ret0, if there is no update frame).
162 -}
163
164 data StackMap = StackMap
165 { sm_sp :: StackLoc
166 -- ^ the offset of Sp relative to the base on entry
167 -- to this block.
168 , sm_args :: ByteOff
169 -- ^ the number of bytes of arguments in the area for this block
170 -- Defn: the offset of young(L) relative to the base is given by
171 -- (sm_sp - sm_args) of the StackMap for block L.
172 , sm_ret_off :: ByteOff
173 -- ^ Number of words of stack that we do not describe with an info
174 -- table, because it contains an update frame.
175 , sm_regs :: UniqFM (LocalReg,StackLoc)
176 -- ^ regs on the stack
177 }
178
179 instance Outputable StackMap where
180 ppr StackMap{..} =
181 text "Sp = " <> int sm_sp $$
182 text "sm_args = " <> int sm_args $$
183 text "sm_ret_off = " <> int sm_ret_off $$
184 text "sm_regs = " <> ppr (eltsUFM sm_regs)
185
186
187 cmmLayoutStack :: DynFlags -> ProcPointSet -> ByteOff -> CmmGraph
188 -> UniqSM (CmmGraph, BlockEnv StackMap)
189 cmmLayoutStack dflags procpoints entry_args
190 graph0@(CmmGraph { g_entry = entry })
191 = do
192 -- We need liveness info. Dead assignments are removed later
193 -- by the sinking pass.
194 let (graph, liveness) = (graph0, cmmLocalLiveness dflags graph0)
195 blocks = postorderDfs graph
196
197 (final_stackmaps, _final_high_sp, new_blocks) <-
198 mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
199 layout dflags procpoints liveness entry entry_args
200 rec_stackmaps rec_high_sp blocks
201
202 new_blocks' <- mapM (lowerSafeForeignCall dflags) new_blocks
203 return (ofBlockList entry new_blocks', final_stackmaps)
204
205
206 layout :: DynFlags
207 -> BlockSet -- proc points
208 -> BlockEnv CmmLocalLive -- liveness
209 -> BlockId -- entry
210 -> ByteOff -- stack args on entry
211
212 -> BlockEnv StackMap -- [final] stack maps
213 -> ByteOff -- [final] Sp high water mark
214
215 -> [CmmBlock] -- [in] blocks
216
217 -> UniqSM
218 ( BlockEnv StackMap -- [out] stack maps
219 , ByteOff -- [out] Sp high water mark
220 , [CmmBlock] -- [out] new blocks
221 )
222
223 layout dflags procpoints liveness entry entry_args final_stackmaps final_sp_high blocks
224 = go blocks init_stackmap entry_args []
225 where
226 (updfr, cont_info) = collectContInfo blocks
227
228 init_stackmap = mapSingleton entry StackMap{ sm_sp = entry_args
229 , sm_args = entry_args
230 , sm_ret_off = updfr
231 , sm_regs = emptyUFM
232 }
233
234 go [] acc_stackmaps acc_hwm acc_blocks
235 = return (acc_stackmaps, acc_hwm, acc_blocks)
236
237 go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
238 = do
239 let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
240
241 let stack0@StackMap { sm_sp = sp0 }
242 = mapFindWithDefault
243 (pprPanic "no stack map for" (ppr entry_lbl))
244 entry_lbl acc_stackmaps
245
246 -- (a) Update the stack map to include the effects of
247 -- assignments in this block
248 let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
249
250 -- (b) Insert assignments to reload all the live variables if this
251 -- block is a proc point
252 let middle1 = if entry_lbl `setMember` procpoints
253 then foldr blockCons middle0 (insertReloads stack0)
254 else middle0
255
256 -- (c) Look at the last node and if we are making a call or
257 -- jumping to a proc point, we must save the live
258 -- variables, adjust Sp, and construct the StackMaps for
259 -- each of the successor blocks. See handleLastNode for
260 -- details.
261 (middle2, sp_off, last1, fixup_blocks, out)
262 <- handleLastNode dflags procpoints liveness cont_info
263 acc_stackmaps stack1 middle0 last0
264
265 -- (d) Manifest Sp: run over the nodes in the block and replace
266 -- CmmStackSlot with CmmLoad from Sp with a concrete offset.
267 --
268 -- our block:
269 -- middle1 -- the original middle nodes
270 -- middle2 -- live variable saves from handleLastNode
271 -- Sp = Sp + sp_off -- Sp adjustment goes here
272 -- last1 -- the last node
273 --
274 let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
275
276 final_blocks = manifestSp dflags final_stackmaps stack0 sp0 final_sp_high entry0
277 middle_pre sp_off last1 fixup_blocks
278
279 acc_stackmaps' = mapUnion acc_stackmaps out
280
281 -- If this block jumps to the GC, then we do not take its
282 -- stack usage into account for the high-water mark.
283 -- Otherwise, if the only stack usage is in the stack-check
284 -- failure block itself, we will do a redundant stack
285 -- check. The stack has a buffer designed to accommodate
286 -- the largest amount of stack needed for calling the GC.
287 --
288 this_sp_hwm | isGcJump last0 = 0
289 | otherwise = sp0 - sp_off
290
291 hwm' = maximum (acc_hwm : this_sp_hwm : map sm_sp (mapElems out))
292
293 go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
294
295
296 -- -----------------------------------------------------------------------------
297
298 -- Not foolproof, but GCFun is the culprit we most want to catch
299 isGcJump :: CmmNode O C -> Bool
300 isGcJump (CmmCall { cml_target = CmmReg (CmmGlobal l) })
301 = l == GCFun || l == GCEnter1
302 isGcJump _something_else = False
303
304 -- -----------------------------------------------------------------------------
305
306 -- This doesn't seem right somehow. We need to find out whether this
307 -- proc will push some update frame material at some point, so that we
308 -- can avoid using that area of the stack for spilling. The
309 -- updfr_space field of the CmmProc *should* tell us, but it doesn't
310 -- (I think maybe it gets filled in later when we do proc-point
311 -- splitting).
312 --
313 -- So we'll just take the max of all the cml_ret_offs. This could be
314 -- unnecessarily pessimistic, but probably not in the code we
315 -- generate.
316
317 collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
318 collectContInfo blocks
319 = (maximum ret_offs, mapFromList (catMaybes mb_argss))
320 where
321 (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
322
323 get_cont :: Block CmmNode x C -> (Maybe (Label, ByteOff), ByteOff)
324 get_cont b =
325 case lastNode b of
326 CmmCall { cml_cont = Just l, .. }
327 -> (Just (l, cml_ret_args), cml_ret_off)
328 CmmForeignCall { .. }
329 -> (Just (succ, ret_args), ret_off)
330 _other -> (Nothing, 0)
331
332
333 -- -----------------------------------------------------------------------------
334 -- Updating the StackMap from middle nodes
335
336 -- Look for loads from stack slots, and update the StackMap. This is
337 -- purely for optimisation reasons, so that we can avoid saving a
338 -- variable back to a different stack slot if it is already on the
339 -- stack.
340 --
341 -- This happens a lot: for example when function arguments are passed
342 -- on the stack and need to be immediately saved across a call, we
343 -- want to just leave them where they are on the stack.
344 --
345 procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
346 procMiddle stackmaps node sm
347 = case node of
348 CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
349 -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
350 where loc = getStackLoc area off stackmaps
351 CmmAssign (CmmLocal r) _other
352 -> sm { sm_regs = delFromUFM (sm_regs sm) r }
353 _other
354 -> sm
355
356 getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
357 getStackLoc Old n _ = n
358 getStackLoc (Young l) n stackmaps =
359 case mapLookup l stackmaps of
360 Nothing -> pprPanic "getStackLoc" (ppr l)
361 Just sm -> sm_sp sm - sm_args sm + n
362
363
364 -- -----------------------------------------------------------------------------
365 -- Handling stack allocation for a last node
366
367 -- We take a single last node and turn it into:
368 --
369 -- C1 (some statements)
370 -- Sp = Sp + N
371 -- C2 (some more statements)
372 -- call f() -- the actual last node
373 --
374 -- plus possibly some more blocks (we may have to add some fixup code
375 -- between the last node and the continuation).
376 --
377 -- C1: is the code for saving the variables across this last node onto
378 -- the stack, if the continuation is a call or jumps to a proc point.
379 --
380 -- C2: if the last node is a safe foreign call, we have to inject some
381 -- extra code that goes *after* the Sp adjustment.
382
383 handleLastNode
384 :: DynFlags -> ProcPointSet -> BlockEnv CmmLocalLive -> BlockEnv ByteOff
385 -> BlockEnv StackMap -> StackMap
386 -> Block CmmNode O O
387 -> CmmNode O C
388 -> UniqSM
389 ( [CmmNode O O] -- nodes to go *before* the Sp adjustment
390 , ByteOff -- amount to adjust Sp
391 , CmmNode O C -- new last node
392 , [CmmBlock] -- new blocks
393 , BlockEnv StackMap -- stackmaps for the continuations
394 )
395
396 handleLastNode dflags procpoints liveness cont_info stackmaps
397 stack0@StackMap { sm_sp = sp0 } middle last
398 = case last of
399 -- At each return / tail call,
400 -- adjust Sp to point to the last argument pushed, which
401 -- is cml_args, after popping any other junk from the stack.
402 CmmCall{ cml_cont = Nothing, .. } -> do
403 let sp_off = sp0 - cml_args
404 return ([], sp_off, last, [], mapEmpty)
405
406 -- At each CmmCall with a continuation:
407 CmmCall{ cml_cont = Just cont_lbl, .. } ->
408 return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
409
410 CmmForeignCall{ succ = cont_lbl, .. } -> do
411 return $ lastCall cont_lbl (wORD_SIZE dflags) ret_args ret_off
412 -- one word of args: the return address
413
414 CmmBranch{..} -> handleBranches
415 CmmCondBranch{..} -> handleBranches
416 CmmSwitch{..} -> handleBranches
417
418 where
419 -- Calls and ForeignCalls are handled the same way:
420 lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
421 -> ( [CmmNode O O]
422 , ByteOff
423 , CmmNode O C
424 , [CmmBlock]
425 , BlockEnv StackMap
426 )
427 lastCall lbl cml_args cml_ret_args cml_ret_off
428 = ( assignments
429 , spOffsetForCall sp0 cont_stack cml_args
430 , last
431 , [] -- no new blocks
432 , mapSingleton lbl cont_stack )
433 where
434 (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
435
436
437 prepareStack lbl cml_ret_args cml_ret_off
438 | Just cont_stack <- mapLookup lbl stackmaps
439 -- If we have already seen this continuation before, then
440 -- we just have to make the stack look the same:
441 = (fixupStack stack0 cont_stack, cont_stack)
442 -- Otherwise, we have to allocate the stack frame
443 | otherwise
444 = (save_assignments, new_cont_stack)
445 where
446 (new_cont_stack, save_assignments)
447 = setupStackFrame dflags lbl liveness cml_ret_off cml_ret_args stack0
448
449
450 -- For other last nodes (branches), if any of the targets is a
451 -- proc point, we have to set up the stack to match what the proc
452 -- point is expecting.
453 --
454 handleBranches :: UniqSM ( [CmmNode O O]
455 , ByteOff
456 , CmmNode O C
457 , [CmmBlock]
458 , BlockEnv StackMap )
459
460 handleBranches
461 -- Note [diamond proc point]
462 | Just l <- futureContinuation middle
463 , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
464 = do
465 let cont_args = mapFindWithDefault 0 l cont_info
466 (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
467 out = mapFromList [ (l', cont_stack)
468 | l' <- successors last ]
469 return ( assigs
470 , spOffsetForCall sp0 cont_stack (wORD_SIZE dflags)
471 , last
472 , []
473 , out)
474
475 | otherwise = do
476 pps <- mapM handleBranch (successors last)
477 let lbl_map :: LabelMap Label
478 lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
479 fix_lbl l = mapFindWithDefault l l lbl_map
480 return ( []
481 , 0
482 , mapSuccessors fix_lbl last
483 , concat [ blk | (_,_,_,blk) <- pps ]
484 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
485
486 -- For each successor of this block
487 handleBranch :: BlockId -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
488 handleBranch l
489 -- (a) if the successor already has a stackmap, we need to
490 -- shuffle the current stack to make it look the same.
491 -- We have to insert a new block to make this happen.
492 | Just stack2 <- mapLookup l stackmaps
493 = do
494 let assigs = fixupStack stack0 stack2
495 (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 assigs
496 return (l, tmp_lbl, stack2, block)
497
498 -- (b) if the successor is a proc point, save everything
499 -- on the stack.
500 | l `setMember` procpoints
501 = do
502 let cont_args = mapFindWithDefault 0 l cont_info
503 (stack2, assigs) =
504 setupStackFrame dflags l liveness (sm_ret_off stack0)
505 cont_args stack0
506 (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 assigs
507 return (l, tmp_lbl, stack2, block)
508
509 -- (c) otherwise, the current StackMap is the StackMap for
510 -- the continuation. But we must remember to remove any
511 -- variables from the StackMap that are *not* live at
512 -- the destination, because this StackMap might be used
513 -- by fixupStack if this is a join point.
514 | otherwise = return (l, l, stack1, [])
515 where live = mapFindWithDefault (panic "handleBranch") l liveness
516 stack1 = stack0 { sm_regs = filterUFM is_live (sm_regs stack0) }
517 is_live (r,_) = r `elemRegSet` live
518
519
520 makeFixupBlock :: DynFlags -> ByteOff -> Label -> StackMap -> [CmmNode O O]
521 -> UniqSM (Label, [CmmBlock])
522 makeFixupBlock dflags sp0 l stack assigs
523 | null assigs && sp0 == sm_sp stack = return (l, [])
524 | otherwise = do
525 tmp_lbl <- liftM mkBlockId $ getUniqueM
526 let sp_off = sp0 - sm_sp stack
527 block = blockJoin (CmmEntry tmp_lbl)
528 (maybeAddSpAdj dflags sp_off (blockFromList assigs))
529 (CmmBranch l)
530 return (tmp_lbl, [block])
531
532
533 -- Sp is currently pointing to current_sp,
534 -- we want it to point to
535 -- (sm_sp cont_stack - sm_args cont_stack + args)
536 -- so the difference is
537 -- sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
538 spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
539 spOffsetForCall current_sp cont_stack args
540 = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
541
542
543 -- | create a sequence of assignments to establish the new StackMap,
544 -- given the old StackMap.
545 fixupStack :: StackMap -> StackMap -> [CmmNode O O]
546 fixupStack old_stack new_stack = concatMap move new_locs
547 where
548 old_map = sm_regs old_stack
549 new_locs = stackSlotRegs new_stack
550
551 move (r,n)
552 | Just (_,m) <- lookupUFM old_map r, n == m = []
553 | otherwise = [CmmStore (CmmStackSlot Old n)
554 (CmmReg (CmmLocal r))]
555
556
557
558 setupStackFrame
559 :: DynFlags
560 -> BlockId -- label of continuation
561 -> BlockEnv CmmLocalLive -- liveness
562 -> ByteOff -- updfr
563 -> ByteOff -- bytes of return values on stack
564 -> StackMap -- current StackMap
565 -> (StackMap, [CmmNode O O])
566
567 setupStackFrame dflags lbl liveness updfr_off ret_args stack0
568 = (cont_stack, assignments)
569 where
570 -- get the set of LocalRegs live in the continuation
571 live = mapFindWithDefault Set.empty lbl liveness
572
573 -- the stack from the base to updfr_off is off-limits.
574 -- our new stack frame contains:
575 -- * saved live variables
576 -- * the return address [young(C) + 8]
577 -- * the args for the call,
578 -- which are replaced by the return values at the return
579 -- point.
580
581 -- everything up to updfr_off is off-limits
582 -- stack1 contains updfr_off, plus everything we need to save
583 (stack1, assignments) = allocate dflags updfr_off live stack0
584
585 -- And the Sp at the continuation is:
586 -- sm_sp stack1 + ret_args
587 cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
588 , sm_args = ret_args
589 , sm_ret_off = updfr_off
590 }
591
592
593 -- -----------------------------------------------------------------------------
594 -- Note [diamond proc point]
595 --
596 -- This special case looks for the pattern we get from a typical
597 -- tagged case expression:
598 --
599 -- Sp[young(L1)] = L1
600 -- if (R1 & 7) != 0 goto L1 else goto L2
601 -- L2:
602 -- call [R1] returns to L1
603 -- L1: live: {y}
604 -- x = R1
605 --
606 -- If we let the generic case handle this, we get
607 --
608 -- Sp[-16] = L1
609 -- if (R1 & 7) != 0 goto L1a else goto L2
610 -- L2:
611 -- Sp[-8] = y
612 -- Sp = Sp - 16
613 -- call [R1] returns to L1
614 -- L1a:
615 -- Sp[-8] = y
616 -- Sp = Sp - 16
617 -- goto L1
618 -- L1:
619 -- x = R1
620 --
621 -- The code for saving the live vars is duplicated in each branch, and
622 -- furthermore there is an extra jump in the fast path (assuming L1 is
623 -- a proc point, which it probably is if there is a heap check).
624 --
625 -- So to fix this we want to set up the stack frame before the
626 -- conditional jump. How do we know when to do this, and when it is
627 -- safe? The basic idea is, when we see the assignment
628 --
629 -- Sp[young(L)] = L
630 --
631 -- we know that
632 -- * we are definitely heading for L
633 -- * there can be no more reads from another stack area, because young(L)
634 -- overlaps with it.
635 --
636 -- We don't necessarily know that everything live at L is live now
637 -- (some might be assigned between here and the jump to L). So we
638 -- simplify and only do the optimisation when we see
639 --
640 -- (1) a block containing an assignment of a return address L
641 -- (2) ending in a branch where one (and only) continuation goes to L,
642 -- and no other continuations go to proc points.
643 --
644 -- then we allocate the stack frame for L at the end of the block,
645 -- before the branch.
646 --
647 -- We could generalise (2), but that would make it a bit more
648 -- complicated to handle, and this currently catches the common case.
649
650 futureContinuation :: Block CmmNode O O -> Maybe BlockId
651 futureContinuation middle = foldBlockNodesB f middle Nothing
652 where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
653 f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
654 = Just l
655 f _ r = r
656
657 -- -----------------------------------------------------------------------------
658 -- Saving live registers
659
660 -- | Given a set of live registers and a StackMap, save all the registers
661 -- on the stack and return the new StackMap and the assignments to do
662 -- the saving.
663 --
664 allocate :: DynFlags -> ByteOff -> LocalRegSet -> StackMap
665 -> (StackMap, [CmmNode O O])
666 allocate dflags ret_off live stackmap@StackMap{ sm_sp = sp0
667 , sm_regs = regs0 }
668 =
669 -- we only have to save regs that are not already in a slot
670 let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
671 regs1 = filterUFM (\(r,_) -> elemRegSet r live) regs0
672 in
673
674 -- make a map of the stack
675 let stack = reverse $ Array.elems $
676 accumArray (\_ x -> x) Empty (1, toWords dflags (max sp0 ret_off)) $
677 ret_words ++ live_words
678 where ret_words =
679 [ (x, Occupied)
680 | x <- [ 1 .. toWords dflags ret_off] ]
681 live_words =
682 [ (toWords dflags x, Occupied)
683 | (r,off) <- eltsUFM regs1,
684 let w = localRegBytes dflags r,
685 x <- [ off, off - wORD_SIZE dflags .. off - w + 1] ]
686 in
687
688 -- Pass over the stack: find slots to save all the new live variables,
689 -- choosing the oldest slots first (hence a foldr).
690 let
691 save slot ([], stack, n, assigs, regs) -- no more regs to save
692 = ([], slot:stack, plusW dflags n 1, assigs, regs)
693 save slot (to_save, stack, n, assigs, regs)
694 = case slot of
695 Occupied -> (to_save, Occupied:stack, plusW dflags n 1, assigs, regs)
696 Empty
697 | Just (stack', r, to_save') <-
698 select_save to_save (slot:stack)
699 -> let assig = CmmStore (CmmStackSlot Old n')
700 (CmmReg (CmmLocal r))
701 n' = plusW dflags n 1
702 in
703 (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
704
705 | otherwise
706 -> (to_save, slot:stack, plusW dflags n 1, assigs, regs)
707
708 -- we should do better here: right now we'll fit the smallest first,
709 -- but it would make more sense to fit the biggest first.
710 select_save :: [LocalReg] -> [StackSlot]
711 -> Maybe ([StackSlot], LocalReg, [LocalReg])
712 select_save regs stack = go regs []
713 where go [] _no_fit = Nothing
714 go (r:rs) no_fit
715 | Just rest <- dropEmpty words stack
716 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
717 | otherwise
718 = go rs (r:no_fit)
719 where words = localRegWords dflags r
720
721 -- fill in empty slots as much as possible
722 (still_to_save, save_stack, n, save_assigs, save_regs)
723 = foldr save (to_save, [], 0, [], []) stack
724
725 -- push any remaining live vars on the stack
726 (push_sp, push_assigs, push_regs)
727 = foldr push (n, [], []) still_to_save
728 where
729 push r (n, assigs, regs)
730 = (n', assig : assigs, (r,(r,n')) : regs)
731 where
732 n' = n + localRegBytes dflags r
733 assig = CmmStore (CmmStackSlot Old n')
734 (CmmReg (CmmLocal r))
735
736 trim_sp
737 | not (null push_regs) = push_sp
738 | otherwise
739 = plusW dflags n (- length (takeWhile isEmpty save_stack))
740
741 final_regs = regs1 `addListToUFM` push_regs
742 `addListToUFM` save_regs
743
744 in
745 -- XXX should be an assert
746 if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
747
748 if (trim_sp .&. (wORD_SIZE dflags - 1)) /= 0 then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
749
750 ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
751 , push_assigs ++ save_assigs )
752
753
754 -- -----------------------------------------------------------------------------
755 -- Manifesting Sp
756
757 -- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp. The
758 -- block looks like this:
759 --
760 -- middle_pre -- the middle nodes
761 -- Sp = Sp + sp_off -- Sp adjustment goes here
762 -- last -- the last node
763 --
764 -- And we have some extra blocks too (that don't contain Sp adjustments)
765 --
766 -- The adjustment for middle_pre will be different from that for
767 -- middle_post, because the Sp adjustment intervenes.
768 --
769 manifestSp
770 :: DynFlags
771 -> BlockEnv StackMap -- StackMaps for other blocks
772 -> StackMap -- StackMap for this block
773 -> ByteOff -- Sp on entry to the block
774 -> ByteOff -- SpHigh
775 -> CmmNode C O -- first node
776 -> [CmmNode O O] -- middle
777 -> ByteOff -- sp_off
778 -> CmmNode O C -- last node
779 -> [CmmBlock] -- new blocks
780 -> [CmmBlock] -- final blocks with Sp manifest
781
782 manifestSp dflags stackmaps stack0 sp0 sp_high
783 first middle_pre sp_off last fixup_blocks
784 = final_block : fixup_blocks'
785 where
786 area_off = getAreaOff stackmaps
787
788 adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
789 adj_pre_sp = mapExpDeep (areaToSp dflags sp0 sp_high area_off)
790 adj_post_sp = mapExpDeep (areaToSp dflags (sp0 - sp_off) sp_high area_off)
791
792 final_middle = maybeAddSpAdj dflags sp_off $
793 blockFromList $
794 map adj_pre_sp $
795 elimStackStores stack0 stackmaps area_off $
796 middle_pre
797
798 final_last = optStackCheck (adj_post_sp last)
799
800 final_block = blockJoin first final_middle final_last
801
802 fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
803
804
805 getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
806 getAreaOff _ Old = 0
807 getAreaOff stackmaps (Young l) =
808 case mapLookup l stackmaps of
809 Just sm -> sm_sp sm - sm_args sm
810 Nothing -> pprPanic "getAreaOff" (ppr l)
811
812
813 maybeAddSpAdj :: DynFlags -> ByteOff -> Block CmmNode O O -> Block CmmNode O O
814 maybeAddSpAdj _ 0 block = block
815 maybeAddSpAdj dflags sp_off block
816 = block `blockSnoc` CmmAssign spReg (cmmOffset dflags (CmmReg spReg) sp_off)
817
818
819 {-
820 Sp(L) is the Sp offset on entry to block L relative to the base of the
821 OLD area.
822
823 SpArgs(L) is the size of the young area for L, i.e. the number of
824 arguments.
825
826 - in block L, each reference to [old + N] turns into
827 [Sp + Sp(L) - N]
828
829 - in block L, each reference to [young(L') + N] turns into
830 [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
831
832 - be careful with the last node of each block: Sp has already been adjusted
833 to be Sp + Sp(L) - Sp(L')
834 -}
835
836 areaToSp :: DynFlags -> ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
837
838 areaToSp dflags sp_old _sp_hwm area_off (CmmStackSlot area n)
839 = cmmOffset dflags (CmmReg spReg) (sp_old - area_off area - n)
840 -- Replace (CmmStackSlot area n) with an offset from Sp
841
842 areaToSp dflags _ sp_hwm _ (CmmLit CmmHighStackMark)
843 = mkIntExpr dflags sp_hwm
844 -- Replace CmmHighStackMark with the number of bytes of stack used,
845 -- the sp_hwm. See Note [Stack usage] in StgCmmHeap
846
847 areaToSp dflags _ _ _ (CmmMachOp (MO_U_Lt _)
848 [CmmMachOp (MO_Sub _)
849 [ CmmRegOff (CmmGlobal Sp) x_off
850 , CmmLit (CmmInt y_lit _)],
851 CmmReg (CmmGlobal SpLim)])
852 | fromIntegral x_off >= y_lit
853 = zeroExpr dflags
854 -- Replace a stack-overflow test that cannot fail with a no-op
855 -- See Note [Always false stack check]
856
857 areaToSp _ _ _ _ other = other
858
859 -- Note [Always false stack check]
860 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
861 -- We can optimise stack checks of the form
862 --
863 -- if ((Sp + x) - y < SpLim) then .. else ..
864 --
865 -- where are non-negative integer byte offsets. Since we know that
866 -- SpLim <= Sp (remember the stack grows downwards), this test must
867 -- yield False if (x >= y), so we can rewrite the comparison to False.
868 -- A subsequent sinking pass will later drop the dead code.
869 -- Optimising this away depends on knowing that SpLim <= Sp, so it is
870 -- really the job of the stack layout algorithm, hence we do it now.
871
872 optStackCheck :: CmmNode O C -> CmmNode O C
873 optStackCheck n = -- Note [Always false stack check]
874 case n of
875 CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
876 other -> other
877
878
879 -- -----------------------------------------------------------------------------
880
881 -- | Eliminate stores of the form
882 --
883 -- Sp[area+n] = r
884 --
885 -- when we know that r is already in the same slot as Sp[area+n]. We
886 -- could do this in a later optimisation pass, but that would involve
887 -- a separate analysis and we already have the information to hand
888 -- here. It helps clean up some extra stack stores in common cases.
889 --
890 -- Note that we may have to modify the StackMap as we walk through the
891 -- code using procMiddle, since an assignment to a variable in the
892 -- StackMap will invalidate its mapping there.
893 --
894 elimStackStores :: StackMap
895 -> BlockEnv StackMap
896 -> (Area -> ByteOff)
897 -> [CmmNode O O]
898 -> [CmmNode O O]
899 elimStackStores stackmap stackmaps area_off nodes
900 = go stackmap nodes
901 where
902 go _stackmap [] = []
903 go stackmap (n:ns)
904 = case n of
905 CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
906 | Just (_,off) <- lookupUFM (sm_regs stackmap) r
907 , area_off area + m == off
908 -> go stackmap ns
909 _otherwise
910 -> n : go (procMiddle stackmaps n stackmap) ns
911
912
913 -- -----------------------------------------------------------------------------
914 -- Update info tables to include stack liveness
915
916
917 setInfoTableStackMap :: DynFlags -> BlockEnv StackMap -> CmmDecl -> CmmDecl
918 setInfoTableStackMap dflags stackmaps (CmmProc top_info@TopInfo{..} l v g)
919 = CmmProc top_info{ info_tbls = mapMapWithKey fix_info info_tbls } l v g
920 where
921 fix_info lbl info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
922 info_tbl { cit_rep = StackRep (get_liveness lbl) }
923 fix_info _ other = other
924
925 get_liveness :: BlockId -> Liveness
926 get_liveness lbl
927 = case mapLookup lbl stackmaps of
928 Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl <+> ppr info_tbls)
929 Just sm -> stackMapToLiveness dflags sm
930
931 setInfoTableStackMap _ _ d = d
932
933
934 stackMapToLiveness :: DynFlags -> StackMap -> Liveness
935 stackMapToLiveness dflags StackMap{..} =
936 reverse $ Array.elems $
937 accumArray (\_ x -> x) True (toWords dflags sm_ret_off + 1,
938 toWords dflags (sm_sp - sm_args)) live_words
939 where
940 live_words = [ (toWords dflags off, False)
941 | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
942
943
944 -- -----------------------------------------------------------------------------
945 -- Lowering safe foreign calls
946
947 {-
948 Note [Lower safe foreign calls]
949
950 We start with
951
952 Sp[young(L1)] = L1
953 ,-----------------------
954 | r1 = foo(x,y,z) returns to L1
955 '-----------------------
956 L1:
957 R1 = r1 -- copyIn, inserted by mkSafeCall
958 ...
959
960 the stack layout algorithm will arrange to save and reload everything
961 live across the call. Our job now is to expand the call so we get
962
963 Sp[young(L1)] = L1
964 ,-----------------------
965 | SAVE_THREAD_STATE()
966 | token = suspendThread(BaseReg, interruptible)
967 | r = foo(x,y,z)
968 | BaseReg = resumeThread(token)
969 | LOAD_THREAD_STATE()
970 | R1 = r -- copyOut
971 | jump Sp[0]
972 '-----------------------
973 L1:
974 r = R1 -- copyIn, inserted by mkSafeCall
975 ...
976
977 Note the copyOut, which saves the results in the places that L1 is
978 expecting them (see Note {safe foreign call convention]). Note also
979 that safe foreign call is replace by an unsafe one in the Cmm graph.
980 -}
981
982 lowerSafeForeignCall :: DynFlags -> CmmBlock -> UniqSM CmmBlock
983 lowerSafeForeignCall dflags block
984 | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
985 = do
986 -- Both 'id' and 'new_base' are KindNonPtr because they're
987 -- RTS-only objects and are not subject to garbage collection
988 id <- newTemp (bWord dflags)
989 new_base <- newTemp (cmmRegType dflags (CmmGlobal BaseReg))
990 let (caller_save, caller_load) = callerSaveVolatileRegs dflags
991 load_tso <- newTemp (gcWord dflags)
992 load_stack <- newTemp (gcWord dflags)
993 let suspend = saveThreadState dflags <*>
994 caller_save <*>
995 mkMiddle (callSuspendThread dflags id intrbl)
996 midCall = mkUnsafeCall tgt res args
997 resume = mkMiddle (callResumeThread new_base id) <*>
998 -- Assign the result to BaseReg: we
999 -- might now have a different Capability!
1000 mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
1001 caller_load <*>
1002 loadThreadState dflags load_tso load_stack
1003
1004 (_, regs, copyout) =
1005 copyOutOflow dflags NativeReturn Jump (Young succ)
1006 (map (CmmReg . CmmLocal) res)
1007 ret_off []
1008
1009 -- NB. after resumeThread returns, the top-of-stack probably contains
1010 -- the stack frame for succ, but it might not: if the current thread
1011 -- received an exception during the call, then the stack might be
1012 -- different. Hence we continue by jumping to the top stack frame,
1013 -- not by jumping to succ.
1014 jump = CmmCall { cml_target = entryCode dflags $
1015 CmmLoad (CmmReg spReg) (bWord dflags)
1016 , cml_cont = Just succ
1017 , cml_args_regs = regs
1018 , cml_args = widthInBytes (wordWidth dflags)
1019 , cml_ret_args = ret_args
1020 , cml_ret_off = ret_off }
1021
1022 graph' <- lgraphOfAGraph $ suspend <*>
1023 midCall <*>
1024 resume <*>
1025 copyout <*>
1026 mkLast jump
1027
1028 case toBlockList graph' of
1029 [one] -> let (_, middle', last) = blockSplit one
1030 in return (blockJoin entry (middle `blockAppend` middle') last)
1031 _ -> panic "lowerSafeForeignCall0"
1032
1033 -- Block doesn't end in a safe foreign call:
1034 | otherwise = return block
1035
1036
1037 foreignLbl :: FastString -> CmmExpr
1038 foreignLbl name = CmmLit (CmmLabel (mkForeignLabel name Nothing ForeignLabelInExternalPackage IsFunction))
1039
1040 newTemp :: CmmType -> UniqSM LocalReg
1041 newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
1042
1043 callSuspendThread :: DynFlags -> LocalReg -> Bool -> CmmNode O O
1044 callSuspendThread dflags id intrbl =
1045 CmmUnsafeForeignCall
1046 (ForeignTarget (foreignLbl (fsLit "suspendThread"))
1047 (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint] CmmMayReturn))
1048 [id] [CmmReg (CmmGlobal BaseReg), mkIntExpr dflags (fromEnum intrbl)]
1049
1050 callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
1051 callResumeThread new_base id =
1052 CmmUnsafeForeignCall
1053 (ForeignTarget (foreignLbl (fsLit "resumeThread"))
1054 (ForeignConvention CCallConv [AddrHint] [AddrHint] CmmMayReturn))
1055 [new_base] [CmmReg (CmmLocal id)]
1056
1057 -- -----------------------------------------------------------------------------
1058
1059 plusW :: DynFlags -> ByteOff -> WordOff -> ByteOff
1060 plusW dflags b w = b + w * wORD_SIZE dflags
1061
1062 data StackSlot = Occupied | Empty
1063 -- Occupied: a return address or part of an update frame
1064
1065 instance Outputable StackSlot where
1066 ppr Occupied = ptext (sLit "XXX")
1067 ppr Empty = ptext (sLit "---")
1068
1069 dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
1070 dropEmpty 0 ss = Just ss
1071 dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
1072 dropEmpty _ _ = Nothing
1073
1074 isEmpty :: StackSlot -> Bool
1075 isEmpty Empty = True
1076 isEmpty _ = False
1077
1078 localRegBytes :: DynFlags -> LocalReg -> ByteOff
1079 localRegBytes dflags r
1080 = roundUpToWords dflags (widthInBytes (typeWidth (localRegType r)))
1081
1082 localRegWords :: DynFlags -> LocalReg -> WordOff
1083 localRegWords dflags = toWords dflags . localRegBytes dflags
1084
1085 toWords :: DynFlags -> ByteOff -> WordOff
1086 toWords dflags x = x `quot` wORD_SIZE dflags
1087
1088
1089 insertReloads :: StackMap -> [CmmNode O O]
1090 insertReloads stackmap =
1091 [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
1092 (localRegType r))
1093 | (r,sp) <- stackSlotRegs stackmap
1094 ]
1095
1096
1097 stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
1098 stackSlotRegs sm = eltsUFM (sm_regs sm)