Honour -dsuppress-uniques more thoroughly
[ghc.git] / compiler / cmm / CmmLayoutStack.hs
1 {-# LANGUAGE CPP, RecordWildCards, GADTs #-}
2 module CmmLayoutStack (
3 cmmLayoutStack, setInfoTableStackMap
4 ) where
5
6 import StgCmmUtils ( callerSaveVolatileRegs ) -- XXX layering violation
7 import StgCmmForeign ( saveThreadState, loadThreadState ) -- XXX layering violation
8
9 import BasicTypes
10 import Cmm
11 import CmmInfo
12 import BlockId
13 import CLabel
14 import CmmUtils
15 import MkGraph
16 import ForeignCall
17 import CmmLive
18 import CmmProcPoint
19 import SMRep
20 import Hoopl
21 import UniqSupply
22 import StgCmmUtils ( newTemp )
23 import Maybes
24 import UniqFM
25 import Util
26
27 import DynFlags
28 import FastString
29 import Outputable hiding ( isEmpty )
30 import qualified Data.Set as Set
31 import Control.Monad.Fix
32 import Data.Array as Array
33 import Data.Bits
34 import Data.List (nub)
35
36 import Prelude hiding ((<*>))
37
38 #include "HsVersions.h"
39
40 {- Note [Stack Layout]
41
42 The job of this pass is to
43
44 - replace references to abstract stack Areas with fixed offsets from Sp.
45
46 - replace the CmmHighStackMark constant used in the stack check with
47 the maximum stack usage of the proc.
48
49 - save any variables that are live across a call, and reload them as
50 necessary.
51
52 Before stack allocation, local variables remain live across native
53 calls (CmmCall{ cmm_cont = Just _ }), and after stack allocation local
54 variables are clobbered by native calls.
55
56 We want to do stack allocation so that as far as possible
57 - stack use is minimized, and
58 - unnecessary stack saves and loads are avoided.
59
60 The algorithm we use is a variant of linear-scan register allocation,
61 where the stack is our register file.
62
63 - First, we do a liveness analysis, which annotates every block with
64 the variables live on entry to the block.
65
66 - We traverse blocks in reverse postorder DFS; that is, we visit at
67 least one predecessor of a block before the block itself. The
68 stack layout flowing from the predecessor of the block will
69 determine the stack layout on entry to the block.
70
71 - We maintain a data structure
72
73 Map Label StackMap
74
75 which describes the contents of the stack and the stack pointer on
76 entry to each block that is a successor of a block that we have
77 visited.
78
79 - For each block we visit:
80
81 - Look up the StackMap for this block.
82
83 - If this block is a proc point (or a call continuation, if we
84 aren't splitting proc points), emit instructions to reload all
85 the live variables from the stack, according to the StackMap.
86
87 - Walk forwards through the instructions:
88 - At an assignment x = Sp[loc]
89 - Record the fact that Sp[loc] contains x, so that we won't
90 need to save x if it ever needs to be spilled.
91 - At an assignment x = E
92 - If x was previously on the stack, it isn't any more
93 - At the last node, if it is a call or a jump to a proc point
94 - Lay out the stack frame for the call (see setupStackFrame)
95 - emit instructions to save all the live variables
96 - Remember the StackMaps for all the successors
97 - emit an instruction to adjust Sp
98 - If the last node is a branch, then the current StackMap is the
99 StackMap for the successors.
100
101 - Manifest Sp: replace references to stack areas in this block
102 with real Sp offsets. We cannot do this until we have laid out
103 the stack area for the successors above.
104
105 In this phase we also eliminate redundant stores to the stack;
106 see elimStackStores.
107
108 - There is one important gotcha: sometimes we'll encounter a control
109 transfer to a block that we've already processed (a join point),
110 and in that case we might need to rearrange the stack to match
111 what the block is expecting. (exactly the same as in linear-scan
112 register allocation, except here we have the luxury of an infinite
113 supply of temporary variables).
114
115 - Finally, we update the magic CmmHighStackMark constant with the
116 stack usage of the function, and eliminate the whole stack check
117 if there was no stack use. (in fact this is done as part of the
118 main traversal, by feeding the high-water-mark output back in as
119 an input. I hate cyclic programming, but it's just too convenient
120 sometimes.)
121
122 There are plenty of tricky details: update frames, proc points, return
123 addresses, foreign calls, and some ad-hoc optimisations that are
124 convenient to do here and effective in common cases. Comments in the
125 code below explain these.
126
127 -}
128
129
130 -- All stack locations are expressed as positive byte offsets from the
131 -- "base", which is defined to be the address above the return address
132 -- on the stack on entry to this CmmProc.
133 --
134 -- Lower addresses have higher StackLocs.
135 --
136 type StackLoc = ByteOff
137
138 {-
139 A StackMap describes the stack at any given point. At a continuation
140 it has a particular layout, like this:
141
142 | | <- base
143 |-------------|
144 | ret0 | <- base + 8
145 |-------------|
146 . upd frame . <- base + sm_ret_off
147 |-------------|
148 | |
149 . vars .
150 . (live/dead) .
151 | | <- base + sm_sp - sm_args
152 |-------------|
153 | ret1 |
154 . ret vals . <- base + sm_sp (<--- Sp points here)
155 |-------------|
156
157 Why do we include the final return address (ret0) in our stack map? I
158 have absolutely no idea, but it seems to be done that way consistently
159 in the rest of the code generator, so I played along here. --SDM
160
161 Note that we will be constructing an info table for the continuation
162 (ret1), which needs to describe the stack down to, but not including,
163 the update frame (or ret0, if there is no update frame).
164 -}
165
166 data StackMap = StackMap
167 { sm_sp :: StackLoc
168 -- ^ the offset of Sp relative to the base on entry
169 -- to this block.
170 , sm_args :: ByteOff
171 -- ^ the number of bytes of arguments in the area for this block
172 -- Defn: the offset of young(L) relative to the base is given by
173 -- (sm_sp - sm_args) of the StackMap for block L.
174 , sm_ret_off :: ByteOff
175 -- ^ Number of words of stack that we do not describe with an info
176 -- table, because it contains an update frame.
177 , sm_regs :: UniqFM (LocalReg,StackLoc)
178 -- ^ regs on the stack
179 }
180
181 instance Outputable StackMap where
182 ppr StackMap{..} =
183 text "Sp = " <> int sm_sp $$
184 text "sm_args = " <> int sm_args $$
185 text "sm_ret_off = " <> int sm_ret_off $$
186 text "sm_regs = " <> pprUFM sm_regs ppr
187
188
189 cmmLayoutStack :: DynFlags -> ProcPointSet -> ByteOff -> CmmGraph
190 -> UniqSM (CmmGraph, LabelMap StackMap)
191 cmmLayoutStack dflags procpoints entry_args
192 graph@(CmmGraph { g_entry = entry })
193 = do
194 -- We need liveness info. Dead assignments are removed later
195 -- by the sinking pass.
196 let liveness = cmmLocalLiveness dflags graph
197 blocks = postorderDfs graph
198
199 (final_stackmaps, _final_high_sp, new_blocks) <-
200 mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
201 layout dflags procpoints liveness entry entry_args
202 rec_stackmaps rec_high_sp blocks
203
204 new_blocks' <- mapM (lowerSafeForeignCall dflags) new_blocks
205 return (ofBlockList entry new_blocks', final_stackmaps)
206
207
208 layout :: DynFlags
209 -> LabelSet -- proc points
210 -> LabelMap CmmLocalLive -- liveness
211 -> BlockId -- entry
212 -> ByteOff -- stack args on entry
213
214 -> LabelMap StackMap -- [final] stack maps
215 -> ByteOff -- [final] Sp high water mark
216
217 -> [CmmBlock] -- [in] blocks
218
219 -> UniqSM
220 ( LabelMap StackMap -- [out] stack maps
221 , ByteOff -- [out] Sp high water mark
222 , [CmmBlock] -- [out] new blocks
223 )
224
225 layout dflags procpoints liveness entry entry_args final_stackmaps final_sp_high blocks
226 = go blocks init_stackmap entry_args []
227 where
228 (updfr, cont_info) = collectContInfo blocks
229
230 init_stackmap = mapSingleton entry StackMap{ sm_sp = entry_args
231 , sm_args = entry_args
232 , sm_ret_off = updfr
233 , sm_regs = emptyUFM
234 }
235
236 go [] acc_stackmaps acc_hwm acc_blocks
237 = return (acc_stackmaps, acc_hwm, acc_blocks)
238
239 go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
240 = do
241 let (entry0@(CmmEntry entry_lbl tscope), middle0, last0) = blockSplit b0
242
243 let stack0@StackMap { sm_sp = sp0 }
244 = mapFindWithDefault
245 (pprPanic "no stack map for" (ppr entry_lbl))
246 entry_lbl acc_stackmaps
247
248 -- (a) Update the stack map to include the effects of
249 -- assignments in this block
250 let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
251
252 -- (b) Insert assignments to reload all the live variables if this
253 -- block is a proc point
254 let middle1 = if entry_lbl `setMember` procpoints
255 then foldr blockCons middle0 (insertReloads stack0)
256 else middle0
257
258 -- (c) Look at the last node and if we are making a call or
259 -- jumping to a proc point, we must save the live
260 -- variables, adjust Sp, and construct the StackMaps for
261 -- each of the successor blocks. See handleLastNode for
262 -- details.
263 (middle2, sp_off, last1, fixup_blocks, out)
264 <- handleLastNode dflags procpoints liveness cont_info
265 acc_stackmaps stack1 tscope middle0 last0
266
267 -- (d) Manifest Sp: run over the nodes in the block and replace
268 -- CmmStackSlot with CmmLoad from Sp with a concrete offset.
269 --
270 -- our block:
271 -- middle1 -- the original middle nodes
272 -- middle2 -- live variable saves from handleLastNode
273 -- Sp = Sp + sp_off -- Sp adjustment goes here
274 -- last1 -- the last node
275 --
276 let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
277
278 let final_blocks =
279 manifestSp dflags final_stackmaps stack0 sp0 final_sp_high
280 entry0 middle_pre sp_off last1 fixup_blocks
281
282 let acc_stackmaps' = mapUnion acc_stackmaps out
283
284 -- If this block jumps to the GC, then we do not take its
285 -- stack usage into account for the high-water mark.
286 -- Otherwise, if the only stack usage is in the stack-check
287 -- failure block itself, we will do a redundant stack
288 -- check. The stack has a buffer designed to accommodate
289 -- the largest amount of stack needed for calling the GC.
290 --
291 this_sp_hwm | isGcJump last0 = 0
292 | otherwise = sp0 - sp_off
293
294 hwm' = maximum (acc_hwm : this_sp_hwm : map sm_sp (mapElems out))
295
296 go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
297
298
299 -- -----------------------------------------------------------------------------
300
301 -- Not foolproof, but GCFun is the culprit we most want to catch
302 isGcJump :: CmmNode O C -> Bool
303 isGcJump (CmmCall { cml_target = CmmReg (CmmGlobal l) })
304 = l == GCFun || l == GCEnter1
305 isGcJump _something_else = False
306
307 -- -----------------------------------------------------------------------------
308
309 -- This doesn't seem right somehow. We need to find out whether this
310 -- proc will push some update frame material at some point, so that we
311 -- can avoid using that area of the stack for spilling. The
312 -- updfr_space field of the CmmProc *should* tell us, but it doesn't
313 -- (I think maybe it gets filled in later when we do proc-point
314 -- splitting).
315 --
316 -- So we'll just take the max of all the cml_ret_offs. This could be
317 -- unnecessarily pessimistic, but probably not in the code we
318 -- generate.
319
320 collectContInfo :: [CmmBlock] -> (ByteOff, LabelMap ByteOff)
321 collectContInfo blocks
322 = (maximum ret_offs, mapFromList (catMaybes mb_argss))
323 where
324 (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
325
326 get_cont :: Block CmmNode x C -> (Maybe (Label, ByteOff), ByteOff)
327 get_cont b =
328 case lastNode b of
329 CmmCall { cml_cont = Just l, .. }
330 -> (Just (l, cml_ret_args), cml_ret_off)
331 CmmForeignCall { .. }
332 -> (Just (succ, ret_args), ret_off)
333 _other -> (Nothing, 0)
334
335
336 -- -----------------------------------------------------------------------------
337 -- Updating the StackMap from middle nodes
338
339 -- Look for loads from stack slots, and update the StackMap. This is
340 -- purely for optimisation reasons, so that we can avoid saving a
341 -- variable back to a different stack slot if it is already on the
342 -- stack.
343 --
344 -- This happens a lot: for example when function arguments are passed
345 -- on the stack and need to be immediately saved across a call, we
346 -- want to just leave them where they are on the stack.
347 --
348 procMiddle :: LabelMap StackMap -> CmmNode e x -> StackMap -> StackMap
349 procMiddle stackmaps node sm
350 = case node of
351 CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
352 -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
353 where loc = getStackLoc area off stackmaps
354 CmmAssign (CmmLocal r) _other
355 -> sm { sm_regs = delFromUFM (sm_regs sm) r }
356 _other
357 -> sm
358
359 getStackLoc :: Area -> ByteOff -> LabelMap StackMap -> StackLoc
360 getStackLoc Old n _ = n
361 getStackLoc (Young l) n stackmaps =
362 case mapLookup l stackmaps of
363 Nothing -> pprPanic "getStackLoc" (ppr l)
364 Just sm -> sm_sp sm - sm_args sm + n
365
366
367 -- -----------------------------------------------------------------------------
368 -- Handling stack allocation for a last node
369
370 -- We take a single last node and turn it into:
371 --
372 -- C1 (some statements)
373 -- Sp = Sp + N
374 -- C2 (some more statements)
375 -- call f() -- the actual last node
376 --
377 -- plus possibly some more blocks (we may have to add some fixup code
378 -- between the last node and the continuation).
379 --
380 -- C1: is the code for saving the variables across this last node onto
381 -- the stack, if the continuation is a call or jumps to a proc point.
382 --
383 -- C2: if the last node is a safe foreign call, we have to inject some
384 -- extra code that goes *after* the Sp adjustment.
385
386 handleLastNode
387 :: DynFlags -> ProcPointSet -> LabelMap CmmLocalLive -> LabelMap ByteOff
388 -> LabelMap StackMap -> StackMap -> CmmTickScope
389 -> Block CmmNode O O
390 -> CmmNode O C
391 -> UniqSM
392 ( [CmmNode O O] -- nodes to go *before* the Sp adjustment
393 , ByteOff -- amount to adjust Sp
394 , CmmNode O C -- new last node
395 , [CmmBlock] -- new blocks
396 , LabelMap StackMap -- stackmaps for the continuations
397 )
398
399 handleLastNode dflags procpoints liveness cont_info stackmaps
400 stack0@StackMap { sm_sp = sp0 } tscp middle last
401 = case last of
402 -- At each return / tail call,
403 -- adjust Sp to point to the last argument pushed, which
404 -- is cml_args, after popping any other junk from the stack.
405 CmmCall{ cml_cont = Nothing, .. } -> do
406 let sp_off = sp0 - cml_args
407 return ([], sp_off, last, [], mapEmpty)
408
409 -- At each CmmCall with a continuation:
410 CmmCall{ cml_cont = Just cont_lbl, .. } ->
411 return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
412
413 CmmForeignCall{ succ = cont_lbl, .. } -> do
414 return $ lastCall cont_lbl (wORD_SIZE dflags) ret_args ret_off
415 -- one word of args: the return address
416
417 CmmBranch {} -> handleBranches
418 CmmCondBranch {} -> handleBranches
419 CmmSwitch {} -> handleBranches
420
421 where
422 -- Calls and ForeignCalls are handled the same way:
423 lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
424 -> ( [CmmNode O O]
425 , ByteOff
426 , CmmNode O C
427 , [CmmBlock]
428 , LabelMap StackMap
429 )
430 lastCall lbl cml_args cml_ret_args cml_ret_off
431 = ( assignments
432 , spOffsetForCall sp0 cont_stack cml_args
433 , last
434 , [] -- no new blocks
435 , mapSingleton lbl cont_stack )
436 where
437 (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
438
439
440 prepareStack lbl cml_ret_args cml_ret_off
441 | Just cont_stack <- mapLookup lbl stackmaps
442 -- If we have already seen this continuation before, then
443 -- we just have to make the stack look the same:
444 = (fixupStack stack0 cont_stack, cont_stack)
445 -- Otherwise, we have to allocate the stack frame
446 | otherwise
447 = (save_assignments, new_cont_stack)
448 where
449 (new_cont_stack, save_assignments)
450 = setupStackFrame dflags lbl liveness cml_ret_off cml_ret_args stack0
451
452
453 -- For other last nodes (branches), if any of the targets is a
454 -- proc point, we have to set up the stack to match what the proc
455 -- point is expecting.
456 --
457 handleBranches :: UniqSM ( [CmmNode O O]
458 , ByteOff
459 , CmmNode O C
460 , [CmmBlock]
461 , LabelMap StackMap )
462
463 handleBranches
464 -- Note [diamond proc point]
465 | Just l <- futureContinuation middle
466 , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
467 = do
468 let cont_args = mapFindWithDefault 0 l cont_info
469 (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
470 out = mapFromList [ (l', cont_stack)
471 | l' <- successors last ]
472 return ( assigs
473 , spOffsetForCall sp0 cont_stack (wORD_SIZE dflags)
474 , last
475 , []
476 , out)
477
478 | otherwise = do
479 pps <- mapM handleBranch (successors last)
480 let lbl_map :: LabelMap Label
481 lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
482 fix_lbl l = mapFindWithDefault l l lbl_map
483 return ( []
484 , 0
485 , mapSuccessors fix_lbl last
486 , concat [ blk | (_,_,_,blk) <- pps ]
487 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
488
489 -- For each successor of this block
490 handleBranch :: BlockId -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
491 handleBranch l
492 -- (a) if the successor already has a stackmap, we need to
493 -- shuffle the current stack to make it look the same.
494 -- We have to insert a new block to make this happen.
495 | Just stack2 <- mapLookup l stackmaps
496 = do
497 let assigs = fixupStack stack0 stack2
498 (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
499 return (l, tmp_lbl, stack2, block)
500
501 -- (b) if the successor is a proc point, save everything
502 -- on the stack.
503 | l `setMember` procpoints
504 = do
505 let cont_args = mapFindWithDefault 0 l cont_info
506 (stack2, assigs) =
507 setupStackFrame dflags l liveness (sm_ret_off stack0)
508 cont_args stack0
509 (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
510 return (l, tmp_lbl, stack2, block)
511
512 -- (c) otherwise, the current StackMap is the StackMap for
513 -- the continuation. But we must remember to remove any
514 -- variables from the StackMap that are *not* live at
515 -- the destination, because this StackMap might be used
516 -- by fixupStack if this is a join point.
517 | otherwise = return (l, l, stack1, [])
518 where live = mapFindWithDefault (panic "handleBranch") l liveness
519 stack1 = stack0 { sm_regs = filterUFM is_live (sm_regs stack0) }
520 is_live (r,_) = r `elemRegSet` live
521
522
523 makeFixupBlock :: DynFlags -> ByteOff -> Label -> StackMap
524 -> CmmTickScope -> [CmmNode O O]
525 -> UniqSM (Label, [CmmBlock])
526 makeFixupBlock dflags sp0 l stack tscope assigs
527 | null assigs && sp0 == sm_sp stack = return (l, [])
528 | otherwise = do
529 tmp_lbl <- newBlockId
530 let sp_off = sp0 - sm_sp stack
531 maybeAddUnwind block
532 | debugLevel dflags > 0
533 = block `blockSnoc` CmmUnwind [(Sp, Just unwind_val)]
534 | otherwise
535 = block
536 where unwind_val = cmmOffset dflags (CmmReg spReg) (sm_sp stack)
537 block = blockJoin (CmmEntry tmp_lbl tscope)
538 ( maybeAddSpAdj dflags sp_off
539 $ maybeAddUnwind
540 $ blockFromList assigs )
541 (CmmBranch l)
542 return (tmp_lbl, [block])
543
544
545 -- Sp is currently pointing to current_sp,
546 -- we want it to point to
547 -- (sm_sp cont_stack - sm_args cont_stack + args)
548 -- so the difference is
549 -- sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
550 spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
551 spOffsetForCall current_sp cont_stack args
552 = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
553
554
555 -- | create a sequence of assignments to establish the new StackMap,
556 -- given the old StackMap.
557 fixupStack :: StackMap -> StackMap -> [CmmNode O O]
558 fixupStack old_stack new_stack = concatMap move new_locs
559 where
560 old_map = sm_regs old_stack
561 new_locs = stackSlotRegs new_stack
562
563 move (r,n)
564 | Just (_,m) <- lookupUFM old_map r, n == m = []
565 | otherwise = [CmmStore (CmmStackSlot Old n)
566 (CmmReg (CmmLocal r))]
567
568
569
570 setupStackFrame
571 :: DynFlags
572 -> BlockId -- label of continuation
573 -> LabelMap CmmLocalLive -- liveness
574 -> ByteOff -- updfr
575 -> ByteOff -- bytes of return values on stack
576 -> StackMap -- current StackMap
577 -> (StackMap, [CmmNode O O])
578
579 setupStackFrame dflags lbl liveness updfr_off ret_args stack0
580 = (cont_stack, assignments)
581 where
582 -- get the set of LocalRegs live in the continuation
583 live = mapFindWithDefault Set.empty lbl liveness
584
585 -- the stack from the base to updfr_off is off-limits.
586 -- our new stack frame contains:
587 -- * saved live variables
588 -- * the return address [young(C) + 8]
589 -- * the args for the call,
590 -- which are replaced by the return values at the return
591 -- point.
592
593 -- everything up to updfr_off is off-limits
594 -- stack1 contains updfr_off, plus everything we need to save
595 (stack1, assignments) = allocate dflags updfr_off live stack0
596
597 -- And the Sp at the continuation is:
598 -- sm_sp stack1 + ret_args
599 cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
600 , sm_args = ret_args
601 , sm_ret_off = updfr_off
602 }
603
604
605 -- -----------------------------------------------------------------------------
606 -- Note [diamond proc point]
607 --
608 -- This special case looks for the pattern we get from a typical
609 -- tagged case expression:
610 --
611 -- Sp[young(L1)] = L1
612 -- if (R1 & 7) != 0 goto L1 else goto L2
613 -- L2:
614 -- call [R1] returns to L1
615 -- L1: live: {y}
616 -- x = R1
617 --
618 -- If we let the generic case handle this, we get
619 --
620 -- Sp[-16] = L1
621 -- if (R1 & 7) != 0 goto L1a else goto L2
622 -- L2:
623 -- Sp[-8] = y
624 -- Sp = Sp - 16
625 -- call [R1] returns to L1
626 -- L1a:
627 -- Sp[-8] = y
628 -- Sp = Sp - 16
629 -- goto L1
630 -- L1:
631 -- x = R1
632 --
633 -- The code for saving the live vars is duplicated in each branch, and
634 -- furthermore there is an extra jump in the fast path (assuming L1 is
635 -- a proc point, which it probably is if there is a heap check).
636 --
637 -- So to fix this we want to set up the stack frame before the
638 -- conditional jump. How do we know when to do this, and when it is
639 -- safe? The basic idea is, when we see the assignment
640 --
641 -- Sp[young(L)] = L
642 --
643 -- we know that
644 -- * we are definitely heading for L
645 -- * there can be no more reads from another stack area, because young(L)
646 -- overlaps with it.
647 --
648 -- We don't necessarily know that everything live at L is live now
649 -- (some might be assigned between here and the jump to L). So we
650 -- simplify and only do the optimisation when we see
651 --
652 -- (1) a block containing an assignment of a return address L
653 -- (2) ending in a branch where one (and only) continuation goes to L,
654 -- and no other continuations go to proc points.
655 --
656 -- then we allocate the stack frame for L at the end of the block,
657 -- before the branch.
658 --
659 -- We could generalise (2), but that would make it a bit more
660 -- complicated to handle, and this currently catches the common case.
661
662 futureContinuation :: Block CmmNode O O -> Maybe BlockId
663 futureContinuation middle = foldBlockNodesB f middle Nothing
664 where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
665 f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
666 = Just l
667 f _ r = r
668
669 -- -----------------------------------------------------------------------------
670 -- Saving live registers
671
672 -- | Given a set of live registers and a StackMap, save all the registers
673 -- on the stack and return the new StackMap and the assignments to do
674 -- the saving.
675 --
676 allocate :: DynFlags -> ByteOff -> LocalRegSet -> StackMap
677 -> (StackMap, [CmmNode O O])
678 allocate dflags ret_off live stackmap@StackMap{ sm_sp = sp0
679 , sm_regs = regs0 }
680 =
681 -- we only have to save regs that are not already in a slot
682 let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
683 regs1 = filterUFM (\(r,_) -> elemRegSet r live) regs0
684 in
685
686 -- make a map of the stack
687 let stack = reverse $ Array.elems $
688 accumArray (\_ x -> x) Empty (1, toWords dflags (max sp0 ret_off)) $
689 ret_words ++ live_words
690 where ret_words =
691 [ (x, Occupied)
692 | x <- [ 1 .. toWords dflags ret_off] ]
693 live_words =
694 [ (toWords dflags x, Occupied)
695 | (r,off) <- nonDetEltsUFM regs1,
696 -- See Note [Unique Determinism and code generation]
697 let w = localRegBytes dflags r,
698 x <- [ off, off - wORD_SIZE dflags .. off - w + 1] ]
699 in
700
701 -- Pass over the stack: find slots to save all the new live variables,
702 -- choosing the oldest slots first (hence a foldr).
703 let
704 save slot ([], stack, n, assigs, regs) -- no more regs to save
705 = ([], slot:stack, plusW dflags n 1, assigs, regs)
706 save slot (to_save, stack, n, assigs, regs)
707 = case slot of
708 Occupied -> (to_save, Occupied:stack, plusW dflags n 1, assigs, regs)
709 Empty
710 | Just (stack', r, to_save') <-
711 select_save to_save (slot:stack)
712 -> let assig = CmmStore (CmmStackSlot Old n')
713 (CmmReg (CmmLocal r))
714 n' = plusW dflags n 1
715 in
716 (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
717
718 | otherwise
719 -> (to_save, slot:stack, plusW dflags n 1, assigs, regs)
720
721 -- we should do better here: right now we'll fit the smallest first,
722 -- but it would make more sense to fit the biggest first.
723 select_save :: [LocalReg] -> [StackSlot]
724 -> Maybe ([StackSlot], LocalReg, [LocalReg])
725 select_save regs stack = go regs []
726 where go [] _no_fit = Nothing
727 go (r:rs) no_fit
728 | Just rest <- dropEmpty words stack
729 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
730 | otherwise
731 = go rs (r:no_fit)
732 where words = localRegWords dflags r
733
734 -- fill in empty slots as much as possible
735 (still_to_save, save_stack, n, save_assigs, save_regs)
736 = foldr save (to_save, [], 0, [], []) stack
737
738 -- push any remaining live vars on the stack
739 (push_sp, push_assigs, push_regs)
740 = foldr push (n, [], []) still_to_save
741 where
742 push r (n, assigs, regs)
743 = (n', assig : assigs, (r,(r,n')) : regs)
744 where
745 n' = n + localRegBytes dflags r
746 assig = CmmStore (CmmStackSlot Old n')
747 (CmmReg (CmmLocal r))
748
749 trim_sp
750 | not (null push_regs) = push_sp
751 | otherwise
752 = plusW dflags n (- length (takeWhile isEmpty save_stack))
753
754 final_regs = regs1 `addListToUFM` push_regs
755 `addListToUFM` save_regs
756
757 in
758 -- XXX should be an assert
759 if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
760
761 if (trim_sp .&. (wORD_SIZE dflags - 1)) /= 0 then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
762
763 ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
764 , push_assigs ++ save_assigs )
765
766
767 -- -----------------------------------------------------------------------------
768 -- Manifesting Sp
769
770 -- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp. The
771 -- block looks like this:
772 --
773 -- middle_pre -- the middle nodes
774 -- Sp = Sp + sp_off -- Sp adjustment goes here
775 -- last -- the last node
776 --
777 -- And we have some extra blocks too (that don't contain Sp adjustments)
778 --
779 -- The adjustment for middle_pre will be different from that for
780 -- middle_post, because the Sp adjustment intervenes.
781 --
782 manifestSp
783 :: DynFlags
784 -> LabelMap StackMap -- StackMaps for other blocks
785 -> StackMap -- StackMap for this block
786 -> ByteOff -- Sp on entry to the block
787 -> ByteOff -- SpHigh
788 -> CmmNode C O -- first node
789 -> [CmmNode O O] -- middle
790 -> ByteOff -- sp_off
791 -> CmmNode O C -- last node
792 -> [CmmBlock] -- new blocks
793 -> [CmmBlock] -- final blocks with Sp manifest
794
795 manifestSp dflags stackmaps stack0 sp0 sp_high
796 first middle_pre sp_off last fixup_blocks
797 = final_block : fixup_blocks'
798 where
799 area_off = getAreaOff stackmaps
800
801 adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
802 adj_pre_sp = mapExpDeep (areaToSp dflags sp0 sp_high area_off)
803 adj_post_sp = mapExpDeep (areaToSp dflags (sp0 - sp_off) sp_high area_off)
804
805 -- Add unwind pseudo-instruction at the beginning of each block to
806 -- document Sp level for debugging
807 add_initial_unwind block
808 | debugLevel dflags > 0
809 = CmmUnwind [(Sp, Just sp_unwind)] `blockCons` block
810 | otherwise
811 = block
812 where sp_unwind = CmmRegOff spReg (sp0 - wORD_SIZE dflags)
813
814 -- Add unwind pseudo-instruction right before the Sp adjustment
815 -- if there is one.
816 add_adj_unwind block
817 | debugLevel dflags > 0
818 , sp_off /= 0
819 = block `blockSnoc` CmmUnwind [(Sp, Just sp_unwind)]
820 | otherwise
821 = block
822 where sp_unwind = CmmRegOff spReg (sp0 - wORD_SIZE dflags - sp_off)
823
824 final_middle = maybeAddSpAdj dflags sp_off
825 . add_adj_unwind
826 . add_initial_unwind
827 . blockFromList
828 . map adj_pre_sp
829 . elimStackStores stack0 stackmaps area_off
830 $ middle_pre
831 final_last = optStackCheck (adj_post_sp last)
832
833 final_block = blockJoin first final_middle final_last
834
835 fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
836
837
838 getAreaOff :: LabelMap StackMap -> (Area -> StackLoc)
839 getAreaOff _ Old = 0
840 getAreaOff stackmaps (Young l) =
841 case mapLookup l stackmaps of
842 Just sm -> sm_sp sm - sm_args sm
843 Nothing -> pprPanic "getAreaOff" (ppr l)
844
845
846 maybeAddSpAdj :: DynFlags -> ByteOff -> Block CmmNode O O -> Block CmmNode O O
847 maybeAddSpAdj _ 0 block = block
848 maybeAddSpAdj dflags sp_off block = block `blockSnoc` adj
849 where
850 adj = CmmAssign spReg (cmmOffset dflags (CmmReg spReg) sp_off)
851
852 {-
853 Sp(L) is the Sp offset on entry to block L relative to the base of the
854 OLD area.
855
856 SpArgs(L) is the size of the young area for L, i.e. the number of
857 arguments.
858
859 - in block L, each reference to [old + N] turns into
860 [Sp + Sp(L) - N]
861
862 - in block L, each reference to [young(L') + N] turns into
863 [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
864
865 - be careful with the last node of each block: Sp has already been adjusted
866 to be Sp + Sp(L) - Sp(L')
867 -}
868
869 areaToSp :: DynFlags -> ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
870
871 areaToSp dflags sp_old _sp_hwm area_off (CmmStackSlot area n)
872 = cmmOffset dflags (CmmReg spReg) (sp_old - area_off area - n)
873 -- Replace (CmmStackSlot area n) with an offset from Sp
874
875 areaToSp dflags _ sp_hwm _ (CmmLit CmmHighStackMark)
876 = mkIntExpr dflags sp_hwm
877 -- Replace CmmHighStackMark with the number of bytes of stack used,
878 -- the sp_hwm. See Note [Stack usage] in StgCmmHeap
879
880 areaToSp dflags _ _ _ (CmmMachOp (MO_U_Lt _) args)
881 | falseStackCheck args
882 = zeroExpr dflags
883 areaToSp dflags _ _ _ (CmmMachOp (MO_U_Ge _) args)
884 | falseStackCheck args
885 = mkIntExpr dflags 1
886 -- Replace a stack-overflow test that cannot fail with a no-op
887 -- See Note [Always false stack check]
888
889 areaToSp _ _ _ _ other = other
890
891 -- | Determine whether a stack check cannot fail.
892 falseStackCheck :: [CmmExpr] -> Bool
893 falseStackCheck [ CmmMachOp (MO_Sub _)
894 [ CmmRegOff (CmmGlobal Sp) x_off
895 , CmmLit (CmmInt y_lit _)]
896 , CmmReg (CmmGlobal SpLim)]
897 = fromIntegral x_off >= y_lit
898 falseStackCheck _ = False
899
900 -- Note [Always false stack check]
901 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
902 -- We can optimise stack checks of the form
903 --
904 -- if ((Sp + x) - y < SpLim) then .. else ..
905 --
906 -- where are non-negative integer byte offsets. Since we know that
907 -- SpLim <= Sp (remember the stack grows downwards), this test must
908 -- yield False if (x >= y), so we can rewrite the comparison to False.
909 -- A subsequent sinking pass will later drop the dead code.
910 -- Optimising this away depends on knowing that SpLim <= Sp, so it is
911 -- really the job of the stack layout algorithm, hence we do it now.
912 --
913 -- The control flow optimiser may negate a conditional to increase
914 -- the likelihood of a fallthrough if the branch is not taken. But
915 -- not every conditional is inverted as the control flow optimiser
916 -- places some requirements on the predecessors of both branch targets.
917 -- So we better look for the inverted comparison too.
918
919 optStackCheck :: CmmNode O C -> CmmNode O C
920 optStackCheck n = -- Note [Always false stack check]
921 case n of
922 CmmCondBranch (CmmLit (CmmInt 0 _)) _true false _ -> CmmBranch false
923 CmmCondBranch (CmmLit (CmmInt _ _)) true _false _ -> CmmBranch true
924 other -> other
925
926
927 -- -----------------------------------------------------------------------------
928
929 -- | Eliminate stores of the form
930 --
931 -- Sp[area+n] = r
932 --
933 -- when we know that r is already in the same slot as Sp[area+n]. We
934 -- could do this in a later optimisation pass, but that would involve
935 -- a separate analysis and we already have the information to hand
936 -- here. It helps clean up some extra stack stores in common cases.
937 --
938 -- Note that we may have to modify the StackMap as we walk through the
939 -- code using procMiddle, since an assignment to a variable in the
940 -- StackMap will invalidate its mapping there.
941 --
942 elimStackStores :: StackMap
943 -> LabelMap StackMap
944 -> (Area -> ByteOff)
945 -> [CmmNode O O]
946 -> [CmmNode O O]
947 elimStackStores stackmap stackmaps area_off nodes
948 = go stackmap nodes
949 where
950 go _stackmap [] = []
951 go stackmap (n:ns)
952 = case n of
953 CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
954 | Just (_,off) <- lookupUFM (sm_regs stackmap) r
955 , area_off area + m == off
956 -> go stackmap ns
957 _otherwise
958 -> n : go (procMiddle stackmaps n stackmap) ns
959
960
961 -- -----------------------------------------------------------------------------
962 -- Update info tables to include stack liveness
963
964
965 setInfoTableStackMap :: DynFlags -> LabelMap StackMap -> CmmDecl -> CmmDecl
966 setInfoTableStackMap dflags stackmaps (CmmProc top_info@TopInfo{..} l v g)
967 = CmmProc top_info{ info_tbls = mapMapWithKey fix_info info_tbls } l v g
968 where
969 fix_info lbl info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
970 info_tbl { cit_rep = StackRep (get_liveness lbl) }
971 fix_info _ other = other
972
973 get_liveness :: BlockId -> Liveness
974 get_liveness lbl
975 = case mapLookup lbl stackmaps of
976 Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl <+> ppr info_tbls)
977 Just sm -> stackMapToLiveness dflags sm
978
979 setInfoTableStackMap _ _ d = d
980
981
982 stackMapToLiveness :: DynFlags -> StackMap -> Liveness
983 stackMapToLiveness dflags StackMap{..} =
984 reverse $ Array.elems $
985 accumArray (\_ x -> x) True (toWords dflags sm_ret_off + 1,
986 toWords dflags (sm_sp - sm_args)) live_words
987 where
988 live_words = [ (toWords dflags off, False)
989 | (r,off) <- nonDetEltsUFM sm_regs
990 , isGcPtrType (localRegType r) ]
991 -- See Note [Unique Determinism and code generation]
992
993
994 -- -----------------------------------------------------------------------------
995 -- Lowering safe foreign calls
996
997 {-
998 Note [Lower safe foreign calls]
999
1000 We start with
1001
1002 Sp[young(L1)] = L1
1003 ,-----------------------
1004 | r1 = foo(x,y,z) returns to L1
1005 '-----------------------
1006 L1:
1007 R1 = r1 -- copyIn, inserted by mkSafeCall
1008 ...
1009
1010 the stack layout algorithm will arrange to save and reload everything
1011 live across the call. Our job now is to expand the call so we get
1012
1013 Sp[young(L1)] = L1
1014 ,-----------------------
1015 | SAVE_THREAD_STATE()
1016 | token = suspendThread(BaseReg, interruptible)
1017 | r = foo(x,y,z)
1018 | BaseReg = resumeThread(token)
1019 | LOAD_THREAD_STATE()
1020 | R1 = r -- copyOut
1021 | jump Sp[0]
1022 '-----------------------
1023 L1:
1024 r = R1 -- copyIn, inserted by mkSafeCall
1025 ...
1026
1027 Note the copyOut, which saves the results in the places that L1 is
1028 expecting them (see Note [safe foreign call convention]). Note also
1029 that safe foreign call is replace by an unsafe one in the Cmm graph.
1030 -}
1031
1032 lowerSafeForeignCall :: DynFlags -> CmmBlock -> UniqSM CmmBlock
1033 lowerSafeForeignCall dflags block
1034 | (entry@(CmmEntry _ tscp), middle, CmmForeignCall { .. }) <- blockSplit block
1035 = do
1036 -- Both 'id' and 'new_base' are KindNonPtr because they're
1037 -- RTS-only objects and are not subject to garbage collection
1038 id <- newTemp (bWord dflags)
1039 new_base <- newTemp (cmmRegType dflags (CmmGlobal BaseReg))
1040 let (caller_save, caller_load) = callerSaveVolatileRegs dflags
1041 save_state_code <- saveThreadState dflags
1042 load_state_code <- loadThreadState dflags
1043 let suspend = save_state_code <*>
1044 caller_save <*>
1045 mkMiddle (callSuspendThread dflags id intrbl)
1046 midCall = mkUnsafeCall tgt res args
1047 resume = mkMiddle (callResumeThread new_base id) <*>
1048 -- Assign the result to BaseReg: we
1049 -- might now have a different Capability!
1050 mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
1051 caller_load <*>
1052 load_state_code
1053
1054 (_, regs, copyout) =
1055 copyOutOflow dflags NativeReturn Jump (Young succ)
1056 (map (CmmReg . CmmLocal) res)
1057 ret_off []
1058
1059 -- NB. after resumeThread returns, the top-of-stack probably contains
1060 -- the stack frame for succ, but it might not: if the current thread
1061 -- received an exception during the call, then the stack might be
1062 -- different. Hence we continue by jumping to the top stack frame,
1063 -- not by jumping to succ.
1064 jump = CmmCall { cml_target = entryCode dflags $
1065 CmmLoad (CmmReg spReg) (bWord dflags)
1066 , cml_cont = Just succ
1067 , cml_args_regs = regs
1068 , cml_args = widthInBytes (wordWidth dflags)
1069 , cml_ret_args = ret_args
1070 , cml_ret_off = ret_off }
1071
1072 graph' <- lgraphOfAGraph ( suspend <*>
1073 midCall <*>
1074 resume <*>
1075 copyout <*>
1076 mkLast jump, tscp)
1077
1078 case toBlockList graph' of
1079 [one] -> let (_, middle', last) = blockSplit one
1080 in return (blockJoin entry (middle `blockAppend` middle') last)
1081 _ -> panic "lowerSafeForeignCall0"
1082
1083 -- Block doesn't end in a safe foreign call:
1084 | otherwise = return block
1085
1086
1087 foreignLbl :: FastString -> CmmExpr
1088 foreignLbl name = CmmLit (CmmLabel (mkForeignLabel name Nothing ForeignLabelInExternalPackage IsFunction))
1089
1090 callSuspendThread :: DynFlags -> LocalReg -> Bool -> CmmNode O O
1091 callSuspendThread dflags id intrbl =
1092 CmmUnsafeForeignCall
1093 (ForeignTarget (foreignLbl (fsLit "suspendThread"))
1094 (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint] CmmMayReturn))
1095 [id] [CmmReg (CmmGlobal BaseReg), mkIntExpr dflags (fromEnum intrbl)]
1096
1097 callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
1098 callResumeThread new_base id =
1099 CmmUnsafeForeignCall
1100 (ForeignTarget (foreignLbl (fsLit "resumeThread"))
1101 (ForeignConvention CCallConv [AddrHint] [AddrHint] CmmMayReturn))
1102 [new_base] [CmmReg (CmmLocal id)]
1103
1104 -- -----------------------------------------------------------------------------
1105
1106 plusW :: DynFlags -> ByteOff -> WordOff -> ByteOff
1107 plusW dflags b w = b + w * wORD_SIZE dflags
1108
1109 data StackSlot = Occupied | Empty
1110 -- Occupied: a return address or part of an update frame
1111
1112 instance Outputable StackSlot where
1113 ppr Occupied = text "XXX"
1114 ppr Empty = text "---"
1115
1116 dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
1117 dropEmpty 0 ss = Just ss
1118 dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
1119 dropEmpty _ _ = Nothing
1120
1121 isEmpty :: StackSlot -> Bool
1122 isEmpty Empty = True
1123 isEmpty _ = False
1124
1125 localRegBytes :: DynFlags -> LocalReg -> ByteOff
1126 localRegBytes dflags r
1127 = roundUpToWords dflags (widthInBytes (typeWidth (localRegType r)))
1128
1129 localRegWords :: DynFlags -> LocalReg -> WordOff
1130 localRegWords dflags = toWords dflags . localRegBytes dflags
1131
1132 toWords :: DynFlags -> ByteOff -> WordOff
1133 toWords dflags x = x `quot` wORD_SIZE dflags
1134
1135
1136 insertReloads :: StackMap -> [CmmNode O O]
1137 insertReloads stackmap =
1138 [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
1139 (localRegType r))
1140 | (r,sp) <- stackSlotRegs stackmap
1141 ]
1142
1143
1144 stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
1145 stackSlotRegs sm = nonDetEltsUFM (sm_regs sm)
1146 -- See Note [Unique Determinism and code generation]