Support the 2-result primops in the new code generator
[ghc.git] / compiler / cmm / CmmLayoutStack.hs
1 {-# LANGUAGE RecordWildCards, GADTs #-}
2 module CmmLayoutStack (
3 cmmLayoutStack, setInfoTableStackMap, cmmSink
4 ) where
5
6 import StgCmmUtils ( callerSaveVolatileRegs ) -- XXX
7 import StgCmmForeign ( saveThreadState, loadThreadState ) -- XXX
8
9 import Cmm
10 import BlockId
11 import CLabel
12 import CmmUtils
13 import MkGraph
14 import Module
15 import ForeignCall
16 import CmmLive
17 import CmmProcPoint
18 import SMRep
19 import Hoopl
20 import Constants
21 import UniqSupply
22 import Maybes
23 import UniqFM
24 import Util
25
26 import FastString
27 import Outputable
28 import Data.Map (Map)
29 import qualified Data.Map as Map
30 import qualified Data.Set as Set
31 import Control.Monad.Fix
32 import Data.Array as Array
33 import Data.Bits
34 import Data.List (nub, partition)
35 import Control.Monad (liftM)
36
37 #include "HsVersions.h"
38
39
40 data StackSlot = Occupied | Empty
41 -- Occupied: a return address or part of an update frame
42
43 instance Outputable StackSlot where
44 ppr Occupied = ptext (sLit "XXX")
45 ppr Empty = ptext (sLit "---")
46
47 -- All stack locations are expressed as positive byte offsets from the
48 -- "base", which is defined to be the address above the return address
49 -- on the stack on entry to this CmmProc.
50 --
51 -- Lower addresses have higher StackLocs.
52 --
53 type StackLoc = ByteOff
54
55 {-
56 A StackMap describes the stack at any given point. At a continuation
57 it has a particular layout, like this:
58
59 | | <- base
60 |-------------|
61 | ret0 | <- base + 8
62 |-------------|
63 . upd frame . <- base + sm_ret_off
64 |-------------|
65 | |
66 . vars .
67 . (live/dead) .
68 | | <- base + sm_sp - sm_args
69 |-------------|
70 | ret1 |
71 . ret vals . <- base + sm_sp (<--- Sp points here)
72 |-------------|
73
74 Why do we include the final return address (ret0) in our stack map? I
75 have absolutely no idea, but it seems to be done that way consistently
76 in the rest of the code generator, so I played along here. --SDM
77
78 Note that we will be constructing an info table for the continuation
79 (ret1), which needs to describe the stack down to, but not including,
80 the update frame (or ret0, if there is no update frame).
81 -}
82
83 data StackMap = StackMap
84 { sm_sp :: StackLoc
85 -- ^ the offset of Sp relative to the base on entry
86 -- to this block.
87 , sm_args :: ByteOff
88 -- ^ the number of bytes of arguments in the area for this block
89 -- Defn: the offset of young(L) relative to the base is given by
90 -- (sm_sp - sm_args) of the StackMap for block L.
91 , sm_ret_off :: ByteOff
92 -- ^ Number of words of stack that we do not describe with an info
93 -- table, because it contains an update frame.
94 , sm_regs :: UniqFM (LocalReg,StackLoc)
95 -- ^ regs on the stack
96 }
97
98 instance Outputable StackMap where
99 ppr StackMap{..} =
100 text "Sp = " <> int sm_sp $$
101 text "sm_args = " <> int sm_args $$
102 text "sm_ret_off = " <> int sm_ret_off $$
103 text "sm_regs = " <> ppr (eltsUFM sm_regs)
104
105
106 cmmLayoutStack :: ProcPointSet -> ByteOff -> CmmGraph
107 -> UniqSM (CmmGraph, BlockEnv StackMap)
108 cmmLayoutStack procpoints entry_args
109 graph0@(CmmGraph { g_entry = entry })
110 = do
111 pprTrace "cmmLayoutStack" (ppr entry_args) $ return ()
112 (graph, liveness) <- removeDeadAssignments graph0
113 pprTrace "liveness" (ppr liveness) $ return ()
114 let blocks = postorderDfs graph
115
116 (final_stackmaps, final_high_sp, new_blocks) <-
117 mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
118 layout procpoints liveness entry entry_args
119 rec_stackmaps rec_high_sp blocks
120
121 new_blocks' <- mapM lowerSafeForeignCall new_blocks
122
123 pprTrace ("Sp HWM") (ppr final_high_sp) $
124 return (ofBlockList entry new_blocks', final_stackmaps)
125
126
127
128 layout :: BlockSet -- proc points
129 -> BlockEnv CmmLive -- liveness
130 -> BlockId -- entry
131 -> ByteOff -- stack args on entry
132
133 -> BlockEnv StackMap -- [final] stack maps
134 -> ByteOff -- [final] Sp high water mark
135
136 -> [CmmBlock] -- [in] blocks
137
138 -> UniqSM
139 ( BlockEnv StackMap -- [out] stack maps
140 , ByteOff -- [out] Sp high water mark
141 , [CmmBlock] -- [out] new blocks
142 )
143
144 layout procpoints liveness entry entry_args final_stackmaps final_hwm blocks
145 = go blocks init_stackmap entry_args []
146 where
147 (updfr, cont_info) = collectContInfo blocks
148
149 init_stackmap = mapSingleton entry StackMap{ sm_sp = entry_args
150 , sm_args = entry_args
151 , sm_ret_off = updfr
152 , sm_regs = emptyUFM
153 }
154
155 go [] acc_stackmaps acc_hwm acc_blocks
156 = return (acc_stackmaps, acc_hwm, acc_blocks)
157
158 go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
159 = do
160 let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
161
162 let stack0@StackMap { sm_sp = sp0 }
163 = mapFindWithDefault
164 (pprPanic "no stack map for" (ppr entry_lbl))
165 entry_lbl acc_stackmaps
166
167 pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
168
169 -- (a) Update the stack map to include the effects of
170 -- assignments in this block
171 let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
172
173 -- (b) Insert assignments to reload all the live variables if this
174 -- block is a proc point
175 let middle1 = if entry_lbl `setMember` procpoints
176 then foldr blockCons middle0 (insertReloads stack0)
177 else middle0
178
179 -- (c) Look at the last node and if we are making a call or
180 -- jumping to a proc point, we must save the live
181 -- variables, adjust Sp, and construct the StackMaps for
182 -- each of the successor blocks. See handleLastNode for
183 -- details.
184 (middle2, sp_off, last1, fixup_blocks, out)
185 <- handleLastNode procpoints liveness cont_info
186 acc_stackmaps stack1 middle0 last0
187
188 pprTrace "layout(out)" (ppr out) $ return ()
189
190 -- (d) Manifest Sp: run over the nodes in the block and replace
191 -- CmmStackSlot with CmmLoad from Sp with a concrete offset.
192 --
193 -- our block:
194 -- middle1 -- the original middle nodes
195 -- middle2 -- live variable saves from handleLastNode
196 -- Sp = Sp + sp_off -- Sp adjustment goes here
197 -- last1 -- the last node
198 --
199 let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
200
201 sp_high = final_hwm - entry_args
202 -- The stack check value is adjusted by the Sp offset on
203 -- entry to the proc, which is entry_args. We are
204 -- assuming that we only do a stack check at the
205 -- beginning of a proc, and we don't modify Sp before the
206 -- check.
207
208 final_blocks = manifestSp final_stackmaps stack0 sp0 sp_high entry0
209 middle_pre sp_off last1 fixup_blocks
210
211 acc_stackmaps' = mapUnion acc_stackmaps out
212
213 hwm' = maximum (acc_hwm : (sp0 - sp_off) : map sm_sp (mapElems out))
214
215 go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
216
217
218 -- -----------------------------------------------------------------------------
219
220 -- This doesn't seem right somehow. We need to find out whether this
221 -- proc will push some update frame material at some point, so that we
222 -- can avoid using that area of the stack for spilling. The
223 -- updfr_space field of the CmmProc *should* tell us, but it doesn't
224 -- (I think maybe it gets filled in later when we do proc-point
225 -- splitting).
226 --
227 -- So we'll just take the max of all the cml_ret_offs. This could be
228 -- unnecessarily pessimistic, but probably not in the code we
229 -- generate.
230
231 collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
232 collectContInfo blocks
233 = (maximum ret_offs, mapFromList (catMaybes mb_argss))
234 where
235 (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
236
237 get_cont b =
238 case lastNode b of
239 CmmCall { cml_cont = Just l, .. }
240 -> (Just (l, cml_ret_args), cml_ret_off)
241 CmmForeignCall { .. }
242 -> (Just (succ, 0), updfr) -- ??
243 _other -> (Nothing, 0)
244
245
246 -- -----------------------------------------------------------------------------
247 -- Updating the StackMap from middle nodes
248
249 -- Look for loads from stack slots, and update the StackMap. This is
250 -- purely for optimisation reasons, so that we can avoid saving a
251 -- variable back to a different stack slot if it is already on the
252 -- stack.
253 --
254 -- This happens a lot: for example when function arguments are passed
255 -- on the stack and need to be immediately saved across a call, we
256 -- want to just leave them where they are on the stack.
257 --
258 procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
259 procMiddle stackmaps node sm
260 = case node of
261 CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
262 -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
263 where loc = getStackLoc area off stackmaps
264 CmmAssign (CmmLocal r) _other
265 -> sm { sm_regs = delFromUFM (sm_regs sm) r }
266 _other
267 -> sm
268
269 getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
270 getStackLoc Old n _ = n
271 getStackLoc (Young l) n stackmaps =
272 case mapLookup l stackmaps of
273 Nothing -> pprPanic "getStackLoc" (ppr l)
274 Just sm -> sm_sp sm - sm_args sm + n
275
276
277 -- -----------------------------------------------------------------------------
278 -- Handling stack allocation for a last node
279
280 -- We take a single last node and turn it into:
281 --
282 -- C1 (some statements)
283 -- Sp = Sp + N
284 -- C2 (some more statements)
285 -- call f() -- the actual last node
286 --
287 -- plus possibly some more blocks (we may have to add some fixup code
288 -- between the last node and the continuation).
289 --
290 -- C1: is the code for saving the variables across this last node onto
291 -- the stack, if the continuation is a call or jumps to a proc point.
292 --
293 -- C2: if the last node is a safe foreign call, we have to inject some
294 -- extra code that goes *after* the Sp adjustment.
295
296 handleLastNode
297 :: ProcPointSet -> BlockEnv CmmLive -> BlockEnv ByteOff
298 -> BlockEnv StackMap -> StackMap
299 -> Block CmmNode O O
300 -> CmmNode O C
301 -> UniqSM
302 ( [CmmNode O O] -- nodes to go *before* the Sp adjustment
303 , ByteOff -- amount to adjust Sp
304 , CmmNode O C -- new last node
305 , [CmmBlock] -- new blocks
306 , BlockEnv StackMap -- stackmaps for the continuations
307 )
308
309 handleLastNode procpoints liveness cont_info stackmaps
310 stack0@StackMap { sm_sp = sp0 } middle last
311 = case last of
312 -- At each return / tail call,
313 -- adjust Sp to point to the last argument pushed, which
314 -- is cml_args, after popping any other junk from the stack.
315 CmmCall{ cml_cont = Nothing, .. } -> do
316 let sp_off = sp0 - cml_args
317 return ([], sp_off, last, [], mapEmpty)
318
319 -- At each CmmCall with a continuation:
320 CmmCall{ cml_cont = Just cont_lbl, .. } ->
321 return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
322
323 CmmForeignCall{ succ = cont_lbl, .. } -> do
324 return $ lastCall cont_lbl wORD_SIZE wORD_SIZE (sm_ret_off stack0)
325 -- one word each for args and results: the return address
326
327 CmmBranch{..} -> handleProcPoints
328 CmmCondBranch{..} -> handleProcPoints
329 CmmSwitch{..} -> handleProcPoints
330
331 where
332 -- Calls and ForeignCalls are handled the same way:
333 lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
334 -> ( [CmmNode O O]
335 , ByteOff
336 , CmmNode O C
337 , [CmmBlock]
338 , BlockEnv StackMap
339 )
340 lastCall lbl cml_args cml_ret_args cml_ret_off
341 = ( assignments
342 , spOffsetForCall sp0 cont_stack cml_args
343 , last
344 , [] -- no new blocks
345 , mapSingleton lbl cont_stack )
346 where
347 (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
348
349
350 prepareStack lbl cml_ret_args cml_ret_off
351 | Just cont_stack <- mapLookup lbl stackmaps
352 -- If we have already seen this continuation before, then
353 -- we just have to make the stack look the same:
354 = (fixupStack stack0 cont_stack, cont_stack)
355 -- Otherwise, we have to allocate the stack frame
356 | otherwise
357 = (save_assignments, new_cont_stack)
358 where
359 (new_cont_stack, save_assignments)
360 = setupStackFrame lbl liveness cml_ret_off cml_ret_args stack0
361
362
363 -- For other last nodes (branches), if any of the targets is a
364 -- proc point, we have to set up the stack to match what the proc
365 -- point is expecting.
366 --
367 handleProcPoints :: UniqSM ( [CmmNode O O]
368 , ByteOff
369 , CmmNode O C
370 , [CmmBlock]
371 , BlockEnv StackMap )
372
373 handleProcPoints
374 -- Note [diamond proc point]
375 | Just l <- futureContinuation middle
376 , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
377 = do
378 let cont_args = mapFindWithDefault 0 l cont_info
379 (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
380 out = mapFromList [ (l', cont_stack)
381 | l' <- successors last ]
382 return ( assigs
383 , spOffsetForCall sp0 cont_stack wORD_SIZE
384 , last
385 , []
386 , out)
387
388 | otherwise = do
389 pps <- mapM handleProcPoint (successors last)
390 let lbl_map :: LabelMap Label
391 lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
392 fix_lbl l = mapLookup l lbl_map `orElse` l
393 return ( []
394 , 0
395 , mapSuccessors fix_lbl last
396 , concat [ blk | (_,_,_,blk) <- pps ]
397 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
398
399 -- For each proc point that is a successor of this block
400 -- (a) if the proc point already has a stackmap, we need to
401 -- shuffle the current stack to make it look the same.
402 -- We have to insert a new block to make this happen.
403 -- (b) otherwise, call "allocate live stack0" to make the
404 -- stack map for the proc point
405 handleProcPoint :: BlockId
406 -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
407 handleProcPoint l
408 | not (l `setMember` procpoints) = return (l, l, stack0, [])
409 | otherwise = do
410 tmp_lbl <- liftM mkBlockId $ getUniqueM
411 let
412 (stack2, assigs) =
413 case mapLookup l stackmaps of
414 Just pp_sm -> (pp_sm, fixupStack stack0 pp_sm)
415 Nothing ->
416 pprTrace "first visit to proc point"
417 (ppr l <+> ppr stack1) $
418 (stack1, assigs)
419 where
420 cont_args = mapFindWithDefault 0 l cont_info
421 (stack1, assigs) =
422 setupStackFrame l liveness (sm_ret_off stack0)
423 cont_args stack0
424
425 sp_off = sp0 - sm_sp stack2
426
427 block = blockJoin (CmmEntry tmp_lbl)
428 (maybeAddSpAdj sp_off (blockFromList assigs))
429 (CmmBranch l)
430 --
431 return (l, tmp_lbl, stack2, [block])
432
433
434
435 -- Sp is currently pointing to current_sp,
436 -- we want it to point to
437 -- (sm_sp cont_stack - sm_args cont_stack + args)
438 -- so the difference is
439 -- sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
440 spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
441 spOffsetForCall current_sp cont_stack args
442 = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
443
444
445 -- | create a sequence of assignments to establish the new StackMap,
446 -- given the old StackMap.
447 fixupStack :: StackMap -> StackMap -> [CmmNode O O]
448 fixupStack old_stack new_stack = concatMap move new_locs
449 where
450 old_map :: Map LocalReg ByteOff
451 old_map = Map.fromList (stackSlotRegs old_stack)
452 new_locs = stackSlotRegs new_stack
453
454 move (r,n)
455 | Just m <- Map.lookup r old_map, n == m = []
456 | otherwise = [CmmStore (CmmStackSlot Old n)
457 (CmmReg (CmmLocal r))]
458
459
460
461 setupStackFrame
462 :: BlockId -- label of continuation
463 -> BlockEnv CmmLive -- liveness
464 -> ByteOff -- updfr
465 -> ByteOff -- bytes of return values on stack
466 -> StackMap -- current StackMap
467 -> (StackMap, [CmmNode O O])
468
469 setupStackFrame lbl liveness updfr_off ret_args stack0
470 = (cont_stack, assignments)
471 where
472 -- get the set of LocalRegs live in the continuation
473 live = mapFindWithDefault Set.empty lbl liveness
474
475 -- the stack from the base to updfr_off is off-limits.
476 -- our new stack frame contains:
477 -- * saved live variables
478 -- * the return address [young(C) + 8]
479 -- * the args for the call,
480 -- which are replaced by the return values at the return
481 -- point.
482
483 -- everything up to updfr_off is off-limits
484 -- stack1 contains updfr_off, plus everything we need to save
485 (stack1, assignments) = allocate updfr_off live stack0
486
487 -- And the Sp at the continuation is:
488 -- sm_sp stack1 + ret_args
489 cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
490 , sm_args = ret_args
491 , sm_ret_off = updfr_off
492 }
493
494
495 -- -----------------------------------------------------------------------------
496 -- Note [diamond proc point]
497 --
498 -- This special case looks for the pattern we get from a typical
499 -- tagged case expression:
500 --
501 -- Sp[young(L1)] = L1
502 -- if (R1 & 7) != 0 goto L1 else goto L2
503 -- L2:
504 -- call [R1] returns to L1
505 -- L1: live: {y}
506 -- x = R1
507 --
508 -- If we let the generic case handle this, we get
509 --
510 -- Sp[-16] = L1
511 -- if (R1 & 7) != 0 goto L1a else goto L2
512 -- L2:
513 -- Sp[-8] = y
514 -- Sp = Sp - 16
515 -- call [R1] returns to L1
516 -- L1a:
517 -- Sp[-8] = y
518 -- Sp = Sp - 16
519 -- goto L1
520 -- L1:
521 -- x = R1
522 --
523 -- The code for saving the live vars is duplicated in each branch, and
524 -- furthermore there is an extra jump in the fast path (assuming L1 is
525 -- a proc point, which it probably is if there is a heap check).
526 --
527 -- So to fix this we want to set up the stack frame before the
528 -- conditional jump. How do we know when to do this, and when it is
529 -- safe? The basic idea is, when we see the assignment
530 --
531 -- Sp[young(L)] = L
532 --
533 -- we know that
534 -- * we are definitely heading for L
535 -- * there can be no more reads from another stack area, because young(L)
536 -- overlaps with it.
537 --
538 -- We don't necessarily know that everything live at L is live now
539 -- (some might be assigned between here and the jump to L). So we
540 -- simplify and only do the optimisation when we see
541 --
542 -- (1) a block containing an assignment of a return address L
543 -- (2) ending in a branch where one (and only) continuation goes to L,
544 -- and no other continuations go to proc points.
545 --
546 -- then we allocate the stack frame for L at the end of the block,
547 -- before the branch.
548 --
549 -- We could generalise (2), but that would make it a bit more
550 -- complicated to handle, and this currently catches the common case.
551
552 futureContinuation :: Block CmmNode O O -> Maybe BlockId
553 futureContinuation middle = foldBlockNodesB f middle Nothing
554 where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
555 f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
556 = Just l
557 f _ r = r
558
559 -- -----------------------------------------------------------------------------
560 -- Saving live registers
561
562 -- | Given a set of live registers and a StackMap, save all the registers
563 -- on the stack and return the new StackMap and the assignments to do
564 -- the saving.
565 --
566 allocate :: ByteOff -> RegSet -> StackMap -> (StackMap, [CmmNode O O])
567 allocate ret_off live stackmap@StackMap{ sm_sp = sp0
568 , sm_regs = regs0 }
569 =
570 pprTrace "allocate" (ppr live $$ ppr stackmap) $
571
572 -- we only have to save regs that are not already in a slot
573 let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
574 regs1 = filterUFM (\(r,_) -> elemRegSet r live) regs0
575 in
576
577 -- make a map of the stack
578 let stack = reverse $ Array.elems $
579 accumArray (\_ x -> x) Empty (1, toWords (max sp0 ret_off)) $
580 ret_words ++ live_words
581 where ret_words =
582 [ (x, Occupied)
583 | x <- [ 1 .. toWords ret_off] ]
584 live_words =
585 [ (toWords x, Occupied)
586 | (r,off) <- eltsUFM regs1,
587 let w = localRegBytes r,
588 x <- [ off, off-wORD_SIZE .. off - w + 1] ]
589 in
590
591 -- Pass over the stack: find slots to save all the new live variables,
592 -- choosing the oldest slots first (hence a foldr).
593 let
594 save slot ([], stack, n, assigs, regs) -- no more regs to save
595 = ([], slot:stack, n `plusW` 1, assigs, regs)
596 save slot (to_save, stack, n, assigs, regs)
597 = case slot of
598 Occupied -> (to_save, Occupied:stack, n `plusW` 1, assigs, regs)
599 Empty
600 | Just (stack', r, to_save') <-
601 select_save to_save (slot:stack)
602 -> let assig = CmmStore (CmmStackSlot Old n')
603 (CmmReg (CmmLocal r))
604 n' = n `plusW` 1
605 in
606 (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
607
608 | otherwise
609 -> (to_save, slot:stack, n `plusW` 1, assigs, regs)
610
611 -- we should do better here: right now we'll fit the smallest first,
612 -- but it would make more sense to fit the biggest first.
613 select_save :: [LocalReg] -> [StackSlot]
614 -> Maybe ([StackSlot], LocalReg, [LocalReg])
615 select_save regs stack = go regs []
616 where go [] _no_fit = Nothing
617 go (r:rs) no_fit
618 | Just rest <- dropEmpty words stack
619 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
620 | otherwise
621 = go rs (r:no_fit)
622 where words = localRegWords r
623
624 -- fill in empty slots as much as possible
625 (still_to_save, save_stack, n, save_assigs, save_regs)
626 = foldr save (to_save, [], 0, [], []) stack
627
628 -- push any remaining live vars on the stack
629 (push_sp, push_assigs, push_regs)
630 = foldr push (n, [], []) still_to_save
631 where
632 push r (n, assigs, regs)
633 = (n', assig : assigs, (r,(r,n')) : regs)
634 where
635 n' = n + localRegBytes r
636 assig = CmmStore (CmmStackSlot Old n')
637 (CmmReg (CmmLocal r))
638
639 trim_sp
640 | not (null push_regs) = push_sp
641 | otherwise
642 = n `plusW` (- length (takeWhile isEmpty save_stack))
643
644 final_regs = regs1 `addListToUFM` push_regs
645 `addListToUFM` save_regs
646
647 in
648 -- XXX should be an assert
649 if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
650
651 if (trim_sp .&. (wORD_SIZE - 1)) /= 0 then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
652
653 ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
654 , push_assigs ++ save_assigs )
655
656
657 -- -----------------------------------------------------------------------------
658 -- Manifesting Sp
659
660 -- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp. The
661 -- block looks like this:
662 --
663 -- middle_pre -- the middle nodes
664 -- Sp = Sp + sp_off -- Sp adjustment goes here
665 -- last -- the last node
666 --
667 -- And we have some extra blocks too (that don't contain Sp adjustments)
668 --
669 -- The adjustment for middle_pre will be different from that for
670 -- middle_post, because the Sp adjustment intervenes.
671 --
672 manifestSp
673 :: BlockEnv StackMap -- StackMaps for other blocks
674 -> StackMap -- StackMap for this block
675 -> ByteOff -- Sp on entry to the block
676 -> ByteOff -- SpHigh
677 -> CmmNode C O -- first node
678 -> [CmmNode O O] -- middle
679 -> ByteOff -- sp_off
680 -> CmmNode O C -- last node
681 -> [CmmBlock] -- new blocks
682 -> [CmmBlock] -- final blocks with Sp manifest
683
684 manifestSp stackmaps stack0 sp0 sp_high
685 first middle_pre sp_off last fixup_blocks
686 = final_block : fixup_blocks'
687 where
688 area_off = getAreaOff stackmaps
689
690 adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
691 adj_pre_sp = mapExpDeep (areaToSp sp0 sp_high area_off)
692 adj_post_sp = mapExpDeep (areaToSp (sp0 - sp_off) sp_high area_off)
693
694 final_middle = maybeAddSpAdj sp_off $
695 blockFromList $
696 map adj_pre_sp $
697 elimStackStores stack0 stackmaps area_off $
698 middle_pre
699
700 final_last = optStackCheck (adj_post_sp last)
701
702 final_block = blockJoin first final_middle final_last
703
704 fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
705
706
707 getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
708 getAreaOff _ Old = 0
709 getAreaOff stackmaps (Young l) =
710 case mapLookup l stackmaps of
711 Just sm -> sm_sp sm - sm_args sm
712 Nothing -> pprPanic "getAreaOff" (ppr l)
713
714
715 maybeAddSpAdj :: ByteOff -> Block CmmNode O O -> Block CmmNode O O
716 maybeAddSpAdj 0 block = block
717 maybeAddSpAdj sp_off block
718 = block `blockSnoc` CmmAssign spReg (cmmOffset (CmmReg spReg) sp_off)
719
720
721 {-
722 Sp(L) is the Sp offset on entry to block L relative to the base of the
723 OLD area.
724
725 SpArgs(L) is the size of the young area for L, i.e. the number of
726 arguments.
727
728 - in block L, each reference to [old + N] turns into
729 [Sp + Sp(L) - N]
730
731 - in block L, each reference to [young(L') + N] turns into
732 [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
733
734 - be careful with the last node of each block: Sp has already been adjusted
735 to be Sp + Sp(L) - Sp(L')
736 -}
737
738 areaToSp :: ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
739 areaToSp sp_old _sp_hwm area_off (CmmStackSlot area n) =
740 cmmOffset (CmmReg spReg) (sp_old - area_off area - n)
741 areaToSp _ sp_hwm _ (CmmLit CmmHighStackMark) = CmmLit (mkIntCLit sp_hwm)
742 areaToSp _ _ _ (CmmMachOp (MO_U_Lt _) -- Note [null stack check]
743 [CmmMachOp (MO_Sub _)
744 [ CmmReg (CmmGlobal Sp)
745 , CmmLit (CmmInt 0 _)],
746 CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
747 areaToSp _ _ _ other = other
748
749 -- -----------------------------------------------------------------------------
750 -- Note [null stack check]
751 --
752 -- If the high-water Sp is zero, then we end up with
753 --
754 -- if (Sp - 0 < SpLim) then .. else ..
755 --
756 -- and possibly some dead code for the failure case. Optimising this
757 -- away depends on knowing that SpLim <= Sp, so it is really the job
758 -- of the stack layout algorithm, hence we do it now. This is also
759 -- convenient because control-flow optimisation later will drop the
760 -- dead code.
761
762 optStackCheck :: CmmNode O C -> CmmNode O C
763 optStackCheck n = -- Note [null stack check]
764 case n of
765 CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
766 other -> other
767
768
769 -- -----------------------------------------------------------------------------
770
771 -- | Eliminate stores of the form
772 --
773 -- Sp[area+n] = r
774 --
775 -- when we know that r is already in the same slot as Sp[area+n]. We
776 -- could do this in a later optimisation pass, but that would involve
777 -- a separate analysis and we already have the information to hand
778 -- here. It helps clean up some extra stack stores in common cases.
779 --
780 -- Note that we may have to modify the StackMap as we walk through the
781 -- code using procMiddle, since an assignment to a variable in the
782 -- StackMap will invalidate its mapping there.
783 --
784 elimStackStores :: StackMap
785 -> BlockEnv StackMap
786 -> (Area -> ByteOff)
787 -> [CmmNode O O]
788 -> [CmmNode O O]
789 elimStackStores stackmap stackmaps area_off nodes
790 = go stackmap nodes
791 where
792 go _stackmap [] = []
793 go stackmap (n:ns)
794 = case n of
795 CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
796 | Just (_,off) <- lookupUFM (sm_regs stackmap) r
797 , area_off area + m == off
798 -> pprTrace "eliminated a node!" (ppr r) $ go stackmap ns
799 _otherwise
800 -> n : go (procMiddle stackmaps n stackmap) ns
801
802
803 -- -----------------------------------------------------------------------------
804 -- Update info tables to include stack liveness
805
806
807 setInfoTableStackMap :: BlockEnv StackMap -> CmmDecl -> CmmDecl
808 setInfoTableStackMap stackmaps
809 (CmmProc top_info@TopInfo{..} l g@CmmGraph{g_entry = eid})
810 = CmmProc top_info{ info_tbl = fix_info info_tbl } l g
811 where
812 fix_info info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
813 info_tbl { cit_rep = StackRep (get_liveness eid) }
814 fix_info other = other
815
816 get_liveness :: BlockId -> Liveness
817 get_liveness lbl
818 = case mapLookup lbl stackmaps of
819 Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl)
820 Just sm -> stackMapToLiveness sm
821
822 setInfoTableStackMap _ d = d
823
824
825 stackMapToLiveness :: StackMap -> Liveness
826 stackMapToLiveness StackMap{..} =
827 reverse $ Array.elems $
828 accumArray (\_ x -> x) True (toWords sm_ret_off + 1,
829 toWords (sm_sp - sm_args)) live_words
830 where
831 live_words = [ (toWords off, False)
832 | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
833
834
835 -- -----------------------------------------------------------------------------
836 -- Lowering safe foreign calls
837
838 {-
839 Note [lower safe foreign calls]
840
841 We start with
842
843 Sp[young(L1)] = L1
844 ,-----------------------
845 | r1 = foo(x,y,z) returns to L1
846 '-----------------------
847 L1:
848 R1 = r1 -- copyIn, inserted by mkSafeCall
849 ...
850
851 the stack layout algorithm will arrange to save and reload everything
852 live across the call. Our job now is to expand the call so we get
853
854 Sp[young(L1)] = L1
855 ,-----------------------
856 | SAVE_THREAD_STATE()
857 | token = suspendThread(BaseReg, interruptible)
858 | r = foo(x,y,z)
859 | BaseReg = resumeThread(token)
860 | LOAD_THREAD_STATE()
861 | R1 = r -- copyOut
862 | jump L1
863 '-----------------------
864 L1:
865 r = R1 -- copyIn, inserted by mkSafeCall
866 ...
867
868 Note the copyOut, which saves the results in the places that L1 is
869 expecting them (see Note {safe foreign call convention]).
870 -}
871
872 lowerSafeForeignCall :: CmmBlock -> UniqSM CmmBlock
873 lowerSafeForeignCall block
874 | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
875 = do
876 -- Both 'id' and 'new_base' are KindNonPtr because they're
877 -- RTS-only objects and are not subject to garbage collection
878 id <- newTemp bWord
879 new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
880 let (caller_save, caller_load) = callerSaveVolatileRegs
881 load_tso <- newTemp gcWord
882 load_stack <- newTemp gcWord
883 let suspend = saveThreadState <*>
884 caller_save <*>
885 mkMiddle (callSuspendThread id intrbl)
886 midCall = mkUnsafeCall tgt res args
887 resume = mkMiddle (callResumeThread new_base id) <*>
888 -- Assign the result to BaseReg: we
889 -- might now have a different Capability!
890 mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
891 caller_load <*>
892 loadThreadState load_tso load_stack
893 -- Note: The successor must be a procpoint, and we have already split,
894 -- so we use a jump, not a branch.
895 succLbl = CmmLit (CmmLabel (infoTblLbl succ))
896
897 (ret_args, regs, copyout) = copyOutOflow NativeReturn Jump (Young succ)
898 (map (CmmReg . CmmLocal) res)
899 updfr (0, [])
900
901 jump = CmmCall { cml_target = succLbl
902 , cml_cont = Just succ
903 , cml_args_regs = regs
904 , cml_args = widthInBytes wordWidth
905 , cml_ret_args = ret_args
906 , cml_ret_off = updfr }
907
908 graph' <- lgraphOfAGraph $ suspend <*>
909 midCall <*>
910 resume <*>
911 copyout <*>
912 mkLast jump
913
914 case toBlockList graph' of
915 [one] -> let (_, middle', last) = blockSplit one
916 in return (blockJoin entry (middle `blockAppend` middle') last)
917 _ -> panic "lowerSafeForeignCall0"
918
919 -- Block doesn't end in a safe foreign call:
920 | otherwise = return block
921
922
923 foreignLbl :: FastString -> CmmExpr
924 foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
925
926 newTemp :: CmmType -> UniqSM LocalReg
927 newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
928
929 callSuspendThread :: LocalReg -> Bool -> CmmNode O O
930 callSuspendThread id intrbl =
931 CmmUnsafeForeignCall
932 (ForeignTarget (foreignLbl (fsLit "suspendThread"))
933 (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
934 [id] [CmmReg (CmmGlobal BaseReg), CmmLit (mkIntCLit (fromEnum intrbl))]
935
936 callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
937 callResumeThread new_base id =
938 CmmUnsafeForeignCall
939 (ForeignTarget (foreignLbl (fsLit "resumeThread"))
940 (ForeignConvention CCallConv [AddrHint] [AddrHint]))
941 [new_base] [CmmReg (CmmLocal id)]
942
943 -- -----------------------------------------------------------------------------
944
945 plusW :: ByteOff -> WordOff -> ByteOff
946 plusW b w = b + w * wORD_SIZE
947
948 dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
949 dropEmpty 0 ss = Just ss
950 dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
951 dropEmpty _ _ = Nothing
952
953 isEmpty :: StackSlot -> Bool
954 isEmpty Empty = True
955 isEmpty _ = False
956
957 localRegBytes :: LocalReg -> ByteOff
958 localRegBytes r = roundUpToWords (widthInBytes (typeWidth (localRegType r)))
959
960 localRegWords :: LocalReg -> WordOff
961 localRegWords = toWords . localRegBytes
962
963 toWords :: ByteOff -> WordOff
964 toWords x = x `quot` wORD_SIZE
965
966
967 insertReloads :: StackMap -> [CmmNode O O]
968 insertReloads stackmap =
969 [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
970 (localRegType r))
971 | (r,sp) <- stackSlotRegs stackmap
972 ]
973
974
975 stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
976 stackSlotRegs sm = eltsUFM (sm_regs sm)
977
978 -- -----------------------------------------------------------------------------
979
980 -- If we do this *before* stack layout, we might be able to avoid
981 -- saving some things across calls/procpoints.
982 --
983 -- *but*, that will invalidate the liveness analysis, and we'll have
984 -- to re-do it.
985
986 cmmSink :: CmmGraph -> UniqSM CmmGraph
987 cmmSink graph = do
988 let liveness = cmmLiveness graph
989 return $ cmmSink' liveness graph
990
991 cmmSink' :: BlockEnv CmmLive -> CmmGraph -> CmmGraph
992 cmmSink' liveness graph
993 = ofBlockList (g_entry graph) $ sink mapEmpty $ postorderDfs graph
994 where
995
996 sink :: BlockEnv [(LocalReg, CmmExpr)] -> [CmmBlock] -> [CmmBlock]
997 sink _ [] = []
998 sink sunk (b:bs) =
999 pprTrace "sink" (ppr l) $
1000 blockJoin first final_middle last : sink sunk' bs
1001 where
1002 l = entryLabel b
1003 (first, middle, last) = blockSplit b
1004 (middle', assigs) = walk (blockToList middle) emptyBlock
1005 (mapFindWithDefault [] l sunk)
1006
1007 (dropped_last, assigs') = partition (`conflictsWithLast` last) assigs
1008
1009 final_middle = foldl blockSnoc middle' (toNodes dropped_last)
1010
1011 sunk' = mapUnion sunk $
1012 mapFromList [ (l, filt assigs' (getLive l))
1013 | l <- successors last ]
1014 where
1015 getLive l = mapFindWithDefault Set.empty l liveness
1016 filt as live = [ (r,e) | (r,e) <- as, r `Set.member` live ]
1017
1018
1019 walk :: [CmmNode O O] -> Block CmmNode O O -> [(LocalReg, CmmExpr)]
1020 -> (Block CmmNode O O, [(LocalReg, CmmExpr)])
1021
1022 walk [] acc as = (acc, as)
1023 walk (n:ns) acc as
1024 | Just a <- collect_it = walk ns acc (a:as)
1025 | otherwise = walk ns (foldr (flip blockSnoc) acc (n:drop_nodes)) as'
1026 where
1027 collect_it = case n of
1028 CmmAssign (CmmLocal r) e@(CmmReg (CmmGlobal _)) -> Just (r,e)
1029 -- CmmAssign (CmmLocal r) e@(CmmLoad addr _) |
1030 -- foldRegsUsed (\b r -> False) True addr -> Just (r,e)
1031 _ -> Nothing
1032
1033 drop_nodes = toNodes dropped
1034 (dropped, as') = partition should_drop as
1035 where should_drop a = a `conflicts` n
1036
1037 toNodes :: [(LocalReg,CmmExpr)] -> [CmmNode O O]
1038 toNodes as = [ CmmAssign (CmmLocal r) rhs | (r,rhs) <- as ]
1039
1040 -- We only sink "r = G" assignments right now, so conflicts is very simple:
1041 conflicts :: (LocalReg,CmmExpr) -> CmmNode O O -> Bool
1042 (_, rhs) `conflicts` CmmAssign reg _ | reg `regUsedIn` rhs = True
1043 --(r, CmmLoad _ _) `conflicts` CmmStore _ _ = True
1044 (r, _) `conflicts` node
1045 = foldRegsUsed (\b r' -> r == r' || b) False node
1046
1047 conflictsWithLast :: (LocalReg,CmmExpr) -> CmmNode O C -> Bool
1048 (r, _) `conflictsWithLast` node
1049 = foldRegsUsed (\b r' -> r == r' || b) False node