CLabel: Refactor pprDynamicLinkerAsmLabel
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import GhcPrelude
7
8 import Cmm
9 import CmmOpt
10 import CmmLive
11 import CmmUtils
12 import Hoopl.Block
13 import Hoopl.Label
14 import Hoopl.Collections
15 import Hoopl.Graph
16 import CodeGen.Platform
17 import Platform (isARM, platformArch)
18
19 import DynFlags
20 import Unique
21 import UniqFM
22 import PprCmm ()
23
24 import qualified Data.IntSet as IntSet
25 import Data.List (partition)
26 import qualified Data.Set as Set
27 import Data.Maybe
28
29 -- Compact sets for membership tests of local variables.
30
31 type LRegSet = IntSet.IntSet
32
33 emptyLRegSet :: LRegSet
34 emptyLRegSet = IntSet.empty
35
36 nullLRegSet :: LRegSet -> Bool
37 nullLRegSet = IntSet.null
38
39 insertLRegSet :: LocalReg -> LRegSet -> LRegSet
40 insertLRegSet l = IntSet.insert (getKey (getUnique l))
41
42 elemLRegSet :: LocalReg -> LRegSet -> Bool
43 elemLRegSet l = IntSet.member (getKey (getUnique l))
44
45 -- -----------------------------------------------------------------------------
46 -- Sinking and inlining
47
48 -- This is an optimisation pass that
49 -- (a) moves assignments closer to their uses, to reduce register pressure
50 -- (b) pushes assignments into a single branch of a conditional if possible
51 -- (c) inlines assignments to registers that are mentioned only once
52 -- (d) discards dead assignments
53 --
54 -- This tightens up lots of register-heavy code. It is particularly
55 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
56 -- which every function starts with a copyIn sequence like:
57 --
58 -- x1 = R1
59 -- x2 = Sp[8]
60 -- x3 = Sp[16]
61 -- if (Sp - 32 < SpLim) then L1 else L2
62 --
63 -- we really want to push the x1..x3 assignments into the L2 branch.
64 --
65 -- Algorithm:
66 --
67 -- * Start by doing liveness analysis.
68 --
69 -- * Keep a list of assignments A; earlier ones may refer to later ones.
70 -- Currently we only sink assignments to local registers, because we don't
71 -- have liveness information about global registers.
72 --
73 -- * Walk forwards through the graph, look at each node N:
74 --
75 -- * If it is a dead assignment, i.e. assignment to a register that is
76 -- not used after N, discard it.
77 --
78 -- * Try to inline based on current list of assignments
79 -- * If any assignments in A (1) occur only once in N, and (2) are
80 -- not live after N, inline the assignment and remove it
81 -- from A.
82 --
83 -- * If an assignment in A is cheap (RHS is local register), then
84 -- inline the assignment and keep it in A in case it is used afterwards.
85 --
86 -- * Otherwise don't inline.
87 --
88 -- * If N is assignment to a local register pick up the assignment
89 -- and add it to A.
90 --
91 -- * If N is not an assignment to a local register:
92 -- * remove any assignments from A that conflict with N, and
93 -- place them before N in the current block. We call this
94 -- "dropping" the assignments.
95 --
96 -- * An assignment conflicts with N if it:
97 -- - assigns to a register mentioned in N
98 -- - mentions a register assigned by N
99 -- - reads from memory written by N
100 -- * do this recursively, dropping dependent assignments
101 --
102 -- * At an exit node:
103 -- * drop any assignments that are live on more than one successor
104 -- and are not trivial
105 -- * if any successor has more than one predecessor (a join-point),
106 -- drop everything live in that successor. Since we only propagate
107 -- assignments that are not dead at the successor, we will therefore
108 -- eliminate all assignments dead at this point. Thus analysis of a
109 -- join-point will always begin with an empty list of assignments.
110 --
111 --
112 -- As a result of above algorithm, sinking deletes some dead assignments
113 -- (transitively, even). This isn't as good as removeDeadAssignments,
114 -- but it's much cheaper.
115
116 -- -----------------------------------------------------------------------------
117 -- things that we aren't optimising very well yet.
118 --
119 -- -----------
120 -- (1) From GHC's FastString.hashStr:
121 --
122 -- s2ay:
123 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
124 -- c2gn:
125 -- R1 = _s2au::I64;
126 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
127 -- c2gp:
128 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
129 -- 4091);
130 -- _s2an::I64 = _s2an::I64 + 1;
131 -- _s2au::I64 = _s2cO::I64;
132 -- goto s2ay;
133 --
134 -- a nice loop, but we didn't eliminate the silly assignment at the end.
135 -- See Note [dependent assignments], which would probably fix this.
136 -- This is #8336 on Trac.
137 --
138 -- -----------
139 -- (2) From stg_atomically_frame in PrimOps.cmm
140 --
141 -- We have a diamond control flow:
142 --
143 -- x = ...
144 -- |
145 -- / \
146 -- A B
147 -- \ /
148 -- |
149 -- use of x
150 --
151 -- Now x won't be sunk down to its use, because we won't push it into
152 -- both branches of the conditional. We certainly do have to check
153 -- that we can sink it past all the code in both A and B, but having
154 -- discovered that, we could sink it to its use.
155 --
156
157 -- -----------------------------------------------------------------------------
158
159 type Assignment = (LocalReg, CmmExpr, AbsMem)
160 -- Assignment caches AbsMem, an abstraction of the memory read by
161 -- the RHS of the assignment.
162
163 type Assignments = [Assignment]
164 -- A sequence of assignments; kept in *reverse* order
165 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
166 -- y = e2
167 -- x = e1
168
169 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
170 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
171 where
172 liveness = cmmLocalLiveness dflags graph
173 getLive l = mapFindWithDefault Set.empty l liveness
174
175 blocks = postorderDfs graph
176
177 join_pts = findJoinPoints blocks
178
179 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
180 sink _ [] = []
181 sink sunk (b:bs) =
182 -- pprTrace "sink" (ppr lbl) $
183 blockJoin first final_middle final_last : sink sunk' bs
184 where
185 lbl = entryLabel b
186 (first, middle, last) = blockSplit b
187
188 succs = successors last
189
190 -- Annotate the middle nodes with the registers live *after*
191 -- the node. This will help us decide whether we can inline
192 -- an assignment in the current node or not.
193 live = Set.unions (map getLive succs)
194 live_middle = gen_kill dflags last live
195 ann_middles = annotate dflags live_middle (blockToList middle)
196
197 -- Now sink and inline in this block
198 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
199 fold_last = constantFoldNode dflags last
200 (final_last, assigs') = tryToInline dflags live fold_last assigs
201
202 -- We cannot sink into join points (successors with more than
203 -- one predecessor), so identify the join points and the set
204 -- of registers live in them.
205 (joins, nonjoins) = partition (`mapMember` join_pts) succs
206 live_in_joins = Set.unions (map getLive joins)
207
208 -- We do not want to sink an assignment into multiple branches,
209 -- so identify the set of registers live in multiple successors.
210 -- This is made more complicated because when we sink an assignment
211 -- into one branch, this might change the set of registers that are
212 -- now live in multiple branches.
213 init_live_sets = map getLive nonjoins
214 live_in_multi live_sets r =
215 case filter (Set.member r) live_sets of
216 (_one:_two:_) -> True
217 _ -> False
218
219 -- Now, drop any assignments that we will not sink any further.
220 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
221
222 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
223 where
224 should_drop = conflicts dflags a final_last
225 || not (isTrivial dflags rhs) && live_in_multi live_sets r
226 || r `Set.member` live_in_joins
227
228 live_sets' | should_drop = live_sets
229 | otherwise = map upd live_sets
230
231 upd set | r `Set.member` set = set `Set.union` live_rhs
232 | otherwise = set
233
234 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
235
236 final_middle = foldl blockSnoc middle' dropped_last
237
238 sunk' = mapUnion sunk $
239 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
240 | l <- succs ]
241
242 {- TODO: enable this later, when we have some good tests in place to
243 measure the effect and tune it.
244
245 -- small: an expression we don't mind duplicating
246 isSmall :: CmmExpr -> Bool
247 isSmall (CmmReg (CmmLocal _)) = True --
248 isSmall (CmmLit _) = True
249 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
250 isSmall (CmmRegOff (CmmLocal _) _) = True
251 isSmall _ = False
252 -}
253
254 --
255 -- We allow duplication of trivial expressions: registers (both local and
256 -- global) and literals.
257 --
258 isTrivial :: DynFlags -> CmmExpr -> Bool
259 isTrivial _ (CmmReg (CmmLocal _)) = True
260 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
261 if isARM (platformArch (targetPlatform dflags))
262 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
263 else isJust (globalRegMaybe (targetPlatform dflags) r)
264 -- GlobalRegs that are loads from BaseReg are not trivial
265 isTrivial _ (CmmLit _) = True
266 isTrivial _ _ = False
267
268 --
269 -- annotate each node with the set of registers live *after* the node
270 --
271 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
272 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
273 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
274
275 --
276 -- Find the blocks that have multiple successors (join points)
277 --
278 findJoinPoints :: [CmmBlock] -> LabelMap Int
279 findJoinPoints blocks = mapFilter (>1) succ_counts
280 where
281 all_succs = concatMap successors blocks
282
283 succ_counts :: LabelMap Int
284 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
285
286 --
287 -- filter the list of assignments to remove any assignments that
288 -- are not live in a continuation.
289 --
290 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
291 filterAssignments dflags live assigs = reverse (go assigs [])
292 where go [] kept = kept
293 go (a@(r,_,_):as) kept | needed = go as (a:kept)
294 | otherwise = go as kept
295 where
296 needed = r `Set.member` live
297 || any (conflicts dflags a) (map toNode kept)
298 -- Note that we must keep assignments that are
299 -- referred to by other assignments we have
300 -- already kept.
301
302 -- -----------------------------------------------------------------------------
303 -- Walk through the nodes of a block, sinking and inlining assignments
304 -- as we go.
305 --
306 -- On input we pass in a:
307 -- * list of nodes in the block
308 -- * a list of assignments that appeared *before* this block and
309 -- that are being sunk.
310 --
311 -- On output we get:
312 -- * a new block
313 -- * a list of assignments that will be placed *after* that block.
314 --
315
316 walk :: DynFlags
317 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
318 -- the set of registers live *after*
319 -- this node.
320
321 -> Assignments -- The current list of
322 -- assignments we are sinking.
323 -- Earlier assignments may refer
324 -- to later ones.
325
326 -> ( Block CmmNode O O -- The new block
327 , Assignments -- Assignments to sink further
328 )
329
330 walk dflags nodes assigs = go nodes emptyBlock assigs
331 where
332 go [] block as = (block, as)
333 go ((live,node):ns) block as
334 | shouldDiscard node live = go ns block as
335 -- discard dead assignment
336 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
337 | otherwise = go ns block' as'
338 where
339 node1 = constantFoldNode dflags node
340
341 (node2, as1) = tryToInline dflags live node1 as
342
343 (dropped, as') = dropAssignmentsSimple dflags
344 (\a -> conflicts dflags a node2) as1
345
346 block' = foldl blockSnoc block dropped `blockSnoc` node2
347
348
349 --
350 -- Heuristic to decide whether to pick up and sink an assignment
351 -- Currently we pick up all assignments to local registers. It might
352 -- be profitable to sink assignments to global regs too, but the
353 -- liveness analysis doesn't track those (yet) so we can't.
354 --
355 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
356 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
357 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
358 shouldSink _ _other = Nothing
359
360 --
361 -- discard dead assignments. This doesn't do as good a job as
362 -- removeDeadAssignments, because it would need multiple passes
363 -- to get all the dead code, but it catches the common case of
364 -- superfluous reloads from the stack that the stack allocator
365 -- leaves behind.
366 --
367 -- Also we catch "r = r" here. You might think it would fall
368 -- out of inlining, but the inliner will see that r is live
369 -- after the instruction and choose not to inline r in the rhs.
370 --
371 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
372 shouldDiscard node live
373 = case node of
374 CmmAssign r (CmmReg r') | r == r' -> True
375 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
376 _otherwise -> False
377
378
379 toNode :: Assignment -> CmmNode O O
380 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
381
382 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
383 -> ([CmmNode O O], Assignments)
384 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
385
386 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
387 -> ([CmmNode O O], Assignments)
388 dropAssignments dflags should_drop state assigs
389 = (dropped, reverse kept)
390 where
391 (dropped,kept) = go state assigs [] []
392
393 go _ [] dropped kept = (dropped, kept)
394 go state (assig : rest) dropped kept
395 | conflict = go state' rest (toNode assig : dropped) kept
396 | otherwise = go state' rest dropped (assig:kept)
397 where
398 (dropit, state') = should_drop assig state
399 conflict = dropit || any (conflicts dflags assig) dropped
400
401
402 -- -----------------------------------------------------------------------------
403 -- Try to inline assignments into a node.
404 -- This also does constant folding for primpops, since
405 -- inlining opens up opportunities for doing so.
406
407 tryToInline
408 :: DynFlags
409 -> LocalRegSet -- set of registers live after this
410 -- node. We cannot inline anything
411 -- that is live after the node, unless
412 -- it is small enough to duplicate.
413 -> CmmNode O x -- The node to inline into
414 -> Assignments -- Assignments to inline
415 -> (
416 CmmNode O x -- New node
417 , Assignments -- Remaining assignments
418 )
419
420 tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
421 where
422 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
423 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
424
425 go _usages node _skipped [] = (node, [])
426
427 go usages node skipped (a@(l,rhs,_) : rest)
428 | cannot_inline = dont_inline
429 | occurs_none = discard -- Note [discard during inlining]
430 | occurs_once = inline_and_discard
431 | isTrivial dflags rhs = inline_and_keep
432 | otherwise = dont_inline
433 where
434 inline_and_discard = go usages' inl_node skipped rest
435 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
436
437 discard = go usages node skipped rest
438
439 dont_inline = keep node -- don't inline the assignment, keep it
440 inline_and_keep = keep inl_node -- inline the assignment, keep it
441
442 keep node' = (final_node, a : rest')
443 where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
444 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
445 usages rhs
446 -- we must not inline anything that is mentioned in the RHS
447 -- of a binding that we have already skipped, so we set the
448 -- usages of the regs on the RHS to 2.
449
450 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
451 || l `elemLRegSet` skipped
452 || not (okToInline dflags rhs node)
453
454 l_usages = lookupUFM usages l
455 l_live = l `elemRegSet` live
456
457 occurs_once = not l_live && l_usages == Just 1
458 occurs_none = not l_live && l_usages == Nothing
459
460 inl_node = case mapExpDeep inl_exp node of
461 -- See Note [Improving conditionals]
462 CmmCondBranch (CmmMachOp (MO_Ne w) args)
463 ti fi l
464 -> CmmCondBranch (cmmMachOpFold dflags (MO_Eq w) args)
465 fi ti l
466 node' -> node'
467
468 inl_exp :: CmmExpr -> CmmExpr
469 -- inl_exp is where the inlining actually takes place!
470 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
471 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
472 = cmmOffset dflags rhs off
473 -- re-constant fold after inlining
474 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
475 inl_exp other = other
476
477 {- Note [Improving conditionals]
478 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
479 Given
480 CmmCondBranch ((a >## b) != 1) t f
481 where a,b, are Floats, the constant folder /cannot/ turn it into
482 CmmCondBranch (a <=## b) t f
483 because comparison on floats are not invertible
484 (see CmmMachOp.maybeInvertComparison).
485
486 What we want instead is simply to reverse the true/false branches thus
487 CmmCondBranch ((a >## b) != 1) t f
488 -->
489 CmmCondBranch (a >## b) f t
490
491 And we do that right here in tryToInline, just as we do cmmMachOpFold.
492 -}
493
494 -- Note [dependent assignments]
495 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
496 --
497 -- If our assignment list looks like
498 --
499 -- [ y = e, x = ... y ... ]
500 --
501 -- We cannot inline x. Remember this list is really in reverse order,
502 -- so it means x = ... y ...; y = e
503 --
504 -- Hence if we inline x, the outer assignment to y will capture the
505 -- reference in x's right hand side.
506 --
507 -- In this case we should rename the y in x's right-hand side,
508 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
509 -- Now we can go ahead and inline x.
510 --
511 -- For now we do nothing, because this would require putting
512 -- everything inside UniqSM.
513 --
514 -- One more variant of this (#7366):
515 --
516 -- [ y = e, y = z ]
517 --
518 -- If we don't want to inline y = e, because y is used many times, we
519 -- might still be tempted to inline y = z (because we always inline
520 -- trivial rhs's). But of course we can't, because y is equal to e,
521 -- not z.
522
523 -- Note [discard during inlining]
524 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
525 -- Opportunities to discard assignments sometimes appear after we've
526 -- done some inlining. Here's an example:
527 --
528 -- x = R1;
529 -- y = P64[x + 7];
530 -- z = P64[x + 15];
531 -- /* z is dead */
532 -- R1 = y & (-8);
533 --
534 -- The x assignment is trivial, so we inline it in the RHS of y, and
535 -- keep both x and y. z gets dropped because it is dead, then we
536 -- inline y, and we have a dead assignment to x. If we don't notice
537 -- that x is dead in tryToInline, we end up retaining it.
538
539 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
540 addUsage m r = addToUFM_C (+) m r 1
541
542 regsUsedIn :: LRegSet -> CmmExpr -> Bool
543 regsUsedIn ls _ | nullLRegSet ls = False
544 regsUsedIn ls e = wrapRecExpf f e False
545 where f (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True
546 f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
547 f _ z = z
548
549 -- we don't inline into CmmUnsafeForeignCall if the expression refers
550 -- to global registers. This is a HACK to avoid global registers
551 -- clashing with C argument-passing registers, really the back-end
552 -- ought to be able to handle it properly, but currently neither PprC
553 -- nor the NCG can do it. See Note [Register parameter passing]
554 -- See also StgCmmForeign:load_args_into_temps.
555 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
556 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
557 not (globalRegistersConflict dflags expr node)
558 okToInline _ _ _ = True
559
560 -- -----------------------------------------------------------------------------
561
562 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
563 -- @r = e@ can be safely commuted past statement @node@.
564 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
565 conflicts dflags (r, rhs, addr) node
566
567 -- (1) node defines registers used by rhs of assignment. This catches
568 -- assignments and all three kinds of calls. See Note [Sinking and calls]
569 | globalRegistersConflict dflags rhs node = True
570 | localRegistersConflict dflags rhs node = True
571
572 -- (2) node uses register defined by assignment
573 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
574
575 -- (3) a store to an address conflicts with a read of the same memory
576 | CmmStore addr' e <- node
577 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
578
579 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
580 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
581 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
582 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
583
584 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
585 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
586
587 -- (6) native calls clobber any memory
588 | CmmCall{} <- node, memConflicts addr AnyMem = True
589
590 -- (7) otherwise, no conflict
591 | otherwise = False
592
593 -- Returns True if node defines any global registers that are used in the
594 -- Cmm expression
595 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
596 globalRegistersConflict dflags expr node =
597 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
598 False node
599
600 -- Returns True if node defines any local registers that are used in the
601 -- Cmm expression
602 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
603 localRegistersConflict dflags expr node =
604 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
605 False node
606
607 -- Note [Sinking and calls]
608 -- ~~~~~~~~~~~~~~~~~~~~~~~~
609 --
610 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
611 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
612 -- stack layout (see Note [Sinking after stack layout]) which leads to two
613 -- invariants related to calls:
614 --
615 -- a) during stack layout phase all safe foreign calls are turned into
616 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
617 -- means that we will never encounter CmmForeignCall node when running
618 -- sinking after stack layout
619 --
620 -- b) stack layout saves all variables live across a call on the stack
621 -- just before making a call (remember we are not sinking assignments to
622 -- stack):
623 --
624 -- L1:
625 -- x = R1
626 -- P64[Sp - 16] = L2
627 -- P64[Sp - 8] = x
628 -- Sp = Sp - 16
629 -- call f() returns L2
630 -- L2:
631 --
632 -- We will attempt to sink { x = R1 } but we will detect conflict with
633 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
634 -- checking whether it conflicts with { call f() }. In this way we will
635 -- never need to check any assignment conflicts with CmmCall. Remember
636 -- that we still need to check for potential memory conflicts.
637 --
638 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
639 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
640 -- This assumption holds only when we do sinking after stack layout. If we run
641 -- it before stack layout we need to check for possible conflicts with all three
642 -- kinds of calls. Our `conflicts` function does that by using a generic
643 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
644 -- UserOfRegs typeclasses.
645 --
646
647 -- An abstraction of memory read or written.
648 data AbsMem
649 = NoMem -- no memory accessed
650 | AnyMem -- arbitrary memory
651 | HeapMem -- definitely heap memory
652 | StackMem -- definitely stack memory
653 | SpMem -- <size>[Sp+n]
654 {-# UNPACK #-} !Int
655 {-# UNPACK #-} !Int
656
657 -- Having SpMem is important because it lets us float loads from Sp
658 -- past stores to Sp as long as they don't overlap, and this helps to
659 -- unravel some long sequences of
660 -- x1 = [Sp + 8]
661 -- x2 = [Sp + 16]
662 -- ...
663 -- [Sp + 8] = xi
664 -- [Sp + 16] = xj
665 --
666 -- Note that SpMem is invalidated if Sp is changed, but the definition
667 -- of 'conflicts' above handles that.
668
669 -- ToDo: this won't currently fix the following commonly occurring code:
670 -- x1 = [R1 + 8]
671 -- x2 = [R1 + 16]
672 -- ..
673 -- [Hp - 8] = x1
674 -- [Hp - 16] = x2
675 -- ..
676
677 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
678 -- assignments to [Hp + n] do not conflict with any other heap memory,
679 -- but this is tricky to nail down. What if we had
680 --
681 -- x = Hp + n
682 -- [x] = ...
683 --
684 -- the store to [x] should be "new heap", not "old heap".
685 -- Furthermore, you could imagine that if we started inlining
686 -- functions in Cmm then there might well be reads of heap memory
687 -- that was written in the same basic block. To take advantage of
688 -- non-aliasing of heap memory we will have to be more clever.
689
690 -- Note [Foreign calls clobber heap]
691 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
692 --
693 -- It is tempting to say that foreign calls clobber only
694 -- non-heap/stack memory, but unfortunately we break this invariant in
695 -- the RTS. For example, in stg_catch_retry_frame we call
696 -- stmCommitNestedTransaction() which modifies the contents of the
697 -- TRec it is passed (this actually caused incorrect code to be
698 -- generated).
699 --
700 -- Since the invariant is true for the majority of foreign calls,
701 -- perhaps we ought to have a special annotation for calls that can
702 -- modify heap/stack memory. For now we just use the conservative
703 -- definition here.
704 --
705 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
706 -- therefore we should never float any memory operations across one of
707 -- these calls.
708
709
710 bothMems :: AbsMem -> AbsMem -> AbsMem
711 bothMems NoMem x = x
712 bothMems x NoMem = x
713 bothMems HeapMem HeapMem = HeapMem
714 bothMems StackMem StackMem = StackMem
715 bothMems (SpMem o1 w1) (SpMem o2 w2)
716 | o1 == o2 = SpMem o1 (max w1 w2)
717 | otherwise = StackMem
718 bothMems SpMem{} StackMem = StackMem
719 bothMems StackMem SpMem{} = StackMem
720 bothMems _ _ = AnyMem
721
722 memConflicts :: AbsMem -> AbsMem -> Bool
723 memConflicts NoMem _ = False
724 memConflicts _ NoMem = False
725 memConflicts HeapMem StackMem = False
726 memConflicts StackMem HeapMem = False
727 memConflicts SpMem{} HeapMem = False
728 memConflicts HeapMem SpMem{} = False
729 memConflicts (SpMem o1 w1) (SpMem o2 w2)
730 | o1 < o2 = o1 + w1 > o2
731 | otherwise = o2 + w2 > o1
732 memConflicts _ _ = True
733
734 exprMem :: DynFlags -> CmmExpr -> AbsMem
735 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
736 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
737 exprMem _ _ = NoMem
738
739 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
740 loadAddr dflags e w =
741 case e of
742 CmmReg r -> regAddr dflags r 0 w
743 CmmRegOff r i -> regAddr dflags r i w
744 _other | regUsedIn dflags (CmmGlobal Sp) e -> StackMem
745 | otherwise -> AnyMem
746
747 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
748 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
749 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
750 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
751 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
752 regAddr _ _ _ _ = AnyMem
753
754 {-
755 Note [Inline GlobalRegs?]
756
757 Should we freely inline GlobalRegs?
758
759 Actually it doesn't make a huge amount of difference either way, so we
760 *do* currently treat GlobalRegs as "trivial" and inline them
761 everywhere, but for what it's worth, here is what I discovered when I
762 (SimonM) looked into this:
763
764 Common sense says we should not inline GlobalRegs, because when we
765 have
766
767 x = R1
768
769 the register allocator will coalesce this assignment, generating no
770 code, and simply record the fact that x is bound to $rbx (or
771 whatever). Furthermore, if we were to sink this assignment, then the
772 range of code over which R1 is live increases, and the range of code
773 over which x is live decreases. All things being equal, it is better
774 for x to be live than R1, because R1 is a fixed register whereas x can
775 live in any register. So we should neither sink nor inline 'x = R1'.
776
777 However, not inlining GlobalRegs can have surprising
778 consequences. e.g. (cgrun020)
779
780 c3EN:
781 _s3DB::P64 = R1;
782 _c3ES::P64 = _s3DB::P64 & 7;
783 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
784 c3EU:
785 _s3DD::P64 = P64[_s3DB::P64 + 6];
786 _s3DE::P64 = P64[_s3DB::P64 + 14];
787 I64[Sp - 8] = c3F0;
788 R1 = _s3DE::P64;
789 P64[Sp] = _s3DD::P64;
790
791 inlining the GlobalReg gives:
792
793 c3EN:
794 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
795 c3EU:
796 I64[Sp - 8] = c3F0;
797 _s3DD::P64 = P64[R1 + 6];
798 R1 = P64[R1 + 14];
799 P64[Sp] = _s3DD::P64;
800
801 but if we don't inline the GlobalReg, instead we get:
802
803 _s3DB::P64 = R1;
804 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
805 c3EU:
806 I64[Sp - 8] = c3F0;
807 R1 = P64[_s3DB::P64 + 14];
808 P64[Sp] = P64[_s3DB::P64 + 6];
809
810 This looks better - we managed to inline _s3DD - but in fact it
811 generates an extra reg-reg move:
812
813 .Lc3EU:
814 movq $c3F0_info,-8(%rbp)
815 movq %rbx,%rax
816 movq 14(%rbx),%rbx
817 movq 6(%rax),%rax
818 movq %rax,(%rbp)
819
820 because _s3DB is now live across the R1 assignment, we lost the
821 benefit of coalescing.
822
823 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
824 R1, then we would not sink a reference to _s3DB past the R1
825 assignment. Or perhaps we *should* do that - we might gain by sinking
826 it, despite losing the coalescing opportunity.
827
828 Sometimes not inlining global registers wins by virtue of the rule
829 about not inlining into arguments of a foreign call, e.g. (T7163) this
830 is what happens when we inlined F1:
831
832 _s3L2::F32 = F1;
833 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
834 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
835
836 but if we don't inline F1:
837
838 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
839 10.0 :: W32));
840 -}