6317cfe92990c226b04c2da85d65b735bda78641
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import GhcPrelude
7
8 import Cmm
9 import CmmOpt
10 import CmmLive
11 import CmmUtils
12 import Hoopl.Block
13 import Hoopl.Label
14 import Hoopl.Collections
15 import Hoopl.Graph
16 import CodeGen.Platform
17 import Platform (isARM, platformArch)
18
19 import DynFlags
20 import Unique
21 import UniqFM
22 import PprCmm ()
23
24 import qualified Data.IntSet as IntSet
25 import Data.List (partition)
26 import qualified Data.Set as Set
27 import Data.Maybe
28
29 -- Compact sets for membership tests of local variables.
30
31 type LRegSet = IntSet.IntSet
32
33 emptyLRegSet :: LRegSet
34 emptyLRegSet = IntSet.empty
35
36 nullLRegSet :: LRegSet -> Bool
37 nullLRegSet = IntSet.null
38
39 insertLRegSet :: LocalReg -> LRegSet -> LRegSet
40 insertLRegSet l = IntSet.insert (getKey (getUnique l))
41
42 elemLRegSet :: LocalReg -> LRegSet -> Bool
43 elemLRegSet l = IntSet.member (getKey (getUnique l))
44
45 -- -----------------------------------------------------------------------------
46 -- Sinking and inlining
47
48 -- This is an optimisation pass that
49 -- (a) moves assignments closer to their uses, to reduce register pressure
50 -- (b) pushes assignments into a single branch of a conditional if possible
51 -- (c) inlines assignments to registers that are mentioned only once
52 -- (d) discards dead assignments
53 --
54 -- This tightens up lots of register-heavy code. It is particularly
55 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
56 -- which every function starts with a copyIn sequence like:
57 --
58 -- x1 = R1
59 -- x2 = Sp[8]
60 -- x3 = Sp[16]
61 -- if (Sp - 32 < SpLim) then L1 else L2
62 --
63 -- we really want to push the x1..x3 assignments into the L2 branch.
64 --
65 -- Algorithm:
66 --
67 -- * Start by doing liveness analysis.
68 --
69 -- * Keep a list of assignments A; earlier ones may refer to later ones.
70 -- Currently we only sink assignments to local registers, because we don't
71 -- have liveness information about global registers.
72 --
73 -- * Walk forwards through the graph, look at each node N:
74 --
75 -- * If it is a dead assignment, i.e. assignment to a register that is
76 -- not used after N, discard it.
77 --
78 -- * Try to inline based on current list of assignments
79 -- * If any assignments in A (1) occur only once in N, and (2) are
80 -- not live after N, inline the assignment and remove it
81 -- from A.
82 --
83 -- * If an assignment in A is cheap (RHS is local register), then
84 -- inline the assignment and keep it in A in case it is used afterwards.
85 --
86 -- * Otherwise don't inline.
87 --
88 -- * If N is assignment to a local register pick up the assignment
89 -- and add it to A.
90 --
91 -- * If N is not an assignment to a local register:
92 -- * remove any assignments from A that conflict with N, and
93 -- place them before N in the current block. We call this
94 -- "dropping" the assignments.
95 --
96 -- * An assignment conflicts with N if it:
97 -- - assigns to a register mentioned in N
98 -- - mentions a register assigned by N
99 -- - reads from memory written by N
100 -- * do this recursively, dropping dependent assignments
101 --
102 -- * At an exit node:
103 -- * drop any assignments that are live on more than one successor
104 -- and are not trivial
105 -- * if any successor has more than one predecessor (a join-point),
106 -- drop everything live in that successor. Since we only propagate
107 -- assignments that are not dead at the successor, we will therefore
108 -- eliminate all assignments dead at this point. Thus analysis of a
109 -- join-point will always begin with an empty list of assignments.
110 --
111 --
112 -- As a result of above algorithm, sinking deletes some dead assignments
113 -- (transitively, even). This isn't as good as removeDeadAssignments,
114 -- but it's much cheaper.
115
116 -- -----------------------------------------------------------------------------
117 -- things that we aren't optimising very well yet.
118 --
119 -- -----------
120 -- (1) From GHC's FastString.hashStr:
121 --
122 -- s2ay:
123 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
124 -- c2gn:
125 -- R1 = _s2au::I64;
126 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
127 -- c2gp:
128 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
129 -- 4091);
130 -- _s2an::I64 = _s2an::I64 + 1;
131 -- _s2au::I64 = _s2cO::I64;
132 -- goto s2ay;
133 --
134 -- a nice loop, but we didn't eliminate the silly assignment at the end.
135 -- See Note [dependent assignments], which would probably fix this.
136 -- This is #8336 on Trac.
137 --
138 -- -----------
139 -- (2) From stg_atomically_frame in PrimOps.cmm
140 --
141 -- We have a diamond control flow:
142 --
143 -- x = ...
144 -- |
145 -- / \
146 -- A B
147 -- \ /
148 -- |
149 -- use of x
150 --
151 -- Now x won't be sunk down to its use, because we won't push it into
152 -- both branches of the conditional. We certainly do have to check
153 -- that we can sink it past all the code in both A and B, but having
154 -- discovered that, we could sink it to its use.
155 --
156
157 -- -----------------------------------------------------------------------------
158
159 type Assignment = (LocalReg, CmmExpr, AbsMem)
160 -- Assignment caches AbsMem, an abstraction of the memory read by
161 -- the RHS of the assignment.
162
163 type Assignments = [Assignment]
164 -- A sequence of assignments; kept in *reverse* order
165 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
166 -- y = e2
167 -- x = e1
168
169 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
170 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
171 where
172 liveness = cmmLocalLiveness dflags graph
173 getLive l = mapFindWithDefault Set.empty l liveness
174
175 blocks = revPostorder graph
176
177 join_pts = findJoinPoints blocks
178
179 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
180 sink _ [] = []
181 sink sunk (b:bs) =
182 -- pprTrace "sink" (ppr lbl) $
183 blockJoin first final_middle final_last : sink sunk' bs
184 where
185 lbl = entryLabel b
186 (first, middle, last) = blockSplit b
187
188 succs = successors last
189
190 -- Annotate the middle nodes with the registers live *after*
191 -- the node. This will help us decide whether we can inline
192 -- an assignment in the current node or not.
193 live = Set.unions (map getLive succs)
194 live_middle = gen_kill dflags last live
195 ann_middles = annotate dflags live_middle (blockToList middle)
196
197 -- Now sink and inline in this block
198 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
199 fold_last = constantFoldNode dflags last
200 (final_last, assigs') = tryToInline dflags live fold_last assigs
201
202 -- We cannot sink into join points (successors with more than
203 -- one predecessor), so identify the join points and the set
204 -- of registers live in them.
205 (joins, nonjoins) = partition (`mapMember` join_pts) succs
206 live_in_joins = Set.unions (map getLive joins)
207
208 -- We do not want to sink an assignment into multiple branches,
209 -- so identify the set of registers live in multiple successors.
210 -- This is made more complicated because when we sink an assignment
211 -- into one branch, this might change the set of registers that are
212 -- now live in multiple branches.
213 init_live_sets = map getLive nonjoins
214 live_in_multi live_sets r =
215 case filter (Set.member r) live_sets of
216 (_one:_two:_) -> True
217 _ -> False
218
219 -- Now, drop any assignments that we will not sink any further.
220 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
221
222 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
223 where
224 should_drop = conflicts dflags a final_last
225 || not (isTrivial dflags rhs) && live_in_multi live_sets r
226 || r `Set.member` live_in_joins
227
228 live_sets' | should_drop = live_sets
229 | otherwise = map upd live_sets
230
231 upd set | r `Set.member` set = set `Set.union` live_rhs
232 | otherwise = set
233
234 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
235
236 final_middle = foldl' blockSnoc middle' dropped_last
237
238 sunk' = mapUnion sunk $
239 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
240 | l <- succs ]
241
242 {- TODO: enable this later, when we have some good tests in place to
243 measure the effect and tune it.
244
245 -- small: an expression we don't mind duplicating
246 isSmall :: CmmExpr -> Bool
247 isSmall (CmmReg (CmmLocal _)) = True --
248 isSmall (CmmLit _) = True
249 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
250 isSmall (CmmRegOff (CmmLocal _) _) = True
251 isSmall _ = False
252 -}
253
254 --
255 -- We allow duplication of trivial expressions: registers (both local and
256 -- global) and literals.
257 --
258 isTrivial :: DynFlags -> CmmExpr -> Bool
259 isTrivial _ (CmmReg (CmmLocal _)) = True
260 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
261 if isARM (platformArch (targetPlatform dflags))
262 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
263 else isJust (globalRegMaybe (targetPlatform dflags) r)
264 -- GlobalRegs that are loads from BaseReg are not trivial
265 isTrivial _ (CmmLit _) = True
266 isTrivial _ _ = False
267
268 --
269 -- annotate each node with the set of registers live *after* the node
270 --
271 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
272 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
273 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
274
275 --
276 -- Find the blocks that have multiple successors (join points)
277 --
278 findJoinPoints :: [CmmBlock] -> LabelMap Int
279 findJoinPoints blocks = mapFilter (>1) succ_counts
280 where
281 all_succs = concatMap successors blocks
282
283 succ_counts :: LabelMap Int
284 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
285
286 --
287 -- filter the list of assignments to remove any assignments that
288 -- are not live in a continuation.
289 --
290 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
291 filterAssignments dflags live assigs = reverse (go assigs [])
292 where go [] kept = kept
293 go (a@(r,_,_):as) kept | needed = go as (a:kept)
294 | otherwise = go as kept
295 where
296 needed = r `Set.member` live
297 || any (conflicts dflags a) (map toNode kept)
298 -- Note that we must keep assignments that are
299 -- referred to by other assignments we have
300 -- already kept.
301
302 -- -----------------------------------------------------------------------------
303 -- Walk through the nodes of a block, sinking and inlining assignments
304 -- as we go.
305 --
306 -- On input we pass in a:
307 -- * list of nodes in the block
308 -- * a list of assignments that appeared *before* this block and
309 -- that are being sunk.
310 --
311 -- On output we get:
312 -- * a new block
313 -- * a list of assignments that will be placed *after* that block.
314 --
315
316 walk :: DynFlags
317 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
318 -- the set of registers live *after*
319 -- this node.
320
321 -> Assignments -- The current list of
322 -- assignments we are sinking.
323 -- Earlier assignments may refer
324 -- to later ones.
325
326 -> ( Block CmmNode O O -- The new block
327 , Assignments -- Assignments to sink further
328 )
329
330 walk dflags nodes assigs = go nodes emptyBlock assigs
331 where
332 go [] block as = (block, as)
333 go ((live,node):ns) block as
334 | shouldDiscard node live = go ns block as
335 -- discard dead assignment
336 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
337 | otherwise = go ns block' as'
338 where
339 node1 = constantFoldNode dflags node
340
341 (node2, as1) = tryToInline dflags live node1 as
342
343 (dropped, as') = dropAssignmentsSimple dflags
344 (\a -> conflicts dflags a node2) as1
345
346 block' = foldl' blockSnoc block dropped `blockSnoc` node2
347
348
349 --
350 -- Heuristic to decide whether to pick up and sink an assignment
351 -- Currently we pick up all assignments to local registers. It might
352 -- be profitable to sink assignments to global regs too, but the
353 -- liveness analysis doesn't track those (yet) so we can't.
354 --
355 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
356 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
357 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
358 shouldSink _ _other = Nothing
359
360 --
361 -- discard dead assignments. This doesn't do as good a job as
362 -- removeDeadAssignments, because it would need multiple passes
363 -- to get all the dead code, but it catches the common case of
364 -- superfluous reloads from the stack that the stack allocator
365 -- leaves behind.
366 --
367 -- Also we catch "r = r" here. You might think it would fall
368 -- out of inlining, but the inliner will see that r is live
369 -- after the instruction and choose not to inline r in the rhs.
370 --
371 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
372 shouldDiscard node live
373 = case node of
374 CmmAssign r (CmmReg r') | r == r' -> True
375 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
376 _otherwise -> False
377
378
379 toNode :: Assignment -> CmmNode O O
380 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
381
382 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
383 -> ([CmmNode O O], Assignments)
384 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
385
386 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
387 -> ([CmmNode O O], Assignments)
388 dropAssignments dflags should_drop state assigs
389 = (dropped, reverse kept)
390 where
391 (dropped,kept) = go state assigs [] []
392
393 go _ [] dropped kept = (dropped, kept)
394 go state (assig : rest) dropped kept
395 | conflict = go state' rest (toNode assig : dropped) kept
396 | otherwise = go state' rest dropped (assig:kept)
397 where
398 (dropit, state') = should_drop assig state
399 conflict = dropit || any (conflicts dflags assig) dropped
400
401
402 -- -----------------------------------------------------------------------------
403 -- Try to inline assignments into a node.
404 -- This also does constant folding for primpops, since
405 -- inlining opens up opportunities for doing so.
406
407 tryToInline
408 :: DynFlags
409 -> LocalRegSet -- set of registers live after this
410 -- node. We cannot inline anything
411 -- that is live after the node, unless
412 -- it is small enough to duplicate.
413 -> CmmNode O x -- The node to inline into
414 -> Assignments -- Assignments to inline
415 -> (
416 CmmNode O x -- New node
417 , Assignments -- Remaining assignments
418 )
419
420 tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
421 where
422 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
423 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
424
425 go _usages node _skipped [] = (node, [])
426
427 go usages node skipped (a@(l,rhs,_) : rest)
428 | cannot_inline = dont_inline
429 | occurs_none = discard -- Note [discard during inlining]
430 | occurs_once = inline_and_discard
431 | isTrivial dflags rhs = inline_and_keep
432 | otherwise = dont_inline
433 where
434 inline_and_discard = go usages' inl_node skipped rest
435 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
436
437 discard = go usages node skipped rest
438
439 dont_inline = keep node -- don't inline the assignment, keep it
440 inline_and_keep = keep inl_node -- inline the assignment, keep it
441
442 keep node' = (final_node, a : rest')
443 where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
444 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
445 usages rhs
446 -- we must not inline anything that is mentioned in the RHS
447 -- of a binding that we have already skipped, so we set the
448 -- usages of the regs on the RHS to 2.
449
450 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
451 || l `elemLRegSet` skipped
452 || not (okToInline dflags rhs node)
453
454 l_usages = lookupUFM usages l
455 l_live = l `elemRegSet` live
456
457 occurs_once = not l_live && l_usages == Just 1
458 occurs_none = not l_live && l_usages == Nothing
459
460 inl_node = improveConditional (mapExpDeep inl_exp node)
461
462 inl_exp :: CmmExpr -> CmmExpr
463 -- inl_exp is where the inlining actually takes place!
464 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
465 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
466 = cmmOffset dflags rhs off
467 -- re-constant fold after inlining
468 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
469 inl_exp other = other
470
471
472 {- Note [improveConditional]
473
474 cmmMachOpFold tries to simplify conditionals to turn things like
475 (a == b) != 1
476 into
477 (a != b)
478 but there's one case it can't handle: when the comparison is over
479 floating-point values, we can't invert it, because floating-point
480 comparisons aren't invertible (because of NaNs).
481
482 But we *can* optimise this conditional by swapping the true and false
483 branches. Given
484 CmmCondBranch ((a >## b) != 1) t f
485 we can turn it into
486 CmmCondBranch (a >## b) f t
487
488 So here we catch conditionals that weren't optimised by cmmMachOpFold,
489 and apply above transformation to eliminate the comparison against 1.
490
491 It's tempting to just turn every != into == and then let cmmMachOpFold
492 do its thing, but that risks changing a nice fall-through conditional
493 into one that requires two jumps. (see swapcond_last in
494 CmmContFlowOpt), so instead we carefully look for just the cases where
495 we can eliminate a comparison.
496 -}
497 improveConditional :: CmmNode O x -> CmmNode O x
498 improveConditional
499 (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
500 | neLike mop, isComparisonExpr x
501 = CmmCondBranch x f t (fmap not l)
502 where
503 neLike (MO_Ne _) = True
504 neLike (MO_U_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
505 neLike (MO_S_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
506 neLike _ = False
507 improveConditional other = other
508
509 -- Note [dependent assignments]
510 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
511 --
512 -- If our assignment list looks like
513 --
514 -- [ y = e, x = ... y ... ]
515 --
516 -- We cannot inline x. Remember this list is really in reverse order,
517 -- so it means x = ... y ...; y = e
518 --
519 -- Hence if we inline x, the outer assignment to y will capture the
520 -- reference in x's right hand side.
521 --
522 -- In this case we should rename the y in x's right-hand side,
523 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
524 -- Now we can go ahead and inline x.
525 --
526 -- For now we do nothing, because this would require putting
527 -- everything inside UniqSM.
528 --
529 -- One more variant of this (#7366):
530 --
531 -- [ y = e, y = z ]
532 --
533 -- If we don't want to inline y = e, because y is used many times, we
534 -- might still be tempted to inline y = z (because we always inline
535 -- trivial rhs's). But of course we can't, because y is equal to e,
536 -- not z.
537
538 -- Note [discard during inlining]
539 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
540 -- Opportunities to discard assignments sometimes appear after we've
541 -- done some inlining. Here's an example:
542 --
543 -- x = R1;
544 -- y = P64[x + 7];
545 -- z = P64[x + 15];
546 -- /* z is dead */
547 -- R1 = y & (-8);
548 --
549 -- The x assignment is trivial, so we inline it in the RHS of y, and
550 -- keep both x and y. z gets dropped because it is dead, then we
551 -- inline y, and we have a dead assignment to x. If we don't notice
552 -- that x is dead in tryToInline, we end up retaining it.
553
554 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
555 addUsage m r = addToUFM_C (+) m r 1
556
557 regsUsedIn :: LRegSet -> CmmExpr -> Bool
558 regsUsedIn ls _ | nullLRegSet ls = False
559 regsUsedIn ls e = wrapRecExpf f e False
560 where f (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True
561 f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
562 f _ z = z
563
564 -- we don't inline into CmmUnsafeForeignCall if the expression refers
565 -- to global registers. This is a HACK to avoid global registers
566 -- clashing with C argument-passing registers, really the back-end
567 -- ought to be able to handle it properly, but currently neither PprC
568 -- nor the NCG can do it. See Note [Register parameter passing]
569 -- See also StgCmmForeign:load_args_into_temps.
570 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
571 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
572 not (globalRegistersConflict dflags expr node)
573 okToInline _ _ _ = True
574
575 -- -----------------------------------------------------------------------------
576
577 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
578 -- @r = e@ can be safely commuted past statement @node@.
579 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
580 conflicts dflags (r, rhs, addr) node
581
582 -- (1) node defines registers used by rhs of assignment. This catches
583 -- assignments and all three kinds of calls. See Note [Sinking and calls]
584 | globalRegistersConflict dflags rhs node = True
585 | localRegistersConflict dflags rhs node = True
586
587 -- (2) node uses register defined by assignment
588 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
589
590 -- (3) a store to an address conflicts with a read of the same memory
591 | CmmStore addr' e <- node
592 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
593
594 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
595 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
596 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
597 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
598
599 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
600 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
601
602 -- (6) native calls clobber any memory
603 | CmmCall{} <- node, memConflicts addr AnyMem = True
604
605 -- (7) otherwise, no conflict
606 | otherwise = False
607
608 -- Returns True if node defines any global registers that are used in the
609 -- Cmm expression
610 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
611 globalRegistersConflict dflags expr node =
612 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
613 False node
614
615 -- Returns True if node defines any local registers that are used in the
616 -- Cmm expression
617 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
618 localRegistersConflict dflags expr node =
619 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
620 False node
621
622 -- Note [Sinking and calls]
623 -- ~~~~~~~~~~~~~~~~~~~~~~~~
624 --
625 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
626 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
627 -- stack layout (see Note [Sinking after stack layout]) which leads to two
628 -- invariants related to calls:
629 --
630 -- a) during stack layout phase all safe foreign calls are turned into
631 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
632 -- means that we will never encounter CmmForeignCall node when running
633 -- sinking after stack layout
634 --
635 -- b) stack layout saves all variables live across a call on the stack
636 -- just before making a call (remember we are not sinking assignments to
637 -- stack):
638 --
639 -- L1:
640 -- x = R1
641 -- P64[Sp - 16] = L2
642 -- P64[Sp - 8] = x
643 -- Sp = Sp - 16
644 -- call f() returns L2
645 -- L2:
646 --
647 -- We will attempt to sink { x = R1 } but we will detect conflict with
648 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
649 -- checking whether it conflicts with { call f() }. In this way we will
650 -- never need to check any assignment conflicts with CmmCall. Remember
651 -- that we still need to check for potential memory conflicts.
652 --
653 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
654 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
655 -- This assumption holds only when we do sinking after stack layout. If we run
656 -- it before stack layout we need to check for possible conflicts with all three
657 -- kinds of calls. Our `conflicts` function does that by using a generic
658 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
659 -- UserOfRegs typeclasses.
660 --
661
662 -- An abstraction of memory read or written.
663 data AbsMem
664 = NoMem -- no memory accessed
665 | AnyMem -- arbitrary memory
666 | HeapMem -- definitely heap memory
667 | StackMem -- definitely stack memory
668 | SpMem -- <size>[Sp+n]
669 {-# UNPACK #-} !Int
670 {-# UNPACK #-} !Int
671
672 -- Having SpMem is important because it lets us float loads from Sp
673 -- past stores to Sp as long as they don't overlap, and this helps to
674 -- unravel some long sequences of
675 -- x1 = [Sp + 8]
676 -- x2 = [Sp + 16]
677 -- ...
678 -- [Sp + 8] = xi
679 -- [Sp + 16] = xj
680 --
681 -- Note that SpMem is invalidated if Sp is changed, but the definition
682 -- of 'conflicts' above handles that.
683
684 -- ToDo: this won't currently fix the following commonly occurring code:
685 -- x1 = [R1 + 8]
686 -- x2 = [R1 + 16]
687 -- ..
688 -- [Hp - 8] = x1
689 -- [Hp - 16] = x2
690 -- ..
691
692 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
693 -- assignments to [Hp + n] do not conflict with any other heap memory,
694 -- but this is tricky to nail down. What if we had
695 --
696 -- x = Hp + n
697 -- [x] = ...
698 --
699 -- the store to [x] should be "new heap", not "old heap".
700 -- Furthermore, you could imagine that if we started inlining
701 -- functions in Cmm then there might well be reads of heap memory
702 -- that was written in the same basic block. To take advantage of
703 -- non-aliasing of heap memory we will have to be more clever.
704
705 -- Note [Foreign calls clobber heap]
706 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
707 --
708 -- It is tempting to say that foreign calls clobber only
709 -- non-heap/stack memory, but unfortunately we break this invariant in
710 -- the RTS. For example, in stg_catch_retry_frame we call
711 -- stmCommitNestedTransaction() which modifies the contents of the
712 -- TRec it is passed (this actually caused incorrect code to be
713 -- generated).
714 --
715 -- Since the invariant is true for the majority of foreign calls,
716 -- perhaps we ought to have a special annotation for calls that can
717 -- modify heap/stack memory. For now we just use the conservative
718 -- definition here.
719 --
720 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
721 -- therefore we should never float any memory operations across one of
722 -- these calls.
723
724
725 bothMems :: AbsMem -> AbsMem -> AbsMem
726 bothMems NoMem x = x
727 bothMems x NoMem = x
728 bothMems HeapMem HeapMem = HeapMem
729 bothMems StackMem StackMem = StackMem
730 bothMems (SpMem o1 w1) (SpMem o2 w2)
731 | o1 == o2 = SpMem o1 (max w1 w2)
732 | otherwise = StackMem
733 bothMems SpMem{} StackMem = StackMem
734 bothMems StackMem SpMem{} = StackMem
735 bothMems _ _ = AnyMem
736
737 memConflicts :: AbsMem -> AbsMem -> Bool
738 memConflicts NoMem _ = False
739 memConflicts _ NoMem = False
740 memConflicts HeapMem StackMem = False
741 memConflicts StackMem HeapMem = False
742 memConflicts SpMem{} HeapMem = False
743 memConflicts HeapMem SpMem{} = False
744 memConflicts (SpMem o1 w1) (SpMem o2 w2)
745 | o1 < o2 = o1 + w1 > o2
746 | otherwise = o2 + w2 > o1
747 memConflicts _ _ = True
748
749 exprMem :: DynFlags -> CmmExpr -> AbsMem
750 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
751 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
752 exprMem _ _ = NoMem
753
754 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
755 loadAddr dflags e w =
756 case e of
757 CmmReg r -> regAddr dflags r 0 w
758 CmmRegOff r i -> regAddr dflags r i w
759 _other | regUsedIn dflags spReg e -> StackMem
760 | otherwise -> AnyMem
761
762 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
763 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
764 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
765 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
766 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
767 regAddr _ _ _ _ = AnyMem
768
769 {-
770 Note [Inline GlobalRegs?]
771
772 Should we freely inline GlobalRegs?
773
774 Actually it doesn't make a huge amount of difference either way, so we
775 *do* currently treat GlobalRegs as "trivial" and inline them
776 everywhere, but for what it's worth, here is what I discovered when I
777 (SimonM) looked into this:
778
779 Common sense says we should not inline GlobalRegs, because when we
780 have
781
782 x = R1
783
784 the register allocator will coalesce this assignment, generating no
785 code, and simply record the fact that x is bound to $rbx (or
786 whatever). Furthermore, if we were to sink this assignment, then the
787 range of code over which R1 is live increases, and the range of code
788 over which x is live decreases. All things being equal, it is better
789 for x to be live than R1, because R1 is a fixed register whereas x can
790 live in any register. So we should neither sink nor inline 'x = R1'.
791
792 However, not inlining GlobalRegs can have surprising
793 consequences. e.g. (cgrun020)
794
795 c3EN:
796 _s3DB::P64 = R1;
797 _c3ES::P64 = _s3DB::P64 & 7;
798 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
799 c3EU:
800 _s3DD::P64 = P64[_s3DB::P64 + 6];
801 _s3DE::P64 = P64[_s3DB::P64 + 14];
802 I64[Sp - 8] = c3F0;
803 R1 = _s3DE::P64;
804 P64[Sp] = _s3DD::P64;
805
806 inlining the GlobalReg gives:
807
808 c3EN:
809 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
810 c3EU:
811 I64[Sp - 8] = c3F0;
812 _s3DD::P64 = P64[R1 + 6];
813 R1 = P64[R1 + 14];
814 P64[Sp] = _s3DD::P64;
815
816 but if we don't inline the GlobalReg, instead we get:
817
818 _s3DB::P64 = R1;
819 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
820 c3EU:
821 I64[Sp - 8] = c3F0;
822 R1 = P64[_s3DB::P64 + 14];
823 P64[Sp] = P64[_s3DB::P64 + 6];
824
825 This looks better - we managed to inline _s3DD - but in fact it
826 generates an extra reg-reg move:
827
828 .Lc3EU:
829 movq $c3F0_info,-8(%rbp)
830 movq %rbx,%rax
831 movq 14(%rbx),%rbx
832 movq 6(%rax),%rax
833 movq %rax,(%rbp)
834
835 because _s3DB is now live across the R1 assignment, we lost the
836 benefit of coalescing.
837
838 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
839 R1, then we would not sink a reference to _s3DB past the R1
840 assignment. Or perhaps we *should* do that - we might gain by sinking
841 it, despite losing the coalescing opportunity.
842
843 Sometimes not inlining global registers wins by virtue of the rule
844 about not inlining into arguments of a foreign call, e.g. (T7163) this
845 is what happens when we inlined F1:
846
847 _s3L2::F32 = F1;
848 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
849 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
850
851 but if we don't inline F1:
852
853 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
854 10.0 :: W32));
855 -}