Typofixes in comments and whitespace only [ci skip]
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import GhcPrelude
7
8 import Cmm
9 import CmmOpt
10 import CmmLive
11 import CmmUtils
12 import Hoopl.Block
13 import Hoopl.Label
14 import Hoopl.Collections
15 import Hoopl.Graph
16 import CodeGen.Platform
17 import Platform (isARM, platformArch)
18
19 import DynFlags
20 import Unique
21 import UniqFM
22 import PprCmm ()
23
24 import qualified Data.IntSet as IntSet
25 import Data.List (partition)
26 import qualified Data.Set as Set
27 import Data.List
28 import Data.Maybe
29
30 -- Compact sets for membership tests of local variables.
31
32 type LRegSet = IntSet.IntSet
33
34 emptyLRegSet :: LRegSet
35 emptyLRegSet = IntSet.empty
36
37 nullLRegSet :: LRegSet -> Bool
38 nullLRegSet = IntSet.null
39
40 insertLRegSet :: LocalReg -> LRegSet -> LRegSet
41 insertLRegSet l = IntSet.insert (getKey (getUnique l))
42
43 elemLRegSet :: LocalReg -> LRegSet -> Bool
44 elemLRegSet l = IntSet.member (getKey (getUnique l))
45
46 -- -----------------------------------------------------------------------------
47 -- Sinking and inlining
48
49 -- This is an optimisation pass that
50 -- (a) moves assignments closer to their uses, to reduce register pressure
51 -- (b) pushes assignments into a single branch of a conditional if possible
52 -- (c) inlines assignments to registers that are mentioned only once
53 -- (d) discards dead assignments
54 --
55 -- This tightens up lots of register-heavy code. It is particularly
56 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
57 -- which every function starts with a copyIn sequence like:
58 --
59 -- x1 = R1
60 -- x2 = Sp[8]
61 -- x3 = Sp[16]
62 -- if (Sp - 32 < SpLim) then L1 else L2
63 --
64 -- we really want to push the x1..x3 assignments into the L2 branch.
65 --
66 -- Algorithm:
67 --
68 -- * Start by doing liveness analysis.
69 --
70 -- * Keep a list of assignments A; earlier ones may refer to later ones.
71 -- Currently we only sink assignments to local registers, because we don't
72 -- have liveness information about global registers.
73 --
74 -- * Walk forwards through the graph, look at each node N:
75 --
76 -- * If it is a dead assignment, i.e. assignment to a register that is
77 -- not used after N, discard it.
78 --
79 -- * Try to inline based on current list of assignments
80 -- * If any assignments in A (1) occur only once in N, and (2) are
81 -- not live after N, inline the assignment and remove it
82 -- from A.
83 --
84 -- * If an assignment in A is cheap (RHS is local register), then
85 -- inline the assignment and keep it in A in case it is used afterwards.
86 --
87 -- * Otherwise don't inline.
88 --
89 -- * If N is assignment to a local register pick up the assignment
90 -- and add it to A.
91 --
92 -- * If N is not an assignment to a local register:
93 -- * remove any assignments from A that conflict with N, and
94 -- place them before N in the current block. We call this
95 -- "dropping" the assignments.
96 --
97 -- * An assignment conflicts with N if it:
98 -- - assigns to a register mentioned in N
99 -- - mentions a register assigned by N
100 -- - reads from memory written by N
101 -- * do this recursively, dropping dependent assignments
102 --
103 -- * At an exit node:
104 -- * drop any assignments that are live on more than one successor
105 -- and are not trivial
106 -- * if any successor has more than one predecessor (a join-point),
107 -- drop everything live in that successor. Since we only propagate
108 -- assignments that are not dead at the successor, we will therefore
109 -- eliminate all assignments dead at this point. Thus analysis of a
110 -- join-point will always begin with an empty list of assignments.
111 --
112 --
113 -- As a result of above algorithm, sinking deletes some dead assignments
114 -- (transitively, even). This isn't as good as removeDeadAssignments,
115 -- but it's much cheaper.
116
117 -- -----------------------------------------------------------------------------
118 -- things that we aren't optimising very well yet.
119 --
120 -- -----------
121 -- (1) From GHC's FastString.hashStr:
122 --
123 -- s2ay:
124 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
125 -- c2gn:
126 -- R1 = _s2au::I64;
127 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
128 -- c2gp:
129 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
130 -- 4091);
131 -- _s2an::I64 = _s2an::I64 + 1;
132 -- _s2au::I64 = _s2cO::I64;
133 -- goto s2ay;
134 --
135 -- a nice loop, but we didn't eliminate the silly assignment at the end.
136 -- See Note [dependent assignments], which would probably fix this.
137 -- This is #8336 on Trac.
138 --
139 -- -----------
140 -- (2) From stg_atomically_frame in PrimOps.cmm
141 --
142 -- We have a diamond control flow:
143 --
144 -- x = ...
145 -- |
146 -- / \
147 -- A B
148 -- \ /
149 -- |
150 -- use of x
151 --
152 -- Now x won't be sunk down to its use, because we won't push it into
153 -- both branches of the conditional. We certainly do have to check
154 -- that we can sink it past all the code in both A and B, but having
155 -- discovered that, we could sink it to its use.
156 --
157
158 -- -----------------------------------------------------------------------------
159
160 type Assignment = (LocalReg, CmmExpr, AbsMem)
161 -- Assignment caches AbsMem, an abstraction of the memory read by
162 -- the RHS of the assignment.
163
164 type Assignments = [Assignment]
165 -- A sequence of assignments; kept in *reverse* order
166 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
167 -- y = e2
168 -- x = e1
169
170 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
171 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
172 where
173 liveness = cmmLocalLiveness dflags graph
174 getLive l = mapFindWithDefault Set.empty l liveness
175
176 blocks = revPostorder graph
177
178 join_pts = findJoinPoints blocks
179
180 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
181 sink _ [] = []
182 sink sunk (b:bs) =
183 -- pprTrace "sink" (ppr lbl) $
184 blockJoin first final_middle final_last : sink sunk' bs
185 where
186 lbl = entryLabel b
187 (first, middle, last) = blockSplit b
188
189 succs = successors last
190
191 -- Annotate the middle nodes with the registers live *after*
192 -- the node. This will help us decide whether we can inline
193 -- an assignment in the current node or not.
194 live = Set.unions (map getLive succs)
195 live_middle = gen_kill dflags last live
196 ann_middles = annotate dflags live_middle (blockToList middle)
197
198 -- Now sink and inline in this block
199 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
200 fold_last = constantFoldNode dflags last
201 (final_last, assigs') = tryToInline dflags live fold_last assigs
202
203 -- We cannot sink into join points (successors with more than
204 -- one predecessor), so identify the join points and the set
205 -- of registers live in them.
206 (joins, nonjoins) = partition (`mapMember` join_pts) succs
207 live_in_joins = Set.unions (map getLive joins)
208
209 -- We do not want to sink an assignment into multiple branches,
210 -- so identify the set of registers live in multiple successors.
211 -- This is made more complicated because when we sink an assignment
212 -- into one branch, this might change the set of registers that are
213 -- now live in multiple branches.
214 init_live_sets = map getLive nonjoins
215 live_in_multi live_sets r =
216 case filter (Set.member r) live_sets of
217 (_one:_two:_) -> True
218 _ -> False
219
220 -- Now, drop any assignments that we will not sink any further.
221 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
222
223 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
224 where
225 should_drop = conflicts dflags a final_last
226 || not (isTrivial dflags rhs) && live_in_multi live_sets r
227 || r `Set.member` live_in_joins
228
229 live_sets' | should_drop = live_sets
230 | otherwise = map upd live_sets
231
232 upd set | r `Set.member` set = set `Set.union` live_rhs
233 | otherwise = set
234
235 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
236
237 final_middle = foldl' blockSnoc middle' dropped_last
238
239 sunk' = mapUnion sunk $
240 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
241 | l <- succs ]
242
243 {- TODO: enable this later, when we have some good tests in place to
244 measure the effect and tune it.
245
246 -- small: an expression we don't mind duplicating
247 isSmall :: CmmExpr -> Bool
248 isSmall (CmmReg (CmmLocal _)) = True --
249 isSmall (CmmLit _) = True
250 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
251 isSmall (CmmRegOff (CmmLocal _) _) = True
252 isSmall _ = False
253 -}
254
255 --
256 -- We allow duplication of trivial expressions: registers (both local and
257 -- global) and literals.
258 --
259 isTrivial :: DynFlags -> CmmExpr -> Bool
260 isTrivial _ (CmmReg (CmmLocal _)) = True
261 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
262 if isARM (platformArch (targetPlatform dflags))
263 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
264 else isJust (globalRegMaybe (targetPlatform dflags) r)
265 -- GlobalRegs that are loads from BaseReg are not trivial
266 isTrivial _ (CmmLit _) = True
267 isTrivial _ _ = False
268
269 --
270 -- annotate each node with the set of registers live *after* the node
271 --
272 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
273 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
274 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
275
276 --
277 -- Find the blocks that have multiple successors (join points)
278 --
279 findJoinPoints :: [CmmBlock] -> LabelMap Int
280 findJoinPoints blocks = mapFilter (>1) succ_counts
281 where
282 all_succs = concatMap successors blocks
283
284 succ_counts :: LabelMap Int
285 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
286
287 --
288 -- filter the list of assignments to remove any assignments that
289 -- are not live in a continuation.
290 --
291 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
292 filterAssignments dflags live assigs = reverse (go assigs [])
293 where go [] kept = kept
294 go (a@(r,_,_):as) kept | needed = go as (a:kept)
295 | otherwise = go as kept
296 where
297 needed = r `Set.member` live
298 || any (conflicts dflags a) (map toNode kept)
299 -- Note that we must keep assignments that are
300 -- referred to by other assignments we have
301 -- already kept.
302
303 -- -----------------------------------------------------------------------------
304 -- Walk through the nodes of a block, sinking and inlining assignments
305 -- as we go.
306 --
307 -- On input we pass in a:
308 -- * list of nodes in the block
309 -- * a list of assignments that appeared *before* this block and
310 -- that are being sunk.
311 --
312 -- On output we get:
313 -- * a new block
314 -- * a list of assignments that will be placed *after* that block.
315 --
316
317 walk :: DynFlags
318 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
319 -- the set of registers live *after*
320 -- this node.
321
322 -> Assignments -- The current list of
323 -- assignments we are sinking.
324 -- Earlier assignments may refer
325 -- to later ones.
326
327 -> ( Block CmmNode O O -- The new block
328 , Assignments -- Assignments to sink further
329 )
330
331 walk dflags nodes assigs = go nodes emptyBlock assigs
332 where
333 go [] block as = (block, as)
334 go ((live,node):ns) block as
335 | shouldDiscard node live = go ns block as
336 -- discard dead assignment
337 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
338 | otherwise = go ns block' as'
339 where
340 node1 = constantFoldNode dflags node
341
342 (node2, as1) = tryToInline dflags live node1 as
343
344 (dropped, as') = dropAssignmentsSimple dflags
345 (\a -> conflicts dflags a node2) as1
346
347 block' = foldl' blockSnoc block dropped `blockSnoc` node2
348
349
350 --
351 -- Heuristic to decide whether to pick up and sink an assignment
352 -- Currently we pick up all assignments to local registers. It might
353 -- be profitable to sink assignments to global regs too, but the
354 -- liveness analysis doesn't track those (yet) so we can't.
355 --
356 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
357 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
358 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
359 shouldSink _ _other = Nothing
360
361 --
362 -- discard dead assignments. This doesn't do as good a job as
363 -- removeDeadAssignments, because it would need multiple passes
364 -- to get all the dead code, but it catches the common case of
365 -- superfluous reloads from the stack that the stack allocator
366 -- leaves behind.
367 --
368 -- Also we catch "r = r" here. You might think it would fall
369 -- out of inlining, but the inliner will see that r is live
370 -- after the instruction and choose not to inline r in the rhs.
371 --
372 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
373 shouldDiscard node live
374 = case node of
375 CmmAssign r (CmmReg r') | r == r' -> True
376 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
377 _otherwise -> False
378
379
380 toNode :: Assignment -> CmmNode O O
381 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
382
383 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
384 -> ([CmmNode O O], Assignments)
385 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
386
387 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
388 -> ([CmmNode O O], Assignments)
389 dropAssignments dflags should_drop state assigs
390 = (dropped, reverse kept)
391 where
392 (dropped,kept) = go state assigs [] []
393
394 go _ [] dropped kept = (dropped, kept)
395 go state (assig : rest) dropped kept
396 | conflict = go state' rest (toNode assig : dropped) kept
397 | otherwise = go state' rest dropped (assig:kept)
398 where
399 (dropit, state') = should_drop assig state
400 conflict = dropit || any (conflicts dflags assig) dropped
401
402
403 -- -----------------------------------------------------------------------------
404 -- Try to inline assignments into a node.
405 -- This also does constant folding for primpops, since
406 -- inlining opens up opportunities for doing so.
407
408 tryToInline
409 :: DynFlags
410 -> LocalRegSet -- set of registers live after this
411 -- node. We cannot inline anything
412 -- that is live after the node, unless
413 -- it is small enough to duplicate.
414 -> CmmNode O x -- The node to inline into
415 -> Assignments -- Assignments to inline
416 -> (
417 CmmNode O x -- New node
418 , Assignments -- Remaining assignments
419 )
420
421 tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
422 where
423 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
424 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
425
426 go _usages node _skipped [] = (node, [])
427
428 go usages node skipped (a@(l,rhs,_) : rest)
429 | cannot_inline = dont_inline
430 | occurs_none = discard -- Note [discard during inlining]
431 | occurs_once = inline_and_discard
432 | isTrivial dflags rhs = inline_and_keep
433 | otherwise = dont_inline
434 where
435 inline_and_discard = go usages' inl_node skipped rest
436 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
437
438 discard = go usages node skipped rest
439
440 dont_inline = keep node -- don't inline the assignment, keep it
441 inline_and_keep = keep inl_node -- inline the assignment, keep it
442
443 keep node' = (final_node, a : rest')
444 where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
445 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
446 usages rhs
447 -- we must not inline anything that is mentioned in the RHS
448 -- of a binding that we have already skipped, so we set the
449 -- usages of the regs on the RHS to 2.
450
451 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
452 || l `elemLRegSet` skipped
453 || not (okToInline dflags rhs node)
454
455 l_usages = lookupUFM usages l
456 l_live = l `elemRegSet` live
457
458 occurs_once = not l_live && l_usages == Just 1
459 occurs_none = not l_live && l_usages == Nothing
460
461 inl_node = improveConditional (mapExpDeep inl_exp node)
462
463 inl_exp :: CmmExpr -> CmmExpr
464 -- inl_exp is where the inlining actually takes place!
465 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
466 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
467 = cmmOffset dflags rhs off
468 -- re-constant fold after inlining
469 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
470 inl_exp other = other
471
472
473 {- Note [improveConditional]
474
475 cmmMachOpFold tries to simplify conditionals to turn things like
476 (a == b) != 1
477 into
478 (a != b)
479 but there's one case it can't handle: when the comparison is over
480 floating-point values, we can't invert it, because floating-point
481 comparisions aren't invertible (because NaN).
482
483 But we *can* optimise this conditional by swapping the true and false
484 branches. Given
485 CmmCondBranch ((a >## b) != 1) t f
486 we can turn it into
487 CmmCondBranch (a >## b) f t
488
489 So here we catch conditionals that weren't optimised by cmmMachOpFold,
490 and apply above transformation to eliminate the comparison against 1.
491
492 It's tempting to just turn every != into == and then let cmmMachOpFold
493 do its thing, but that risks changing a nice fall-through conditional
494 into one that requires two jumps. (see swapcond_last in
495 CmmContFlowOpt), so instead we carefully look for just the cases where
496 we can eliminate a comparison.
497 -}
498 improveConditional :: CmmNode O x -> CmmNode O x
499 improveConditional
500 (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
501 | neLike mop, isComparisonExpr x
502 = CmmCondBranch x f t (fmap not l)
503 where
504 neLike (MO_Ne _) = True
505 neLike (MO_U_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
506 neLike (MO_S_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
507 neLike _ = False
508 improveConditional other = other
509
510 -- Note [dependent assignments]
511 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
512 --
513 -- If our assignment list looks like
514 --
515 -- [ y = e, x = ... y ... ]
516 --
517 -- We cannot inline x. Remember this list is really in reverse order,
518 -- so it means x = ... y ...; y = e
519 --
520 -- Hence if we inline x, the outer assignment to y will capture the
521 -- reference in x's right hand side.
522 --
523 -- In this case we should rename the y in x's right-hand side,
524 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
525 -- Now we can go ahead and inline x.
526 --
527 -- For now we do nothing, because this would require putting
528 -- everything inside UniqSM.
529 --
530 -- One more variant of this (#7366):
531 --
532 -- [ y = e, y = z ]
533 --
534 -- If we don't want to inline y = e, because y is used many times, we
535 -- might still be tempted to inline y = z (because we always inline
536 -- trivial rhs's). But of course we can't, because y is equal to e,
537 -- not z.
538
539 -- Note [discard during inlining]
540 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
541 -- Opportunities to discard assignments sometimes appear after we've
542 -- done some inlining. Here's an example:
543 --
544 -- x = R1;
545 -- y = P64[x + 7];
546 -- z = P64[x + 15];
547 -- /* z is dead */
548 -- R1 = y & (-8);
549 --
550 -- The x assignment is trivial, so we inline it in the RHS of y, and
551 -- keep both x and y. z gets dropped because it is dead, then we
552 -- inline y, and we have a dead assignment to x. If we don't notice
553 -- that x is dead in tryToInline, we end up retaining it.
554
555 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
556 addUsage m r = addToUFM_C (+) m r 1
557
558 regsUsedIn :: LRegSet -> CmmExpr -> Bool
559 regsUsedIn ls _ | nullLRegSet ls = False
560 regsUsedIn ls e = wrapRecExpf f e False
561 where f (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True
562 f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
563 f _ z = z
564
565 -- we don't inline into CmmUnsafeForeignCall if the expression refers
566 -- to global registers. This is a HACK to avoid global registers
567 -- clashing with C argument-passing registers, really the back-end
568 -- ought to be able to handle it properly, but currently neither PprC
569 -- nor the NCG can do it. See Note [Register parameter passing]
570 -- See also StgCmmForeign:load_args_into_temps.
571 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
572 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
573 not (globalRegistersConflict dflags expr node)
574 okToInline _ _ _ = True
575
576 -- -----------------------------------------------------------------------------
577
578 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
579 -- @r = e@ can be safely commuted past statement @node@.
580 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
581 conflicts dflags (r, rhs, addr) node
582
583 -- (1) node defines registers used by rhs of assignment. This catches
584 -- assignments and all three kinds of calls. See Note [Sinking and calls]
585 | globalRegistersConflict dflags rhs node = True
586 | localRegistersConflict dflags rhs node = True
587
588 -- (2) node uses register defined by assignment
589 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
590
591 -- (3) a store to an address conflicts with a read of the same memory
592 | CmmStore addr' e <- node
593 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
594
595 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
596 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
597 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
598 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
599
600 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
601 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
602
603 -- (6) native calls clobber any memory
604 | CmmCall{} <- node, memConflicts addr AnyMem = True
605
606 -- (7) otherwise, no conflict
607 | otherwise = False
608
609 -- Returns True if node defines any global registers that are used in the
610 -- Cmm expression
611 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
612 globalRegistersConflict dflags expr node =
613 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
614 False node
615
616 -- Returns True if node defines any local registers that are used in the
617 -- Cmm expression
618 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
619 localRegistersConflict dflags expr node =
620 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
621 False node
622
623 -- Note [Sinking and calls]
624 -- ~~~~~~~~~~~~~~~~~~~~~~~~
625 --
626 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
627 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
628 -- stack layout (see Note [Sinking after stack layout]) which leads to two
629 -- invariants related to calls:
630 --
631 -- a) during stack layout phase all safe foreign calls are turned into
632 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
633 -- means that we will never encounter CmmForeignCall node when running
634 -- sinking after stack layout
635 --
636 -- b) stack layout saves all variables live across a call on the stack
637 -- just before making a call (remember we are not sinking assignments to
638 -- stack):
639 --
640 -- L1:
641 -- x = R1
642 -- P64[Sp - 16] = L2
643 -- P64[Sp - 8] = x
644 -- Sp = Sp - 16
645 -- call f() returns L2
646 -- L2:
647 --
648 -- We will attempt to sink { x = R1 } but we will detect conflict with
649 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
650 -- checking whether it conflicts with { call f() }. In this way we will
651 -- never need to check any assignment conflicts with CmmCall. Remember
652 -- that we still need to check for potential memory conflicts.
653 --
654 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
655 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
656 -- This assumption holds only when we do sinking after stack layout. If we run
657 -- it before stack layout we need to check for possible conflicts with all three
658 -- kinds of calls. Our `conflicts` function does that by using a generic
659 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
660 -- UserOfRegs typeclasses.
661 --
662
663 -- An abstraction of memory read or written.
664 data AbsMem
665 = NoMem -- no memory accessed
666 | AnyMem -- arbitrary memory
667 | HeapMem -- definitely heap memory
668 | StackMem -- definitely stack memory
669 | SpMem -- <size>[Sp+n]
670 {-# UNPACK #-} !Int
671 {-# UNPACK #-} !Int
672
673 -- Having SpMem is important because it lets us float loads from Sp
674 -- past stores to Sp as long as they don't overlap, and this helps to
675 -- unravel some long sequences of
676 -- x1 = [Sp + 8]
677 -- x2 = [Sp + 16]
678 -- ...
679 -- [Sp + 8] = xi
680 -- [Sp + 16] = xj
681 --
682 -- Note that SpMem is invalidated if Sp is changed, but the definition
683 -- of 'conflicts' above handles that.
684
685 -- ToDo: this won't currently fix the following commonly occurring code:
686 -- x1 = [R1 + 8]
687 -- x2 = [R1 + 16]
688 -- ..
689 -- [Hp - 8] = x1
690 -- [Hp - 16] = x2
691 -- ..
692
693 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
694 -- assignments to [Hp + n] do not conflict with any other heap memory,
695 -- but this is tricky to nail down. What if we had
696 --
697 -- x = Hp + n
698 -- [x] = ...
699 --
700 -- the store to [x] should be "new heap", not "old heap".
701 -- Furthermore, you could imagine that if we started inlining
702 -- functions in Cmm then there might well be reads of heap memory
703 -- that was written in the same basic block. To take advantage of
704 -- non-aliasing of heap memory we will have to be more clever.
705
706 -- Note [Foreign calls clobber heap]
707 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
708 --
709 -- It is tempting to say that foreign calls clobber only
710 -- non-heap/stack memory, but unfortunately we break this invariant in
711 -- the RTS. For example, in stg_catch_retry_frame we call
712 -- stmCommitNestedTransaction() which modifies the contents of the
713 -- TRec it is passed (this actually caused incorrect code to be
714 -- generated).
715 --
716 -- Since the invariant is true for the majority of foreign calls,
717 -- perhaps we ought to have a special annotation for calls that can
718 -- modify heap/stack memory. For now we just use the conservative
719 -- definition here.
720 --
721 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
722 -- therefore we should never float any memory operations across one of
723 -- these calls.
724
725
726 bothMems :: AbsMem -> AbsMem -> AbsMem
727 bothMems NoMem x = x
728 bothMems x NoMem = x
729 bothMems HeapMem HeapMem = HeapMem
730 bothMems StackMem StackMem = StackMem
731 bothMems (SpMem o1 w1) (SpMem o2 w2)
732 | o1 == o2 = SpMem o1 (max w1 w2)
733 | otherwise = StackMem
734 bothMems SpMem{} StackMem = StackMem
735 bothMems StackMem SpMem{} = StackMem
736 bothMems _ _ = AnyMem
737
738 memConflicts :: AbsMem -> AbsMem -> Bool
739 memConflicts NoMem _ = False
740 memConflicts _ NoMem = False
741 memConflicts HeapMem StackMem = False
742 memConflicts StackMem HeapMem = False
743 memConflicts SpMem{} HeapMem = False
744 memConflicts HeapMem SpMem{} = False
745 memConflicts (SpMem o1 w1) (SpMem o2 w2)
746 | o1 < o2 = o1 + w1 > o2
747 | otherwise = o2 + w2 > o1
748 memConflicts _ _ = True
749
750 exprMem :: DynFlags -> CmmExpr -> AbsMem
751 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
752 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
753 exprMem _ _ = NoMem
754
755 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
756 loadAddr dflags e w =
757 case e of
758 CmmReg r -> regAddr dflags r 0 w
759 CmmRegOff r i -> regAddr dflags r i w
760 _other | regUsedIn dflags spReg e -> StackMem
761 | otherwise -> AnyMem
762
763 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
764 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
765 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
766 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
767 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
768 regAddr _ _ _ _ = AnyMem
769
770 {-
771 Note [Inline GlobalRegs?]
772
773 Should we freely inline GlobalRegs?
774
775 Actually it doesn't make a huge amount of difference either way, so we
776 *do* currently treat GlobalRegs as "trivial" and inline them
777 everywhere, but for what it's worth, here is what I discovered when I
778 (SimonM) looked into this:
779
780 Common sense says we should not inline GlobalRegs, because when we
781 have
782
783 x = R1
784
785 the register allocator will coalesce this assignment, generating no
786 code, and simply record the fact that x is bound to $rbx (or
787 whatever). Furthermore, if we were to sink this assignment, then the
788 range of code over which R1 is live increases, and the range of code
789 over which x is live decreases. All things being equal, it is better
790 for x to be live than R1, because R1 is a fixed register whereas x can
791 live in any register. So we should neither sink nor inline 'x = R1'.
792
793 However, not inlining GlobalRegs can have surprising
794 consequences. e.g. (cgrun020)
795
796 c3EN:
797 _s3DB::P64 = R1;
798 _c3ES::P64 = _s3DB::P64 & 7;
799 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
800 c3EU:
801 _s3DD::P64 = P64[_s3DB::P64 + 6];
802 _s3DE::P64 = P64[_s3DB::P64 + 14];
803 I64[Sp - 8] = c3F0;
804 R1 = _s3DE::P64;
805 P64[Sp] = _s3DD::P64;
806
807 inlining the GlobalReg gives:
808
809 c3EN:
810 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
811 c3EU:
812 I64[Sp - 8] = c3F0;
813 _s3DD::P64 = P64[R1 + 6];
814 R1 = P64[R1 + 14];
815 P64[Sp] = _s3DD::P64;
816
817 but if we don't inline the GlobalReg, instead we get:
818
819 _s3DB::P64 = R1;
820 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
821 c3EU:
822 I64[Sp - 8] = c3F0;
823 R1 = P64[_s3DB::P64 + 14];
824 P64[Sp] = P64[_s3DB::P64 + 6];
825
826 This looks better - we managed to inline _s3DD - but in fact it
827 generates an extra reg-reg move:
828
829 .Lc3EU:
830 movq $c3F0_info,-8(%rbp)
831 movq %rbx,%rax
832 movq 14(%rbx),%rbx
833 movq 6(%rax),%rax
834 movq %rax,(%rbp)
835
836 because _s3DB is now live across the R1 assignment, we lost the
837 benefit of coalescing.
838
839 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
840 R1, then we would not sink a reference to _s3DB past the R1
841 assignment. Or perhaps we *should* do that - we might gain by sinking
842 it, despite losing the coalescing opportunity.
843
844 Sometimes not inlining global registers wins by virtue of the rule
845 about not inlining into arguments of a foreign call, e.g. (T7163) this
846 is what happens when we inlined F1:
847
848 _s3L2::F32 = F1;
849 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
850 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
851
852 but if we don't inline F1:
853
854 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
855 10.0 :: W32));
856 -}