14 import Hoopl
.Collections
16 import CodeGen
.Platform
17 import Platform
(isARM
, platformArch
)
24 import qualified Data
.IntSet
as IntSet
25 import Data
.List
(partition)
26 import qualified Data
.Set
as Set
30 -- Compact sets for membership tests of local variables.
32 type LRegSet
= IntSet
.IntSet
34 emptyLRegSet
:: LRegSet
35 emptyLRegSet
= IntSet
.empty
37 nullLRegSet
:: LRegSet
-> Bool
38 nullLRegSet
= IntSet
.null
40 insertLRegSet
:: LocalReg
-> LRegSet
-> LRegSet
41 insertLRegSet l
= IntSet
.insert (getKey
(getUnique l
))
43 elemLRegSet
:: LocalReg
-> LRegSet
-> Bool
44 elemLRegSet l
= IntSet
.member
(getKey
(getUnique l
))
46 -- -----------------------------------------------------------------------------
47 -- Sinking and inlining
49 -- This is an optimisation pass that
50 -- (a) moves assignments closer to their uses, to reduce register pressure
51 -- (b) pushes assignments into a single branch of a conditional if possible
52 -- (c) inlines assignments to registers that are mentioned only once
53 -- (d) discards dead assignments
55 -- This tightens up lots of register-heavy code. It is particularly
56 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
57 -- which every function starts with a copyIn sequence like:
62 -- if (Sp - 32 < SpLim) then L1 else L2
64 -- we really want to push the x1..x3 assignments into the L2 branch.
68 -- * Start by doing liveness analysis.
70 -- * Keep a list of assignments A; earlier ones may refer to later ones.
71 -- Currently we only sink assignments to local registers, because we don't
72 -- have liveness information about global registers.
74 -- * Walk forwards through the graph, look at each node N:
76 -- * If it is a dead assignment, i.e. assignment to a register that is
77 -- not used after N, discard it.
79 -- * Try to inline based on current list of assignments
80 -- * If any assignments in A (1) occur only once in N, and (2) are
81 -- not live after N, inline the assignment and remove it
84 -- * If an assignment in A is cheap (RHS is local register), then
85 -- inline the assignment and keep it in A in case it is used afterwards.
87 -- * Otherwise don't inline.
89 -- * If N is assignment to a local register pick up the assignment
92 -- * If N is not an assignment to a local register:
93 -- * remove any assignments from A that conflict with N, and
94 -- place them before N in the current block. We call this
95 -- "dropping" the assignments.
97 -- * An assignment conflicts with N if it:
98 -- - assigns to a register mentioned in N
99 -- - mentions a register assigned by N
100 -- - reads from memory written by N
101 -- * do this recursively, dropping dependent assignments
103 -- * At an exit node:
104 -- * drop any assignments that are live on more than one successor
105 -- and are not trivial
106 -- * if any successor has more than one predecessor (a join-point),
107 -- drop everything live in that successor. Since we only propagate
108 -- assignments that are not dead at the successor, we will therefore
109 -- eliminate all assignments dead at this point. Thus analysis of a
110 -- join-point will always begin with an empty list of assignments.
113 -- As a result of above algorithm, sinking deletes some dead assignments
114 -- (transitively, even). This isn't as good as removeDeadAssignments,
115 -- but it's much cheaper.
117 -- -----------------------------------------------------------------------------
118 -- things that we aren't optimising very well yet.
121 -- (1) From GHC's FastString.hashStr:
124 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
127 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
129 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
131 -- _s2an::I64 = _s2an::I64 + 1;
132 -- _s2au::I64 = _s2cO::I64;
135 -- a nice loop, but we didn't eliminate the silly assignment at the end.
136 -- See Note [dependent assignments], which would probably fix this.
137 -- This is #8336 on Trac.
140 -- (2) From stg_atomically_frame in PrimOps.cmm
142 -- We have a diamond control flow:
152 -- Now x won't be sunk down to its use, because we won't push it into
153 -- both branches of the conditional. We certainly do have to check
154 -- that we can sink it past all the code in both A and B, but having
155 -- discovered that, we could sink it to its use.
158 -- -----------------------------------------------------------------------------
160 type Assignment
= (LocalReg
, CmmExpr
, AbsMem
)
161 -- Assignment caches AbsMem, an abstraction of the memory read by
162 -- the RHS of the assignment.
164 type Assignments
= [Assignment
]
165 -- A sequence of assignments; kept in *reverse* order
166 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
170 cmmSink
:: DynFlags
-> CmmGraph
-> CmmGraph
171 cmmSink dflags graph
= ofBlockList
(g_entry graph
) $ sink mapEmpty
$ blocks
173 liveness
= cmmLocalLiveness dflags graph
174 getLive l
= mapFindWithDefault Set
.empty l liveness
176 blocks
= revPostorder graph
178 join_pts
= findJoinPoints blocks
180 sink
:: LabelMap Assignments
-> [CmmBlock
] -> [CmmBlock
]
183 -- pprTrace "sink" (ppr lbl) $
184 blockJoin first final_middle final_last
: sink sunk
' bs
187 (first
, middle
, last) = blockSplit b
189 succs
= successors
last
191 -- Annotate the middle nodes with the registers live *after*
192 -- the node. This will help us decide whether we can inline
193 -- an assignment in the current node or not.
194 live
= Set
.unions
(map getLive succs
)
195 live_middle
= gen_kill dflags
last live
196 ann_middles
= annotate dflags live_middle
(blockToList middle
)
198 -- Now sink and inline in this block
199 (middle
', assigs
) = walk dflags ann_middles
(mapFindWithDefault
[] lbl sunk
)
200 fold_last
= constantFoldNode dflags
last
201 (final_last
, assigs
') = tryToInline dflags live fold_last assigs
203 -- We cannot sink into join points (successors with more than
204 -- one predecessor), so identify the join points and the set
205 -- of registers live in them.
206 (joins
, nonjoins
) = partition (`mapMember` join_pts
) succs
207 live_in_joins
= Set
.unions
(map getLive joins
)
209 -- We do not want to sink an assignment into multiple branches,
210 -- so identify the set of registers live in multiple successors.
211 -- This is made more complicated because when we sink an assignment
212 -- into one branch, this might change the set of registers that are
213 -- now live in multiple branches.
214 init_live_sets
= map getLive nonjoins
215 live_in_multi live_sets r
=
216 case filter (Set
.member r
) live_sets
of
217 (_one
:_two
:_
) -> True
220 -- Now, drop any assignments that we will not sink any further.
221 (dropped_last
, assigs
'') = dropAssignments dflags drop_if init_live_sets assigs
'
223 drop_if a
@(r
,rhs
,_
) live_sets
= (should_drop
, live_sets
')
225 should_drop
= conflicts dflags a final_last
226 ||
not (isTrivial dflags rhs
) && live_in_multi live_sets r
227 || r `Set
.member` live_in_joins
229 live_sets
' | should_drop
= live_sets
230 |
otherwise = map upd live_sets
232 upd set | r `Set
.member` set
= set `Set
.union` live_rhs
235 live_rhs
= foldRegsUsed dflags extendRegSet emptyRegSet rhs
237 final_middle
= foldl' blockSnoc middle
' dropped_last
239 sunk
' = mapUnion sunk
$
240 mapFromList
[ (l
, filterAssignments dflags
(getLive l
) assigs
'')
243 {- TODO: enable this later, when we have some good tests in place to
244 measure the effect and tune it.
246 -- small: an expression we don't mind duplicating
247 isSmall :: CmmExpr -> Bool
248 isSmall (CmmReg (CmmLocal _)) = True --
249 isSmall (CmmLit _) = True
250 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
251 isSmall (CmmRegOff (CmmLocal _) _) = True
256 -- We allow duplication of trivial expressions: registers (both local and
257 -- global) and literals.
259 isTrivial
:: DynFlags
-> CmmExpr
-> Bool
260 isTrivial _
(CmmReg
(CmmLocal _
)) = True
261 isTrivial dflags
(CmmReg
(CmmGlobal r
)) = -- see Note [Inline GlobalRegs?]
262 if isARM
(platformArch
(targetPlatform dflags
))
263 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
264 else isJust (globalRegMaybe
(targetPlatform dflags
) r
)
265 -- GlobalRegs that are loads from BaseReg are not trivial
266 isTrivial _
(CmmLit _
) = True
267 isTrivial _ _
= False
270 -- annotate each node with the set of registers live *after* the node
272 annotate
:: DynFlags
-> LocalRegSet
-> [CmmNode O O
] -> [(LocalRegSet
, CmmNode O O
)]
273 annotate dflags live nodes
= snd $ foldr ann
(live
,[]) nodes
274 where ann n
(live
,nodes
) = (gen_kill dflags n live
, (live
,n
) : nodes
)
277 -- Find the blocks that have multiple successors (join points)
279 findJoinPoints
:: [CmmBlock
] -> LabelMap
Int
280 findJoinPoints blocks
= mapFilter
(>1) succ_counts
282 all_succs
= concatMap successors blocks
284 succ_counts
:: LabelMap
Int
285 succ_counts
= foldr (\l
-> mapInsertWith
(+) l
1) mapEmpty all_succs
288 -- filter the list of assignments to remove any assignments that
289 -- are not live in a continuation.
291 filterAssignments
:: DynFlags
-> LocalRegSet
-> Assignments
-> Assignments
292 filterAssignments dflags live assigs
= reverse (go assigs
[])
293 where go
[] kept
= kept
294 go
(a
@(r
,_
,_
):as) kept | needed
= go
as (a
:kept
)
295 |
otherwise = go
as kept
297 needed
= r `Set
.member` live
298 ||
any (conflicts dflags a
) (map toNode kept
)
299 -- Note that we must keep assignments that are
300 -- referred to by other assignments we have
303 -- -----------------------------------------------------------------------------
304 -- Walk through the nodes of a block, sinking and inlining assignments
307 -- On input we pass in a:
308 -- * list of nodes in the block
309 -- * a list of assignments that appeared *before* this block and
310 -- that are being sunk.
314 -- * a list of assignments that will be placed *after* that block.
318 -> [(LocalRegSet
, CmmNode O O
)] -- nodes of the block, annotated with
319 -- the set of registers live *after*
322 -> Assignments
-- The current list of
323 -- assignments we are sinking.
324 -- Earlier assignments may refer
327 -> ( Block CmmNode O O
-- The new block
328 , Assignments
-- Assignments to sink further
331 walk dflags nodes assigs
= go nodes emptyBlock assigs
333 go
[] block
as = (block
, as)
334 go
((live
,node
):ns
) block
as
335 | shouldDiscard node live
= go ns block
as
336 -- discard dead assignment
337 | Just a
<- shouldSink dflags node2
= go ns block
(a
: as1
)
338 |
otherwise = go ns block
' as'
340 node1
= constantFoldNode dflags node
342 (node2
, as1
) = tryToInline dflags live node1
as
344 (dropped
, as') = dropAssignmentsSimple dflags
345 (\a -> conflicts dflags a node2
) as1
347 block
' = foldl' blockSnoc block dropped `blockSnoc` node2
351 -- Heuristic to decide whether to pick up and sink an assignment
352 -- Currently we pick up all assignments to local registers. It might
353 -- be profitable to sink assignments to global regs too, but the
354 -- liveness analysis doesn't track those (yet) so we can't.
356 shouldSink
:: DynFlags
-> CmmNode e x
-> Maybe Assignment
357 shouldSink dflags
(CmmAssign
(CmmLocal r
) e
) | no_local_regs
= Just
(r
, e
, exprMem dflags e
)
358 where no_local_regs
= True -- foldRegsUsed (\_ _ -> False) True e
359 shouldSink _ _other
= Nothing
362 -- discard dead assignments. This doesn't do as good a job as
363 -- removeDeadAssignments, because it would need multiple passes
364 -- to get all the dead code, but it catches the common case of
365 -- superfluous reloads from the stack that the stack allocator
368 -- Also we catch "r = r" here. You might think it would fall
369 -- out of inlining, but the inliner will see that r is live
370 -- after the instruction and choose not to inline r in the rhs.
372 shouldDiscard
:: CmmNode e x
-> LocalRegSet
-> Bool
373 shouldDiscard node live
375 CmmAssign r
(CmmReg r
') | r
== r
' -> True
376 CmmAssign
(CmmLocal r
) _
-> not (r `Set
.member` live
)
380 toNode
:: Assignment
-> CmmNode O O
381 toNode
(r
,rhs
,_
) = CmmAssign
(CmmLocal r
) rhs
383 dropAssignmentsSimple
:: DynFlags
-> (Assignment
-> Bool) -> Assignments
384 -> ([CmmNode O O
], Assignments
)
385 dropAssignmentsSimple dflags f
= dropAssignments dflags
(\a _
-> (f a
, ())) ()
387 dropAssignments
:: DynFlags
-> (Assignment
-> s
-> (Bool, s
)) -> s
-> Assignments
388 -> ([CmmNode O O
], Assignments
)
389 dropAssignments dflags should_drop state assigs
390 = (dropped
, reverse kept
)
392 (dropped
,kept
) = go state assigs
[] []
394 go _
[] dropped kept
= (dropped
, kept
)
395 go state
(assig
: rest
) dropped kept
396 | conflict
= go state
' rest
(toNode assig
: dropped
) kept
397 |
otherwise = go state
' rest dropped
(assig
:kept
)
399 (dropit
, state
') = should_drop assig state
400 conflict
= dropit ||
any (conflicts dflags assig
) dropped
403 -- -----------------------------------------------------------------------------
404 -- Try to inline assignments into a node.
405 -- This also does constant folding for primpops, since
406 -- inlining opens up opportunities for doing so.
410 -> LocalRegSet
-- set of registers live after this
411 -- node. We cannot inline anything
412 -- that is live after the node, unless
413 -- it is small enough to duplicate.
414 -> CmmNode O x
-- The node to inline into
415 -> Assignments
-- Assignments to inline
417 CmmNode O x
-- New node
418 , Assignments
-- Remaining assignments
421 tryToInline dflags live node assigs
= go usages node emptyLRegSet assigs
423 usages
:: UniqFM
Int -- Maps each LocalReg to a count of how often it is used
424 usages
= foldLocalRegsUsed dflags addUsage emptyUFM node
426 go _usages node _skipped
[] = (node
, [])
428 go usages node skipped
(a
@(l
,rhs
,_
) : rest
)
429 | cannot_inline
= dont_inline
430 | occurs_none
= discard
-- Note [discard during inlining]
431 | occurs_once
= inline_and_discard
432 | isTrivial dflags rhs
= inline_and_keep
433 |
otherwise = dont_inline
435 inline_and_discard
= go usages
' inl_node skipped rest
436 where usages
' = foldLocalRegsUsed dflags addUsage usages rhs
438 discard
= go usages node skipped rest
440 dont_inline
= keep node
-- don't inline the assignment, keep it
441 inline_and_keep
= keep inl_node
-- inline the assignment, keep it
443 keep node
' = (final_node
, a
: rest
')
444 where (final_node
, rest
') = go usages
' node
' (insertLRegSet l skipped
) rest
445 usages
' = foldLocalRegsUsed dflags
(\m r
-> addToUFM m r
2)
447 -- we must not inline anything that is mentioned in the RHS
448 -- of a binding that we have already skipped, so we set the
449 -- usages of the regs on the RHS to 2.
451 cannot_inline
= skipped `regsUsedIn` rhs
-- Note [dependent assignments]
452 || l `elemLRegSet` skipped
453 ||
not (okToInline dflags rhs node
)
455 l_usages
= lookupUFM usages l
456 l_live
= l `elemRegSet` live
458 occurs_once
= not l_live
&& l_usages
== Just
1
459 occurs_none
= not l_live
&& l_usages
== Nothing
461 inl_node
= improveConditional
(mapExpDeep inl_exp node
)
463 inl_exp
:: CmmExpr
-> CmmExpr
464 -- inl_exp is where the inlining actually takes place!
465 inl_exp
(CmmReg
(CmmLocal l
')) | l
== l
' = rhs
466 inl_exp
(CmmRegOff
(CmmLocal l
') off
) | l
== l
'
467 = cmmOffset dflags rhs off
468 -- re-constant fold after inlining
469 inl_exp
(CmmMachOp op args
) = cmmMachOpFold dflags op args
470 inl_exp other
= other
473 {- Note [improveConditional]
475 cmmMachOpFold tries to simplify conditionals to turn things like
479 but there's one case it can't handle: when the comparison is over
480 floating-point values, we can't invert it, because floating-point
481 comparisions aren't invertible (because NaN).
483 But we *can* optimise this conditional by swapping the true and false
485 CmmCondBranch ((a >## b) != 1) t f
487 CmmCondBranch (a >## b) f t
489 So here we catch conditionals that weren't optimised by cmmMachOpFold,
490 and apply above transformation to eliminate the comparison against 1.
492 It's tempting to just turn every != into == and then let cmmMachOpFold
493 do its thing, but that risks changing a nice fall-through conditional
494 into one that requires two jumps. (see swapcond_last in
495 CmmContFlowOpt), so instead we carefully look for just the cases where
496 we can eliminate a comparison.
498 improveConditional
:: CmmNode O x
-> CmmNode O x
500 (CmmCondBranch
(CmmMachOp mop
[x
, CmmLit
(CmmInt
1 _
)]) t f l
)
501 | neLike mop
, isComparisonExpr x
502 = CmmCondBranch x f t
(fmap not l
)
504 neLike
(MO_Ne _
) = True
505 neLike
(MO_U_Lt _
) = True -- (x<y) < 1 behaves like (x<y) != 1
506 neLike
(MO_S_Lt _
) = True -- (x<y) < 1 behaves like (x<y) != 1
508 improveConditional other
= other
510 -- Note [dependent assignments]
511 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
513 -- If our assignment list looks like
515 -- [ y = e, x = ... y ... ]
517 -- We cannot inline x. Remember this list is really in reverse order,
518 -- so it means x = ... y ...; y = e
520 -- Hence if we inline x, the outer assignment to y will capture the
521 -- reference in x's right hand side.
523 -- In this case we should rename the y in x's right-hand side,
524 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
525 -- Now we can go ahead and inline x.
527 -- For now we do nothing, because this would require putting
528 -- everything inside UniqSM.
530 -- One more variant of this (#7366):
534 -- If we don't want to inline y = e, because y is used many times, we
535 -- might still be tempted to inline y = z (because we always inline
536 -- trivial rhs's). But of course we can't, because y is equal to e,
539 -- Note [discard during inlining]
540 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
541 -- Opportunities to discard assignments sometimes appear after we've
542 -- done some inlining. Here's an example:
550 -- The x assignment is trivial, so we inline it in the RHS of y, and
551 -- keep both x and y. z gets dropped because it is dead, then we
552 -- inline y, and we have a dead assignment to x. If we don't notice
553 -- that x is dead in tryToInline, we end up retaining it.
555 addUsage
:: UniqFM
Int -> LocalReg
-> UniqFM
Int
556 addUsage m r
= addToUFM_C
(+) m r
1
558 regsUsedIn
:: LRegSet
-> CmmExpr
-> Bool
559 regsUsedIn ls _ | nullLRegSet ls
= False
560 regsUsedIn ls e
= wrapRecExpf f e
False
561 where f
(CmmReg
(CmmLocal l
)) _ | l `elemLRegSet` ls
= True
562 f
(CmmRegOff
(CmmLocal l
) _
) _ | l `elemLRegSet` ls
= True
565 -- we don't inline into CmmUnsafeForeignCall if the expression refers
566 -- to global registers. This is a HACK to avoid global registers
567 -- clashing with C argument-passing registers, really the back-end
568 -- ought to be able to handle it properly, but currently neither PprC
569 -- nor the NCG can do it. See Note [Register parameter passing]
570 -- See also StgCmmForeign:load_args_into_temps.
571 okToInline
:: DynFlags
-> CmmExpr
-> CmmNode e x
-> Bool
572 okToInline dflags expr node
@(CmmUnsafeForeignCall
{}) =
573 not (globalRegistersConflict dflags expr node
)
574 okToInline _ _ _
= True
576 -- -----------------------------------------------------------------------------
578 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
579 -- @r = e@ can be safely commuted past statement @node@.
580 conflicts
:: DynFlags
-> Assignment
-> CmmNode O x
-> Bool
581 conflicts dflags
(r
, rhs
, addr
) node
583 -- (1) node defines registers used by rhs of assignment. This catches
584 -- assignments and all three kinds of calls. See Note [Sinking and calls]
585 | globalRegistersConflict dflags rhs node
= True
586 | localRegistersConflict dflags rhs node
= True
588 -- (2) node uses register defined by assignment
589 | foldRegsUsed dflags
(\b r
' -> r
== r
' || b
) False node
= True
591 -- (3) a store to an address conflicts with a read of the same memory
592 | CmmStore addr
' e
<- node
593 , memConflicts addr
(loadAddr dflags addr
' (cmmExprWidth dflags e
)) = True
595 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
596 | HeapMem
<- addr
, CmmAssign
(CmmGlobal Hp
) _
<- node
= True
597 | StackMem
<- addr
, CmmAssign
(CmmGlobal Sp
) _
<- node
= True
598 | SpMem
{} <- addr
, CmmAssign
(CmmGlobal Sp
) _
<- node
= True
600 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
601 | CmmUnsafeForeignCall
{} <- node
, memConflicts addr AnyMem
= True
603 -- (6) native calls clobber any memory
604 | CmmCall
{} <- node
, memConflicts addr AnyMem
= True
606 -- (7) otherwise, no conflict
609 -- Returns True if node defines any global registers that are used in the
611 globalRegistersConflict
:: DynFlags
-> CmmExpr
-> CmmNode e x
-> Bool
612 globalRegistersConflict dflags expr node
=
613 foldRegsDefd dflags
(\b r
-> b || regUsedIn dflags
(CmmGlobal r
) expr
)
616 -- Returns True if node defines any local registers that are used in the
618 localRegistersConflict
:: DynFlags
-> CmmExpr
-> CmmNode e x
-> Bool
619 localRegistersConflict dflags expr node
=
620 foldRegsDefd dflags
(\b r
-> b || regUsedIn dflags
(CmmLocal r
) expr
)
623 -- Note [Sinking and calls]
624 -- ~~~~~~~~~~~~~~~~~~~~~~~~
626 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
627 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
628 -- stack layout (see Note [Sinking after stack layout]) which leads to two
629 -- invariants related to calls:
631 -- a) during stack layout phase all safe foreign calls are turned into
632 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
633 -- means that we will never encounter CmmForeignCall node when running
634 -- sinking after stack layout
636 -- b) stack layout saves all variables live across a call on the stack
637 -- just before making a call (remember we are not sinking assignments to
645 -- call f() returns L2
648 -- We will attempt to sink { x = R1 } but we will detect conflict with
649 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
650 -- checking whether it conflicts with { call f() }. In this way we will
651 -- never need to check any assignment conflicts with CmmCall. Remember
652 -- that we still need to check for potential memory conflicts.
654 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
655 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
656 -- This assumption holds only when we do sinking after stack layout. If we run
657 -- it before stack layout we need to check for possible conflicts with all three
658 -- kinds of calls. Our `conflicts` function does that by using a generic
659 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
660 -- UserOfRegs typeclasses.
663 -- An abstraction of memory read or written.
665 = NoMem
-- no memory accessed
666 | AnyMem
-- arbitrary memory
667 | HeapMem
-- definitely heap memory
668 | StackMem
-- definitely stack memory
669 | SpMem
-- <size>[Sp+n]
673 -- Having SpMem is important because it lets us float loads from Sp
674 -- past stores to Sp as long as they don't overlap, and this helps to
675 -- unravel some long sequences of
682 -- Note that SpMem is invalidated if Sp is changed, but the definition
683 -- of 'conflicts' above handles that.
685 -- ToDo: this won't currently fix the following commonly occurring code:
693 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
694 -- assignments to [Hp + n] do not conflict with any other heap memory,
695 -- but this is tricky to nail down. What if we had
700 -- the store to [x] should be "new heap", not "old heap".
701 -- Furthermore, you could imagine that if we started inlining
702 -- functions in Cmm then there might well be reads of heap memory
703 -- that was written in the same basic block. To take advantage of
704 -- non-aliasing of heap memory we will have to be more clever.
706 -- Note [Foreign calls clobber heap]
707 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
709 -- It is tempting to say that foreign calls clobber only
710 -- non-heap/stack memory, but unfortunately we break this invariant in
711 -- the RTS. For example, in stg_catch_retry_frame we call
712 -- stmCommitNestedTransaction() which modifies the contents of the
713 -- TRec it is passed (this actually caused incorrect code to be
716 -- Since the invariant is true for the majority of foreign calls,
717 -- perhaps we ought to have a special annotation for calls that can
718 -- modify heap/stack memory. For now we just use the conservative
721 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
722 -- therefore we should never float any memory operations across one of
726 bothMems
:: AbsMem
-> AbsMem
-> AbsMem
729 bothMems HeapMem HeapMem
= HeapMem
730 bothMems StackMem StackMem
= StackMem
731 bothMems
(SpMem o1 w1
) (SpMem o2 w2
)
732 | o1
== o2
= SpMem o1
(max w1 w2
)
733 |
otherwise = StackMem
734 bothMems SpMem
{} StackMem
= StackMem
735 bothMems StackMem SpMem
{} = StackMem
736 bothMems _ _
= AnyMem
738 memConflicts
:: AbsMem
-> AbsMem
-> Bool
739 memConflicts NoMem _
= False
740 memConflicts _ NoMem
= False
741 memConflicts HeapMem StackMem
= False
742 memConflicts StackMem HeapMem
= False
743 memConflicts SpMem
{} HeapMem
= False
744 memConflicts HeapMem SpMem
{} = False
745 memConflicts
(SpMem o1 w1
) (SpMem o2 w2
)
746 | o1
< o2
= o1
+ w1
> o2
747 |
otherwise = o2
+ w2
> o1
748 memConflicts _ _
= True
750 exprMem
:: DynFlags
-> CmmExpr
-> AbsMem
751 exprMem dflags
(CmmLoad addr w
) = bothMems
(loadAddr dflags addr
(typeWidth w
)) (exprMem dflags addr
)
752 exprMem dflags
(CmmMachOp _ es
) = foldr bothMems NoMem
(map (exprMem dflags
) es
)
755 loadAddr
:: DynFlags
-> CmmExpr
-> Width
-> AbsMem
756 loadAddr dflags e w
=
758 CmmReg r
-> regAddr dflags r
0 w
759 CmmRegOff r i
-> regAddr dflags r i w
760 _other | regUsedIn dflags spReg e
-> StackMem
761 |
otherwise -> AnyMem
763 regAddr
:: DynFlags
-> CmmReg
-> Int -> Width
-> AbsMem
764 regAddr _
(CmmGlobal Sp
) i w
= SpMem i
(widthInBytes w
)
765 regAddr _
(CmmGlobal Hp
) _ _
= HeapMem
766 regAddr _
(CmmGlobal CurrentTSO
) _ _
= HeapMem
-- important for PrimOps
767 regAddr dflags r _ _ | isGcPtrType
(cmmRegType dflags r
) = HeapMem
-- yay! GCPtr pays for itself
768 regAddr _ _ _ _
= AnyMem
771 Note [Inline GlobalRegs?]
773 Should we freely inline GlobalRegs?
775 Actually it doesn't make a huge amount of difference either way, so we
776 *do* currently treat GlobalRegs as "trivial" and inline them
777 everywhere, but for what it's worth, here is what I discovered when I
778 (SimonM) looked into this:
780 Common sense says we should not inline GlobalRegs, because when we
785 the register allocator will coalesce this assignment, generating no
786 code, and simply record the fact that x is bound to $rbx (or
787 whatever). Furthermore, if we were to sink this assignment, then the
788 range of code over which R1 is live increases, and the range of code
789 over which x is live decreases. All things being equal, it is better
790 for x to be live than R1, because R1 is a fixed register whereas x can
791 live in any register. So we should neither sink nor inline 'x = R1'.
793 However, not inlining GlobalRegs can have surprising
794 consequences. e.g. (cgrun020)
798 _c3ES::P64 = _s3DB::P64 & 7;
799 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
801 _s3DD::P64 = P64[_s3DB::P64 + 6];
802 _s3DE::P64 = P64[_s3DB::P64 + 14];
805 P64[Sp] = _s3DD::P64;
807 inlining the GlobalReg gives:
810 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
813 _s3DD::P64 = P64[R1 + 6];
815 P64[Sp] = _s3DD::P64;
817 but if we don't inline the GlobalReg, instead we get:
820 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
823 R1 = P64[_s3DB::P64 + 14];
824 P64[Sp] = P64[_s3DB::P64 + 6];
826 This looks better - we managed to inline _s3DD - but in fact it
827 generates an extra reg-reg move:
830 movq $c3F0_info,-8(%rbp)
836 because _s3DB is now live across the R1 assignment, we lost the
837 benefit of coalescing.
839 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
840 R1, then we would not sink a reference to _s3DB past the R1
841 assignment. Or perhaps we *should* do that - we might gain by sinking
842 it, despite losing the coalescing opportunity.
844 Sometimes not inlining global registers wins by virtue of the rule
845 about not inlining into arguments of a foreign call, e.g. (T7163) this
846 is what happens when we inlined F1:
849 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
850 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
852 but if we don't inline F1:
854 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,