d21f2422e73a238baffebd6689be3fe838df73f2
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import Cmm
7 import CmmOpt
8 import CmmLive
9 import CmmUtils
10 import Hoopl
11 import CodeGen.Platform
12 import Platform (isARM, platformArch)
13
14 import DynFlags
15 import UniqFM
16 import PprCmm ()
17
18 import Data.List (partition)
19 import qualified Data.Set as Set
20 import Data.Maybe
21
22 -- -----------------------------------------------------------------------------
23 -- Sinking and inlining
24
25 -- This is an optimisation pass that
26 -- (a) moves assignments closer to their uses, to reduce register pressure
27 -- (b) pushes assignments into a single branch of a conditional if possible
28 -- (c) inlines assignments to registers that are mentioned only once
29 -- (d) discards dead assignments
30 --
31 -- This tightens up lots of register-heavy code. It is particularly
32 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
33 -- which every function starts with a copyIn sequence like:
34 --
35 -- x1 = R1
36 -- x2 = Sp[8]
37 -- x3 = Sp[16]
38 -- if (Sp - 32 < SpLim) then L1 else L2
39 --
40 -- we really want to push the x1..x3 assignments into the L2 branch.
41 --
42 -- Algorithm:
43 --
44 -- * Start by doing liveness analysis.
45 --
46 -- * Keep a list of assignments A; earlier ones may refer to later ones.
47 -- Currently we only sink assignments to local registers, because we don't
48 -- have liveness information about global registers.
49 --
50 -- * Walk forwards through the graph, look at each node N:
51 --
52 -- * If it is a dead assignment, i.e. assignment to a register that is
53 -- not used after N, discard it.
54 --
55 -- * Try to inline based on current list of assignments
56 -- * If any assignments in A (1) occur only once in N, and (2) are
57 -- not live after N, inline the assignment and remove it
58 -- from A.
59 --
60 -- * If an assignment in A is cheap (RHS is local register), then
61 -- inline the assignment and keep it in A in case it is used afterwards.
62 --
63 -- * Otherwise don't inline.
64 --
65 -- * If N is assignment to a local register pick up the assignment
66 -- and add it to A.
67 --
68 -- * If N is not an assignment to a local register:
69 -- * remove any assignments from A that conflict with N, and
70 -- place them before N in the current block. We call this
71 -- "dropping" the assignments.
72 --
73 -- * An assignment conflicts with N if it:
74 -- - assigns to a register mentioned in N
75 -- - mentions a register assigned by N
76 -- - reads from memory written by N
77 -- * do this recursively, dropping dependent assignments
78 --
79 -- * At an exit node:
80 -- * drop any assignments that are live on more than one successor
81 -- and are not trivial
82 -- * if any successor has more than one predecessor (a join-point),
83 -- drop everything live in that successor. Since we only propagate
84 -- assignments that are not dead at the successor, we will therefore
85 -- eliminate all assignments dead at this point. Thus analysis of a
86 -- join-point will always begin with an empty list of assignments.
87 --
88 --
89 -- As a result of above algorithm, sinking deletes some dead assignments
90 -- (transitively, even). This isn't as good as removeDeadAssignments,
91 -- but it's much cheaper.
92
93 -- -----------------------------------------------------------------------------
94 -- things that we aren't optimising very well yet.
95 --
96 -- -----------
97 -- (1) From GHC's FastString.hashStr:
98 --
99 -- s2ay:
100 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
101 -- c2gn:
102 -- R1 = _s2au::I64;
103 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
104 -- c2gp:
105 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
106 -- 4091);
107 -- _s2an::I64 = _s2an::I64 + 1;
108 -- _s2au::I64 = _s2cO::I64;
109 -- goto s2ay;
110 --
111 -- a nice loop, but we didn't eliminate the silly assignment at the end.
112 -- See Note [dependent assignments], which would probably fix this.
113 -- This is #8336 on Trac.
114 --
115 -- -----------
116 -- (2) From stg_atomically_frame in PrimOps.cmm
117 --
118 -- We have a diamond control flow:
119 --
120 -- x = ...
121 -- |
122 -- / \
123 -- A B
124 -- \ /
125 -- |
126 -- use of x
127 --
128 -- Now x won't be sunk down to its use, because we won't push it into
129 -- both branches of the conditional. We certainly do have to check
130 -- that we can sink it past all the code in both A and B, but having
131 -- discovered that, we could sink it to its use.
132 --
133
134 -- -----------------------------------------------------------------------------
135
136 type Assignment = (LocalReg, CmmExpr, AbsMem)
137 -- Assignment caches AbsMem, an abstraction of the memory read by
138 -- the RHS of the assignment.
139
140 type Assignments = [Assignment]
141 -- A sequence of assignments; kept in *reverse* order
142 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
143 -- y = e2
144 -- x = e1
145
146 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
147 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
148 where
149 liveness = cmmLocalLiveness dflags graph
150 getLive l = mapFindWithDefault Set.empty l liveness
151
152 blocks = postorderDfs graph
153
154 join_pts = findJoinPoints blocks
155
156 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
157 sink _ [] = []
158 sink sunk (b:bs) =
159 -- pprTrace "sink" (ppr lbl) $
160 blockJoin first final_middle final_last : sink sunk' bs
161 where
162 lbl = entryLabel b
163 (first, middle, last) = blockSplit b
164
165 succs = successors last
166
167 -- Annotate the middle nodes with the registers live *after*
168 -- the node. This will help us decide whether we can inline
169 -- an assignment in the current node or not.
170 live = Set.unions (map getLive succs)
171 live_middle = gen_kill dflags last live
172 ann_middles = annotate dflags live_middle (blockToList middle)
173
174 -- Now sink and inline in this block
175 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
176 fold_last = constantFoldNode dflags last
177 (final_last, assigs') = tryToInline dflags live fold_last assigs
178
179 -- We cannot sink into join points (successors with more than
180 -- one predecessor), so identify the join points and the set
181 -- of registers live in them.
182 (joins, nonjoins) = partition (`mapMember` join_pts) succs
183 live_in_joins = Set.unions (map getLive joins)
184
185 -- We do not want to sink an assignment into multiple branches,
186 -- so identify the set of registers live in multiple successors.
187 -- This is made more complicated because when we sink an assignment
188 -- into one branch, this might change the set of registers that are
189 -- now live in multiple branches.
190 init_live_sets = map getLive nonjoins
191 live_in_multi live_sets r =
192 case filter (Set.member r) live_sets of
193 (_one:_two:_) -> True
194 _ -> False
195
196 -- Now, drop any assignments that we will not sink any further.
197 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
198
199 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
200 where
201 should_drop = conflicts dflags a final_last
202 || not (isTrivial dflags rhs) && live_in_multi live_sets r
203 || r `Set.member` live_in_joins
204
205 live_sets' | should_drop = live_sets
206 | otherwise = map upd live_sets
207
208 upd set | r `Set.member` set = set `Set.union` live_rhs
209 | otherwise = set
210
211 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
212
213 final_middle = foldl blockSnoc middle' dropped_last
214
215 sunk' = mapUnion sunk $
216 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
217 | l <- succs ]
218
219 {- TODO: enable this later, when we have some good tests in place to
220 measure the effect and tune it.
221
222 -- small: an expression we don't mind duplicating
223 isSmall :: CmmExpr -> Bool
224 isSmall (CmmReg (CmmLocal _)) = True --
225 isSmall (CmmLit _) = True
226 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
227 isSmall (CmmRegOff (CmmLocal _) _) = True
228 isSmall _ = False
229 -}
230
231 --
232 -- We allow duplication of trivial expressions: registers (both local and
233 -- global) and literals.
234 --
235 isTrivial :: DynFlags -> CmmExpr -> Bool
236 isTrivial _ (CmmReg (CmmLocal _)) = True
237 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
238 if isARM (platformArch (targetPlatform dflags))
239 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
240 else isJust (globalRegMaybe (targetPlatform dflags) r)
241 -- GlobalRegs that are loads from BaseReg are not trivial
242 isTrivial _ (CmmLit _) = True
243 isTrivial _ _ = False
244
245 --
246 -- annotate each node with the set of registers live *after* the node
247 --
248 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
249 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
250 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
251
252 --
253 -- Find the blocks that have multiple successors (join points)
254 --
255 findJoinPoints :: [CmmBlock] -> LabelMap Int
256 findJoinPoints blocks = mapFilter (>1) succ_counts
257 where
258 all_succs = concatMap successors blocks
259
260 succ_counts :: LabelMap Int
261 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
262
263 --
264 -- filter the list of assignments to remove any assignments that
265 -- are not live in a continuation.
266 --
267 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
268 filterAssignments dflags live assigs = reverse (go assigs [])
269 where go [] kept = kept
270 go (a@(r,_,_):as) kept | needed = go as (a:kept)
271 | otherwise = go as kept
272 where
273 needed = r `Set.member` live
274 || any (conflicts dflags a) (map toNode kept)
275 -- Note that we must keep assignments that are
276 -- referred to by other assignments we have
277 -- already kept.
278
279 -- -----------------------------------------------------------------------------
280 -- Walk through the nodes of a block, sinking and inlining assignments
281 -- as we go.
282 --
283 -- On input we pass in a:
284 -- * list of nodes in the block
285 -- * a list of assignments that appeared *before* this block and
286 -- that are being sunk.
287 --
288 -- On output we get:
289 -- * a new block
290 -- * a list of assignments that will be placed *after* that block.
291 --
292
293 walk :: DynFlags
294 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
295 -- the set of registers live *after*
296 -- this node.
297
298 -> Assignments -- The current list of
299 -- assignments we are sinking.
300 -- Earlier assignments may refer
301 -- to later ones.
302
303 -> ( Block CmmNode O O -- The new block
304 , Assignments -- Assignments to sink further
305 )
306
307 walk dflags nodes assigs = go nodes emptyBlock assigs
308 where
309 go [] block as = (block, as)
310 go ((live,node):ns) block as
311 | shouldDiscard node live = go ns block as
312 -- discard dead assignment
313 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
314 | otherwise = go ns block' as'
315 where
316 node1 = constantFoldNode dflags node
317
318 (node2, as1) = tryToInline dflags live node1 as
319
320 (dropped, as') = dropAssignmentsSimple dflags
321 (\a -> conflicts dflags a node2) as1
322
323 block' = foldl blockSnoc block dropped `blockSnoc` node2
324
325
326 --
327 -- Heuristic to decide whether to pick up and sink an assignment
328 -- Currently we pick up all assignments to local registers. It might
329 -- be profitable to sink assignments to global regs too, but the
330 -- liveness analysis doesn't track those (yet) so we can't.
331 --
332 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
333 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
334 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
335 shouldSink _ _other = Nothing
336
337 --
338 -- discard dead assignments. This doesn't do as good a job as
339 -- removeDeadAssignments, because it would need multiple passes
340 -- to get all the dead code, but it catches the common case of
341 -- superfluous reloads from the stack that the stack allocator
342 -- leaves behind.
343 --
344 -- Also we catch "r = r" here. You might think it would fall
345 -- out of inlining, but the inliner will see that r is live
346 -- after the instruction and choose not to inline r in the rhs.
347 --
348 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
349 shouldDiscard node live
350 = case node of
351 CmmAssign r (CmmReg r') | r == r' -> True
352 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
353 _otherwise -> False
354
355
356 toNode :: Assignment -> CmmNode O O
357 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
358
359 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
360 -> ([CmmNode O O], Assignments)
361 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
362
363 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
364 -> ([CmmNode O O], Assignments)
365 dropAssignments dflags should_drop state assigs
366 = (dropped, reverse kept)
367 where
368 (dropped,kept) = go state assigs [] []
369
370 go _ [] dropped kept = (dropped, kept)
371 go state (assig : rest) dropped kept
372 | conflict = go state' rest (toNode assig : dropped) kept
373 | otherwise = go state' rest dropped (assig:kept)
374 where
375 (dropit, state') = should_drop assig state
376 conflict = dropit || any (conflicts dflags assig) dropped
377
378
379 -- -----------------------------------------------------------------------------
380 -- Try to inline assignments into a node.
381 -- This also does constant folding for primpops, since
382 -- inlining opens up opportunities for doing so.
383
384 tryToInline
385 :: DynFlags
386 -> LocalRegSet -- set of registers live after this
387 -- node. We cannot inline anything
388 -- that is live after the node, unless
389 -- it is small enough to duplicate.
390 -> CmmNode O x -- The node to inline into
391 -> Assignments -- Assignments to inline
392 -> (
393 CmmNode O x -- New node
394 , Assignments -- Remaining assignments
395 )
396
397 tryToInline dflags live node assigs = go usages node [] assigs
398 where
399 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
400 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
401
402 go _usages node _skipped [] = (node, [])
403
404 go usages node skipped (a@(l,rhs,_) : rest)
405 | cannot_inline = dont_inline
406 | occurs_none = discard -- Note [discard during inlining]
407 | occurs_once = inline_and_discard
408 | isTrivial dflags rhs = inline_and_keep
409 | otherwise = dont_inline
410 where
411 inline_and_discard = go usages' inl_node skipped rest
412 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
413
414 discard = go usages node skipped rest
415
416 dont_inline = keep node -- don't inline the assignment, keep it
417 inline_and_keep = keep inl_node -- inline the assignment, keep it
418
419 keep node' = (final_node, a : rest')
420 where (final_node, rest') = go usages' node' (l:skipped) rest
421 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
422 usages rhs
423 -- we must not inline anything that is mentioned in the RHS
424 -- of a binding that we have already skipped, so we set the
425 -- usages of the regs on the RHS to 2.
426
427 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
428 || l `elem` skipped
429 || not (okToInline dflags rhs node)
430
431 l_usages = lookupUFM usages l
432 l_live = l `elemRegSet` live
433
434 occurs_once = not l_live && l_usages == Just 1
435 occurs_none = not l_live && l_usages == Nothing
436
437 inl_node = case mapExpDeep inl_exp node of
438 -- See Note [Improving conditionals]
439 CmmCondBranch (CmmMachOp (MO_Ne w) args)
440 ti fi l
441 -> CmmCondBranch (cmmMachOpFold dflags (MO_Eq w) args)
442 fi ti l
443 node' -> node'
444
445 inl_exp :: CmmExpr -> CmmExpr
446 -- inl_exp is where the inlining actually takes place!
447 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
448 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
449 = cmmOffset dflags rhs off
450 -- re-constant fold after inlining
451 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
452 inl_exp other = other
453
454 {- Note [Improving conditionals]
455 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
456 Given
457 CmmCondBranch ((a >## b) != 1) t f
458 where a,b, are Floats, the constant folder /cannot/ turn it into
459 CmmCondBranch (a <=## b) t f
460 because comparison on floats are not invertible
461 (see CmmMachOp.maybeInvertComparison).
462
463 What we want instead is simply to reverse the true/false branches thus
464 CmmCondBranch ((a >## b) != 1) t f
465 -->
466 CmmCondBranch (a >## b) f t
467
468 And we do that right here in tryToInline, just as we do cmmMachOpFold.
469 -}
470
471 -- Note [dependent assignments]
472 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
473 --
474 -- If our assignment list looks like
475 --
476 -- [ y = e, x = ... y ... ]
477 --
478 -- We cannot inline x. Remember this list is really in reverse order,
479 -- so it means x = ... y ...; y = e
480 --
481 -- Hence if we inline x, the outer assignment to y will capture the
482 -- reference in x's right hand side.
483 --
484 -- In this case we should rename the y in x's right-hand side,
485 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
486 -- Now we can go ahead and inline x.
487 --
488 -- For now we do nothing, because this would require putting
489 -- everything inside UniqSM.
490 --
491 -- One more variant of this (#7366):
492 --
493 -- [ y = e, y = z ]
494 --
495 -- If we don't want to inline y = e, because y is used many times, we
496 -- might still be tempted to inline y = z (because we always inline
497 -- trivial rhs's). But of course we can't, because y is equal to e,
498 -- not z.
499
500 -- Note [discard during inlining]
501 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
502 -- Opportunities to discard assignments sometimes appear after we've
503 -- done some inlining. Here's an example:
504 --
505 -- x = R1;
506 -- y = P64[x + 7];
507 -- z = P64[x + 15];
508 -- /* z is dead */
509 -- R1 = y & (-8);
510 --
511 -- The x assignment is trivial, so we inline it in the RHS of y, and
512 -- keep both x and y. z gets dropped because it is dead, then we
513 -- inline y, and we have a dead assignment to x. If we don't notice
514 -- that x is dead in tryToInline, we end up retaining it.
515
516 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
517 addUsage m r = addToUFM_C (+) m r 1
518
519 regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
520 regsUsedIn [] _ = False
521 regsUsedIn ls e = wrapRecExpf f e False
522 where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
523 f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
524 f _ z = z
525
526 -- we don't inline into CmmUnsafeForeignCall if the expression refers
527 -- to global registers. This is a HACK to avoid global registers
528 -- clashing with C argument-passing registers, really the back-end
529 -- ought to be able to handle it properly, but currently neither PprC
530 -- nor the NCG can do it. See Note [Register parameter passing]
531 -- See also StgCmmForeign:load_args_into_temps.
532 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
533 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
534 not (globalRegistersConflict dflags expr node)
535 okToInline _ _ _ = True
536
537 -- -----------------------------------------------------------------------------
538
539 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
540 -- @r = e@ can be safely commuted past statement @node@.
541 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
542 conflicts dflags (r, rhs, addr) node
543
544 -- (1) node defines registers used by rhs of assignment. This catches
545 -- assignments and all three kinds of calls. See Note [Sinking and calls]
546 | globalRegistersConflict dflags rhs node = True
547 | localRegistersConflict dflags rhs node = True
548
549 -- (2) node uses register defined by assignment
550 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
551
552 -- (3) a store to an address conflicts with a read of the same memory
553 | CmmStore addr' e <- node
554 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
555
556 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
557 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
558 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
559 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
560
561 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
562 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
563
564 -- (6) native calls clobber any memory
565 | CmmCall{} <- node, memConflicts addr AnyMem = True
566
567 -- (7) otherwise, no conflict
568 | otherwise = False
569
570 -- Returns True if node defines any global registers that are used in the
571 -- Cmm expression
572 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
573 globalRegistersConflict dflags expr node =
574 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
575 False node
576
577 -- Returns True if node defines any local registers that are used in the
578 -- Cmm expression
579 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
580 localRegistersConflict dflags expr node =
581 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
582 False node
583
584 -- Note [Sinking and calls]
585 -- ~~~~~~~~~~~~~~~~~~~~~~~~
586 --
587 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
588 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
589 -- stack layout (see Note [Sinking after stack layout]) which leads to two
590 -- invariants related to calls:
591 --
592 -- a) during stack layout phase all safe foreign calls are turned into
593 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
594 -- means that we will never encounter CmmForeignCall node when running
595 -- sinking after stack layout
596 --
597 -- b) stack layout saves all variables live across a call on the stack
598 -- just before making a call (remember we are not sinking assignments to
599 -- stack):
600 --
601 -- L1:
602 -- x = R1
603 -- P64[Sp - 16] = L2
604 -- P64[Sp - 8] = x
605 -- Sp = Sp - 16
606 -- call f() returns L2
607 -- L2:
608 --
609 -- We will attempt to sink { x = R1 } but we will detect conflict with
610 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
611 -- checking whether it conflicts with { call f() }. In this way we will
612 -- never need to check any assignment conflicts with CmmCall. Remember
613 -- that we still need to check for potential memory conflicts.
614 --
615 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
616 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
617 -- This assumption holds only when we do sinking after stack layout. If we run
618 -- it before stack layout we need to check for possible conflicts with all three
619 -- kinds of calls. Our `conflicts` function does that by using a generic
620 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
621 -- UserOfRegs typeclasses.
622 --
623
624 -- An abstraction of memory read or written.
625 data AbsMem
626 = NoMem -- no memory accessed
627 | AnyMem -- arbitrary memory
628 | HeapMem -- definitely heap memory
629 | StackMem -- definitely stack memory
630 | SpMem -- <size>[Sp+n]
631 {-# UNPACK #-} !Int
632 {-# UNPACK #-} !Int
633
634 -- Having SpMem is important because it lets us float loads from Sp
635 -- past stores to Sp as long as they don't overlap, and this helps to
636 -- unravel some long sequences of
637 -- x1 = [Sp + 8]
638 -- x2 = [Sp + 16]
639 -- ...
640 -- [Sp + 8] = xi
641 -- [Sp + 16] = xj
642 --
643 -- Note that SpMem is invalidated if Sp is changed, but the definition
644 -- of 'conflicts' above handles that.
645
646 -- ToDo: this won't currently fix the following commonly occurring code:
647 -- x1 = [R1 + 8]
648 -- x2 = [R1 + 16]
649 -- ..
650 -- [Hp - 8] = x1
651 -- [Hp - 16] = x2
652 -- ..
653
654 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
655 -- assignments to [Hp + n] do not conflict with any other heap memory,
656 -- but this is tricky to nail down. What if we had
657 --
658 -- x = Hp + n
659 -- [x] = ...
660 --
661 -- the store to [x] should be "new heap", not "old heap".
662 -- Furthermore, you could imagine that if we started inlining
663 -- functions in Cmm then there might well be reads of heap memory
664 -- that was written in the same basic block. To take advantage of
665 -- non-aliasing of heap memory we will have to be more clever.
666
667 -- Note [Foreign calls clobber heap]
668 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
669 --
670 -- It is tempting to say that foreign calls clobber only
671 -- non-heap/stack memory, but unfortunately we break this invariant in
672 -- the RTS. For example, in stg_catch_retry_frame we call
673 -- stmCommitNestedTransaction() which modifies the contents of the
674 -- TRec it is passed (this actually caused incorrect code to be
675 -- generated).
676 --
677 -- Since the invariant is true for the majority of foreign calls,
678 -- perhaps we ought to have a special annotation for calls that can
679 -- modify heap/stack memory. For now we just use the conservative
680 -- definition here.
681 --
682 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
683 -- therefore we should never float any memory operations across one of
684 -- these calls.
685
686
687 bothMems :: AbsMem -> AbsMem -> AbsMem
688 bothMems NoMem x = x
689 bothMems x NoMem = x
690 bothMems HeapMem HeapMem = HeapMem
691 bothMems StackMem StackMem = StackMem
692 bothMems (SpMem o1 w1) (SpMem o2 w2)
693 | o1 == o2 = SpMem o1 (max w1 w2)
694 | otherwise = StackMem
695 bothMems SpMem{} StackMem = StackMem
696 bothMems StackMem SpMem{} = StackMem
697 bothMems _ _ = AnyMem
698
699 memConflicts :: AbsMem -> AbsMem -> Bool
700 memConflicts NoMem _ = False
701 memConflicts _ NoMem = False
702 memConflicts HeapMem StackMem = False
703 memConflicts StackMem HeapMem = False
704 memConflicts SpMem{} HeapMem = False
705 memConflicts HeapMem SpMem{} = False
706 memConflicts (SpMem o1 w1) (SpMem o2 w2)
707 | o1 < o2 = o1 + w1 > o2
708 | otherwise = o2 + w2 > o1
709 memConflicts _ _ = True
710
711 exprMem :: DynFlags -> CmmExpr -> AbsMem
712 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
713 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
714 exprMem _ _ = NoMem
715
716 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
717 loadAddr dflags e w =
718 case e of
719 CmmReg r -> regAddr dflags r 0 w
720 CmmRegOff r i -> regAddr dflags r i w
721 _other | regUsedIn dflags (CmmGlobal Sp) e -> StackMem
722 | otherwise -> AnyMem
723
724 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
725 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
726 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
727 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
728 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
729 regAddr _ _ _ _ = AnyMem
730
731 {-
732 Note [Inline GlobalRegs?]
733
734 Should we freely inline GlobalRegs?
735
736 Actually it doesn't make a huge amount of difference either way, so we
737 *do* currently treat GlobalRegs as "trivial" and inline them
738 everywhere, but for what it's worth, here is what I discovered when I
739 (SimonM) looked into this:
740
741 Common sense says we should not inline GlobalRegs, because when we
742 have
743
744 x = R1
745
746 the register allocator will coalesce this assignment, generating no
747 code, and simply record the fact that x is bound to $rbx (or
748 whatever). Furthermore, if we were to sink this assignment, then the
749 range of code over which R1 is live increases, and the range of code
750 over which x is live decreases. All things being equal, it is better
751 for x to be live than R1, because R1 is a fixed register whereas x can
752 live in any register. So we should neither sink nor inline 'x = R1'.
753
754 However, not inlining GlobalRegs can have surprising
755 consequences. e.g. (cgrun020)
756
757 c3EN:
758 _s3DB::P64 = R1;
759 _c3ES::P64 = _s3DB::P64 & 7;
760 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
761 c3EU:
762 _s3DD::P64 = P64[_s3DB::P64 + 6];
763 _s3DE::P64 = P64[_s3DB::P64 + 14];
764 I64[Sp - 8] = c3F0;
765 R1 = _s3DE::P64;
766 P64[Sp] = _s3DD::P64;
767
768 inlining the GlobalReg gives:
769
770 c3EN:
771 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
772 c3EU:
773 I64[Sp - 8] = c3F0;
774 _s3DD::P64 = P64[R1 + 6];
775 R1 = P64[R1 + 14];
776 P64[Sp] = _s3DD::P64;
777
778 but if we don't inline the GlobalReg, instead we get:
779
780 _s3DB::P64 = R1;
781 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
782 c3EU:
783 I64[Sp - 8] = c3F0;
784 R1 = P64[_s3DB::P64 + 14];
785 P64[Sp] = P64[_s3DB::P64 + 6];
786
787 This looks better - we managed to inline _s3DD - but in fact it
788 generates an extra reg-reg move:
789
790 .Lc3EU:
791 movq $c3F0_info,-8(%rbp)
792 movq %rbx,%rax
793 movq 14(%rbx),%rbx
794 movq 6(%rax),%rax
795 movq %rax,(%rbp)
796
797 because _s3DB is now live across the R1 assignment, we lost the
798 benefit of coalescing.
799
800 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
801 R1, then we would not sink a reference to _s3DB past the R1
802 assignment. Or perhaps we *should* do that - we might gain by sinking
803 it, despite losing the coalescing opportunity.
804
805 Sometimes not inlining global registers wins by virtue of the rule
806 about not inlining into arguments of a foreign call, e.g. (T7163) this
807 is what happens when we inlined F1:
808
809 _s3L2::F32 = F1;
810 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
811 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
812
813 but if we don't inline F1:
814
815 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
816 10.0 :: W32));
817 -}