Hoopl: remove dependency on Hoopl package
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import Cmm
7 import CmmOpt
8 import CmmLive
9 import CmmUtils
10 import Hoopl.Block
11 import Hoopl.Label
12 import Hoopl.Collections
13 import Hoopl.Graph
14 import CodeGen.Platform
15 import Platform (isARM, platformArch)
16
17 import DynFlags
18 import UniqFM
19 import PprCmm ()
20
21 import Data.List (partition)
22 import qualified Data.Set as Set
23 import Data.Maybe
24
25 -- -----------------------------------------------------------------------------
26 -- Sinking and inlining
27
28 -- This is an optimisation pass that
29 -- (a) moves assignments closer to their uses, to reduce register pressure
30 -- (b) pushes assignments into a single branch of a conditional if possible
31 -- (c) inlines assignments to registers that are mentioned only once
32 -- (d) discards dead assignments
33 --
34 -- This tightens up lots of register-heavy code. It is particularly
35 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
36 -- which every function starts with a copyIn sequence like:
37 --
38 -- x1 = R1
39 -- x2 = Sp[8]
40 -- x3 = Sp[16]
41 -- if (Sp - 32 < SpLim) then L1 else L2
42 --
43 -- we really want to push the x1..x3 assignments into the L2 branch.
44 --
45 -- Algorithm:
46 --
47 -- * Start by doing liveness analysis.
48 --
49 -- * Keep a list of assignments A; earlier ones may refer to later ones.
50 -- Currently we only sink assignments to local registers, because we don't
51 -- have liveness information about global registers.
52 --
53 -- * Walk forwards through the graph, look at each node N:
54 --
55 -- * If it is a dead assignment, i.e. assignment to a register that is
56 -- not used after N, discard it.
57 --
58 -- * Try to inline based on current list of assignments
59 -- * If any assignments in A (1) occur only once in N, and (2) are
60 -- not live after N, inline the assignment and remove it
61 -- from A.
62 --
63 -- * If an assignment in A is cheap (RHS is local register), then
64 -- inline the assignment and keep it in A in case it is used afterwards.
65 --
66 -- * Otherwise don't inline.
67 --
68 -- * If N is assignment to a local register pick up the assignment
69 -- and add it to A.
70 --
71 -- * If N is not an assignment to a local register:
72 -- * remove any assignments from A that conflict with N, and
73 -- place them before N in the current block. We call this
74 -- "dropping" the assignments.
75 --
76 -- * An assignment conflicts with N if it:
77 -- - assigns to a register mentioned in N
78 -- - mentions a register assigned by N
79 -- - reads from memory written by N
80 -- * do this recursively, dropping dependent assignments
81 --
82 -- * At an exit node:
83 -- * drop any assignments that are live on more than one successor
84 -- and are not trivial
85 -- * if any successor has more than one predecessor (a join-point),
86 -- drop everything live in that successor. Since we only propagate
87 -- assignments that are not dead at the successor, we will therefore
88 -- eliminate all assignments dead at this point. Thus analysis of a
89 -- join-point will always begin with an empty list of assignments.
90 --
91 --
92 -- As a result of above algorithm, sinking deletes some dead assignments
93 -- (transitively, even). This isn't as good as removeDeadAssignments,
94 -- but it's much cheaper.
95
96 -- -----------------------------------------------------------------------------
97 -- things that we aren't optimising very well yet.
98 --
99 -- -----------
100 -- (1) From GHC's FastString.hashStr:
101 --
102 -- s2ay:
103 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
104 -- c2gn:
105 -- R1 = _s2au::I64;
106 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
107 -- c2gp:
108 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
109 -- 4091);
110 -- _s2an::I64 = _s2an::I64 + 1;
111 -- _s2au::I64 = _s2cO::I64;
112 -- goto s2ay;
113 --
114 -- a nice loop, but we didn't eliminate the silly assignment at the end.
115 -- See Note [dependent assignments], which would probably fix this.
116 -- This is #8336 on Trac.
117 --
118 -- -----------
119 -- (2) From stg_atomically_frame in PrimOps.cmm
120 --
121 -- We have a diamond control flow:
122 --
123 -- x = ...
124 -- |
125 -- / \
126 -- A B
127 -- \ /
128 -- |
129 -- use of x
130 --
131 -- Now x won't be sunk down to its use, because we won't push it into
132 -- both branches of the conditional. We certainly do have to check
133 -- that we can sink it past all the code in both A and B, but having
134 -- discovered that, we could sink it to its use.
135 --
136
137 -- -----------------------------------------------------------------------------
138
139 type Assignment = (LocalReg, CmmExpr, AbsMem)
140 -- Assignment caches AbsMem, an abstraction of the memory read by
141 -- the RHS of the assignment.
142
143 type Assignments = [Assignment]
144 -- A sequence of assignments; kept in *reverse* order
145 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
146 -- y = e2
147 -- x = e1
148
149 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
150 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
151 where
152 liveness = cmmLocalLiveness dflags graph
153 getLive l = mapFindWithDefault Set.empty l liveness
154
155 blocks = postorderDfs graph
156
157 join_pts = findJoinPoints blocks
158
159 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
160 sink _ [] = []
161 sink sunk (b:bs) =
162 -- pprTrace "sink" (ppr lbl) $
163 blockJoin first final_middle final_last : sink sunk' bs
164 where
165 lbl = entryLabel b
166 (first, middle, last) = blockSplit b
167
168 succs = successors last
169
170 -- Annotate the middle nodes with the registers live *after*
171 -- the node. This will help us decide whether we can inline
172 -- an assignment in the current node or not.
173 live = Set.unions (map getLive succs)
174 live_middle = gen_kill dflags last live
175 ann_middles = annotate dflags live_middle (blockToList middle)
176
177 -- Now sink and inline in this block
178 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
179 fold_last = constantFoldNode dflags last
180 (final_last, assigs') = tryToInline dflags live fold_last assigs
181
182 -- We cannot sink into join points (successors with more than
183 -- one predecessor), so identify the join points and the set
184 -- of registers live in them.
185 (joins, nonjoins) = partition (`mapMember` join_pts) succs
186 live_in_joins = Set.unions (map getLive joins)
187
188 -- We do not want to sink an assignment into multiple branches,
189 -- so identify the set of registers live in multiple successors.
190 -- This is made more complicated because when we sink an assignment
191 -- into one branch, this might change the set of registers that are
192 -- now live in multiple branches.
193 init_live_sets = map getLive nonjoins
194 live_in_multi live_sets r =
195 case filter (Set.member r) live_sets of
196 (_one:_two:_) -> True
197 _ -> False
198
199 -- Now, drop any assignments that we will not sink any further.
200 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
201
202 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
203 where
204 should_drop = conflicts dflags a final_last
205 || not (isTrivial dflags rhs) && live_in_multi live_sets r
206 || r `Set.member` live_in_joins
207
208 live_sets' | should_drop = live_sets
209 | otherwise = map upd live_sets
210
211 upd set | r `Set.member` set = set `Set.union` live_rhs
212 | otherwise = set
213
214 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
215
216 final_middle = foldl blockSnoc middle' dropped_last
217
218 sunk' = mapUnion sunk $
219 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
220 | l <- succs ]
221
222 {- TODO: enable this later, when we have some good tests in place to
223 measure the effect and tune it.
224
225 -- small: an expression we don't mind duplicating
226 isSmall :: CmmExpr -> Bool
227 isSmall (CmmReg (CmmLocal _)) = True --
228 isSmall (CmmLit _) = True
229 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
230 isSmall (CmmRegOff (CmmLocal _) _) = True
231 isSmall _ = False
232 -}
233
234 --
235 -- We allow duplication of trivial expressions: registers (both local and
236 -- global) and literals.
237 --
238 isTrivial :: DynFlags -> CmmExpr -> Bool
239 isTrivial _ (CmmReg (CmmLocal _)) = True
240 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
241 if isARM (platformArch (targetPlatform dflags))
242 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
243 else isJust (globalRegMaybe (targetPlatform dflags) r)
244 -- GlobalRegs that are loads from BaseReg are not trivial
245 isTrivial _ (CmmLit _) = True
246 isTrivial _ _ = False
247
248 --
249 -- annotate each node with the set of registers live *after* the node
250 --
251 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
252 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
253 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
254
255 --
256 -- Find the blocks that have multiple successors (join points)
257 --
258 findJoinPoints :: [CmmBlock] -> LabelMap Int
259 findJoinPoints blocks = mapFilter (>1) succ_counts
260 where
261 all_succs = concatMap successors blocks
262
263 succ_counts :: LabelMap Int
264 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
265
266 --
267 -- filter the list of assignments to remove any assignments that
268 -- are not live in a continuation.
269 --
270 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
271 filterAssignments dflags live assigs = reverse (go assigs [])
272 where go [] kept = kept
273 go (a@(r,_,_):as) kept | needed = go as (a:kept)
274 | otherwise = go as kept
275 where
276 needed = r `Set.member` live
277 || any (conflicts dflags a) (map toNode kept)
278 -- Note that we must keep assignments that are
279 -- referred to by other assignments we have
280 -- already kept.
281
282 -- -----------------------------------------------------------------------------
283 -- Walk through the nodes of a block, sinking and inlining assignments
284 -- as we go.
285 --
286 -- On input we pass in a:
287 -- * list of nodes in the block
288 -- * a list of assignments that appeared *before* this block and
289 -- that are being sunk.
290 --
291 -- On output we get:
292 -- * a new block
293 -- * a list of assignments that will be placed *after* that block.
294 --
295
296 walk :: DynFlags
297 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
298 -- the set of registers live *after*
299 -- this node.
300
301 -> Assignments -- The current list of
302 -- assignments we are sinking.
303 -- Earlier assignments may refer
304 -- to later ones.
305
306 -> ( Block CmmNode O O -- The new block
307 , Assignments -- Assignments to sink further
308 )
309
310 walk dflags nodes assigs = go nodes emptyBlock assigs
311 where
312 go [] block as = (block, as)
313 go ((live,node):ns) block as
314 | shouldDiscard node live = go ns block as
315 -- discard dead assignment
316 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
317 | otherwise = go ns block' as'
318 where
319 node1 = constantFoldNode dflags node
320
321 (node2, as1) = tryToInline dflags live node1 as
322
323 (dropped, as') = dropAssignmentsSimple dflags
324 (\a -> conflicts dflags a node2) as1
325
326 block' = foldl blockSnoc block dropped `blockSnoc` node2
327
328
329 --
330 -- Heuristic to decide whether to pick up and sink an assignment
331 -- Currently we pick up all assignments to local registers. It might
332 -- be profitable to sink assignments to global regs too, but the
333 -- liveness analysis doesn't track those (yet) so we can't.
334 --
335 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
336 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
337 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
338 shouldSink _ _other = Nothing
339
340 --
341 -- discard dead assignments. This doesn't do as good a job as
342 -- removeDeadAssignments, because it would need multiple passes
343 -- to get all the dead code, but it catches the common case of
344 -- superfluous reloads from the stack that the stack allocator
345 -- leaves behind.
346 --
347 -- Also we catch "r = r" here. You might think it would fall
348 -- out of inlining, but the inliner will see that r is live
349 -- after the instruction and choose not to inline r in the rhs.
350 --
351 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
352 shouldDiscard node live
353 = case node of
354 CmmAssign r (CmmReg r') | r == r' -> True
355 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
356 _otherwise -> False
357
358
359 toNode :: Assignment -> CmmNode O O
360 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
361
362 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
363 -> ([CmmNode O O], Assignments)
364 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
365
366 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
367 -> ([CmmNode O O], Assignments)
368 dropAssignments dflags should_drop state assigs
369 = (dropped, reverse kept)
370 where
371 (dropped,kept) = go state assigs [] []
372
373 go _ [] dropped kept = (dropped, kept)
374 go state (assig : rest) dropped kept
375 | conflict = go state' rest (toNode assig : dropped) kept
376 | otherwise = go state' rest dropped (assig:kept)
377 where
378 (dropit, state') = should_drop assig state
379 conflict = dropit || any (conflicts dflags assig) dropped
380
381
382 -- -----------------------------------------------------------------------------
383 -- Try to inline assignments into a node.
384 -- This also does constant folding for primpops, since
385 -- inlining opens up opportunities for doing so.
386
387 tryToInline
388 :: DynFlags
389 -> LocalRegSet -- set of registers live after this
390 -- node. We cannot inline anything
391 -- that is live after the node, unless
392 -- it is small enough to duplicate.
393 -> CmmNode O x -- The node to inline into
394 -> Assignments -- Assignments to inline
395 -> (
396 CmmNode O x -- New node
397 , Assignments -- Remaining assignments
398 )
399
400 tryToInline dflags live node assigs = go usages node [] assigs
401 where
402 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
403 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
404
405 go _usages node _skipped [] = (node, [])
406
407 go usages node skipped (a@(l,rhs,_) : rest)
408 | cannot_inline = dont_inline
409 | occurs_none = discard -- Note [discard during inlining]
410 | occurs_once = inline_and_discard
411 | isTrivial dflags rhs = inline_and_keep
412 | otherwise = dont_inline
413 where
414 inline_and_discard = go usages' inl_node skipped rest
415 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
416
417 discard = go usages node skipped rest
418
419 dont_inline = keep node -- don't inline the assignment, keep it
420 inline_and_keep = keep inl_node -- inline the assignment, keep it
421
422 keep node' = (final_node, a : rest')
423 where (final_node, rest') = go usages' node' (l:skipped) rest
424 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
425 usages rhs
426 -- we must not inline anything that is mentioned in the RHS
427 -- of a binding that we have already skipped, so we set the
428 -- usages of the regs on the RHS to 2.
429
430 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
431 || l `elem` skipped
432 || not (okToInline dflags rhs node)
433
434 l_usages = lookupUFM usages l
435 l_live = l `elemRegSet` live
436
437 occurs_once = not l_live && l_usages == Just 1
438 occurs_none = not l_live && l_usages == Nothing
439
440 inl_node = case mapExpDeep inl_exp node of
441 -- See Note [Improving conditionals]
442 CmmCondBranch (CmmMachOp (MO_Ne w) args)
443 ti fi l
444 -> CmmCondBranch (cmmMachOpFold dflags (MO_Eq w) args)
445 fi ti l
446 node' -> node'
447
448 inl_exp :: CmmExpr -> CmmExpr
449 -- inl_exp is where the inlining actually takes place!
450 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
451 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
452 = cmmOffset dflags rhs off
453 -- re-constant fold after inlining
454 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
455 inl_exp other = other
456
457 {- Note [Improving conditionals]
458 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
459 Given
460 CmmCondBranch ((a >## b) != 1) t f
461 where a,b, are Floats, the constant folder /cannot/ turn it into
462 CmmCondBranch (a <=## b) t f
463 because comparison on floats are not invertible
464 (see CmmMachOp.maybeInvertComparison).
465
466 What we want instead is simply to reverse the true/false branches thus
467 CmmCondBranch ((a >## b) != 1) t f
468 -->
469 CmmCondBranch (a >## b) f t
470
471 And we do that right here in tryToInline, just as we do cmmMachOpFold.
472 -}
473
474 -- Note [dependent assignments]
475 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
476 --
477 -- If our assignment list looks like
478 --
479 -- [ y = e, x = ... y ... ]
480 --
481 -- We cannot inline x. Remember this list is really in reverse order,
482 -- so it means x = ... y ...; y = e
483 --
484 -- Hence if we inline x, the outer assignment to y will capture the
485 -- reference in x's right hand side.
486 --
487 -- In this case we should rename the y in x's right-hand side,
488 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
489 -- Now we can go ahead and inline x.
490 --
491 -- For now we do nothing, because this would require putting
492 -- everything inside UniqSM.
493 --
494 -- One more variant of this (#7366):
495 --
496 -- [ y = e, y = z ]
497 --
498 -- If we don't want to inline y = e, because y is used many times, we
499 -- might still be tempted to inline y = z (because we always inline
500 -- trivial rhs's). But of course we can't, because y is equal to e,
501 -- not z.
502
503 -- Note [discard during inlining]
504 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
505 -- Opportunities to discard assignments sometimes appear after we've
506 -- done some inlining. Here's an example:
507 --
508 -- x = R1;
509 -- y = P64[x + 7];
510 -- z = P64[x + 15];
511 -- /* z is dead */
512 -- R1 = y & (-8);
513 --
514 -- The x assignment is trivial, so we inline it in the RHS of y, and
515 -- keep both x and y. z gets dropped because it is dead, then we
516 -- inline y, and we have a dead assignment to x. If we don't notice
517 -- that x is dead in tryToInline, we end up retaining it.
518
519 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
520 addUsage m r = addToUFM_C (+) m r 1
521
522 regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
523 regsUsedIn [] _ = False
524 regsUsedIn ls e = wrapRecExpf f e False
525 where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
526 f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
527 f _ z = z
528
529 -- we don't inline into CmmUnsafeForeignCall if the expression refers
530 -- to global registers. This is a HACK to avoid global registers
531 -- clashing with C argument-passing registers, really the back-end
532 -- ought to be able to handle it properly, but currently neither PprC
533 -- nor the NCG can do it. See Note [Register parameter passing]
534 -- See also StgCmmForeign:load_args_into_temps.
535 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
536 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
537 not (globalRegistersConflict dflags expr node)
538 okToInline _ _ _ = True
539
540 -- -----------------------------------------------------------------------------
541
542 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
543 -- @r = e@ can be safely commuted past statement @node@.
544 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
545 conflicts dflags (r, rhs, addr) node
546
547 -- (1) node defines registers used by rhs of assignment. This catches
548 -- assignments and all three kinds of calls. See Note [Sinking and calls]
549 | globalRegistersConflict dflags rhs node = True
550 | localRegistersConflict dflags rhs node = True
551
552 -- (2) node uses register defined by assignment
553 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
554
555 -- (3) a store to an address conflicts with a read of the same memory
556 | CmmStore addr' e <- node
557 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
558
559 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
560 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
561 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
562 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
563
564 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
565 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
566
567 -- (6) native calls clobber any memory
568 | CmmCall{} <- node, memConflicts addr AnyMem = True
569
570 -- (7) otherwise, no conflict
571 | otherwise = False
572
573 -- Returns True if node defines any global registers that are used in the
574 -- Cmm expression
575 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
576 globalRegistersConflict dflags expr node =
577 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
578 False node
579
580 -- Returns True if node defines any local registers that are used in the
581 -- Cmm expression
582 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
583 localRegistersConflict dflags expr node =
584 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
585 False node
586
587 -- Note [Sinking and calls]
588 -- ~~~~~~~~~~~~~~~~~~~~~~~~
589 --
590 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
591 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
592 -- stack layout (see Note [Sinking after stack layout]) which leads to two
593 -- invariants related to calls:
594 --
595 -- a) during stack layout phase all safe foreign calls are turned into
596 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
597 -- means that we will never encounter CmmForeignCall node when running
598 -- sinking after stack layout
599 --
600 -- b) stack layout saves all variables live across a call on the stack
601 -- just before making a call (remember we are not sinking assignments to
602 -- stack):
603 --
604 -- L1:
605 -- x = R1
606 -- P64[Sp - 16] = L2
607 -- P64[Sp - 8] = x
608 -- Sp = Sp - 16
609 -- call f() returns L2
610 -- L2:
611 --
612 -- We will attempt to sink { x = R1 } but we will detect conflict with
613 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
614 -- checking whether it conflicts with { call f() }. In this way we will
615 -- never need to check any assignment conflicts with CmmCall. Remember
616 -- that we still need to check for potential memory conflicts.
617 --
618 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
619 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
620 -- This assumption holds only when we do sinking after stack layout. If we run
621 -- it before stack layout we need to check for possible conflicts with all three
622 -- kinds of calls. Our `conflicts` function does that by using a generic
623 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
624 -- UserOfRegs typeclasses.
625 --
626
627 -- An abstraction of memory read or written.
628 data AbsMem
629 = NoMem -- no memory accessed
630 | AnyMem -- arbitrary memory
631 | HeapMem -- definitely heap memory
632 | StackMem -- definitely stack memory
633 | SpMem -- <size>[Sp+n]
634 {-# UNPACK #-} !Int
635 {-# UNPACK #-} !Int
636
637 -- Having SpMem is important because it lets us float loads from Sp
638 -- past stores to Sp as long as they don't overlap, and this helps to
639 -- unravel some long sequences of
640 -- x1 = [Sp + 8]
641 -- x2 = [Sp + 16]
642 -- ...
643 -- [Sp + 8] = xi
644 -- [Sp + 16] = xj
645 --
646 -- Note that SpMem is invalidated if Sp is changed, but the definition
647 -- of 'conflicts' above handles that.
648
649 -- ToDo: this won't currently fix the following commonly occurring code:
650 -- x1 = [R1 + 8]
651 -- x2 = [R1 + 16]
652 -- ..
653 -- [Hp - 8] = x1
654 -- [Hp - 16] = x2
655 -- ..
656
657 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
658 -- assignments to [Hp + n] do not conflict with any other heap memory,
659 -- but this is tricky to nail down. What if we had
660 --
661 -- x = Hp + n
662 -- [x] = ...
663 --
664 -- the store to [x] should be "new heap", not "old heap".
665 -- Furthermore, you could imagine that if we started inlining
666 -- functions in Cmm then there might well be reads of heap memory
667 -- that was written in the same basic block. To take advantage of
668 -- non-aliasing of heap memory we will have to be more clever.
669
670 -- Note [Foreign calls clobber heap]
671 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
672 --
673 -- It is tempting to say that foreign calls clobber only
674 -- non-heap/stack memory, but unfortunately we break this invariant in
675 -- the RTS. For example, in stg_catch_retry_frame we call
676 -- stmCommitNestedTransaction() which modifies the contents of the
677 -- TRec it is passed (this actually caused incorrect code to be
678 -- generated).
679 --
680 -- Since the invariant is true for the majority of foreign calls,
681 -- perhaps we ought to have a special annotation for calls that can
682 -- modify heap/stack memory. For now we just use the conservative
683 -- definition here.
684 --
685 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
686 -- therefore we should never float any memory operations across one of
687 -- these calls.
688
689
690 bothMems :: AbsMem -> AbsMem -> AbsMem
691 bothMems NoMem x = x
692 bothMems x NoMem = x
693 bothMems HeapMem HeapMem = HeapMem
694 bothMems StackMem StackMem = StackMem
695 bothMems (SpMem o1 w1) (SpMem o2 w2)
696 | o1 == o2 = SpMem o1 (max w1 w2)
697 | otherwise = StackMem
698 bothMems SpMem{} StackMem = StackMem
699 bothMems StackMem SpMem{} = StackMem
700 bothMems _ _ = AnyMem
701
702 memConflicts :: AbsMem -> AbsMem -> Bool
703 memConflicts NoMem _ = False
704 memConflicts _ NoMem = False
705 memConflicts HeapMem StackMem = False
706 memConflicts StackMem HeapMem = False
707 memConflicts SpMem{} HeapMem = False
708 memConflicts HeapMem SpMem{} = False
709 memConflicts (SpMem o1 w1) (SpMem o2 w2)
710 | o1 < o2 = o1 + w1 > o2
711 | otherwise = o2 + w2 > o1
712 memConflicts _ _ = True
713
714 exprMem :: DynFlags -> CmmExpr -> AbsMem
715 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
716 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
717 exprMem _ _ = NoMem
718
719 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
720 loadAddr dflags e w =
721 case e of
722 CmmReg r -> regAddr dflags r 0 w
723 CmmRegOff r i -> regAddr dflags r i w
724 _other | regUsedIn dflags (CmmGlobal Sp) e -> StackMem
725 | otherwise -> AnyMem
726
727 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
728 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
729 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
730 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
731 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
732 regAddr _ _ _ _ = AnyMem
733
734 {-
735 Note [Inline GlobalRegs?]
736
737 Should we freely inline GlobalRegs?
738
739 Actually it doesn't make a huge amount of difference either way, so we
740 *do* currently treat GlobalRegs as "trivial" and inline them
741 everywhere, but for what it's worth, here is what I discovered when I
742 (SimonM) looked into this:
743
744 Common sense says we should not inline GlobalRegs, because when we
745 have
746
747 x = R1
748
749 the register allocator will coalesce this assignment, generating no
750 code, and simply record the fact that x is bound to $rbx (or
751 whatever). Furthermore, if we were to sink this assignment, then the
752 range of code over which R1 is live increases, and the range of code
753 over which x is live decreases. All things being equal, it is better
754 for x to be live than R1, because R1 is a fixed register whereas x can
755 live in any register. So we should neither sink nor inline 'x = R1'.
756
757 However, not inlining GlobalRegs can have surprising
758 consequences. e.g. (cgrun020)
759
760 c3EN:
761 _s3DB::P64 = R1;
762 _c3ES::P64 = _s3DB::P64 & 7;
763 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
764 c3EU:
765 _s3DD::P64 = P64[_s3DB::P64 + 6];
766 _s3DE::P64 = P64[_s3DB::P64 + 14];
767 I64[Sp - 8] = c3F0;
768 R1 = _s3DE::P64;
769 P64[Sp] = _s3DD::P64;
770
771 inlining the GlobalReg gives:
772
773 c3EN:
774 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
775 c3EU:
776 I64[Sp - 8] = c3F0;
777 _s3DD::P64 = P64[R1 + 6];
778 R1 = P64[R1 + 14];
779 P64[Sp] = _s3DD::P64;
780
781 but if we don't inline the GlobalReg, instead we get:
782
783 _s3DB::P64 = R1;
784 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
785 c3EU:
786 I64[Sp - 8] = c3F0;
787 R1 = P64[_s3DB::P64 + 14];
788 P64[Sp] = P64[_s3DB::P64 + 6];
789
790 This looks better - we managed to inline _s3DD - but in fact it
791 generates an extra reg-reg move:
792
793 .Lc3EU:
794 movq $c3F0_info,-8(%rbp)
795 movq %rbx,%rax
796 movq 14(%rbx),%rbx
797 movq 6(%rax),%rax
798 movq %rax,(%rbp)
799
800 because _s3DB is now live across the R1 assignment, we lost the
801 benefit of coalescing.
802
803 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
804 R1, then we would not sink a reference to _s3DB past the R1
805 assignment. Or perhaps we *should* do that - we might gain by sinking
806 it, despite losing the coalescing opportunity.
807
808 Sometimes not inlining global registers wins by virtue of the rule
809 about not inlining into arguments of a foreign call, e.g. (T7163) this
810 is what happens when we inlined F1:
811
812 _s3L2::F32 = F1;
813 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
814 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
815
816 but if we don't inline F1:
817
818 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
819 10.0 :: W32));
820 -}