Fix #11407.
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import Cmm
7 import CmmOpt
8 import BlockId
9 import CmmLive
10 import CmmUtils
11 import Hoopl
12 import CodeGen.Platform
13 import Platform (isARM, platformArch)
14
15 import DynFlags
16 import UniqFM
17 import PprCmm ()
18
19 import Data.List (partition)
20 import qualified Data.Set as Set
21 import Data.Maybe
22
23 -- -----------------------------------------------------------------------------
24 -- Sinking and inlining
25
26 -- This is an optimisation pass that
27 -- (a) moves assignments closer to their uses, to reduce register pressure
28 -- (b) pushes assignments into a single branch of a conditional if possible
29 -- (c) inlines assignments to registers that are mentioned only once
30 -- (d) discards dead assignments
31 --
32 -- This tightens up lots of register-heavy code. It is particularly
33 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
34 -- which every function starts with a copyIn sequence like:
35 --
36 -- x1 = R1
37 -- x2 = Sp[8]
38 -- x3 = Sp[16]
39 -- if (Sp - 32 < SpLim) then L1 else L2
40 --
41 -- we really want to push the x1..x3 assignments into the L2 branch.
42 --
43 -- Algorithm:
44 --
45 -- * Start by doing liveness analysis.
46 --
47 -- * Keep a list of assignments A; earlier ones may refer to later ones.
48 -- Currently we only sink assignments to local registers, because we don't
49 -- have liveness information about global registers.
50 --
51 -- * Walk forwards through the graph, look at each node N:
52 --
53 -- * If it is a dead assignment, i.e. assignment to a register that is
54 -- not used after N, discard it.
55 --
56 -- * Try to inline based on current list of assignments
57 -- * If any assignments in A (1) occur only once in N, and (2) are
58 -- not live after N, inline the assignment and remove it
59 -- from A.
60 --
61 -- * If an assignment in A is cheap (RHS is local register), then
62 -- inline the assignment and keep it in A in case it is used afterwards.
63 --
64 -- * Otherwise don't inline.
65 --
66 -- * If N is assignment to a local register pick up the assignment
67 -- and add it to A.
68 --
69 -- * If N is not an assignment to a local register:
70 -- * remove any assignments from A that conflict with N, and
71 -- place them before N in the current block. We call this
72 -- "dropping" the assignments.
73 --
74 -- * An assignment conflicts with N if it:
75 -- - assigns to a register mentioned in N
76 -- - mentions a register assigned by N
77 -- - reads from memory written by N
78 -- * do this recursively, dropping dependent assignments
79 --
80 -- * At an exit node:
81 -- * drop any assignments that are live on more than one successor
82 -- and are not trivial
83 -- * if any successor has more than one predecessor (a join-point),
84 -- drop everything live in that successor. Since we only propagate
85 -- assignments that are not dead at the successor, we will therefore
86 -- eliminate all assignments dead at this point. Thus analysis of a
87 -- join-point will always begin with an empty list of assignments.
88 --
89 --
90 -- As a result of above algorithm, sinking deletes some dead assignments
91 -- (transitively, even). This isn't as good as removeDeadAssignments,
92 -- but it's much cheaper.
93
94 -- -----------------------------------------------------------------------------
95 -- things that we aren't optimising very well yet.
96 --
97 -- -----------
98 -- (1) From GHC's FastString.hashStr:
99 --
100 -- s2ay:
101 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
102 -- c2gn:
103 -- R1 = _s2au::I64;
104 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
105 -- c2gp:
106 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
107 -- 4091);
108 -- _s2an::I64 = _s2an::I64 + 1;
109 -- _s2au::I64 = _s2cO::I64;
110 -- goto s2ay;
111 --
112 -- a nice loop, but we didn't eliminate the silly assignment at the end.
113 -- See Note [dependent assignments], which would probably fix this.
114 -- This is #8336 on Trac.
115 --
116 -- -----------
117 -- (2) From stg_atomically_frame in PrimOps.cmm
118 --
119 -- We have a diamond control flow:
120 --
121 -- x = ...
122 -- |
123 -- / \
124 -- A B
125 -- \ /
126 -- |
127 -- use of x
128 --
129 -- Now x won't be sunk down to its use, because we won't push it into
130 -- both branches of the conditional. We certainly do have to check
131 -- that we can sink it past all the code in both A and B, but having
132 -- discovered that, we could sink it to its use.
133 --
134
135 -- -----------------------------------------------------------------------------
136
137 type Assignment = (LocalReg, CmmExpr, AbsMem)
138 -- Assignment caches AbsMem, an abstraction of the memory read by
139 -- the RHS of the assignment.
140
141 type Assignments = [Assignment]
142 -- A sequence of assignements; kept in *reverse* order
143 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
144 -- y = e2
145 -- x = e1
146
147 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
148 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
149 where
150 liveness = cmmLocalLiveness dflags graph
151 getLive l = mapFindWithDefault Set.empty l liveness
152
153 blocks = postorderDfs graph
154
155 join_pts = findJoinPoints blocks
156
157 sink :: BlockEnv Assignments -> [CmmBlock] -> [CmmBlock]
158 sink _ [] = []
159 sink sunk (b:bs) =
160 -- pprTrace "sink" (ppr lbl) $
161 blockJoin first final_middle final_last : sink sunk' bs
162 where
163 lbl = entryLabel b
164 (first, middle, last) = blockSplit b
165
166 succs = successors last
167
168 -- Annotate the middle nodes with the registers live *after*
169 -- the node. This will help us decide whether we can inline
170 -- an assignment in the current node or not.
171 live = Set.unions (map getLive succs)
172 live_middle = gen_kill dflags last live
173 ann_middles = annotate dflags live_middle (blockToList middle)
174
175 -- Now sink and inline in this block
176 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
177 fold_last = constantFoldNode dflags last
178 (final_last, assigs') = tryToInline dflags live fold_last assigs
179
180 -- We cannot sink into join points (successors with more than
181 -- one predecessor), so identify the join points and the set
182 -- of registers live in them.
183 (joins, nonjoins) = partition (`mapMember` join_pts) succs
184 live_in_joins = Set.unions (map getLive joins)
185
186 -- We do not want to sink an assignment into multiple branches,
187 -- so identify the set of registers live in multiple successors.
188 -- This is made more complicated because when we sink an assignment
189 -- into one branch, this might change the set of registers that are
190 -- now live in multiple branches.
191 init_live_sets = map getLive nonjoins
192 live_in_multi live_sets r =
193 case filter (Set.member r) live_sets of
194 (_one:_two:_) -> True
195 _ -> False
196
197 -- Now, drop any assignments that we will not sink any further.
198 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
199
200 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
201 where
202 should_drop = conflicts dflags a final_last
203 || not (isTrivial dflags rhs) && live_in_multi live_sets r
204 || r `Set.member` live_in_joins
205
206 live_sets' | should_drop = live_sets
207 | otherwise = map upd live_sets
208
209 upd set | r `Set.member` set = set `Set.union` live_rhs
210 | otherwise = set
211
212 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
213
214 final_middle = foldl blockSnoc middle' dropped_last
215
216 sunk' = mapUnion sunk $
217 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
218 | l <- succs ]
219
220 {- TODO: enable this later, when we have some good tests in place to
221 measure the effect and tune it.
222
223 -- small: an expression we don't mind duplicating
224 isSmall :: CmmExpr -> Bool
225 isSmall (CmmReg (CmmLocal _)) = True --
226 isSmall (CmmLit _) = True
227 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
228 isSmall (CmmRegOff (CmmLocal _) _) = True
229 isSmall _ = False
230 -}
231
232 --
233 -- We allow duplication of trivial expressions: registers (both local and
234 -- global) and literals.
235 --
236 isTrivial :: DynFlags -> CmmExpr -> Bool
237 isTrivial _ (CmmReg (CmmLocal _)) = True
238 isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
239 if isARM (platformArch (targetPlatform dflags))
240 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
241 else isJust (globalRegMaybe (targetPlatform dflags) r)
242 -- GlobalRegs that are loads from BaseReg are not trivial
243 isTrivial _ (CmmLit _) = True
244 isTrivial _ _ = False
245
246 --
247 -- annotate each node with the set of registers live *after* the node
248 --
249 annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
250 annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
251 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
252
253 --
254 -- Find the blocks that have multiple successors (join points)
255 --
256 findJoinPoints :: [CmmBlock] -> BlockEnv Int
257 findJoinPoints blocks = mapFilter (>1) succ_counts
258 where
259 all_succs = concatMap successors blocks
260
261 succ_counts :: BlockEnv Int
262 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
263
264 --
265 -- filter the list of assignments to remove any assignments that
266 -- are not live in a continuation.
267 --
268 filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
269 filterAssignments dflags live assigs = reverse (go assigs [])
270 where go [] kept = kept
271 go (a@(r,_,_):as) kept | needed = go as (a:kept)
272 | otherwise = go as kept
273 where
274 needed = r `Set.member` live
275 || any (conflicts dflags a) (map toNode kept)
276 -- Note that we must keep assignments that are
277 -- referred to by other assignments we have
278 -- already kept.
279
280 -- -----------------------------------------------------------------------------
281 -- Walk through the nodes of a block, sinking and inlining assignments
282 -- as we go.
283 --
284 -- On input we pass in a:
285 -- * list of nodes in the block
286 -- * a list of assignments that appeared *before* this block and
287 -- that are being sunk.
288 --
289 -- On output we get:
290 -- * a new block
291 -- * a list of assignments that will be placed *after* that block.
292 --
293
294 walk :: DynFlags
295 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
296 -- the set of registers live *after*
297 -- this node.
298
299 -> Assignments -- The current list of
300 -- assignments we are sinking.
301 -- Earlier assignments may refer
302 -- to later ones.
303
304 -> ( Block CmmNode O O -- The new block
305 , Assignments -- Assignments to sink further
306 )
307
308 walk dflags nodes assigs = go nodes emptyBlock assigs
309 where
310 go [] block as = (block, as)
311 go ((live,node):ns) block as
312 | shouldDiscard node live = go ns block as
313 -- discard dead assignment
314 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
315 | otherwise = go ns block' as'
316 where
317 node1 = constantFoldNode dflags node
318
319 (node2, as1) = tryToInline dflags live node1 as
320
321 (dropped, as') = dropAssignmentsSimple dflags
322 (\a -> conflicts dflags a node2) as1
323
324 block' = foldl blockSnoc block dropped `blockSnoc` node2
325
326
327 --
328 -- Heuristic to decide whether to pick up and sink an assignment
329 -- Currently we pick up all assignments to local registers. It might
330 -- be profitable to sink assignments to global regs too, but the
331 -- liveness analysis doesn't track those (yet) so we can't.
332 --
333 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
334 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
335 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
336 shouldSink _ _other = Nothing
337
338 --
339 -- discard dead assignments. This doesn't do as good a job as
340 -- removeDeadAsssignments, because it would need multiple passes
341 -- to get all the dead code, but it catches the common case of
342 -- superfluous reloads from the stack that the stack allocator
343 -- leaves behind.
344 --
345 -- Also we catch "r = r" here. You might think it would fall
346 -- out of inlining, but the inliner will see that r is live
347 -- after the instruction and choose not to inline r in the rhs.
348 --
349 shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
350 shouldDiscard node live
351 = case node of
352 CmmAssign r (CmmReg r') | r == r' -> True
353 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
354 _otherwise -> False
355
356
357 toNode :: Assignment -> CmmNode O O
358 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
359
360 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
361 -> ([CmmNode O O], Assignments)
362 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
363
364 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
365 -> ([CmmNode O O], Assignments)
366 dropAssignments dflags should_drop state assigs
367 = (dropped, reverse kept)
368 where
369 (dropped,kept) = go state assigs [] []
370
371 go _ [] dropped kept = (dropped, kept)
372 go state (assig : rest) dropped kept
373 | conflict = go state' rest (toNode assig : dropped) kept
374 | otherwise = go state' rest dropped (assig:kept)
375 where
376 (dropit, state') = should_drop assig state
377 conflict = dropit || any (conflicts dflags assig) dropped
378
379
380 -- -----------------------------------------------------------------------------
381 -- Try to inline assignments into a node.
382
383 tryToInline
384 :: DynFlags
385 -> LocalRegSet -- set of registers live after this
386 -- node. We cannot inline anything
387 -- that is live after the node, unless
388 -- it is small enough to duplicate.
389 -> CmmNode O x -- The node to inline into
390 -> Assignments -- Assignments to inline
391 -> (
392 CmmNode O x -- New node
393 , Assignments -- Remaining assignments
394 )
395
396 tryToInline dflags live node assigs = go usages node [] assigs
397 where
398 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
399 usages = foldLocalRegsUsed dflags addUsage emptyUFM node
400
401 go _usages node _skipped [] = (node, [])
402
403 go usages node skipped (a@(l,rhs,_) : rest)
404 | cannot_inline = dont_inline
405 | occurs_none = discard -- Note [discard during inlining]
406 | occurs_once = inline_and_discard
407 | isTrivial dflags rhs = inline_and_keep
408 | otherwise = dont_inline
409 where
410 inline_and_discard = go usages' inl_node skipped rest
411 where usages' = foldLocalRegsUsed dflags addUsage usages rhs
412
413 discard = go usages node skipped rest
414
415 dont_inline = keep node -- don't inline the assignment, keep it
416 inline_and_keep = keep inl_node -- inline the assignment, keep it
417
418 keep node' = (final_node, a : rest')
419 where (final_node, rest') = go usages' node' (l:skipped) rest
420 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
421 usages rhs
422 -- we must not inline anything that is mentioned in the RHS
423 -- of a binding that we have already skipped, so we set the
424 -- usages of the regs on the RHS to 2.
425
426 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
427 || l `elem` skipped
428 || not (okToInline dflags rhs node)
429
430 l_usages = lookupUFM usages l
431 l_live = l `elemRegSet` live
432
433 occurs_once = not l_live && l_usages == Just 1
434 occurs_none = not l_live && l_usages == Nothing
435
436 inl_node = mapExpDeep inline node
437 -- mapExpDeep is where the inlining actually takes place!
438 where inline (CmmReg (CmmLocal l')) | l == l' = rhs
439 inline (CmmRegOff (CmmLocal l') off) | l == l'
440 = cmmOffset dflags rhs off
441 -- re-constant fold after inlining
442 inline (CmmMachOp op args) = cmmMachOpFold dflags op args
443 inline other = other
444
445 -- Note [dependent assignments]
446 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
447 --
448 -- If our assignment list looks like
449 --
450 -- [ y = e, x = ... y ... ]
451 --
452 -- We cannot inline x. Remember this list is really in reverse order,
453 -- so it means x = ... y ...; y = e
454 --
455 -- Hence if we inline x, the outer assignment to y will capture the
456 -- reference in x's right hand side.
457 --
458 -- In this case we should rename the y in x's right-hand side,
459 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
460 -- Now we can go ahead and inline x.
461 --
462 -- For now we do nothing, because this would require putting
463 -- everything inside UniqSM.
464 --
465 -- One more variant of this (#7366):
466 --
467 -- [ y = e, y = z ]
468 --
469 -- If we don't want to inline y = e, because y is used many times, we
470 -- might still be tempted to inline y = z (because we always inline
471 -- trivial rhs's). But of course we can't, because y is equal to e,
472 -- not z.
473
474 -- Note [discard during inlining]
475 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
476 -- Opportunities to discard assignments sometimes appear after we've
477 -- done some inlining. Here's an example:
478 --
479 -- x = R1;
480 -- y = P64[x + 7];
481 -- z = P64[x + 15];
482 -- /* z is dead */
483 -- R1 = y & (-8);
484 --
485 -- The x assignment is trivial, so we inline it in the RHS of y, and
486 -- keep both x and y. z gets dropped because it is dead, then we
487 -- inline y, and we have a dead assignment to x. If we don't notice
488 -- that x is dead in tryToInline, we end up retaining it.
489
490 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
491 addUsage m r = addToUFM_C (+) m r 1
492
493 regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
494 regsUsedIn [] _ = False
495 regsUsedIn ls e = wrapRecExpf f e False
496 where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
497 f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
498 f _ z = z
499
500 -- we don't inline into CmmUnsafeForeignCall if the expression refers
501 -- to global registers. This is a HACK to avoid global registers
502 -- clashing with C argument-passing registers, really the back-end
503 -- ought to be able to handle it properly, but currently neither PprC
504 -- nor the NCG can do it. See Note [Register parameter passing]
505 -- See also StgCmmForeign:load_args_into_temps.
506 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
507 okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
508 not (globalRegistersConflict dflags expr node)
509 okToInline _ _ _ = True
510
511 -- -----------------------------------------------------------------------------
512
513 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
514 -- @r = e@ can be safely commuted past statement @node@.
515 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
516 conflicts dflags (r, rhs, addr) node
517
518 -- (1) node defines registers used by rhs of assignment. This catches
519 -- assignments and all three kinds of calls. See Note [Sinking and calls]
520 | globalRegistersConflict dflags rhs node = True
521 | localRegistersConflict dflags rhs node = True
522
523 -- (2) node uses register defined by assignment
524 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
525
526 -- (3) a store to an address conflicts with a read of the same memory
527 | CmmStore addr' e <- node
528 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
529
530 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
531 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
532 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
533 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
534
535 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
536 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
537
538 -- (6) native calls clobber any memory
539 | CmmCall{} <- node, memConflicts addr AnyMem = True
540
541 -- (7) otherwise, no conflict
542 | otherwise = False
543
544 -- Returns True if node defines any global registers that are used in the
545 -- Cmm expression
546 globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
547 globalRegistersConflict dflags expr node =
548 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
549 False node
550
551 -- Returns True if node defines any local registers that are used in the
552 -- Cmm expression
553 localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
554 localRegistersConflict dflags expr node =
555 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
556 False node
557
558 -- Note [Sinking and calls]
559 -- ~~~~~~~~~~~~~~~~~~~~~~~~
560 --
561 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
562 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
563 -- stack layout (see Note [Sinking after stack layout]) which leads to two
564 -- invariants related to calls:
565 --
566 -- a) during stack layout phase all safe foreign calls are turned into
567 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
568 -- means that we will never encounter CmmForeignCall node when running
569 -- sinking after stack layout
570 --
571 -- b) stack layout saves all variables live across a call on the stack
572 -- just before making a call (remember we are not sinking assignments to
573 -- stack):
574 --
575 -- L1:
576 -- x = R1
577 -- P64[Sp - 16] = L2
578 -- P64[Sp - 8] = x
579 -- Sp = Sp - 16
580 -- call f() returns L2
581 -- L2:
582 --
583 -- We will attempt to sink { x = R1 } but we will detect conflict with
584 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
585 -- checking whether it conflicts with { call f() }. In this way we will
586 -- never need to check any assignment conflicts with CmmCall. Remember
587 -- that we still need to check for potential memory conflicts.
588 --
589 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
590 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
591 -- This assumption holds only when we do sinking after stack layout. If we run
592 -- it before stack layout we need to check for possible conflicts with all three
593 -- kinds of calls. Our `conflicts` function does that by using a generic
594 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
595 -- UserOfRegs typeclasses.
596 --
597
598 -- An abstraction of memory read or written.
599 data AbsMem
600 = NoMem -- no memory accessed
601 | AnyMem -- arbitrary memory
602 | HeapMem -- definitely heap memory
603 | StackMem -- definitely stack memory
604 | SpMem -- <size>[Sp+n]
605 {-# UNPACK #-} !Int
606 {-# UNPACK #-} !Int
607
608 -- Having SpMem is important because it lets us float loads from Sp
609 -- past stores to Sp as long as they don't overlap, and this helps to
610 -- unravel some long sequences of
611 -- x1 = [Sp + 8]
612 -- x2 = [Sp + 16]
613 -- ...
614 -- [Sp + 8] = xi
615 -- [Sp + 16] = xj
616 --
617 -- Note that SpMem is invalidated if Sp is changed, but the definition
618 -- of 'conflicts' above handles that.
619
620 -- ToDo: this won't currently fix the following commonly occurring code:
621 -- x1 = [R1 + 8]
622 -- x2 = [R1 + 16]
623 -- ..
624 -- [Hp - 8] = x1
625 -- [Hp - 16] = x2
626 -- ..
627
628 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
629 -- assignments to [Hp + n] do not conflict with any other heap memory,
630 -- but this is tricky to nail down. What if we had
631 --
632 -- x = Hp + n
633 -- [x] = ...
634 --
635 -- the store to [x] should be "new heap", not "old heap".
636 -- Furthermore, you could imagine that if we started inlining
637 -- functions in Cmm then there might well be reads of heap memory
638 -- that was written in the same basic block. To take advantage of
639 -- non-aliasing of heap memory we will have to be more clever.
640
641 -- Note [Foreign calls clobber heap]
642 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
643 --
644 -- It is tempting to say that foreign calls clobber only
645 -- non-heap/stack memory, but unfortunately we break this invariant in
646 -- the RTS. For example, in stg_catch_retry_frame we call
647 -- stmCommitNestedTransaction() which modifies the contents of the
648 -- TRec it is passed (this actually caused incorrect code to be
649 -- generated).
650 --
651 -- Since the invariant is true for the majority of foreign calls,
652 -- perhaps we ought to have a special annotation for calls that can
653 -- modify heap/stack memory. For now we just use the conservative
654 -- definition here.
655 --
656 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
657 -- therefore we should never float any memory operations across one of
658 -- these calls.
659
660
661 bothMems :: AbsMem -> AbsMem -> AbsMem
662 bothMems NoMem x = x
663 bothMems x NoMem = x
664 bothMems HeapMem HeapMem = HeapMem
665 bothMems StackMem StackMem = StackMem
666 bothMems (SpMem o1 w1) (SpMem o2 w2)
667 | o1 == o2 = SpMem o1 (max w1 w2)
668 | otherwise = StackMem
669 bothMems SpMem{} StackMem = StackMem
670 bothMems StackMem SpMem{} = StackMem
671 bothMems _ _ = AnyMem
672
673 memConflicts :: AbsMem -> AbsMem -> Bool
674 memConflicts NoMem _ = False
675 memConflicts _ NoMem = False
676 memConflicts HeapMem StackMem = False
677 memConflicts StackMem HeapMem = False
678 memConflicts SpMem{} HeapMem = False
679 memConflicts HeapMem SpMem{} = False
680 memConflicts (SpMem o1 w1) (SpMem o2 w2)
681 | o1 < o2 = o1 + w1 > o2
682 | otherwise = o2 + w2 > o1
683 memConflicts _ _ = True
684
685 exprMem :: DynFlags -> CmmExpr -> AbsMem
686 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
687 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
688 exprMem _ _ = NoMem
689
690 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
691 loadAddr dflags e w =
692 case e of
693 CmmReg r -> regAddr dflags r 0 w
694 CmmRegOff r i -> regAddr dflags r i w
695 _other | regUsedIn dflags (CmmGlobal Sp) e -> StackMem
696 | otherwise -> AnyMem
697
698 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
699 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
700 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
701 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
702 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
703 regAddr _ _ _ _ = AnyMem
704
705 {-
706 Note [Inline GlobalRegs?]
707
708 Should we freely inline GlobalRegs?
709
710 Actually it doesn't make a huge amount of difference either way, so we
711 *do* currently treat GlobalRegs as "trivial" and inline them
712 everywhere, but for what it's worth, here is what I discovered when I
713 (SimonM) looked into this:
714
715 Common sense says we should not inline GlobalRegs, because when we
716 have
717
718 x = R1
719
720 the register allocator will coalesce this assignment, generating no
721 code, and simply record the fact that x is bound to $rbx (or
722 whatever). Furthermore, if we were to sink this assignment, then the
723 range of code over which R1 is live increases, and the range of code
724 over which x is live decreases. All things being equal, it is better
725 for x to be live than R1, because R1 is a fixed register whereas x can
726 live in any register. So we should neither sink nor inline 'x = R1'.
727
728 However, not inlining GlobalRegs can have surprising
729 consequences. e.g. (cgrun020)
730
731 c3EN:
732 _s3DB::P64 = R1;
733 _c3ES::P64 = _s3DB::P64 & 7;
734 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
735 c3EU:
736 _s3DD::P64 = P64[_s3DB::P64 + 6];
737 _s3DE::P64 = P64[_s3DB::P64 + 14];
738 I64[Sp - 8] = c3F0;
739 R1 = _s3DE::P64;
740 P64[Sp] = _s3DD::P64;
741
742 inlining the GlobalReg gives:
743
744 c3EN:
745 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
746 c3EU:
747 I64[Sp - 8] = c3F0;
748 _s3DD::P64 = P64[R1 + 6];
749 R1 = P64[R1 + 14];
750 P64[Sp] = _s3DD::P64;
751
752 but if we don't inline the GlobalReg, instead we get:
753
754 _s3DB::P64 = R1;
755 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
756 c3EU:
757 I64[Sp - 8] = c3F0;
758 R1 = P64[_s3DB::P64 + 14];
759 P64[Sp] = P64[_s3DB::P64 + 6];
760
761 This looks better - we managed to inline _s3DD - but in fact it
762 generates an extra reg-reg move:
763
764 .Lc3EU:
765 movq $c3F0_info,-8(%rbp)
766 movq %rbx,%rax
767 movq 14(%rbx),%rbx
768 movq 6(%rax),%rax
769 movq %rax,(%rbp)
770
771 because _s3DB is now live across the R1 assignment, we lost the
772 benefit of coalescing.
773
774 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
775 R1, then we would not sink a reference to _s3DB past the R1
776 assignment. Or perhaps we *should* do that - we might gain by sinking
777 it, despite losing the coalescing opportunity.
778
779 Sometimes not inlining global registers wins by virtue of the rule
780 about not inlining into arguments of a foreign call, e.g. (T7163) this
781 is what happens when we inlined F1:
782
783 _s3L2::F32 = F1;
784 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
785 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
786
787 but if we don't inline F1:
788
789 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
790 10.0 :: W32));
791 -}