Small optimisation: always sink/inline reg1 = reg2 assignments
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import CodeGen.Platform (callerSaves)
7
8 import Cmm
9 import CmmOpt
10 import BlockId
11 import CmmLive
12 import CmmUtils
13 import Hoopl
14
15 import DynFlags
16 import UniqFM
17 import PprCmm ()
18
19 import Data.List (partition)
20 import qualified Data.Set as Set
21
22 -- -----------------------------------------------------------------------------
23 -- Sinking and inlining
24
25 -- This is an optimisation pass that
26 -- (a) moves assignments closer to their uses, to reduce register pressure
27 -- (b) pushes assignments into a single branch of a conditional if possible
28 -- (c) inlines assignments to registers that are mentioned only once
29 -- (d) discards dead assignments
30 --
31 -- This tightens up lots of register-heavy code. It is particularly
32 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
33 -- which every function starts with a copyIn sequence like:
34 --
35 -- x1 = R1
36 -- x2 = Sp[8]
37 -- x3 = Sp[16]
38 -- if (Sp - 32 < SpLim) then L1 else L2
39 --
40 -- we really want to push the x1..x3 assignments into the L2 branch.
41 --
42 -- Algorithm:
43 --
44 -- * Start by doing liveness analysis.
45 --
46 -- * Keep a list of assignments A; earlier ones may refer to later ones
47 --
48 -- * Walk forwards through the graph, look at each node N:
49 -- * If any assignments in A (1) occur only once in N, and (2) are
50 -- not live after N, inline the assignment and remove it
51 -- from A.
52 -- * If N is an assignment:
53 -- * If the register is not live after N, discard it
54 -- * otherwise pick up the assignment and add it to A
55 -- * If N is a non-assignment node:
56 -- * remove any assignments from A that conflict with N, and
57 -- place them before N in the current block. (we call this
58 -- "dropping" the assignments).
59 -- * An assignment conflicts with N if it:
60 -- - assigns to a register mentioned in N
61 -- - mentions a register assigned by N
62 -- - reads from memory written by N
63 -- * do this recursively, dropping dependent assignments
64 -- * At a multi-way branch:
65 -- * drop any assignments that are live on more than one branch
66 -- * if any successor has more than one predecessor (a
67 -- join-point), drop everything live in that successor
68 --
69 -- As a side-effect we'll delete some dead assignments (transitively,
70 -- even). This isn't as good as removeDeadAssignments, but it's much
71 -- cheaper.
72
73 -- If we do this *before* stack layout, we might be able to avoid
74 -- saving some things across calls/procpoints.
75 --
76 -- *but*, that will invalidate the liveness analysis, and we'll have
77 -- to re-do it.
78
79 -- -----------------------------------------------------------------------------
80 -- things that we aren't optimising very well yet.
81 --
82 -- -----------
83 -- (1) From GHC's FastString.hashStr:
84 --
85 -- s2ay:
86 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
87 -- c2gn:
88 -- R1 = _s2au::I64;
89 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
90 -- c2gp:
91 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
92 -- 4091);
93 -- _s2an::I64 = _s2an::I64 + 1;
94 -- _s2au::I64 = _s2cO::I64;
95 -- goto s2ay;
96 --
97 -- a nice loop, but we didn't eliminate the silly assignment at the end.
98 -- See Note [dependent assignments], which would probably fix this.
99 --
100 -- -----------
101 -- (2) From stg_atomically_frame in PrimOps.cmm
102 --
103 -- We have a diamond control flow:
104 --
105 -- x = ...
106 -- |
107 -- / \
108 -- A B
109 -- \ /
110 -- |
111 -- use of x
112 --
113 -- Now x won't be sunk down to its use, because we won't push it into
114 -- both branches of the conditional. We certainly do have to check
115 -- that we can sink it past all the code in both A and B, but having
116 -- discovered that, we could sink it to its use.
117 --
118
119 -- -----------------------------------------------------------------------------
120
121 type Assignment = (LocalReg, CmmExpr, AbsMem)
122 -- Assignment caches AbsMem, an abstraction of the memory read by
123 -- the RHS of the assignment.
124
125 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
126 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
127 where
128 liveness = cmmLiveness graph
129 getLive l = mapFindWithDefault Set.empty l liveness
130
131 blocks = postorderDfs graph
132
133 join_pts = findJoinPoints blocks
134
135 sink :: BlockEnv [Assignment] -> [CmmBlock] -> [CmmBlock]
136 sink _ [] = []
137 sink sunk (b:bs) =
138 -- pprTrace "sink" (ppr lbl) $
139 blockJoin first final_middle final_last : sink sunk' bs
140 where
141 lbl = entryLabel b
142 (first, middle, last) = blockSplit b
143
144 succs = successors last
145
146 -- Annotate the middle nodes with the registers live *after*
147 -- the node. This will help us decide whether we can inline
148 -- an assignment in the current node or not.
149 live = Set.unions (map getLive succs)
150 live_middle = gen_kill last live
151 ann_middles = annotate live_middle (blockToList middle)
152
153 -- Now sink and inline in this block
154 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
155 fold_last = constantFold dflags last
156 (final_last, assigs') = tryToInline dflags live fold_last assigs
157
158 -- We cannot sink into join points (successors with more than
159 -- one predecessor), so identify the join points and the set
160 -- of registers live in them.
161 (joins, nonjoins) = partition (`mapMember` join_pts) succs
162 live_in_joins = Set.unions (map getLive joins)
163
164 -- We do not want to sink an assignment into multiple branches,
165 -- so identify the set of registers live in multiple successors.
166 -- This is made more complicated because when we sink an assignment
167 -- into one branch, this might change the set of registers that are
168 -- now live in multiple branches.
169 init_live_sets = map getLive nonjoins
170 live_in_multi live_sets r =
171 case filter (Set.member r) live_sets of
172 (_one:_two:_) -> True
173 _ -> False
174
175 -- Now, drop any assignments that we will not sink any further.
176 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
177
178 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
179 where
180 should_drop = conflicts dflags a final_last
181 || not (isTrivial rhs) && live_in_multi live_sets r
182 || r `Set.member` live_in_joins
183
184 live_sets' | should_drop = live_sets
185 | otherwise = map upd live_sets
186
187 upd set | r `Set.member` set = set `Set.union` live_rhs
188 | otherwise = set
189
190 live_rhs = foldRegsUsed extendRegSet emptyRegSet rhs
191
192 final_middle = foldl blockSnoc middle' dropped_last
193
194 sunk' = mapUnion sunk $
195 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
196 | l <- succs ]
197
198 {- TODO: enable this later, when we have some good tests in place to
199 measure the effect and tune it.
200
201 -- small: an expression we don't mind duplicating
202 isSmall :: CmmExpr -> Bool
203 isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead
204 isSmall (CmmLit _) = True
205 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
206 isSmall (CmmRegOff (CmmLocal _) _) = True
207 isSmall _ = False
208 -}
209
210 isTrivial :: CmmExpr -> Bool
211 isTrivial (CmmReg (CmmLocal _)) = True
212 -- isTrivial (CmmLit _) = True
213 isTrivial _ = False
214
215 --
216 -- annotate each node with the set of registers live *after* the node
217 --
218 annotate :: RegSet -> [CmmNode O O] -> [(RegSet, CmmNode O O)]
219 annotate live nodes = snd $ foldr ann (live,[]) nodes
220 where ann n (live,nodes) = (gen_kill n live, (live,n) : nodes)
221
222 --
223 -- Find the blocks that have multiple successors (join points)
224 --
225 findJoinPoints :: [CmmBlock] -> BlockEnv Int
226 findJoinPoints blocks = mapFilter (>1) succ_counts
227 where
228 all_succs = concatMap successors blocks
229
230 succ_counts :: BlockEnv Int
231 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
232
233 --
234 -- filter the list of assignments to remove any assignments that
235 -- are not live in a continuation.
236 --
237 filterAssignments :: DynFlags -> RegSet -> [Assignment] -> [Assignment]
238 filterAssignments dflags live assigs = reverse (go assigs [])
239 where go [] kept = kept
240 go (a@(r,_,_):as) kept | needed = go as (a:kept)
241 | otherwise = go as kept
242 where
243 needed = r `Set.member` live
244 || any (conflicts dflags a) (map toNode kept)
245 -- Note that we must keep assignments that are
246 -- referred to by other assignments we have
247 -- already kept.
248
249 -- -----------------------------------------------------------------------------
250 -- Walk through the nodes of a block, sinking and inlining assignments
251 -- as we go.
252
253 walk :: DynFlags
254 -> [(RegSet, CmmNode O O)] -- nodes of the block, annotated with
255 -- the set of registers live *after*
256 -- this node.
257
258 -> [Assignment] -- The current list of
259 -- assignments we are sinking.
260 -- Later assignments may refer
261 -- to earlier ones.
262
263 -> ( Block CmmNode O O -- The new block
264 , [Assignment] -- Assignments to sink further
265 )
266
267 walk dflags nodes assigs = go nodes emptyBlock assigs
268 where
269 go [] block as = (block, as)
270 go ((live,node):ns) block as
271 | shouldDiscard node live = go ns block as
272 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
273 | otherwise = go ns block' as'
274 where
275 node1 = constantFold dflags node
276
277 (node2, as1) = tryToInline dflags live node1 as
278
279 (dropped, as') = dropAssignmentsSimple dflags
280 (\a -> conflicts dflags a node2) as1
281
282 block' = foldl blockSnoc block dropped `blockSnoc` node2
283
284
285 constantFold :: DynFlags -> CmmNode e x -> CmmNode e x
286 constantFold dflags node = mapExpDeep f node
287 where f (CmmMachOp op args) = cmmMachOpFold dflags op args
288 f (CmmRegOff r 0) = CmmReg r
289 f e = e
290
291 --
292 -- Heuristic to decide whether to pick up and sink an assignment
293 -- Currently we pick up all assignments to local registers. It might
294 -- be profitable to sink assignments to global regs too, but the
295 -- liveness analysis doesn't track those (yet) so we can't.
296 --
297 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
298 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
299 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
300 shouldSink _ _other = Nothing
301
302 --
303 -- discard dead assignments. This doesn't do as good a job as
304 -- removeDeadAsssignments, because it would need multiple passes
305 -- to get all the dead code, but it catches the common case of
306 -- superfluous reloads from the stack that the stack allocator
307 -- leaves behind.
308 --
309 -- Also we catch "r = r" here. You might think it would fall
310 -- out of inlining, but the inliner will see that r is live
311 -- after the instruction and choose not to inline r in the rhs.
312 --
313 shouldDiscard :: CmmNode e x -> RegSet -> Bool
314 shouldDiscard node live
315 = case node of
316 CmmAssign r (CmmReg r') | r == r' -> True
317 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
318 _otherwise -> False
319
320
321 toNode :: Assignment -> CmmNode O O
322 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
323
324 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> [Assignment]
325 -> ([CmmNode O O], [Assignment])
326 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
327
328 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> [Assignment]
329 -> ([CmmNode O O], [Assignment])
330 dropAssignments dflags should_drop state assigs
331 = (dropped, reverse kept)
332 where
333 (dropped,kept) = go state assigs [] []
334
335 go _ [] dropped kept = (dropped, kept)
336 go state (assig : rest) dropped kept
337 | conflict = go state' rest (toNode assig : dropped) kept
338 | otherwise = go state' rest dropped (assig:kept)
339 where
340 (dropit, state') = should_drop assig state
341 conflict = dropit || any (conflicts dflags assig) dropped
342
343
344 -- -----------------------------------------------------------------------------
345 -- Try to inline assignments into a node.
346
347 tryToInline
348 :: DynFlags
349 -> RegSet -- set of registers live after this
350 -- node. We cannot inline anything
351 -- that is live after the node, unless
352 -- it is small enough to duplicate.
353 -> CmmNode O x -- The node to inline into
354 -> [Assignment] -- Assignments to inline
355 -> (
356 CmmNode O x -- New node
357 , [Assignment] -- Remaining assignments
358 )
359
360 tryToInline dflags live node assigs = go usages node [] assigs
361 where
362 usages :: UniqFM Int
363 usages = foldRegsUsed addUsage emptyUFM node
364
365 go _usages node _skipped [] = (node, [])
366
367 go usages node skipped (a@(l,rhs,_) : rest)
368 | can_inline = inline_and_discard
369 | isTrivial rhs = inline_and_keep
370 where
371 inline_and_discard = go usages' node' skipped rest
372
373 inline_and_keep = (node'', a : rest')
374 where (node'',rest') = go usages' node' (l:skipped) rest
375
376 can_inline =
377 not (l `elemRegSet` live)
378 && not (skipped `regsUsedIn` rhs) -- Note [dependent assignments]
379 && okToInline dflags rhs node
380 && lookupUFM usages l == Just 1
381
382 usages' = foldRegsUsed addUsage usages rhs
383
384 node' = mapExpDeep inline node
385 where inline (CmmReg (CmmLocal l')) | l == l' = rhs
386 inline (CmmRegOff (CmmLocal l') off) | l == l'
387 = cmmOffset dflags rhs off
388 -- re-constant fold after inlining
389 inline (CmmMachOp op args) = cmmMachOpFold dflags op args
390 inline other = other
391
392 go usages node skipped (assig@(l,rhs,_) : rest)
393 = (node', assig : rest')
394 where (node', rest') = go usages' node (l:skipped) rest
395 usages' = foldRegsUsed (\m r -> addToUFM m r 2) usages rhs
396 -- we must not inline anything that is mentioned in the RHS
397 -- of a binding that we have already skipped, so we set the
398 -- usages of the regs on the RHS to 2.
399
400 -- Note [dependent assignments]
401 --
402 -- If our assignment list looks like
403 --
404 -- [ y = e, x = ... y ... ]
405 --
406 -- We cannot inline x. Remember this list is really in reverse order,
407 -- so it means x = ... y ...; y = e
408 --
409 -- Hence if we inline x, the outer assignment to y will capture the
410 -- reference in x's right hand side.
411 --
412 -- In this case we should rename the y in x's right-hand side,
413 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
414 -- Now we can go ahead and inline x.
415 --
416 -- For now we do nothing, because this would require putting
417 -- everything inside UniqSM.
418
419 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
420 addUsage m r = addToUFM_C (+) m r 1
421
422 regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
423 regsUsedIn [] _ = False
424 regsUsedIn ls e = wrapRecExpf f e False
425 where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
426 f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
427 f _ z = z
428
429 -- we don't inline into CmmUnsafeForeignCall if the expression refers
430 -- to global registers. This is a HACK to avoid global registers
431 -- clashing with C argument-passing registers, really the back-end
432 -- ought to be able to handle it properly, but currently neither PprC
433 -- nor the NCG can do it. See Note [Register parameter passing]
434 -- See also StgCmmForeign:load_args_into_temps.
435 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
436 okToInline dflags expr CmmUnsafeForeignCall{} = not (anyCallerSavesRegs dflags expr)
437 okToInline _ _ _ = True
438
439 -- -----------------------------------------------------------------------------
440
441 -- | @conflicts (r,e) stmt@ is @False@ if and only if the assignment
442 -- @r = e@ can be safely commuted past @stmt@.
443 --
444 -- We only sink "r = G" assignments right now, so conflicts is very simple:
445 --
446 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
447 conflicts dflags (r, rhs, addr) node
448
449 -- (1) an assignment to a register conflicts with a use of the register
450 | CmmAssign reg _ <- node, reg `regUsedIn` rhs = True
451 | foldRegsUsed (\b r' -> r == r' || b) False node = True
452
453 -- (2) a store to an address conflicts with a read of the same memory
454 | CmmStore addr' e <- node
455 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
456
457 -- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
458 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
459 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
460 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
461
462 -- (4) assignments that read caller-saves GlobalRegs conflict with a
463 -- foreign call. See Note [foreign calls clobber GlobalRegs].
464 | CmmUnsafeForeignCall{} <- node, anyCallerSavesRegs dflags rhs = True
465
466 -- (5) foreign calls clobber heap: see Note [foreign calls clobber heap]
467 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
468
469 -- (6) native calls clobber any memory
470 | CmmCall{} <- node, memConflicts addr AnyMem = True
471
472 -- (7) otherwise, no conflict
473 | otherwise = False
474
475
476 anyCallerSavesRegs :: DynFlags -> CmmExpr -> Bool
477 anyCallerSavesRegs dflags e = wrapRecExpf f e False
478 where f (CmmReg (CmmGlobal r)) _
479 | callerSaves (targetPlatform dflags) r = True
480 f _ z = z
481
482 -- An abstraction of memory read or written.
483 data AbsMem
484 = NoMem -- no memory accessed
485 | AnyMem -- arbitrary memory
486 | HeapMem -- definitely heap memory
487 | StackMem -- definitely stack memory
488 | SpMem -- <size>[Sp+n]
489 {-# UNPACK #-} !Int
490 {-# UNPACK #-} !Int
491
492 -- Having SpMem is important because it lets us float loads from Sp
493 -- past stores to Sp as long as they don't overlap, and this helps to
494 -- unravel some long sequences of
495 -- x1 = [Sp + 8]
496 -- x2 = [Sp + 16]
497 -- ...
498 -- [Sp + 8] = xi
499 -- [Sp + 16] = xj
500 --
501 -- Note that SpMem is invalidated if Sp is changed, but the definition
502 -- of 'conflicts' above handles that.
503
504 -- ToDo: this won't currently fix the following commonly occurring code:
505 -- x1 = [R1 + 8]
506 -- x2 = [R1 + 16]
507 -- ..
508 -- [Hp - 8] = x1
509 -- [Hp - 16] = x2
510 -- ..
511
512 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
513 -- assignments to [Hp + n] do not conflict with any other heap memory,
514 -- but this is tricky to nail down. What if we had
515 --
516 -- x = Hp + n
517 -- [x] = ...
518 --
519 -- the store to [x] should be "new heap", not "old heap".
520 -- Furthermore, you could imagine that if we started inlining
521 -- functions in Cmm then there might well be reads of heap memory
522 -- that was written in the same basic block. To take advantage of
523 -- non-aliasing of heap memory we will have to be more clever.
524
525 -- Note [foreign calls clobber]
526 --
527 -- It is tempting to say that foreign calls clobber only
528 -- non-heap/stack memory, but unfortunately we break this invariant in
529 -- the RTS. For example, in stg_catch_retry_frame we call
530 -- stmCommitNestedTransaction() which modifies the contents of the
531 -- TRec it is passed (this actually caused incorrect code to be
532 -- generated).
533 --
534 -- Since the invariant is true for the majority of foreign calls,
535 -- perhaps we ought to have a special annotation for calls that can
536 -- modify heap/stack memory. For now we just use the conservative
537 -- definition here.
538
539
540 bothMems :: AbsMem -> AbsMem -> AbsMem
541 bothMems NoMem x = x
542 bothMems x NoMem = x
543 bothMems HeapMem HeapMem = HeapMem
544 bothMems StackMem StackMem = StackMem
545 bothMems (SpMem o1 w1) (SpMem o2 w2)
546 | o1 == o2 = SpMem o1 (max w1 w2)
547 | otherwise = StackMem
548 bothMems SpMem{} StackMem = StackMem
549 bothMems StackMem SpMem{} = StackMem
550 bothMems _ _ = AnyMem
551
552 memConflicts :: AbsMem -> AbsMem -> Bool
553 memConflicts NoMem _ = False
554 memConflicts _ NoMem = False
555 memConflicts HeapMem StackMem = False
556 memConflicts StackMem HeapMem = False
557 memConflicts SpMem{} HeapMem = False
558 memConflicts HeapMem SpMem{} = False
559 memConflicts (SpMem o1 w1) (SpMem o2 w2)
560 | o1 < o2 = o1 + w1 > o2
561 | otherwise = o2 + w2 > o1
562 memConflicts _ _ = True
563
564 exprMem :: DynFlags -> CmmExpr -> AbsMem
565 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
566 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
567 exprMem _ _ = NoMem
568
569 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
570 loadAddr dflags e w =
571 case e of
572 CmmReg r -> regAddr dflags r 0 w
573 CmmRegOff r i -> regAddr dflags r i w
574 _other | CmmGlobal Sp `regUsedIn` e -> StackMem
575 | otherwise -> AnyMem
576
577 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
578 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
579 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
580 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
581 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
582 regAddr _ _ _ _ = AnyMem