Produce new-style Cmm from the Cmm parser
[ghc.git] / compiler / cmm / CmmSink.hs
1 {-# LANGUAGE GADTs #-}
2 module CmmSink (
3 cmmSink
4 ) where
5
6 import CodeGen.Platform (callerSaves)
7
8 import Cmm
9 import CmmOpt
10 import BlockId
11 import CmmLive
12 import CmmUtils
13 import Hoopl
14
15 import DynFlags
16 import UniqFM
17 import PprCmm ()
18
19 import Data.List (partition)
20 import qualified Data.Set as Set
21
22 -- -----------------------------------------------------------------------------
23 -- Sinking and inlining
24
25 -- This is an optimisation pass that
26 -- (a) moves assignments closer to their uses, to reduce register pressure
27 -- (b) pushes assignments into a single branch of a conditional if possible
28 -- (c) inlines assignments to registers that are mentioned only once
29 -- (d) discards dead assignments
30 --
31 -- This tightens up lots of register-heavy code. It is particularly
32 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
33 -- which every function starts with a copyIn sequence like:
34 --
35 -- x1 = R1
36 -- x2 = Sp[8]
37 -- x3 = Sp[16]
38 -- if (Sp - 32 < SpLim) then L1 else L2
39 --
40 -- we really want to push the x1..x3 assignments into the L2 branch.
41 --
42 -- Algorithm:
43 --
44 -- * Start by doing liveness analysis.
45 --
46 -- * Keep a list of assignments A; earlier ones may refer to later ones
47 --
48 -- * Walk forwards through the graph, look at each node N:
49 -- * If any assignments in A (1) occur only once in N, and (2) are
50 -- not live after N, inline the assignment and remove it
51 -- from A.
52 -- * If N is an assignment:
53 -- * If the register is not live after N, discard it
54 -- * otherwise pick up the assignment and add it to A
55 -- * If N is a non-assignment node:
56 -- * remove any assignments from A that conflict with N, and
57 -- place them before N in the current block. (we call this
58 -- "dropping" the assignments).
59 -- * An assignment conflicts with N if it:
60 -- - assigns to a register mentioned in N
61 -- - mentions a register assigned by N
62 -- - reads from memory written by N
63 -- * do this recursively, dropping dependent assignments
64 -- * At a multi-way branch:
65 -- * drop any assignments that are live on more than one branch
66 -- * if any successor has more than one predecessor (a
67 -- join-point), drop everything live in that successor
68 --
69 -- As a side-effect we'll delete some dead assignments (transitively,
70 -- even). This isn't as good as removeDeadAssignments, but it's much
71 -- cheaper.
72
73 -- If we do this *before* stack layout, we might be able to avoid
74 -- saving some things across calls/procpoints.
75 --
76 -- *but*, that will invalidate the liveness analysis, and we'll have
77 -- to re-do it.
78
79 -- -----------------------------------------------------------------------------
80 -- things that we aren't optimising very well yet.
81 --
82 -- -----------
83 -- (1) From GHC's FastString.hashStr:
84 --
85 -- s2ay:
86 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
87 -- c2gn:
88 -- R1 = _s2au::I64;
89 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
90 -- c2gp:
91 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
92 -- 4091);
93 -- _s2an::I64 = _s2an::I64 + 1;
94 -- _s2au::I64 = _s2cO::I64;
95 -- goto s2ay;
96 --
97 -- a nice loop, but we didn't eliminate the silly assignment at the end.
98 -- See Note [dependent assignments], which would probably fix this.
99 --
100 -- -----------
101 -- (2) From stg_atomically_frame in PrimOps.cmm
102 --
103 -- We have a diamond control flow:
104 --
105 -- x = ...
106 -- |
107 -- / \
108 -- A B
109 -- \ /
110 -- |
111 -- use of x
112 --
113 -- Now x won't be sunk down to its use, because we won't push it into
114 -- both branches of the conditional. We certainly do have to check
115 -- that we can sink it past all the code in both A and B, but having
116 -- discovered that, we could sink it to its use.
117 --
118
119 -- -----------------------------------------------------------------------------
120
121 type Assignment = (LocalReg, CmmExpr, AbsMem)
122 -- Assignment caches AbsMem, an abstraction of the memory read by
123 -- the RHS of the assignment.
124
125 cmmSink :: DynFlags -> CmmGraph -> CmmGraph
126 cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
127 where
128 liveness = cmmLiveness graph
129 getLive l = mapFindWithDefault Set.empty l liveness
130
131 blocks = postorderDfs graph
132
133 join_pts = findJoinPoints blocks
134
135 sink :: BlockEnv [Assignment] -> [CmmBlock] -> [CmmBlock]
136 sink _ [] = []
137 sink sunk (b:bs) =
138 -- pprTrace "sink" (ppr lbl) $
139 blockJoin first final_middle final_last : sink sunk' bs
140 where
141 lbl = entryLabel b
142 (first, middle, last) = blockSplit b
143
144 succs = successors last
145
146 -- Annotate the middle nodes with the registers live *after*
147 -- the node. This will help us decide whether we can inline
148 -- an assignment in the current node or not.
149 live = Set.unions (map getLive succs)
150 live_middle = gen_kill last live
151 ann_middles = annotate live_middle (blockToList middle)
152
153 -- Now sink and inline in this block
154 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
155 fold_last = constantFold dflags last
156 (final_last, assigs') = tryToInline dflags live fold_last assigs
157
158 -- We cannot sink into join points (successors with more than
159 -- one predecessor), so identify the join points and the set
160 -- of registers live in them.
161 (joins, nonjoins) = partition (`mapMember` join_pts) succs
162 live_in_joins = Set.unions (map getLive joins)
163
164 -- We do not want to sink an assignment into multiple branches,
165 -- so identify the set of registers live in multiple successors.
166 -- This is made more complicated because when we sink an assignment
167 -- into one branch, this might change the set of registers that are
168 -- now live in multiple branches.
169 init_live_sets = map getLive nonjoins
170 live_in_multi live_sets r =
171 case filter (Set.member r) live_sets of
172 (_one:_two:_) -> True
173 _ -> False
174
175 -- Now, drop any assignments that we will not sink any further.
176 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
177
178 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
179 where
180 should_drop = conflicts dflags a final_last
181 || {- not (isSmall rhs) && -} live_in_multi live_sets r
182 || r `Set.member` live_in_joins
183
184 live_sets' | should_drop = live_sets
185 | otherwise = map upd live_sets
186
187 upd set | r `Set.member` set = set `Set.union` live_rhs
188 | otherwise = set
189
190 live_rhs = foldRegsUsed extendRegSet emptyRegSet rhs
191
192 final_middle = foldl blockSnoc middle' dropped_last
193
194 sunk' = mapUnion sunk $
195 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
196 | l <- succs ]
197
198 {- TODO: enable this later, when we have some good tests in place to
199 measure the effect and tune it.
200
201 -- small: an expression we don't mind duplicating
202 isSmall :: CmmExpr -> Bool
203 isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead
204 isSmall (CmmLit _) = True
205 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
206 isSmall (CmmRegOff (CmmLocal _) _) = True
207 isSmall _ = False
208
209 isTrivial :: CmmExpr -> Bool
210 isTrivial (CmmReg (CmmLocal _)) = True
211 isTrivial (CmmLit _) = True
212 isTrivial _ = False
213 -}
214
215 --
216 -- annotate each node with the set of registers live *after* the node
217 --
218 annotate :: RegSet -> [CmmNode O O] -> [(RegSet, CmmNode O O)]
219 annotate live nodes = snd $ foldr ann (live,[]) nodes
220 where ann n (live,nodes) = (gen_kill n live, (live,n) : nodes)
221
222 --
223 -- Find the blocks that have multiple successors (join points)
224 --
225 findJoinPoints :: [CmmBlock] -> BlockEnv Int
226 findJoinPoints blocks = mapFilter (>1) succ_counts
227 where
228 all_succs = concatMap successors blocks
229
230 succ_counts :: BlockEnv Int
231 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
232
233 --
234 -- filter the list of assignments to remove any assignments that
235 -- are not live in a continuation.
236 --
237 filterAssignments :: DynFlags -> RegSet -> [Assignment] -> [Assignment]
238 filterAssignments dflags live assigs = reverse (go assigs [])
239 where go [] kept = kept
240 go (a@(r,_,_):as) kept | needed = go as (a:kept)
241 | otherwise = go as kept
242 where
243 needed = r `Set.member` live
244 || any (conflicts dflags a) (map toNode kept)
245 -- Note that we must keep assignments that are
246 -- referred to by other assignments we have
247 -- already kept.
248
249 -- -----------------------------------------------------------------------------
250 -- Walk through the nodes of a block, sinking and inlining assignments
251 -- as we go.
252
253 walk :: DynFlags
254 -> [(RegSet, CmmNode O O)] -- nodes of the block, annotated with
255 -- the set of registers live *after*
256 -- this node.
257
258 -> [Assignment] -- The current list of
259 -- assignments we are sinking.
260 -- Later assignments may refer
261 -- to earlier ones.
262
263 -> ( Block CmmNode O O -- The new block
264 , [Assignment] -- Assignments to sink further
265 )
266
267 walk dflags nodes assigs = go nodes emptyBlock assigs
268 where
269 go [] block as = (block, as)
270 go ((live,node):ns) block as
271 | shouldDiscard node live = go ns block as
272 | Just a <- shouldSink dflags node2 = go ns block (a : as1)
273 | otherwise = go ns block' as'
274 where
275 node1 = constantFold dflags node
276
277 (node2, as1) = tryToInline dflags live node1 as
278
279 (dropped, as') = dropAssignmentsSimple dflags
280 (\a -> conflicts dflags a node2) as1
281
282 block' = foldl blockSnoc block dropped `blockSnoc` node2
283
284
285 constantFold :: DynFlags -> CmmNode e x -> CmmNode e x
286 constantFold dflags node = mapExpDeep f node
287 where f (CmmMachOp op args) = cmmMachOpFold dflags op args
288 f (CmmRegOff r 0) = CmmReg r
289 f e = e
290
291 --
292 -- Heuristic to decide whether to pick up and sink an assignment
293 -- Currently we pick up all assignments to local registers. It might
294 -- be profitable to sink assignments to global regs too, but the
295 -- liveness analysis doesn't track those (yet) so we can't.
296 --
297 shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
298 shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
299 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
300 shouldSink _ _other = Nothing
301
302 --
303 -- discard dead assignments. This doesn't do as good a job as
304 -- removeDeadAsssignments, because it would need multiple passes
305 -- to get all the dead code, but it catches the common case of
306 -- superfluous reloads from the stack that the stack allocator
307 -- leaves behind.
308 --
309 -- Also we catch "r = r" here. You might think it would fall
310 -- out of inlining, but the inliner will see that r is live
311 -- after the instruction and choose not to inline r in the rhs.
312 --
313 shouldDiscard :: CmmNode e x -> RegSet -> Bool
314 shouldDiscard node live
315 = case node of
316 CmmAssign r (CmmReg r') | r == r' -> True
317 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
318 _otherwise -> False
319
320
321 toNode :: Assignment -> CmmNode O O
322 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
323
324 dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> [Assignment]
325 -> ([CmmNode O O], [Assignment])
326 dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
327
328 dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> [Assignment]
329 -> ([CmmNode O O], [Assignment])
330 dropAssignments dflags should_drop state assigs
331 = (dropped, reverse kept)
332 where
333 (dropped,kept) = go state assigs [] []
334
335 go _ [] dropped kept = (dropped, kept)
336 go state (assig : rest) dropped kept
337 | conflict = go state' rest (toNode assig : dropped) kept
338 | otherwise = go state' rest dropped (assig:kept)
339 where
340 (dropit, state') = should_drop assig state
341 conflict = dropit || any (conflicts dflags assig) dropped
342
343
344 -- -----------------------------------------------------------------------------
345 -- Try to inline assignments into a node.
346
347 tryToInline
348 :: DynFlags
349 -> RegSet -- set of registers live after this
350 -- node. We cannot inline anything
351 -- that is live after the node, unless
352 -- it is small enough to duplicate.
353 -> CmmNode O x -- The node to inline into
354 -> [Assignment] -- Assignments to inline
355 -> (
356 CmmNode O x -- New node
357 , [Assignment] -- Remaining assignments
358 )
359
360 tryToInline dflags live node assigs = go usages node [] assigs
361 where
362 usages :: UniqFM Int
363 usages = foldRegsUsed addUsage emptyUFM node
364
365 go _usages node _skipped [] = (node, [])
366
367 go usages node skipped (a@(l,rhs,_) : rest)
368 | can_inline = inline_and_discard
369 | False {- isTiny rhs -} = inline_and_keep
370 -- ^^ seems to make things slightly worse
371 where
372 inline_and_discard = go usages' node' skipped rest
373
374 inline_and_keep = (node'', a : rest')
375 where (node'',rest') = go usages' node' (l:skipped) rest
376
377 can_inline =
378 not (l `elemRegSet` live)
379 && not (skipped `regsUsedIn` rhs) -- Note [dependent assignments]
380 && okToInline dflags rhs node
381 && lookupUFM usages l == Just 1
382
383 usages' = foldRegsUsed addUsage usages rhs
384
385 node' = mapExpDeep inline node
386 where inline (CmmReg (CmmLocal l')) | l == l' = rhs
387 inline (CmmRegOff (CmmLocal l') off) | l == l'
388 = cmmOffset dflags rhs off
389 -- re-constant fold after inlining
390 inline (CmmMachOp op args) = cmmMachOpFold dflags op args
391 inline other = other
392
393 go usages node skipped (assig@(l,rhs,_) : rest)
394 = (node', assig : rest')
395 where (node', rest') = go usages' node (l:skipped) rest
396 usages' = foldRegsUsed (\m r -> addToUFM m r 2) usages rhs
397 -- we must not inline anything that is mentioned in the RHS
398 -- of a binding that we have already skipped, so we set the
399 -- usages of the regs on the RHS to 2.
400
401 -- Note [dependent assignments]
402 --
403 -- If our assignment list looks like
404 --
405 -- [ y = e, x = ... y ... ]
406 --
407 -- We cannot inline x. Remember this list is really in reverse order,
408 -- so it means x = ... y ...; y = e
409 --
410 -- Hence if we inline x, the outer assignment to y will capture the
411 -- reference in x's right hand side.
412 --
413 -- In this case we should rename the y in x's right-hand side,
414 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
415 -- Now we can go ahead and inline x.
416 --
417 -- For now we do nothing, because this would require putting
418 -- everything inside UniqSM.
419
420 addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
421 addUsage m r = addToUFM_C (+) m r 1
422
423 regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
424 regsUsedIn [] _ = False
425 regsUsedIn ls e = wrapRecExpf f e False
426 where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
427 f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
428 f _ z = z
429
430 -- we don't inline into CmmUnsafeForeignCall if the expression refers
431 -- to global registers. This is a HACK to avoid global registers
432 -- clashing with C argument-passing registers, really the back-end
433 -- ought to be able to handle it properly, but currently neither PprC
434 -- nor the NCG can do it. See Note [Register parameter passing]
435 -- See also StgCmmForeign:load_args_into_temps.
436 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
437 okToInline dflags expr CmmUnsafeForeignCall{} = not (anyCallerSavesRegs dflags expr)
438 okToInline _ _ _ = True
439
440 -- -----------------------------------------------------------------------------
441
442 -- | @conflicts (r,e) stmt@ is @False@ if and only if the assignment
443 -- @r = e@ can be safely commuted past @stmt@.
444 --
445 -- We only sink "r = G" assignments right now, so conflicts is very simple:
446 --
447 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
448 conflicts dflags (r, rhs, addr) node
449
450 -- (1) an assignment to a register conflicts with a use of the register
451 | CmmAssign reg _ <- node, reg `regUsedIn` rhs = True
452 | foldRegsUsed (\b r' -> r == r' || b) False node = True
453
454 -- (2) a store to an address conflicts with a read of the same memory
455 | CmmStore addr' e <- node
456 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
457
458 -- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
459 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
460 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
461 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
462
463 -- (4) assignments that read caller-saves GlobalRegs conflict with a
464 -- foreign call. See Note [foreign calls clobber GlobalRegs].
465 | CmmUnsafeForeignCall{} <- node, anyCallerSavesRegs dflags rhs = True
466
467 -- (5) foreign calls clobber memory, but not heap/stack memory
468 | CmmUnsafeForeignCall{} <- node, AnyMem <- addr = True
469
470 -- (6) native calls clobber any memory
471 | CmmCall{} <- node, memConflicts addr AnyMem = True
472
473 -- (7) otherwise, no conflict
474 | otherwise = False
475
476
477 anyCallerSavesRegs :: DynFlags -> CmmExpr -> Bool
478 anyCallerSavesRegs dflags e = wrapRecExpf f e False
479 where f (CmmReg (CmmGlobal r)) _
480 | callerSaves (targetPlatform dflags) r = True
481 f _ z = z
482
483 -- An abstraction of memory read or written.
484 data AbsMem
485 = NoMem -- no memory accessed
486 | AnyMem -- arbitrary memory
487 | HeapMem -- definitely heap memory
488 | StackMem -- definitely stack memory
489 | SpMem -- <size>[Sp+n]
490 {-# UNPACK #-} !Int
491 {-# UNPACK #-} !Int
492
493 -- Having SpMem is important because it lets us float loads from Sp
494 -- past stores to Sp as long as they don't overlap, and this helps to
495 -- unravel some long sequences of
496 -- x1 = [Sp + 8]
497 -- x2 = [Sp + 16]
498 -- ...
499 -- [Sp + 8] = xi
500 -- [Sp + 16] = xj
501 --
502 -- Note that SpMem is invalidated if Sp is changed, but the definition
503 -- of 'conflicts' above handles that.
504
505 -- ToDo: this won't currently fix the following commonly occurring code:
506 -- x1 = [R1 + 8]
507 -- x2 = [R1 + 16]
508 -- ..
509 -- [Hp - 8] = x1
510 -- [Hp - 16] = x2
511 -- ..
512
513 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
514 -- assignments to [Hp + n] do not conflict with any other heap memory,
515 -- but this is tricky to nail down. What if we had
516 --
517 -- x = Hp + n
518 -- [x] = ...
519 --
520 -- the store to [x] should be "new heap", not "old heap".
521 -- Furthermore, you could imagine that if we started inlining
522 -- functions in Cmm then there might well be reads of heap memory
523 -- that was written in the same basic block. To take advantage of
524 -- non-aliasing of heap memory we will have to be more clever.
525
526 bothMems :: AbsMem -> AbsMem -> AbsMem
527 bothMems NoMem x = x
528 bothMems x NoMem = x
529 bothMems HeapMem HeapMem = HeapMem
530 bothMems StackMem StackMem = StackMem
531 bothMems (SpMem o1 w1) (SpMem o2 w2)
532 | o1 == o2 = SpMem o1 (max w1 w2)
533 | otherwise = StackMem
534 bothMems SpMem{} StackMem = StackMem
535 bothMems StackMem SpMem{} = StackMem
536 bothMems _ _ = AnyMem
537
538 memConflicts :: AbsMem -> AbsMem -> Bool
539 memConflicts NoMem _ = False
540 memConflicts _ NoMem = False
541 memConflicts HeapMem StackMem = False
542 memConflicts StackMem HeapMem = False
543 memConflicts SpMem{} HeapMem = False
544 memConflicts HeapMem SpMem{} = False
545 memConflicts (SpMem o1 w1) (SpMem o2 w2)
546 | o1 < o2 = o1 + w1 > o2
547 | otherwise = o2 + w2 > o1
548 memConflicts _ _ = True
549
550 exprMem :: DynFlags -> CmmExpr -> AbsMem
551 exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
552 exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
553 exprMem _ _ = NoMem
554
555 loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
556 loadAddr dflags e w =
557 case e of
558 CmmReg r -> regAddr dflags r 0 w
559 CmmRegOff r i -> regAddr dflags r i w
560 _other | CmmGlobal Sp `regUsedIn` e -> StackMem
561 | otherwise -> AnyMem
562
563 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
564 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
565 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
566 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
567 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
568 regAddr _ _ _ _ = AnyMem