c0b7510349a8844085eb227ff8c9168ad64330f1

1 {-# LANGUAGE ViewPatterns #-}

2 {-# LANGUAGE GADTs #-}

3 {-# LANGUAGE FlexibleContexts #-}

5 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}

7 -- TODO: Get rid of this flag:

8 {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}

10 -- This module implements generalized code motion for assignments to

11 -- local registers, inlining and sinking when possible. It also does

12 -- some amount of rewriting for stores to register slots, which are

13 -- effectively equivalent to local registers.

14 module CmmRewriteAssignments

15 ( rewriteAssignments

18 import Cmm

19 import CmmExpr

20 import CmmOpt

21 import OptimizationFuel

22 import StgCmmUtils

25 import UniqFM

26 import Unique

27 import BlockId

33 ----------------------------------------------------------------

34 --- Main function

38 -- Because we need to act on forwards and backwards information, we

39 -- first perform usage analysis and bake this information into the

40 -- graph (backwards transform), and then do a forwards transform

41 -- to actually perform inlining and sinking.

42 g' <- annotateUsage g

44 analRewFwd assignmentLattice

45 assignmentTransfer

49 ----------------------------------------------------------------

50 --- Usage information

52 -- We decorate all register assignments with approximate usage

53 -- information, that is, the maximum number of times the register is

54 -- referenced while it is live along all outgoing control paths.

55 -- This analysis provides a precise upper bound for usage, so if a

56 -- register is never referenced, we can remove it, as that assignment is

57 -- dead.

58 --

59 -- This analysis is very similar to liveness analysis; we just keep a

60 -- little extra info. (Maybe we should move it to CmmLive, and subsume

61 -- the old liveness analysis.)

62 --

63 -- There are a few subtleties here:

64 --

65 -- - If a register goes dead, and then becomes live again, the usages

66 -- of the disjoint live range don't count towards the original range.

67 --

68 -- a = 1; // used once

69 -- b = a;

70 -- a = 2; // used once

71 -- c = a;

72 --

73 -- - A register may be used multiple times, but these all reside in

74 -- different control paths, such that any given execution only uses

75 -- it once. In that case, the usage count may still be 1.

76 --

77 -- a = 1; // used once

78 -- if (b) {

79 -- c = a + 3;

80 -- } else {

81 -- c = a + 1;

82 -- }

83 --

84 -- This policy corresponds to an inlining strategy that does not

85 -- duplicate computation but may increase binary size.

86 --

87 -- - If we naively implement a usage count, we have a counting to

88 -- infinity problem across joins. Furthermore, knowing that

89 -- something is used 2 or more times in one runtime execution isn't

90 -- particularly useful for optimizations (inlining may be beneficial,

91 -- but there's no way of knowing that without register pressure

92 -- information.)

93 --

94 -- while (...) {

95 -- // first iteration, b used once

96 -- // second iteration, b used twice

97 -- // third iteration ...

98 -- a = b;

99 -- }

100 -- // b used zero times

101 --

102 -- There is an orthogonal question, which is that for every runtime

103 -- execution, the register may be used only once, but if we inline it

104 -- in every conditional path, the binary size might increase a lot.

105 -- But tracking this information would be tricky, because it violates

106 -- the finite lattice restriction Hoopl requires for termination;

107 -- we'd thus need to supply an alternate proof, which is probably

108 -- something we should defer until we actually have an optimization

109 -- that would take advantage of this. (This might also interact

110 -- strangely with liveness information.)

111 --

112 -- a = ...;

113 -- // a is used one time, but in X different paths

114 -- case (b) of

115 -- 1 -> ... a ...

116 -- 2 -> ... a ...

117 -- 3 -> ... a ...

118 -- ...

119 --

120 -- - Memory stores to local register slots (CmmStore (CmmStackSlot

121 -- (LocalReg _) 0) _) have similar behavior to local registers,

122 -- in that these locations are all disjoint from each other. Thus,

123 -- we attempt to inline them too. Note that because these are only

124 -- generated as part of the spilling process, most of the time this

125 -- will refer to a local register and the assignment will immediately

126 -- die on the subsequent call. However, if we manage to replace that

127 -- local register with a memory location, it means that we've managed

128 -- to preserve a value on the stack without having to move it to

129 -- another memory location again! We collect usage information just

130 -- to be safe in case extra computation is involved.

134 -- Absence in map = ZeroUse

136 {-

137 -- minBound is bottom, maxBound is top, least-upper-bound is max

138 -- ToDo: Put this in Hoopl. Note that this isn't as useful as I

139 -- originally hoped, because you usually want to leave out the bottom

140 -- element when you have things like this put in maps. Maybe f is

141 -- useful on its own as a combining function.

142 boundedOrdLattice :: (Bounded a, Ord a) => String -> DataflowLattice a

143 boundedOrdLattice n = DataflowLattice n minBound f

144 where f _ (OldFact x) (NewFact y)

145 | x >= y = (NoChange, x)

146 | otherwise = (SomeChange, y)

147 -}

149 -- Custom node type we'll rewrite to. CmmAssign nodes to local

150 -- registers are replaced with AssignLocal nodes.

152 -- Plain will not contain CmmAssign nodes immediately after

153 -- transformation, but as we rewrite assignments, we may have

154 -- assignments here: these are assignments that should not be

155 -- rewritten!

172 liftRegUsage = mapGraph Plain

175 eraseRegUsage = mapGraph f

182 usageLattice :: DataflowLattice UsageMap

188 -- We reuse the names 'gen' and 'kill', although we're doing something

189 -- slightly different from the Dragon Book

194 middle n f = gen_kill n f

196 -- Checking for CmmCall/CmmForeignCall is unnecessary, because

197 -- spills/reloads have already occurred by the time we do this

198 -- analysis.

199 -- XXX Deprecated warning is puzzling: what label are we

200 -- supposed to use?

201 -- ToDo: With a bit more cleverness here, we can avoid

202 -- disappointment and heartbreak associated with the inability

203 -- to inline into CmmCall and CmmForeignCall by

204 -- over-estimating the usage to be ManyUse.

209 gen a f = foldRegsUsed increaseUsage f a

211 kill a f = foldRegsDefd delFromUFM f a

212 increaseUsage f r = addToUFM_C combine f r SingleUse

218 middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))

222 Nothing -> emptyGraph

229 annotateUsage vanilla_g =

232 analRewBwd usageLattice usageTransfer usageRewrite

234 ----------------------------------------------------------------

235 --- Assignment tracking

237 -- The idea is to maintain a map of local registers do expressions,

238 -- such that the value of that register is the same as the value of that

239 -- expression at any given time. We can then do several things,

240 -- as described by Assignment.

242 -- Assignment describes the various optimizations that are valid

243 -- at a given point in the program.

245 -- This assignment can always be inlined. It is cheap or single-use.

246 AlwaysInline CmmExpr

247 -- This assignment should be sunk down to its first use. (This will

248 -- increase code size if the register is used in multiple control flow

249 -- paths, but won't increase execution time, and the reduction of

250 -- register pressure is worth it, I think.)

251 | AlwaysSink CmmExpr

252 -- We cannot safely optimize occurrences of this local register. (This

253 -- corresponds to top in the lattice structure.)

254 | NeverOptimize

256 -- Extract the expression that is being assigned to

260 xassign NeverOptimize = Nothing

262 -- Extracts the expression, but only if they're the same constructor

266 xassign2 _ = Nothing

268 -- Note: We'd like to make decisions about "not optimizing" as soon as

269 -- possible, because this will make running the transfer function more

270 -- efficient.

273 assignmentLattice :: DataflowLattice AssignmentMap

284 -- Deletes sinks from assignment map, because /this/ is the place

285 -- where it will be sunk to.

289 f old = old

291 -- Invalidates any expressions that use a register.

293 -- foldUFM_Directly :: (Unique -> elt -> a -> a) -> a -> UniqFM elt -> a

296 f _ _ m = m

297 {- This requires the entire spine of the map to be continually rebuilt,

298 - which causes crazy memory usage!

299 invalidateUsersOf reg = mapUFM (invalidateUsers' reg)

300 where invalidateUsers' reg (xassign -> Just e) | reg `regUsedIn` e = NeverOptimize

301 invalidateUsers' _ old = old

302 -}

304 -- Note [foldUFM performance]

305 -- These calls to fold UFM no longer leak memory, but they do cause

306 -- pretty killer amounts of allocation. So they'll be something to

307 -- optimize; we need an algorithmic change to prevent us from having to

308 -- traverse the /entire/ map continually.

312 -- Algorithm for annotated assignments:

313 -- 1. Delete any sinking assignments that were used by this instruction

314 -- 2. Add the assignment to our list of valid local assignments with

315 -- the correct optimization policy.

316 -- 3. Look for all assignments that reference that register and

317 -- invalidate them.

322 SingleUse -> AlwaysInline e

323 ManyUse -> decide e

324 decide CmmLit{} = AlwaysInline e

325 decide CmmReg{} = AlwaysInline e

326 decide CmmLoad{} = AlwaysSink e

327 decide CmmStackSlot{} = AlwaysSink e

328 decide CmmMachOp{} = AlwaysSink e

329 -- We'll always inline simple operations on the global

330 -- registers, to reduce register pressure: Sp - 4 or Hp - 8

331 -- EZY: Justify this optimization more carefully.

332 decide CmmRegOff{} = AlwaysInline e

334 -- Algorithm for unannotated assignments of global registers:

335 -- 1. Delete any sinking assignments that were used by this instruction

336 -- 2. Look for all assignments that reference this register and

337 -- invalidate them.

341 -- Algorithm for unannotated assignments of *local* registers: do

342 -- nothing (it's a reload, so no state should have changed)

345 -- Algorithm for stores:

346 -- 1. Delete any sinking assignments that were used by this instruction

347 -- 2. Look for all assignments that load from memory locations that

348 -- were clobbered by this store and invalidate them.

352 where f u (xassign -> Just x) m | (lhs, rhs) `clobbers` (u, x) = addToUFM_Directly m u NeverOptimize

353 f _ _ m = m

354 {- Also leaky

355 = mapUFM_Directly p . deleteSinks n $ assign

356 -- ToDo: There's a missed opportunity here: even if a memory

357 -- access we're attempting to sink gets clobbered at some

358 -- location, it's still /better/ to sink it to right before the

359 -- point where it gets clobbered. How might we do this?

360 -- Unfortunately, it's too late to change the assignment...

361 where p r (xassign -> Just x) | (lhs, rhs) `clobbers` (r, x) = NeverOptimize

362 p _ old = old

363 -}

365 -- Assumption: Unsafe foreign calls don't clobber memory

366 -- Since foreign calls clobber caller saved registers, we need

367 -- invalidate any assignments that reference those global registers.

368 -- This is kind of expensive. (One way to optimize this might be to

369 -- store extra information about expressions that allow this and other

370 -- checks to be done cheaply.)

372 = deleteCallerSaves (foldRegsDefd (\m r -> addToUFM m r NeverOptimize) (deleteSinks n assign) n)

375 f _ _ m = m

378 g _ b = b

381 = assign

383 -- Assumptions:

384 -- * Writes using Hp do not overlap with any other memory locations

385 -- (An important invariant being relied on here is that we only ever

386 -- use Hp to allocate values on the heap, which appears to be the

387 -- case given hpReg usage, and that our heap writing code doesn't

388 -- do anything stupid like overlapping writes.)

389 -- * Stack slots do not overlap with any other memory locations

390 -- * Stack slots for different areas do not overlap

391 -- * Stack slots within the same area and different offsets may

392 -- overlap; we need to do a size check (see 'overlaps').

393 -- * Register slots only overlap with themselves. (But this shouldn't

394 -- happen in practice, because we'll fail to inline a reload across

395 -- the next spill.)

396 -- * Non stack-slot stores always conflict with each other. (This is

397 -- not always the case; we could probably do something special for Hp)

403 -- ToDo: Also catch MachOp case

405 | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)

412 -- Maybe there's an invariant broken if this actually ever

413 -- returns True

427 -- Check for memory overlapping.

428 -- Diagram:

429 -- 4 8 12

430 -- s -w- o

431 -- [ I32 ]

432 -- [ F64 ]

433 -- s' -w'- o'

447 -- Invalidates any expressions that have volatile contents: essentially,

448 -- all terminals volatile except for literals and loads of stack slots

449 -- that do not correspond to the call area for 'k' (the current call

450 -- area is volatile because overflow return parameters may be written

451 -- there.)

452 -- Note: mapUFM could be expensive, but hopefully block boundaries

453 -- aren't too common. If it is a problem, replace with something more

454 -- clever.

456 invalidateVolatile k m = mapUFM p m

467 assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase assignmentLattice .) . lastAssignment)

469 -- Note [Soundness of inlining]

470 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

471 -- In the Hoopl paper, the soundness condition on rewrite functions is

472 -- described as follows:

473 --

474 -- "If it replaces a node n by a replacement graph g, then g must

475 -- be observationally equivalent to n under the assumptions

476 -- expressed by the incoming dataflow fact f. Moreover, analysis of

477 -- g must produce output fact(s) that are at least as informative

478 -- as the fact(s) produced by applying the transfer function to n."

479 --

480 -- We consider the second condition in more detail here. It says given

481 -- the rewrite R(n, f) = g, then for any incoming fact f' consistent

482 -- with f (f' >= f), then running the transfer function T(f', n) <= T(f', g).

483 -- For inlining this is not necessarily the case:

484 --

485 -- n = "x = a + 2"

486 -- f = f' = {a = y}

487 -- g = "x = y + 2"

488 -- T(f', n) = {x = a + 2, a = y}

489 -- T(f', g) = {x = y + 2, a = y}

490 --

491 -- y + 2 and a + 2 are not obviously comparable, and a naive

492 -- implementation of the lattice would say they are incomparable.

493 -- At best, this means we may be over-conservative, at worst, it means

494 -- we may not terminate.

495 --

496 -- However, in the original Lerner-Grove-Chambers paper, soundness and

497 -- termination are separated, and only equivalence of facts is required

498 -- for soundness. Monotonicity of the transfer function is not required

499 -- for termination (as the calculation of least-upper-bound prevents

500 -- this from being a problem), but it means we won't necessarily find

501 -- the least-fixed point.

503 -- Note [Coherency of annotations]

504 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

505 -- Is it possible for our usage annotations to become invalid after we

506 -- start performing transformations? As the usage info only provides

507 -- an upper bound, we only need to consider cases where the usages of

508 -- a register may increase due to transformations--e.g. any reference

509 -- to a local register in an AlwaysInline or AlwaysSink instruction, whose

510 -- originating assignment was single use (we don't care about the

511 -- many use case, because it is the top of the lattice). But such a

512 -- case is not possible, because we always inline any single use

513 -- register. QED.

514 --

515 -- TODO: A useful lint option would be to check this invariant that

516 -- there is never a local register in the assignment map that is

517 -- single-use.

519 -- Note [Soundness of store rewriting]

520 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

521 -- Its soundness depends on the invariant that no assignment is made to

522 -- the local register before its store is accessed. This is clearly

523 -- true with unoptimized spill-reload code, and as the store will always

524 -- be rewritten first (if possible), there is no chance of it being

525 -- propagated down before getting written (possibly with incorrect

526 -- values from the assignment map, due to reassignment of the local

527 -- register.) This is probably not locally sound.

531 where

533 middle :: WithRegUsage CmmNode O O -> AssignmentMap -> GenCmmReplGraph (WithRegUsage CmmNode) O O

535 middle (AssignLocal l e u) assign = return $ rewriteLocal assign (precompute assign (CmmAssign (CmmLocal l) e)) l e u

537 -- Tuple is (inline?, reloads for sinks)

544 -- This case can show up when we have

545 -- limited optimization fuel.

547 rewrite :: AssignmentMap

550 -> CmmNode O x

553 -- Note [CmmCall Inline Hack]

554 -- Conservative hack: don't do any inlining on what will

555 -- be translated into an OldCmm CmmCalls, since the code

556 -- produced here tends to be unproblematic and I need to write

557 -- lint passes to ensure that we don't put anything in the

558 -- arguments that could be construed as a global register by

559 -- some later translation pass. (For example, slots will turn

560 -- into dereferences of Sp). See [Register parameter passing].

561 -- ToDo: Fix this up to only bug out if all inlines were for

562 -- CmmExprs with global registers (we can't use the

563 -- straightforward mapExpDeep call, in this case.) ToDo: We miss

564 -- an opportunity here, where all possible inlinings should

565 -- instead be sunk.

569 rewriteLocal :: AssignmentMap

577 -- inlinable check omitted, since we can always inline into

578 -- assignments.

588 _ -> old

596 _ -> old

597 -- See Note [Soundness of store rewriting]

601 _ -> old

602 inlineExp _ old = old

610 -- Need to interleave this with inlining, because machop folding results

611 -- in literals, which we can inline more aggressively, and inlining

612 -- gives us opportunities for more folding. However, we don't need any

613 -- facts to do MachOp folding.

624 foldNode n = mapExpDeepM foldExp n

626 foldExp _ = Nothing

628 -- ToDo: Outputable instance for UsageMap and AssignmentMap