Be more selective in which conditionals we invert
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 {-# LANGUAGE BangPatterns #-}
2
3 module CmmPipeline (
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
7 cmmPipeline
8 ) where
9
10 import GhcPrelude
11
12 import Cmm
13 import CmmLint
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmImplementSwitchPlans
17 import CmmProcPoint
18 import CmmContFlowOpt
19 import CmmLayoutStack
20 import CmmSink
21 import Hoopl.Collections
22
23 import UniqSupply
24 import DynFlags
25 import ErrUtils
26 import HscTypes
27 import Control.Monad
28 import Outputable
29 import Platform
30
31 -----------------------------------------------------------------------------
32 -- | Top level driver for C-- pipeline
33 -----------------------------------------------------------------------------
34
35 cmmPipeline :: HscEnv -- Compilation env including
36 -- dynamic flags: -dcmm-lint -ddump-cmm-cps
37 -> TopSRT -- SRT table and accumulating list of compiled procs
38 -> CmmGroup -- Input C-- with Procedures
39 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
40
41 cmmPipeline hsc_env topSRT prog =
42 do let dflags = hsc_dflags hsc_env
43
44 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
45
46 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
47 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" (ppr cmms)
48
49 return (topSRT, cmms)
50
51
52 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
53 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
54 cpsTop hsc_env proc =
55 do
56 ----------- Control-flow optimisations ----------------------------------
57
58 -- The first round of control-flow optimisation speeds up the
59 -- later passes by removing lots of empty blocks, so we do it
60 -- even when optimisation isn't turned on.
61 --
62 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
63 return $ cmmCfgOptsProc splitting_proc_points proc
64 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
65
66 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
67 , do_layout = do_layout }} = h
68
69 ----------- Eliminate common blocks -------------------------------------
70 g <- {-# SCC "elimCommonBlocks" #-}
71 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
72 Opt_D_dump_cmm_cbe "Post common block elimination"
73
74 -- Any work storing block Labels must be performed _after_
75 -- elimCommonBlocks
76
77 g <- {-# SCC "createSwitchPlans" #-}
78 runUniqSM $ cmmImplementSwitchPlans dflags g
79 dump Opt_D_dump_cmm_switch "Post switch plan" g
80
81 ----------- Proc points -------------------------------------------------
82 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
83 proc_points <-
84 if splitting_proc_points
85 then do
86 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
87 minimalProcPointSet (targetPlatform dflags) call_pps g
88 dumpWith dflags Opt_D_dump_cmm_proc "Proc points"
89 (ppr l $$ ppr pp $$ ppr g)
90 return pp
91 else
92 return call_pps
93
94 ----------- Layout the stack and manifest Sp ----------------------------
95 (g, stackmaps) <-
96 {-# SCC "layoutStack" #-}
97 if do_layout
98 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
99 else return (g, mapEmpty)
100 dump Opt_D_dump_cmm_sp "Layout Stack" g
101
102 ----------- Sink and inline assignments --------------------------------
103 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
104 condPass Opt_CmmSink (cmmSink dflags) g
105 Opt_D_dump_cmm_sink "Sink assignments"
106
107 ------------- CAF analysis ----------------------------------------------
108 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
109 dumpWith dflags Opt_D_dump_cmm_caf "CAFEnv" (ppr cafEnv)
110
111 g <- if splitting_proc_points
112 then do
113 ------------- Split into separate procedures -----------------------
114 let pp_map = {-# SCC "procPointAnalysis" #-}
115 procPointAnalysis proc_points g
116 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" $
117 ppr pp_map
118 g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
119 splitAtProcPoints dflags l call_pps proc_points pp_map
120 (CmmProc h l v g)
121 dumps Opt_D_dump_cmm_split "Post splitting" g
122 return g
123 else do
124 -- attach info tables to return points
125 return $ [attachContInfoTables call_pps (CmmProc h l v g)]
126
127 ------------- Populate info tables with stack info -----------------
128 g <- {-# SCC "setInfoTableStackMap" #-}
129 return $ map (setInfoTableStackMap dflags stackmaps) g
130 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
131
132 ----------- Control-flow optimisations -----------------------------
133 g <- {-# SCC "cmmCfgOpts(2)" #-}
134 return $ if optLevel dflags >= 1
135 then map (cmmCfgOptsProc splitting_proc_points) g
136 else g
137 g <- return (map removeUnreachableBlocksProc g)
138 -- See Note [unreachable blocks]
139 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
140
141 return (cafEnv, g)
142
143 where dflags = hsc_dflags hsc_env
144 platform = targetPlatform dflags
145 dump = dumpGraph dflags
146
147 dumps flag name
148 = mapM_ (dumpWith dflags flag name . ppr)
149
150 condPass flag pass g dumpflag dumpname =
151 if gopt flag dflags
152 then do
153 g <- return $ pass g
154 dump dumpflag dumpname g
155 return g
156 else return g
157
158
159 -- we don't need to split proc points for the NCG, unless
160 -- tablesNextToCode is off. The latter is because we have no
161 -- label to put on info tables for basic blocks that are not
162 -- the entry point.
163 splitting_proc_points = hscTarget dflags /= HscAsm
164 || not (tablesNextToCode dflags)
165 || -- Note [inconsistent-pic-reg]
166 usingInconsistentPicReg
167 usingInconsistentPicReg
168 = case (platformArch platform, platformOS platform, positionIndependent dflags)
169 of (ArchX86, OSDarwin, pic) -> pic
170 (ArchPPC, OSDarwin, pic) -> pic
171 _ -> False
172
173 -- Note [Sinking after stack layout]
174 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
175 --
176 -- In the past we considered running sinking pass also before stack
177 -- layout, but after making some measurements we realized that:
178 --
179 -- a) running sinking only before stack layout produces slower
180 -- code than running sinking only before stack layout
181 --
182 -- b) running sinking both before and after stack layout produces
183 -- code that has the same performance as when running sinking
184 -- only after stack layout.
185 --
186 -- In other words sinking before stack layout doesn't buy as anything.
187 --
188 -- An interesting question is "why is it better to run sinking after
189 -- stack layout"? It seems that the major reason are stores and loads
190 -- generated by stack layout. Consider this code before stack layout:
191 --
192 -- c1E:
193 -- _c1C::P64 = R3;
194 -- _c1B::P64 = R2;
195 -- _c1A::P64 = R1;
196 -- I64[(young<c1D> + 8)] = c1D;
197 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
198 -- c1D:
199 -- R3 = _c1C::P64;
200 -- R2 = _c1B::P64;
201 -- R1 = _c1A::P64;
202 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
203 --
204 -- Stack layout pass will save all local variables live across a call
205 -- (_c1C, _c1B and _c1A in this example) on the stack just before
206 -- making a call and reload them from the stack after returning from a
207 -- call:
208 --
209 -- c1E:
210 -- _c1C::P64 = R3;
211 -- _c1B::P64 = R2;
212 -- _c1A::P64 = R1;
213 -- I64[Sp - 32] = c1D;
214 -- P64[Sp - 24] = _c1A::P64;
215 -- P64[Sp - 16] = _c1B::P64;
216 -- P64[Sp - 8] = _c1C::P64;
217 -- Sp = Sp - 32;
218 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
219 -- c1D:
220 -- _c1A::P64 = P64[Sp + 8];
221 -- _c1B::P64 = P64[Sp + 16];
222 -- _c1C::P64 = P64[Sp + 24];
223 -- R3 = _c1C::P64;
224 -- R2 = _c1B::P64;
225 -- R1 = _c1A::P64;
226 -- Sp = Sp + 32;
227 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
228 --
229 -- If we don't run sinking pass after stack layout we are basically
230 -- left with such code. However, running sinking on this code can lead
231 -- to significant improvements:
232 --
233 -- c1E:
234 -- I64[Sp - 32] = c1D;
235 -- P64[Sp - 24] = R1;
236 -- P64[Sp - 16] = R2;
237 -- P64[Sp - 8] = R3;
238 -- Sp = Sp - 32;
239 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
240 -- c1D:
241 -- R3 = P64[Sp + 24];
242 -- R2 = P64[Sp + 16];
243 -- R1 = P64[Sp + 8];
244 -- Sp = Sp + 32;
245 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
246 --
247 -- Now we only have 9 assignments instead of 15.
248 --
249 -- There is one case when running sinking before stack layout could
250 -- be beneficial. Consider this:
251 --
252 -- L1:
253 -- x = y
254 -- call f() returns L2
255 -- L2: ...x...y...
256 --
257 -- Since both x and y are live across a call to f, they will be stored
258 -- on the stack during stack layout and restored after the call:
259 --
260 -- L1:
261 -- x = y
262 -- P64[Sp - 24] = L2
263 -- P64[Sp - 16] = x
264 -- P64[Sp - 8] = y
265 -- Sp = Sp - 24
266 -- call f() returns L2
267 -- L2:
268 -- y = P64[Sp + 16]
269 -- x = P64[Sp + 8]
270 -- Sp = Sp + 24
271 -- ...x...y...
272 --
273 -- However, if we run sinking before stack layout we would propagate x
274 -- to its usage place (both x and y must be local register for this to
275 -- be possible - global registers cannot be floated past a call):
276 --
277 -- L1:
278 -- x = y
279 -- call f() returns L2
280 -- L2: ...y...y...
281 --
282 -- Thus making x dead at the call to f(). If we ran stack layout now
283 -- we would generate less stores and loads:
284 --
285 -- L1:
286 -- x = y
287 -- P64[Sp - 16] = L2
288 -- P64[Sp - 8] = y
289 -- Sp = Sp - 16
290 -- call f() returns L2
291 -- L2:
292 -- y = P64[Sp + 8]
293 -- Sp = Sp + 16
294 -- ...y...y...
295 --
296 -- But since we don't see any benefits from running sinking befroe stack
297 -- layout, this situation probably doesn't arise too often in practice.
298 --
299
300 {- Note [inconsistent-pic-reg]
301
302 On x86/Darwin, PIC is implemented by inserting a sequence like
303
304 call 1f
305 1: popl %reg
306
307 at the proc entry point, and then referring to labels as offsets from
308 %reg. If we don't split proc points, then we could have many entry
309 points in a proc that would need this sequence, and each entry point
310 would then get a different value for %reg. If there are any join
311 points, then at the join point we don't have a consistent value for
312 %reg, so we don't know how to refer to labels.
313
314 Hence, on x86/Darwin, we have to split proc points, and then each proc
315 point will get its own PIC initialisation sequence.
316
317 The situation is the same for ppc/Darwin. We use essentially the same
318 sequence to load the program counter onto reg:
319
320 bcl 20,31,1f
321 1: mflr reg
322
323 This isn't an issue on x86/ELF, where the sequence is
324
325 call 1f
326 1: popl %reg
327 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
328
329 so %reg always has a consistent value: the address of
330 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
331
332 -}
333
334 {- Note [unreachable blocks]
335
336 The control-flow optimiser sometimes leaves unreachable blocks behind
337 containing junk code. These aren't necessarily a problem, but
338 removing them is good because it might save time in the native code
339 generator later.
340
341 -}
342
343 runUniqSM :: UniqSM a -> IO a
344 runUniqSM m = do
345 us <- mkSplitUniqSupply 'u'
346 return (initUs_ us m)
347
348
349 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
350 dumpGraph dflags flag name g = do
351 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
352 dumpWith dflags flag name (ppr g)
353 where
354 do_lint g = case cmmLintGraph dflags g of
355 Just err -> do { fatalErrorMsg dflags err
356 ; ghcExit dflags 1
357 }
358 Nothing -> return ()
359
360 dumpWith :: DynFlags -> DumpFlag -> String -> SDoc -> IO ()
361 dumpWith dflags flag txt sdoc = do
362 -- ToDo: No easy way of say "dump all the cmm, *and* split
363 -- them into files." Also, -ddump-cmm-verbose doesn't play
364 -- nicely with -ddump-to-file, since the headers get omitted.
365 dumpIfSet_dyn dflags flag txt sdoc
366 when (not (dopt flag dflags)) $
367 dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose txt sdoc