Fold testsuite.git into ghc.git (re #8545)
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 {-# LANGUAGE NoMonoLocalBinds #-}
2 -- Norman likes local bindings
3 -- If this module lives on I'd like to get rid of this extension in due course
4
5 module CmmPipeline (
6 -- | Converts C-- with an implicit stack and native C-- calls into
7 -- optimized, CPS converted and native-call-less C--. The latter
8 -- C-- can be used to generate assembly.
9 cmmPipeline
10 ) where
11
12 import Cmm
13 import CmmLint
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmProcPoint
17 import CmmContFlowOpt
18 import CmmLayoutStack
19 import CmmSink
20 import Hoopl
21
22 import UniqSupply
23 import DynFlags
24 import ErrUtils
25 import HscTypes
26 import Control.Monad
27 import Outputable
28 import Platform
29
30 -----------------------------------------------------------------------------
31 -- | Top level driver for C-- pipeline
32 -----------------------------------------------------------------------------
33
34 cmmPipeline :: HscEnv -- Compilation env including
35 -- dynamic flags: -dcmm-lint -ddump-cps-cmm
36 -> TopSRT -- SRT table and accumulating list of compiled procs
37 -> CmmGroup -- Input C-- with Procedures
38 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
39
40 cmmPipeline hsc_env topSRT prog =
41 do let dflags = hsc_dflags hsc_env
42
43 showPass dflags "CPSZ"
44
45 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
46
47 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
48 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" cmms
49
50 return (topSRT, cmms)
51
52
53 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
54 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
55 cpsTop hsc_env proc =
56 do
57 ----------- Control-flow optimisations ----------------------------------
58
59 -- The first round of control-flow optimisation speeds up the
60 -- later passes by removing lots of empty blocks, so we do it
61 -- even when optimisation isn't turned on.
62 --
63 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
64 return $ cmmCfgOptsProc splitting_proc_points proc
65 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
66
67 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
68 , do_layout = do_layout }} = h
69
70 ----------- Eliminate common blocks -------------------------------------
71 g <- {-# SCC "elimCommonBlocks" #-}
72 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
73 Opt_D_dump_cmm_cbe "Post common block elimination"
74
75 -- Any work storing block Labels must be performed _after_
76 -- elimCommonBlocks
77
78 ----------- Proc points -------------------------------------------------
79 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
80 proc_points <-
81 if splitting_proc_points
82 then do
83 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
84 minimalProcPointSet (targetPlatform dflags) call_pps g
85 dumpIfSet_dyn dflags Opt_D_dump_cmm "Proc points"
86 (ppr l $$ ppr pp $$ ppr g)
87 return pp
88 else
89 return call_pps
90
91 let noncall_pps = proc_points `setDifference` call_pps
92 when (not (setNull noncall_pps) && dopt Opt_D_dump_cmm dflags) $
93 pprTrace "Non-call proc points: " (ppr noncall_pps) $ return ()
94
95 ----------- Layout the stack and manifest Sp ----------------------------
96 (g, stackmaps) <-
97 {-# SCC "layoutStack" #-}
98 if do_layout
99 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
100 else return (g, mapEmpty)
101 dump Opt_D_dump_cmm_sp "Layout Stack" g
102
103 ----------- Sink and inline assignments --------------------------------
104 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
105 condPass Opt_CmmSink (cmmSink dflags) g
106 Opt_D_dump_cmm_sink "Sink assignments"
107
108 ------------- CAF analysis ----------------------------------------------
109 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
110 dumpIfSet_dyn dflags Opt_D_dump_cmm "CAFEnv" (ppr cafEnv)
111
112 if splitting_proc_points
113 then do
114 ------------- Split into separate procedures -----------------------
115 pp_map <- {-# SCC "procPointAnalysis" #-} runUniqSM $
116 procPointAnalysis proc_points g
117 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" pp_map
118 gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
119 splitAtProcPoints dflags l call_pps proc_points pp_map
120 (CmmProc h l v g)
121 dumps Opt_D_dump_cmm_split "Post splitting" gs
122
123 ------------- Populate info tables with stack info -----------------
124 gs <- {-# SCC "setInfoTableStackMap" #-}
125 return $ map (setInfoTableStackMap dflags stackmaps) gs
126 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" gs
127
128 ----------- Control-flow optimisations -----------------------------
129 gs <- {-# SCC "cmmCfgOpts(2)" #-}
130 return $ if optLevel dflags >= 1
131 then map (cmmCfgOptsProc splitting_proc_points) gs
132 else gs
133 gs <- return (map removeUnreachableBlocksProc gs)
134 -- Note [unreachable blocks]
135 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" gs
136
137 return (cafEnv, gs)
138
139 else do
140 -- attach info tables to return points
141 g <- return $ attachContInfoTables call_pps (CmmProc h l v g)
142
143 ------------- Populate info tables with stack info -----------------
144 g <- {-# SCC "setInfoTableStackMap" #-}
145 return $ setInfoTableStackMap dflags stackmaps g
146 dump' Opt_D_dump_cmm_info "after setInfoTableStackMap" g
147
148 ----------- Control-flow optimisations -----------------------------
149 g <- {-# SCC "cmmCfgOpts(2)" #-}
150 return $ if optLevel dflags >= 1
151 then cmmCfgOptsProc splitting_proc_points g
152 else g
153 g <- return (removeUnreachableBlocksProc g)
154 -- Note [unreachable blocks]
155 dump' Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
156
157 return (cafEnv, [g])
158
159 where dflags = hsc_dflags hsc_env
160 platform = targetPlatform dflags
161 dump = dumpGraph dflags
162 dump' = dumpWith dflags
163
164 dumps flag name
165 = mapM_ (dumpWith dflags flag name)
166
167 condPass flag pass g dumpflag dumpname =
168 if gopt flag dflags
169 then do
170 g <- return $ pass g
171 dump dumpflag dumpname g
172 return g
173 else return g
174
175
176 -- we don't need to split proc points for the NCG, unless
177 -- tablesNextToCode is off. The latter is because we have no
178 -- label to put on info tables for basic blocks that are not
179 -- the entry point.
180 splitting_proc_points = hscTarget dflags /= HscAsm
181 || not (tablesNextToCode dflags)
182 || -- Note [inconsistent-pic-reg]
183 usingInconsistentPicReg
184 usingInconsistentPicReg
185 = case (platformArch platform, platformOS platform, gopt Opt_PIC dflags)
186 of (ArchX86, OSDarwin, pic) -> pic
187 (ArchPPC, OSDarwin, pic) -> pic
188 _ -> False
189
190 -- Note [Sinking after stack layout]
191 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
192 --
193 -- In the past we considered running sinking pass also before stack
194 -- layout, but after making some measurements we realized that:
195 --
196 -- a) running sinking only before stack layout produces slower
197 -- code than running sinking only before stack layout
198 --
199 -- b) running sinking both before and after stack layout produces
200 -- code that has the same performance as when running sinking
201 -- only after stack layout.
202 --
203 -- In other words sinking before stack layout doesn't buy as anything.
204 --
205 -- An interesting question is "why is it better to run sinking after
206 -- stack layout"? It seems that the major reason are stores and loads
207 -- generated by stack layout. Consider this code before stack layout:
208 --
209 -- c1E:
210 -- _c1C::P64 = R3;
211 -- _c1B::P64 = R2;
212 -- _c1A::P64 = R1;
213 -- I64[(young<c1D> + 8)] = c1D;
214 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
215 -- c1D:
216 -- R3 = _c1C::P64;
217 -- R2 = _c1B::P64;
218 -- R1 = _c1A::P64;
219 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
220 --
221 -- Stack layout pass will save all local variables live across a call
222 -- (_c1C, _c1B and _c1A in this example) on the stack just before
223 -- making a call and reload them from the stack after returning from a
224 -- call:
225 --
226 -- c1E:
227 -- _c1C::P64 = R3;
228 -- _c1B::P64 = R2;
229 -- _c1A::P64 = R1;
230 -- I64[Sp - 32] = c1D;
231 -- P64[Sp - 24] = _c1A::P64;
232 -- P64[Sp - 16] = _c1B::P64;
233 -- P64[Sp - 8] = _c1C::P64;
234 -- Sp = Sp - 32;
235 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
236 -- c1D:
237 -- _c1A::P64 = P64[Sp + 8];
238 -- _c1B::P64 = P64[Sp + 16];
239 -- _c1C::P64 = P64[Sp + 24];
240 -- R3 = _c1C::P64;
241 -- R2 = _c1B::P64;
242 -- R1 = _c1A::P64;
243 -- Sp = Sp + 32;
244 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
245 --
246 -- If we don't run sinking pass after stack layout we are basically
247 -- left with such code. However, running sinking on this code can lead
248 -- to significant improvements:
249 --
250 -- c1E:
251 -- I64[Sp - 32] = c1D;
252 -- P64[Sp - 24] = R1;
253 -- P64[Sp - 16] = R2;
254 -- P64[Sp - 8] = R3;
255 -- Sp = Sp - 32;
256 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
257 -- c1D:
258 -- R3 = P64[Sp + 24];
259 -- R2 = P64[Sp + 16];
260 -- R1 = P64[Sp + 8];
261 -- Sp = Sp + 32;
262 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
263 --
264 -- Now we only have 9 assignments instead of 15.
265 --
266 -- There is one case when running sinking before stack layout could
267 -- be beneficial. Consider this:
268 --
269 -- L1:
270 -- x = y
271 -- call f() returns L2
272 -- L2: ...x...y...
273 --
274 -- Since both x and y are live across a call to f, they will be stored
275 -- on the stack during stack layout and restored after the call:
276 --
277 -- L1:
278 -- x = y
279 -- P64[Sp - 24] = L2
280 -- P64[Sp - 16] = x
281 -- P64[Sp - 8] = y
282 -- Sp = Sp - 24
283 -- call f() returns L2
284 -- L2:
285 -- y = P64[Sp + 16]
286 -- x = P64[Sp + 8]
287 -- Sp = Sp + 24
288 -- ...x...y...
289 --
290 -- However, if we run sinking before stack layout we would propagate x
291 -- to its usage place (both x and y must be local register for this to
292 -- be possible - global registers cannot be floated past a call):
293 --
294 -- L1:
295 -- x = y
296 -- call f() returns L2
297 -- L2: ...y...y...
298 --
299 -- Thus making x dead at the call to f(). If we ran stack layout now
300 -- we would generate less stores and loads:
301 --
302 -- L1:
303 -- x = y
304 -- P64[Sp - 16] = L2
305 -- P64[Sp - 8] = y
306 -- Sp = Sp - 16
307 -- call f() returns L2
308 -- L2:
309 -- y = P64[Sp + 8]
310 -- Sp = Sp + 16
311 -- ...y...y...
312 --
313 -- But since we don't see any benefits from running sinking befroe stack
314 -- layout, this situation probably doesn't arise too often in practice.
315 --
316
317 {- Note [inconsistent-pic-reg]
318
319 On x86/Darwin, PIC is implemented by inserting a sequence like
320
321 call 1f
322 1: popl %reg
323
324 at the proc entry point, and then referring to labels as offsets from
325 %reg. If we don't split proc points, then we could have many entry
326 points in a proc that would need this sequence, and each entry point
327 would then get a different value for %reg. If there are any join
328 points, then at the join point we don't have a consistent value for
329 %reg, so we don't know how to refer to labels.
330
331 Hence, on x86/Darwin, we have to split proc points, and then each proc
332 point will get its own PIC initialisation sequence.
333
334 The situation is the same for ppc/Darwin. We use essentially the same
335 sequence to load the program counter onto reg:
336
337 bcl 20,31,1f
338 1: mflr reg
339
340 This isn't an issue on x86/ELF, where the sequence is
341
342 call 1f
343 1: popl %reg
344 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
345
346 so %reg always has a consistent value: the address of
347 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
348
349 -}
350
351 {- Note [unreachable blocks]
352
353 The control-flow optimiser sometimes leaves unreachable blocks behind
354 containing junk code. If these blocks make it into the native code
355 generator then they trigger a register allocator panic because they
356 refer to undefined LocalRegs, so we must eliminate any unreachable
357 blocks before passing the code onwards.
358
359 -}
360
361 runUniqSM :: UniqSM a -> IO a
362 runUniqSM m = do
363 us <- mkSplitUniqSupply 'u'
364 return (initUs_ us m)
365
366
367 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
368 dumpGraph dflags flag name g = do
369 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
370 dumpWith dflags flag name g
371 where
372 do_lint g = case cmmLintGraph dflags g of
373 Just err -> do { fatalErrorMsg dflags err
374 ; ghcExit dflags 1
375 }
376 Nothing -> return ()
377
378 dumpWith :: Outputable a => DynFlags -> DumpFlag -> String -> a -> IO ()
379 dumpWith dflags flag txt g = do
380 -- ToDo: No easy way of say "dump all the cmm, *and* split
381 -- them into files." Also, -ddump-cmm doesn't play nicely
382 -- with -ddump-to-file, since the headers get omitted.
383 dumpIfSet_dyn dflags flag txt (ppr g)
384 when (not (dopt flag dflags)) $
385 dumpIfSet_dyn dflags Opt_D_dump_cmm txt (ppr g)
386