Add debug dump of the list of Cmm proc points
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 {-# LANGUAGE NoMonoLocalBinds #-}
2 -- Norman likes local bindings
3 -- If this module lives on I'd like to get rid of this extension in due course
4
5 module CmmPipeline (
6 -- | Converts C-- with an implicit stack and native C-- calls into
7 -- optimized, CPS converted and native-call-less C--. The latter
8 -- C-- can be used to generate assembly.
9 cmmPipeline
10 ) where
11
12 import Cmm
13 import CmmLint
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmProcPoint
17 import CmmContFlowOpt
18 import CmmLayoutStack
19 import CmmSink
20 import Hoopl
21
22 import UniqSupply
23 import DynFlags
24 import ErrUtils
25 import HscTypes
26 import Control.Monad
27 import Outputable
28 import Platform
29
30 -----------------------------------------------------------------------------
31 -- | Top level driver for C-- pipeline
32 -----------------------------------------------------------------------------
33
34 cmmPipeline :: HscEnv -- Compilation env including
35 -- dynamic flags: -dcmm-lint -ddump-cps-cmm
36 -> TopSRT -- SRT table and accumulating list of compiled procs
37 -> CmmGroup -- Input C-- with Procedures
38 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
39
40 cmmPipeline hsc_env topSRT prog =
41 do let dflags = hsc_dflags hsc_env
42
43 showPass dflags "CPSZ"
44
45 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
46
47 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
48 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" cmms
49
50 return (topSRT, cmms)
51
52
53 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
54 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
55 cpsTop hsc_env proc =
56 do
57 ----------- Control-flow optimisations ----------------------------------
58
59 -- The first round of control-flow optimisation speeds up the
60 -- later passes by removing lots of empty blocks, so we do it
61 -- even when optimisation isn't turned on.
62 --
63 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
64 return $ cmmCfgOptsProc splitting_proc_points proc
65 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
66
67 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
68 , do_layout = do_layout }} = h
69
70 ----------- Eliminate common blocks -------------------------------------
71 g <- {-# SCC "elimCommonBlocks" #-}
72 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
73 Opt_D_dump_cmm_cbe "Post common block elimination"
74
75 -- Any work storing block Labels must be performed _after_
76 -- elimCommonBlocks
77
78 ----------- Proc points -------------------------------------------------
79 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
80 proc_points <-
81 if splitting_proc_points
82 then {-# SCC "minimalProcPointSet" #-} runUniqSM $
83 minimalProcPointSet (targetPlatform dflags) call_pps g
84 else
85 return call_pps
86 dumpIfSet_dyn dflags Opt_D_dump_cmm "Proc points" (ppr l $$ ppr proc_points $$ ppr g)
87
88 let noncall_pps = proc_points `setDifference` call_pps
89 when (not (setNull noncall_pps) && dopt Opt_D_dump_cmm dflags) $
90 pprTrace "Non-call proc points: " (ppr noncall_pps) $ return ()
91
92 ----------- Layout the stack and manifest Sp ----------------------------
93 (g, stackmaps) <-
94 {-# SCC "layoutStack" #-}
95 if do_layout
96 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
97 else return (g, mapEmpty)
98 dump Opt_D_dump_cmm_sp "Layout Stack" g
99
100 ----------- Sink and inline assignments --------------------------------
101 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
102 condPass Opt_CmmSink (cmmSink dflags) g
103 Opt_D_dump_cmm_sink "Sink assignments"
104
105 ------------- CAF analysis ----------------------------------------------
106 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
107 dumpIfSet_dyn dflags Opt_D_dump_cmm "CAFEnv" (ppr cafEnv)
108
109 if splitting_proc_points
110 then do
111 ------------- Split into separate procedures -----------------------
112 pp_map <- {-# SCC "procPointAnalysis" #-} runUniqSM $
113 procPointAnalysis proc_points g
114 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" pp_map
115 gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
116 splitAtProcPoints dflags l call_pps proc_points pp_map
117 (CmmProc h l v g)
118 dumps Opt_D_dump_cmm_split "Post splitting" gs
119
120 ------------- Populate info tables with stack info -----------------
121 gs <- {-# SCC "setInfoTableStackMap" #-}
122 return $ map (setInfoTableStackMap dflags stackmaps) gs
123 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" gs
124
125 ----------- Control-flow optimisations -----------------------------
126 gs <- {-# SCC "cmmCfgOpts(2)" #-}
127 return $ if optLevel dflags >= 1
128 then map (cmmCfgOptsProc splitting_proc_points) gs
129 else gs
130 gs <- return (map removeUnreachableBlocksProc gs)
131 -- Note [unreachable blocks]
132 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" gs
133
134 return (cafEnv, gs)
135
136 else do
137 -- attach info tables to return points
138 g <- return $ attachContInfoTables call_pps (CmmProc h l v g)
139
140 ------------- Populate info tables with stack info -----------------
141 g <- {-# SCC "setInfoTableStackMap" #-}
142 return $ setInfoTableStackMap dflags stackmaps g
143 dump' Opt_D_dump_cmm_info "after setInfoTableStackMap" g
144
145 ----------- Control-flow optimisations -----------------------------
146 g <- {-# SCC "cmmCfgOpts(2)" #-}
147 return $ if optLevel dflags >= 1
148 then cmmCfgOptsProc splitting_proc_points g
149 else g
150 g <- return (removeUnreachableBlocksProc g)
151 -- Note [unreachable blocks]
152 dump' Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
153
154 return (cafEnv, [g])
155
156 where dflags = hsc_dflags hsc_env
157 platform = targetPlatform dflags
158 dump = dumpGraph dflags
159 dump' = dumpWith dflags
160
161 dumps flag name
162 = mapM_ (dumpWith dflags flag name)
163
164 condPass flag pass g dumpflag dumpname =
165 if gopt flag dflags
166 then do
167 g <- return $ pass g
168 dump dumpflag dumpname g
169 return g
170 else return g
171
172
173 -- we don't need to split proc points for the NCG, unless
174 -- tablesNextToCode is off. The latter is because we have no
175 -- label to put on info tables for basic blocks that are not
176 -- the entry point.
177 splitting_proc_points = hscTarget dflags /= HscAsm
178 || not (tablesNextToCode dflags)
179 || -- Note [inconsistent-pic-reg]
180 usingInconsistentPicReg
181 usingInconsistentPicReg
182 = case (platformArch platform, platformOS platform, gopt Opt_PIC dflags)
183 of (ArchX86, OSDarwin, pic) -> pic
184 (ArchPPC, OSDarwin, pic) -> pic
185 _ -> False
186
187 -- Note [Sinking after stack layout]
188 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
189 --
190 -- In the past we considered running sinking pass also before stack
191 -- layout, but after making some measurements we realized that:
192 --
193 -- a) running sinking only before stack layout produces slower
194 -- code than running sinking only before stack layout
195 --
196 -- b) running sinking both before and after stack layout produces
197 -- code that has the same performance as when running sinking
198 -- only after stack layout.
199 --
200 -- In other words sinking before stack layout doesn't buy as anything.
201 --
202 -- An interesting question is "why is it better to run sinking after
203 -- stack layout"? It seems that the major reason are stores and loads
204 -- generated by stack layout. Consider this code before stack layout:
205 --
206 -- c1E:
207 -- _c1C::P64 = R3;
208 -- _c1B::P64 = R2;
209 -- _c1A::P64 = R1;
210 -- I64[(young<c1D> + 8)] = c1D;
211 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
212 -- c1D:
213 -- R3 = _c1C::P64;
214 -- R2 = _c1B::P64;
215 -- R1 = _c1A::P64;
216 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
217 --
218 -- Stack layout pass will save all local variables live across a call
219 -- (_c1C, _c1B and _c1A in this example) on the stack just before
220 -- making a call and reload them from the stack after returning from a
221 -- call:
222 --
223 -- c1E:
224 -- _c1C::P64 = R3;
225 -- _c1B::P64 = R2;
226 -- _c1A::P64 = R1;
227 -- I64[Sp - 32] = c1D;
228 -- P64[Sp - 24] = _c1A::P64;
229 -- P64[Sp - 16] = _c1B::P64;
230 -- P64[Sp - 8] = _c1C::P64;
231 -- Sp = Sp - 32;
232 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
233 -- c1D:
234 -- _c1A::P64 = P64[Sp + 8];
235 -- _c1B::P64 = P64[Sp + 16];
236 -- _c1C::P64 = P64[Sp + 24];
237 -- R3 = _c1C::P64;
238 -- R2 = _c1B::P64;
239 -- R1 = _c1A::P64;
240 -- Sp = Sp + 32;
241 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
242 --
243 -- If we don't run sinking pass after stack layout we are basically
244 -- left with such code. However, running sinking on this code can lead
245 -- to significant improvements:
246 --
247 -- c1E:
248 -- I64[Sp - 32] = c1D;
249 -- P64[Sp - 24] = R1;
250 -- P64[Sp - 16] = R2;
251 -- P64[Sp - 8] = R3;
252 -- Sp = Sp - 32;
253 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
254 -- c1D:
255 -- R3 = P64[Sp + 24];
256 -- R2 = P64[Sp + 16];
257 -- R1 = P64[Sp + 8];
258 -- Sp = Sp + 32;
259 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
260 --
261 -- Now we only have 9 assignments instead of 15.
262 --
263 -- There is one case when running sinking before stack layout could
264 -- be beneficial. Consider this:
265 --
266 -- L1:
267 -- x = y
268 -- call f() returns L2
269 -- L2: ...x...y...
270 --
271 -- Since both x and y are live across a call to f, they will be stored
272 -- on the stack during stack layout and restored after the call:
273 --
274 -- L1:
275 -- x = y
276 -- P64[Sp - 24] = L2
277 -- P64[Sp - 16] = x
278 -- P64[Sp - 8] = y
279 -- Sp = Sp - 24
280 -- call f() returns L2
281 -- L2:
282 -- y = P64[Sp + 16]
283 -- x = P64[Sp + 8]
284 -- Sp = Sp + 24
285 -- ...x...y...
286 --
287 -- However, if we run sinking before stack layout we would propagate x
288 -- to its usage place (both x and y must be local register for this to
289 -- be possible - global registers cannot be floated past a call):
290 --
291 -- L1:
292 -- x = y
293 -- call f() returns L2
294 -- L2: ...y...y...
295 --
296 -- Thus making x dead at the call to f(). If we ran stack layout now
297 -- we would generate less stores and loads:
298 --
299 -- L1:
300 -- x = y
301 -- P64[Sp - 16] = L2
302 -- P64[Sp - 8] = y
303 -- Sp = Sp - 16
304 -- call f() returns L2
305 -- L2:
306 -- y = P64[Sp + 8]
307 -- Sp = Sp + 16
308 -- ...y...y...
309 --
310 -- But since we don't see any benefits from running sinking befroe stack
311 -- layout, this situation probably doesn't arise too often in practice.
312 --
313
314 {- Note [inconsistent-pic-reg]
315
316 On x86/Darwin, PIC is implemented by inserting a sequence like
317
318 call 1f
319 1: popl %reg
320
321 at the proc entry point, and then referring to labels as offsets from
322 %reg. If we don't split proc points, then we could have many entry
323 points in a proc that would need this sequence, and each entry point
324 would then get a different value for %reg. If there are any join
325 points, then at the join point we don't have a consistent value for
326 %reg, so we don't know how to refer to labels.
327
328 Hence, on x86/Darwin, we have to split proc points, and then each proc
329 point will get its own PIC initialisation sequence.
330
331 The situation is the same for ppc/Darwin. We use essentially the same
332 sequence to load the program counter onto reg:
333
334 bcl 20,31,1f
335 1: mflr reg
336
337 This isn't an issue on x86/ELF, where the sequence is
338
339 call 1f
340 1: popl %reg
341 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
342
343 so %reg always has a consistent value: the address of
344 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
345
346 -}
347
348 {- Note [unreachable blocks]
349
350 The control-flow optimiser sometimes leaves unreachable blocks behind
351 containing junk code. If these blocks make it into the native code
352 generator then they trigger a register allocator panic because they
353 refer to undefined LocalRegs, so we must eliminate any unreachable
354 blocks before passing the code onwards.
355
356 -}
357
358 runUniqSM :: UniqSM a -> IO a
359 runUniqSM m = do
360 us <- mkSplitUniqSupply 'u'
361 return (initUs_ us m)
362
363
364 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
365 dumpGraph dflags flag name g = do
366 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
367 dumpWith dflags flag name g
368 where
369 do_lint g = case cmmLintGraph dflags g of
370 Just err -> do { fatalErrorMsg dflags err
371 ; ghcExit dflags 1
372 }
373 Nothing -> return ()
374
375 dumpWith :: Outputable a => DynFlags -> DumpFlag -> String -> a -> IO ()
376 dumpWith dflags flag txt g = do
377 -- ToDo: No easy way of say "dump all the cmm, *and* split
378 -- them into files." Also, -ddump-cmm doesn't play nicely
379 -- with -ddump-to-file, since the headers get omitted.
380 dumpIfSet_dyn dflags flag txt (ppr g)
381 when (not (dopt flag dflags)) $
382 dumpIfSet_dyn dflags Opt_D_dump_cmm txt (ppr g)
383