Eliminate duplicate code in Cmm pipeline
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 module CmmPipeline (
2 -- | Converts C-- with an implicit stack and native C-- calls into
3 -- optimized, CPS converted and native-call-less C--. The latter
4 -- C-- can be used to generate assembly.
5 cmmPipeline
6 ) where
7
8 import Cmm
9 import CmmLint
10 import CmmBuildInfoTables
11 import CmmCommonBlockElim
12 import CmmProcPoint
13 import CmmContFlowOpt
14 import CmmLayoutStack
15 import CmmSink
16 import Hoopl
17
18 import UniqSupply
19 import DynFlags
20 import ErrUtils
21 import HscTypes
22 import Control.Monad
23 import Outputable
24 import Platform
25
26 -----------------------------------------------------------------------------
27 -- | Top level driver for C-- pipeline
28 -----------------------------------------------------------------------------
29
30 cmmPipeline :: HscEnv -- Compilation env including
31 -- dynamic flags: -dcmm-lint -ddump-cps-cmm
32 -> TopSRT -- SRT table and accumulating list of compiled procs
33 -> CmmGroup -- Input C-- with Procedures
34 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
35
36 cmmPipeline hsc_env topSRT prog =
37 do let dflags = hsc_dflags hsc_env
38
39 showPass dflags "CPSZ"
40
41 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
42
43 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
44 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" cmms
45
46 return (topSRT, cmms)
47
48
49 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
50 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
51 cpsTop hsc_env proc =
52 do
53 ----------- Control-flow optimisations ----------------------------------
54
55 -- The first round of control-flow optimisation speeds up the
56 -- later passes by removing lots of empty blocks, so we do it
57 -- even when optimisation isn't turned on.
58 --
59 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
60 return $ cmmCfgOptsProc splitting_proc_points proc
61 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
62
63 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
64 , do_layout = do_layout }} = h
65
66 ----------- Eliminate common blocks -------------------------------------
67 g <- {-# SCC "elimCommonBlocks" #-}
68 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
69 Opt_D_dump_cmm_cbe "Post common block elimination"
70
71 -- Any work storing block Labels must be performed _after_
72 -- elimCommonBlocks
73
74 ----------- Proc points -------------------------------------------------
75 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
76 proc_points <-
77 if splitting_proc_points
78 then do
79 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
80 minimalProcPointSet (targetPlatform dflags) call_pps g
81 dumpIfSet_dyn dflags Opt_D_dump_cmm "Proc points"
82 (ppr l $$ ppr pp $$ ppr g)
83 return pp
84 else
85 return call_pps
86
87 ----------- Layout the stack and manifest Sp ----------------------------
88 (g, stackmaps) <-
89 {-# SCC "layoutStack" #-}
90 if do_layout
91 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
92 else return (g, mapEmpty)
93 dump Opt_D_dump_cmm_sp "Layout Stack" g
94
95 ----------- Sink and inline assignments --------------------------------
96 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
97 condPass Opt_CmmSink (cmmSink dflags) g
98 Opt_D_dump_cmm_sink "Sink assignments"
99
100 ------------- CAF analysis ----------------------------------------------
101 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
102 dumpIfSet_dyn dflags Opt_D_dump_cmm "CAFEnv" (ppr cafEnv)
103
104 g <- if splitting_proc_points
105 then do
106 ------------- Split into separate procedures -----------------------
107 pp_map <- {-# SCC "procPointAnalysis" #-} runUniqSM $
108 procPointAnalysis proc_points g
109 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" pp_map
110 g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
111 splitAtProcPoints dflags l call_pps proc_points pp_map
112 (CmmProc h l v g)
113 dumps Opt_D_dump_cmm_split "Post splitting" g
114 return g
115 else do
116 -- attach info tables to return points
117 return $ [attachContInfoTables call_pps (CmmProc h l v g)]
118
119 ------------- Populate info tables with stack info -----------------
120 g <- {-# SCC "setInfoTableStackMap" #-}
121 return $ map (setInfoTableStackMap dflags stackmaps) g
122 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
123
124 ----------- Control-flow optimisations -----------------------------
125 g <- {-# SCC "cmmCfgOpts(2)" #-}
126 return $ if optLevel dflags >= 1
127 then map (cmmCfgOptsProc splitting_proc_points) g
128 else g
129 g <- return (map removeUnreachableBlocksProc g)
130 -- See Note [unreachable blocks]
131 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
132
133 return (cafEnv, g)
134
135 where dflags = hsc_dflags hsc_env
136 platform = targetPlatform dflags
137 dump = dumpGraph dflags
138
139 dumps flag name
140 = mapM_ (dumpWith dflags flag name)
141
142 condPass flag pass g dumpflag dumpname =
143 if gopt flag dflags
144 then do
145 g <- return $ pass g
146 dump dumpflag dumpname g
147 return g
148 else return g
149
150
151 -- we don't need to split proc points for the NCG, unless
152 -- tablesNextToCode is off. The latter is because we have no
153 -- label to put on info tables for basic blocks that are not
154 -- the entry point.
155 splitting_proc_points = hscTarget dflags /= HscAsm
156 || not (tablesNextToCode dflags)
157 || -- Note [inconsistent-pic-reg]
158 usingInconsistentPicReg
159 usingInconsistentPicReg
160 = case (platformArch platform, platformOS platform, gopt Opt_PIC dflags)
161 of (ArchX86, OSDarwin, pic) -> pic
162 (ArchPPC, OSDarwin, pic) -> pic
163 _ -> False
164
165 -- Note [Sinking after stack layout]
166 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
167 --
168 -- In the past we considered running sinking pass also before stack
169 -- layout, but after making some measurements we realized that:
170 --
171 -- a) running sinking only before stack layout produces slower
172 -- code than running sinking only before stack layout
173 --
174 -- b) running sinking both before and after stack layout produces
175 -- code that has the same performance as when running sinking
176 -- only after stack layout.
177 --
178 -- In other words sinking before stack layout doesn't buy as anything.
179 --
180 -- An interesting question is "why is it better to run sinking after
181 -- stack layout"? It seems that the major reason are stores and loads
182 -- generated by stack layout. Consider this code before stack layout:
183 --
184 -- c1E:
185 -- _c1C::P64 = R3;
186 -- _c1B::P64 = R2;
187 -- _c1A::P64 = R1;
188 -- I64[(young<c1D> + 8)] = c1D;
189 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
190 -- c1D:
191 -- R3 = _c1C::P64;
192 -- R2 = _c1B::P64;
193 -- R1 = _c1A::P64;
194 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
195 --
196 -- Stack layout pass will save all local variables live across a call
197 -- (_c1C, _c1B and _c1A in this example) on the stack just before
198 -- making a call and reload them from the stack after returning from a
199 -- call:
200 --
201 -- c1E:
202 -- _c1C::P64 = R3;
203 -- _c1B::P64 = R2;
204 -- _c1A::P64 = R1;
205 -- I64[Sp - 32] = c1D;
206 -- P64[Sp - 24] = _c1A::P64;
207 -- P64[Sp - 16] = _c1B::P64;
208 -- P64[Sp - 8] = _c1C::P64;
209 -- Sp = Sp - 32;
210 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
211 -- c1D:
212 -- _c1A::P64 = P64[Sp + 8];
213 -- _c1B::P64 = P64[Sp + 16];
214 -- _c1C::P64 = P64[Sp + 24];
215 -- R3 = _c1C::P64;
216 -- R2 = _c1B::P64;
217 -- R1 = _c1A::P64;
218 -- Sp = Sp + 32;
219 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
220 --
221 -- If we don't run sinking pass after stack layout we are basically
222 -- left with such code. However, running sinking on this code can lead
223 -- to significant improvements:
224 --
225 -- c1E:
226 -- I64[Sp - 32] = c1D;
227 -- P64[Sp - 24] = R1;
228 -- P64[Sp - 16] = R2;
229 -- P64[Sp - 8] = R3;
230 -- Sp = Sp - 32;
231 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
232 -- c1D:
233 -- R3 = P64[Sp + 24];
234 -- R2 = P64[Sp + 16];
235 -- R1 = P64[Sp + 8];
236 -- Sp = Sp + 32;
237 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
238 --
239 -- Now we only have 9 assignments instead of 15.
240 --
241 -- There is one case when running sinking before stack layout could
242 -- be beneficial. Consider this:
243 --
244 -- L1:
245 -- x = y
246 -- call f() returns L2
247 -- L2: ...x...y...
248 --
249 -- Since both x and y are live across a call to f, they will be stored
250 -- on the stack during stack layout and restored after the call:
251 --
252 -- L1:
253 -- x = y
254 -- P64[Sp - 24] = L2
255 -- P64[Sp - 16] = x
256 -- P64[Sp - 8] = y
257 -- Sp = Sp - 24
258 -- call f() returns L2
259 -- L2:
260 -- y = P64[Sp + 16]
261 -- x = P64[Sp + 8]
262 -- Sp = Sp + 24
263 -- ...x...y...
264 --
265 -- However, if we run sinking before stack layout we would propagate x
266 -- to its usage place (both x and y must be local register for this to
267 -- be possible - global registers cannot be floated past a call):
268 --
269 -- L1:
270 -- x = y
271 -- call f() returns L2
272 -- L2: ...y...y...
273 --
274 -- Thus making x dead at the call to f(). If we ran stack layout now
275 -- we would generate less stores and loads:
276 --
277 -- L1:
278 -- x = y
279 -- P64[Sp - 16] = L2
280 -- P64[Sp - 8] = y
281 -- Sp = Sp - 16
282 -- call f() returns L2
283 -- L2:
284 -- y = P64[Sp + 8]
285 -- Sp = Sp + 16
286 -- ...y...y...
287 --
288 -- But since we don't see any benefits from running sinking befroe stack
289 -- layout, this situation probably doesn't arise too often in practice.
290 --
291
292 {- Note [inconsistent-pic-reg]
293
294 On x86/Darwin, PIC is implemented by inserting a sequence like
295
296 call 1f
297 1: popl %reg
298
299 at the proc entry point, and then referring to labels as offsets from
300 %reg. If we don't split proc points, then we could have many entry
301 points in a proc that would need this sequence, and each entry point
302 would then get a different value for %reg. If there are any join
303 points, then at the join point we don't have a consistent value for
304 %reg, so we don't know how to refer to labels.
305
306 Hence, on x86/Darwin, we have to split proc points, and then each proc
307 point will get its own PIC initialisation sequence.
308
309 The situation is the same for ppc/Darwin. We use essentially the same
310 sequence to load the program counter onto reg:
311
312 bcl 20,31,1f
313 1: mflr reg
314
315 This isn't an issue on x86/ELF, where the sequence is
316
317 call 1f
318 1: popl %reg
319 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
320
321 so %reg always has a consistent value: the address of
322 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
323
324 -}
325
326 {- Note [unreachable blocks]
327
328 The control-flow optimiser sometimes leaves unreachable blocks behind
329 containing junk code. If these blocks make it into the native code
330 generator then they trigger a register allocator panic because they
331 refer to undefined LocalRegs, so we must eliminate any unreachable
332 blocks before passing the code onwards.
333
334 -}
335
336 runUniqSM :: UniqSM a -> IO a
337 runUniqSM m = do
338 us <- mkSplitUniqSupply 'u'
339 return (initUs_ us m)
340
341
342 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
343 dumpGraph dflags flag name g = do
344 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
345 dumpWith dflags flag name g
346 where
347 do_lint g = case cmmLintGraph dflags g of
348 Just err -> do { fatalErrorMsg dflags err
349 ; ghcExit dflags 1
350 }
351 Nothing -> return ()
352
353 dumpWith :: Outputable a => DynFlags -> DumpFlag -> String -> a -> IO ()
354 dumpWith dflags flag txt g = do
355 -- ToDo: No easy way of say "dump all the cmm, *and* split
356 -- them into files." Also, -ddump-cmm doesn't play nicely
357 -- with -ddump-to-file, since the headers get omitted.
358 dumpIfSet_dyn dflags flag txt (ppr g)
359 when (not (dopt flag dflags)) $
360 dumpIfSet_dyn dflags Opt_D_dump_cmm txt (ppr g)