Remove unnecessary LANGUAGE pragma
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 module CmmPipeline (
2 -- | Converts C-- with an implicit stack and native C-- calls into
3 -- optimized, CPS converted and native-call-less C--. The latter
4 -- C-- can be used to generate assembly.
5 cmmPipeline
6 ) where
7
8 import Cmm
9 import CmmLint
10 import CmmBuildInfoTables
11 import CmmCommonBlockElim
12 import CmmProcPoint
13 import CmmContFlowOpt
14 import CmmLayoutStack
15 import CmmSink
16 import Hoopl
17
18 import UniqSupply
19 import DynFlags
20 import ErrUtils
21 import HscTypes
22 import Control.Monad
23 import Outputable
24 import Platform
25
26 -----------------------------------------------------------------------------
27 -- | Top level driver for C-- pipeline
28 -----------------------------------------------------------------------------
29
30 cmmPipeline :: HscEnv -- Compilation env including
31 -- dynamic flags: -dcmm-lint -ddump-cps-cmm
32 -> TopSRT -- SRT table and accumulating list of compiled procs
33 -> CmmGroup -- Input C-- with Procedures
34 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
35
36 cmmPipeline hsc_env topSRT prog =
37 do let dflags = hsc_dflags hsc_env
38
39 showPass dflags "CPSZ"
40
41 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
42
43 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
44 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" cmms
45
46 return (topSRT, cmms)
47
48
49 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
50 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
51 cpsTop hsc_env proc =
52 do
53 ----------- Control-flow optimisations ----------------------------------
54
55 -- The first round of control-flow optimisation speeds up the
56 -- later passes by removing lots of empty blocks, so we do it
57 -- even when optimisation isn't turned on.
58 --
59 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
60 return $ cmmCfgOptsProc splitting_proc_points proc
61 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
62
63 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
64 , do_layout = do_layout }} = h
65
66 ----------- Eliminate common blocks -------------------------------------
67 g <- {-# SCC "elimCommonBlocks" #-}
68 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
69 Opt_D_dump_cmm_cbe "Post common block elimination"
70
71 -- Any work storing block Labels must be performed _after_
72 -- elimCommonBlocks
73
74 ----------- Proc points -------------------------------------------------
75 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
76 proc_points <-
77 if splitting_proc_points
78 then do
79 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
80 minimalProcPointSet (targetPlatform dflags) call_pps g
81 dumpIfSet_dyn dflags Opt_D_dump_cmm "Proc points"
82 (ppr l $$ ppr pp $$ ppr g)
83 return pp
84 else
85 return call_pps
86
87 let noncall_pps = proc_points `setDifference` call_pps
88 when (not (setNull noncall_pps) && dopt Opt_D_dump_cmm dflags) $
89 pprTrace "Non-call proc points: " (ppr noncall_pps) $ return ()
90
91 ----------- Layout the stack and manifest Sp ----------------------------
92 (g, stackmaps) <-
93 {-# SCC "layoutStack" #-}
94 if do_layout
95 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
96 else return (g, mapEmpty)
97 dump Opt_D_dump_cmm_sp "Layout Stack" g
98
99 ----------- Sink and inline assignments --------------------------------
100 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
101 condPass Opt_CmmSink (cmmSink dflags) g
102 Opt_D_dump_cmm_sink "Sink assignments"
103
104 ------------- CAF analysis ----------------------------------------------
105 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
106 dumpIfSet_dyn dflags Opt_D_dump_cmm "CAFEnv" (ppr cafEnv)
107
108 if splitting_proc_points
109 then do
110 ------------- Split into separate procedures -----------------------
111 pp_map <- {-# SCC "procPointAnalysis" #-} runUniqSM $
112 procPointAnalysis proc_points g
113 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" pp_map
114 gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
115 splitAtProcPoints dflags l call_pps proc_points pp_map
116 (CmmProc h l v g)
117 dumps Opt_D_dump_cmm_split "Post splitting" gs
118
119 ------------- Populate info tables with stack info -----------------
120 gs <- {-# SCC "setInfoTableStackMap" #-}
121 return $ map (setInfoTableStackMap dflags stackmaps) gs
122 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" gs
123
124 ----------- Control-flow optimisations -----------------------------
125 gs <- {-# SCC "cmmCfgOpts(2)" #-}
126 return $ if optLevel dflags >= 1
127 then map (cmmCfgOptsProc splitting_proc_points) gs
128 else gs
129 gs <- return (map removeUnreachableBlocksProc gs)
130 -- Note [unreachable blocks]
131 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" gs
132
133 return (cafEnv, gs)
134
135 else do
136 -- attach info tables to return points
137 g <- return $ attachContInfoTables call_pps (CmmProc h l v g)
138
139 ------------- Populate info tables with stack info -----------------
140 g <- {-# SCC "setInfoTableStackMap" #-}
141 return $ setInfoTableStackMap dflags stackmaps g
142 dump' Opt_D_dump_cmm_info "after setInfoTableStackMap" g
143
144 ----------- Control-flow optimisations -----------------------------
145 g <- {-# SCC "cmmCfgOpts(2)" #-}
146 return $ if optLevel dflags >= 1
147 then cmmCfgOptsProc splitting_proc_points g
148 else g
149 g <- return (removeUnreachableBlocksProc g)
150 -- Note [unreachable blocks]
151 dump' Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
152
153 return (cafEnv, [g])
154
155 where dflags = hsc_dflags hsc_env
156 platform = targetPlatform dflags
157 dump = dumpGraph dflags
158 dump' = dumpWith dflags
159
160 dumps flag name
161 = mapM_ (dumpWith dflags flag name)
162
163 condPass flag pass g dumpflag dumpname =
164 if gopt flag dflags
165 then do
166 g <- return $ pass g
167 dump dumpflag dumpname g
168 return g
169 else return g
170
171
172 -- we don't need to split proc points for the NCG, unless
173 -- tablesNextToCode is off. The latter is because we have no
174 -- label to put on info tables for basic blocks that are not
175 -- the entry point.
176 splitting_proc_points = hscTarget dflags /= HscAsm
177 || not (tablesNextToCode dflags)
178 || -- Note [inconsistent-pic-reg]
179 usingInconsistentPicReg
180 usingInconsistentPicReg
181 = case (platformArch platform, platformOS platform, gopt Opt_PIC dflags)
182 of (ArchX86, OSDarwin, pic) -> pic
183 (ArchPPC, OSDarwin, pic) -> pic
184 _ -> False
185
186 -- Note [Sinking after stack layout]
187 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
188 --
189 -- In the past we considered running sinking pass also before stack
190 -- layout, but after making some measurements we realized that:
191 --
192 -- a) running sinking only before stack layout produces slower
193 -- code than running sinking only before stack layout
194 --
195 -- b) running sinking both before and after stack layout produces
196 -- code that has the same performance as when running sinking
197 -- only after stack layout.
198 --
199 -- In other words sinking before stack layout doesn't buy as anything.
200 --
201 -- An interesting question is "why is it better to run sinking after
202 -- stack layout"? It seems that the major reason are stores and loads
203 -- generated by stack layout. Consider this code before stack layout:
204 --
205 -- c1E:
206 -- _c1C::P64 = R3;
207 -- _c1B::P64 = R2;
208 -- _c1A::P64 = R1;
209 -- I64[(young<c1D> + 8)] = c1D;
210 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
211 -- c1D:
212 -- R3 = _c1C::P64;
213 -- R2 = _c1B::P64;
214 -- R1 = _c1A::P64;
215 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
216 --
217 -- Stack layout pass will save all local variables live across a call
218 -- (_c1C, _c1B and _c1A in this example) on the stack just before
219 -- making a call and reload them from the stack after returning from a
220 -- call:
221 --
222 -- c1E:
223 -- _c1C::P64 = R3;
224 -- _c1B::P64 = R2;
225 -- _c1A::P64 = R1;
226 -- I64[Sp - 32] = c1D;
227 -- P64[Sp - 24] = _c1A::P64;
228 -- P64[Sp - 16] = _c1B::P64;
229 -- P64[Sp - 8] = _c1C::P64;
230 -- Sp = Sp - 32;
231 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
232 -- c1D:
233 -- _c1A::P64 = P64[Sp + 8];
234 -- _c1B::P64 = P64[Sp + 16];
235 -- _c1C::P64 = P64[Sp + 24];
236 -- R3 = _c1C::P64;
237 -- R2 = _c1B::P64;
238 -- R1 = _c1A::P64;
239 -- Sp = Sp + 32;
240 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
241 --
242 -- If we don't run sinking pass after stack layout we are basically
243 -- left with such code. However, running sinking on this code can lead
244 -- to significant improvements:
245 --
246 -- c1E:
247 -- I64[Sp - 32] = c1D;
248 -- P64[Sp - 24] = R1;
249 -- P64[Sp - 16] = R2;
250 -- P64[Sp - 8] = R3;
251 -- Sp = Sp - 32;
252 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
253 -- c1D:
254 -- R3 = P64[Sp + 24];
255 -- R2 = P64[Sp + 16];
256 -- R1 = P64[Sp + 8];
257 -- Sp = Sp + 32;
258 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
259 --
260 -- Now we only have 9 assignments instead of 15.
261 --
262 -- There is one case when running sinking before stack layout could
263 -- be beneficial. Consider this:
264 --
265 -- L1:
266 -- x = y
267 -- call f() returns L2
268 -- L2: ...x...y...
269 --
270 -- Since both x and y are live across a call to f, they will be stored
271 -- on the stack during stack layout and restored after the call:
272 --
273 -- L1:
274 -- x = y
275 -- P64[Sp - 24] = L2
276 -- P64[Sp - 16] = x
277 -- P64[Sp - 8] = y
278 -- Sp = Sp - 24
279 -- call f() returns L2
280 -- L2:
281 -- y = P64[Sp + 16]
282 -- x = P64[Sp + 8]
283 -- Sp = Sp + 24
284 -- ...x...y...
285 --
286 -- However, if we run sinking before stack layout we would propagate x
287 -- to its usage place (both x and y must be local register for this to
288 -- be possible - global registers cannot be floated past a call):
289 --
290 -- L1:
291 -- x = y
292 -- call f() returns L2
293 -- L2: ...y...y...
294 --
295 -- Thus making x dead at the call to f(). If we ran stack layout now
296 -- we would generate less stores and loads:
297 --
298 -- L1:
299 -- x = y
300 -- P64[Sp - 16] = L2
301 -- P64[Sp - 8] = y
302 -- Sp = Sp - 16
303 -- call f() returns L2
304 -- L2:
305 -- y = P64[Sp + 8]
306 -- Sp = Sp + 16
307 -- ...y...y...
308 --
309 -- But since we don't see any benefits from running sinking befroe stack
310 -- layout, this situation probably doesn't arise too often in practice.
311 --
312
313 {- Note [inconsistent-pic-reg]
314
315 On x86/Darwin, PIC is implemented by inserting a sequence like
316
317 call 1f
318 1: popl %reg
319
320 at the proc entry point, and then referring to labels as offsets from
321 %reg. If we don't split proc points, then we could have many entry
322 points in a proc that would need this sequence, and each entry point
323 would then get a different value for %reg. If there are any join
324 points, then at the join point we don't have a consistent value for
325 %reg, so we don't know how to refer to labels.
326
327 Hence, on x86/Darwin, we have to split proc points, and then each proc
328 point will get its own PIC initialisation sequence.
329
330 The situation is the same for ppc/Darwin. We use essentially the same
331 sequence to load the program counter onto reg:
332
333 bcl 20,31,1f
334 1: mflr reg
335
336 This isn't an issue on x86/ELF, where the sequence is
337
338 call 1f
339 1: popl %reg
340 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
341
342 so %reg always has a consistent value: the address of
343 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
344
345 -}
346
347 {- Note [unreachable blocks]
348
349 The control-flow optimiser sometimes leaves unreachable blocks behind
350 containing junk code. If these blocks make it into the native code
351 generator then they trigger a register allocator panic because they
352 refer to undefined LocalRegs, so we must eliminate any unreachable
353 blocks before passing the code onwards.
354
355 -}
356
357 runUniqSM :: UniqSM a -> IO a
358 runUniqSM m = do
359 us <- mkSplitUniqSupply 'u'
360 return (initUs_ us m)
361
362
363 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
364 dumpGraph dflags flag name g = do
365 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
366 dumpWith dflags flag name g
367 where
368 do_lint g = case cmmLintGraph dflags g of
369 Just err -> do { fatalErrorMsg dflags err
370 ; ghcExit dflags 1
371 }
372 Nothing -> return ()
373
374 dumpWith :: Outputable a => DynFlags -> DumpFlag -> String -> a -> IO ()
375 dumpWith dflags flag txt g = do
376 -- ToDo: No easy way of say "dump all the cmm, *and* split
377 -- them into files." Also, -ddump-cmm doesn't play nicely
378 -- with -ddump-to-file, since the headers get omitted.
379 dumpIfSet_dyn dflags flag txt (ppr g)
380 when (not (dopt flag dflags)) $
381 dumpIfSet_dyn dflags Opt_D_dump_cmm txt (ppr g)