Remove trailing whitespace
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 {-# LANGUAGE BangPatterns #-}
2
3 module CmmPipeline (
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
7 cmmPipeline
8 ) where
9
10 import GhcPrelude
11
12 import Cmm
13 import CmmLint
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmImplementSwitchPlans
17 import CmmProcPoint
18 import CmmContFlowOpt
19 import CmmLayoutStack
20 import CmmSink
21 import Hoopl.Collections
22
23 import UniqSupply
24 import DynFlags
25 import ErrUtils
26 import HscTypes
27 import Control.Monad
28 import Outputable
29 import Platform
30
31 -----------------------------------------------------------------------------
32 -- | Top level driver for C-- pipeline
33 -----------------------------------------------------------------------------
34
35 cmmPipeline
36 :: HscEnv -- Compilation env including
37 -- dynamic flags: -dcmm-lint -ddump-cmm-cps
38 -> ModuleSRTInfo -- Info about SRTs generated so far
39 -> CmmGroup -- Input C-- with Procedures
40 -> IO (ModuleSRTInfo, CmmGroup) -- Output CPS transformed C--
41
42 cmmPipeline hsc_env srtInfo prog =
43 do let dflags = hsc_dflags hsc_env
44
45 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
46
47 (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo tops
48 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" (ppr cmms)
49
50 return (srtInfo, cmms)
51
52
53 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
54 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
55 cpsTop hsc_env proc =
56 do
57 ----------- Control-flow optimisations ----------------------------------
58
59 -- The first round of control-flow optimisation speeds up the
60 -- later passes by removing lots of empty blocks, so we do it
61 -- even when optimisation isn't turned on.
62 --
63 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
64 return $ cmmCfgOptsProc splitting_proc_points proc
65 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
66
67 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
68 , do_layout = do_layout }} = h
69
70 ----------- Eliminate common blocks -------------------------------------
71 g <- {-# SCC "elimCommonBlocks" #-}
72 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
73 Opt_D_dump_cmm_cbe "Post common block elimination"
74
75 -- Any work storing block Labels must be performed _after_
76 -- elimCommonBlocks
77
78 g <- {-# SCC "createSwitchPlans" #-}
79 runUniqSM $ cmmImplementSwitchPlans dflags g
80 dump Opt_D_dump_cmm_switch "Post switch plan" g
81
82 ----------- Proc points -------------------------------------------------
83 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
84 proc_points <-
85 if splitting_proc_points
86 then do
87 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
88 minimalProcPointSet (targetPlatform dflags) call_pps g
89 dumpWith dflags Opt_D_dump_cmm_proc "Proc points"
90 (ppr l $$ ppr pp $$ ppr g)
91 return pp
92 else
93 return call_pps
94
95 ----------- Layout the stack and manifest Sp ----------------------------
96 (g, stackmaps) <-
97 {-# SCC "layoutStack" #-}
98 if do_layout
99 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
100 else return (g, mapEmpty)
101 dump Opt_D_dump_cmm_sp "Layout Stack" g
102
103 ----------- Sink and inline assignments --------------------------------
104 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
105 condPass Opt_CmmSink (cmmSink dflags) g
106 Opt_D_dump_cmm_sink "Sink assignments"
107
108 ------------- CAF analysis ----------------------------------------------
109 let cafEnv = {-# SCC "cafAnal" #-} cafAnal call_pps l g
110 dumpWith dflags Opt_D_dump_cmm_caf "CAFEnv" (ppr cafEnv)
111
112 g <- if splitting_proc_points
113 then do
114 ------------- Split into separate procedures -----------------------
115 let pp_map = {-# SCC "procPointAnalysis" #-}
116 procPointAnalysis proc_points g
117 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" $
118 ppr pp_map
119 g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
120 splitAtProcPoints dflags l call_pps proc_points pp_map
121 (CmmProc h l v g)
122 dumps Opt_D_dump_cmm_split "Post splitting" g
123 return g
124 else do
125 -- attach info tables to return points
126 return $ [attachContInfoTables call_pps (CmmProc h l v g)]
127
128 ------------- Populate info tables with stack info -----------------
129 g <- {-# SCC "setInfoTableStackMap" #-}
130 return $ map (setInfoTableStackMap dflags stackmaps) g
131 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
132
133 ----------- Control-flow optimisations -----------------------------
134 g <- {-# SCC "cmmCfgOpts(2)" #-}
135 return $ if optLevel dflags >= 1
136 then map (cmmCfgOptsProc splitting_proc_points) g
137 else g
138 g <- return (map removeUnreachableBlocksProc g)
139 -- See Note [unreachable blocks]
140 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
141
142 return (cafEnv, g)
143
144 where dflags = hsc_dflags hsc_env
145 platform = targetPlatform dflags
146 dump = dumpGraph dflags
147
148 dumps flag name
149 = mapM_ (dumpWith dflags flag name . ppr)
150
151 condPass flag pass g dumpflag dumpname =
152 if gopt flag dflags
153 then do
154 g <- return $ pass g
155 dump dumpflag dumpname g
156 return g
157 else return g
158
159 -- we don't need to split proc points for the NCG, unless
160 -- tablesNextToCode is off. The latter is because we have no
161 -- label to put on info tables for basic blocks that are not
162 -- the entry point.
163 splitting_proc_points = hscTarget dflags /= HscAsm
164 || not (tablesNextToCode dflags)
165 || -- Note [inconsistent-pic-reg]
166 usingInconsistentPicReg
167 usingInconsistentPicReg
168 = case (platformArch platform, platformOS platform, positionIndependent dflags)
169 of (ArchX86, OSDarwin, pic) -> pic
170 _ -> False
171
172 -- Note [Sinking after stack layout]
173 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
174 --
175 -- In the past we considered running sinking pass also before stack
176 -- layout, but after making some measurements we realized that:
177 --
178 -- a) running sinking only before stack layout produces slower
179 -- code than running sinking only before stack layout
180 --
181 -- b) running sinking both before and after stack layout produces
182 -- code that has the same performance as when running sinking
183 -- only after stack layout.
184 --
185 -- In other words sinking before stack layout doesn't buy as anything.
186 --
187 -- An interesting question is "why is it better to run sinking after
188 -- stack layout"? It seems that the major reason are stores and loads
189 -- generated by stack layout. Consider this code before stack layout:
190 --
191 -- c1E:
192 -- _c1C::P64 = R3;
193 -- _c1B::P64 = R2;
194 -- _c1A::P64 = R1;
195 -- I64[(young<c1D> + 8)] = c1D;
196 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
197 -- c1D:
198 -- R3 = _c1C::P64;
199 -- R2 = _c1B::P64;
200 -- R1 = _c1A::P64;
201 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
202 --
203 -- Stack layout pass will save all local variables live across a call
204 -- (_c1C, _c1B and _c1A in this example) on the stack just before
205 -- making a call and reload them from the stack after returning from a
206 -- call:
207 --
208 -- c1E:
209 -- _c1C::P64 = R3;
210 -- _c1B::P64 = R2;
211 -- _c1A::P64 = R1;
212 -- I64[Sp - 32] = c1D;
213 -- P64[Sp - 24] = _c1A::P64;
214 -- P64[Sp - 16] = _c1B::P64;
215 -- P64[Sp - 8] = _c1C::P64;
216 -- Sp = Sp - 32;
217 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
218 -- c1D:
219 -- _c1A::P64 = P64[Sp + 8];
220 -- _c1B::P64 = P64[Sp + 16];
221 -- _c1C::P64 = P64[Sp + 24];
222 -- R3 = _c1C::P64;
223 -- R2 = _c1B::P64;
224 -- R1 = _c1A::P64;
225 -- Sp = Sp + 32;
226 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
227 --
228 -- If we don't run sinking pass after stack layout we are basically
229 -- left with such code. However, running sinking on this code can lead
230 -- to significant improvements:
231 --
232 -- c1E:
233 -- I64[Sp - 32] = c1D;
234 -- P64[Sp - 24] = R1;
235 -- P64[Sp - 16] = R2;
236 -- P64[Sp - 8] = R3;
237 -- Sp = Sp - 32;
238 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
239 -- c1D:
240 -- R3 = P64[Sp + 24];
241 -- R2 = P64[Sp + 16];
242 -- R1 = P64[Sp + 8];
243 -- Sp = Sp + 32;
244 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
245 --
246 -- Now we only have 9 assignments instead of 15.
247 --
248 -- There is one case when running sinking before stack layout could
249 -- be beneficial. Consider this:
250 --
251 -- L1:
252 -- x = y
253 -- call f() returns L2
254 -- L2: ...x...y...
255 --
256 -- Since both x and y are live across a call to f, they will be stored
257 -- on the stack during stack layout and restored after the call:
258 --
259 -- L1:
260 -- x = y
261 -- P64[Sp - 24] = L2
262 -- P64[Sp - 16] = x
263 -- P64[Sp - 8] = y
264 -- Sp = Sp - 24
265 -- call f() returns L2
266 -- L2:
267 -- y = P64[Sp + 16]
268 -- x = P64[Sp + 8]
269 -- Sp = Sp + 24
270 -- ...x...y...
271 --
272 -- However, if we run sinking before stack layout we would propagate x
273 -- to its usage place (both x and y must be local register for this to
274 -- be possible - global registers cannot be floated past a call):
275 --
276 -- L1:
277 -- x = y
278 -- call f() returns L2
279 -- L2: ...y...y...
280 --
281 -- Thus making x dead at the call to f(). If we ran stack layout now
282 -- we would generate less stores and loads:
283 --
284 -- L1:
285 -- x = y
286 -- P64[Sp - 16] = L2
287 -- P64[Sp - 8] = y
288 -- Sp = Sp - 16
289 -- call f() returns L2
290 -- L2:
291 -- y = P64[Sp + 8]
292 -- Sp = Sp + 16
293 -- ...y...y...
294 --
295 -- But since we don't see any benefits from running sinking befroe stack
296 -- layout, this situation probably doesn't arise too often in practice.
297 --
298
299 {- Note [inconsistent-pic-reg]
300
301 On x86/Darwin, PIC is implemented by inserting a sequence like
302
303 call 1f
304 1: popl %reg
305
306 at the proc entry point, and then referring to labels as offsets from
307 %reg. If we don't split proc points, then we could have many entry
308 points in a proc that would need this sequence, and each entry point
309 would then get a different value for %reg. If there are any join
310 points, then at the join point we don't have a consistent value for
311 %reg, so we don't know how to refer to labels.
312
313 Hence, on x86/Darwin, we have to split proc points, and then each proc
314 point will get its own PIC initialisation sequence.
315
316 This isn't an issue on x86/ELF, where the sequence is
317
318 call 1f
319 1: popl %reg
320 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
321
322 so %reg always has a consistent value: the address of
323 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
324
325 -}
326
327 {- Note [unreachable blocks]
328
329 The control-flow optimiser sometimes leaves unreachable blocks behind
330 containing junk code. These aren't necessarily a problem, but
331 removing them is good because it might save time in the native code
332 generator later.
333
334 -}
335
336 runUniqSM :: UniqSM a -> IO a
337 runUniqSM m = do
338 us <- mkSplitUniqSupply 'u'
339 return (initUs_ us m)
340
341
342 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
343 dumpGraph dflags flag name g = do
344 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
345 dumpWith dflags flag name (ppr g)
346 where
347 do_lint g = case cmmLintGraph dflags g of
348 Just err -> do { fatalErrorMsg dflags err
349 ; ghcExit dflags 1
350 }
351 Nothing -> return ()
352
353 dumpWith :: DynFlags -> DumpFlag -> String -> SDoc -> IO ()
354 dumpWith dflags flag txt sdoc = do
355 -- ToDo: No easy way of say "dump all the cmm, *and* split
356 -- them into files." Also, -ddump-cmm-verbose doesn't play
357 -- nicely with -ddump-to-file, since the headers get omitted.
358 dumpIfSet_dyn dflags flag txt sdoc
359 when (not (dopt flag dflags)) $
360 dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose txt sdoc