Fix Ticky histogram on Windows
[ghc.git] / compiler / cmm / CmmPipeline.hs
1 {-# LANGUAGE BangPatterns #-}
2
3 module CmmPipeline (
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
7 cmmPipeline
8 ) where
9
10 import Cmm
11 import CmmLint
12 import CmmBuildInfoTables
13 import CmmCommonBlockElim
14 import CmmImplementSwitchPlans
15 import CmmProcPoint
16 import CmmContFlowOpt
17 import CmmLayoutStack
18 import CmmSink
19 import Hoopl
20
21 import UniqSupply
22 import DynFlags
23 import ErrUtils
24 import HscTypes
25 import Control.Monad
26 import Outputable
27 import Platform
28
29 -----------------------------------------------------------------------------
30 -- | Top level driver for C-- pipeline
31 -----------------------------------------------------------------------------
32
33 cmmPipeline :: HscEnv -- Compilation env including
34 -- dynamic flags: -dcmm-lint -ddump-cps-cmm
35 -> TopSRT -- SRT table and accumulating list of compiled procs
36 -> CmmGroup -- Input C-- with Procedures
37 -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
38
39 cmmPipeline hsc_env topSRT prog =
40 do let dflags = hsc_dflags hsc_env
41
42 tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
43
44 (topSRT, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
45 dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" cmms
46
47 return (topSRT, cmms)
48
49
50 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
51 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
52 cpsTop hsc_env proc =
53 do
54 ----------- Control-flow optimisations ----------------------------------
55
56 -- The first round of control-flow optimisation speeds up the
57 -- later passes by removing lots of empty blocks, so we do it
58 -- even when optimisation isn't turned on.
59 --
60 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
61 return $ cmmCfgOptsProc splitting_proc_points proc
62 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
63
64 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
65 , do_layout = do_layout }} = h
66
67 ----------- Eliminate common blocks -------------------------------------
68 g <- {-# SCC "elimCommonBlocks" #-}
69 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
70 Opt_D_dump_cmm_cbe "Post common block elimination"
71
72 -- Any work storing block Labels must be performed _after_
73 -- elimCommonBlocks
74
75 g <- {-# SCC "createSwitchPlans" #-}
76 runUniqSM $ cmmImplementSwitchPlans dflags g
77 dump Opt_D_dump_cmm_switch "Post switch plan" g
78
79 ----------- Proc points -------------------------------------------------
80 let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
81 proc_points <-
82 if splitting_proc_points
83 then do
84 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
85 minimalProcPointSet (targetPlatform dflags) call_pps g
86 dumpIfSet_dyn dflags Opt_D_dump_cmm "Proc points"
87 (ppr l $$ ppr pp $$ ppr g)
88 return pp
89 else
90 return call_pps
91
92 ----------- Layout the stack and manifest Sp ----------------------------
93 (g, stackmaps) <-
94 {-# SCC "layoutStack" #-}
95 if do_layout
96 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
97 else return (g, mapEmpty)
98 dump Opt_D_dump_cmm_sp "Layout Stack" g
99
100 ----------- Sink and inline assignments --------------------------------
101 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
102 condPass Opt_CmmSink (cmmSink dflags) g
103 Opt_D_dump_cmm_sink "Sink assignments"
104
105 ------------- CAF analysis ----------------------------------------------
106 let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
107 dumpIfSet_dyn dflags Opt_D_dump_cmm "CAFEnv" (ppr cafEnv)
108
109 g <- if splitting_proc_points
110 then do
111 ------------- Split into separate procedures -----------------------
112 pp_map <- {-# SCC "procPointAnalysis" #-} runUniqSM $
113 procPointAnalysis proc_points g
114 dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" pp_map
115 g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
116 splitAtProcPoints dflags l call_pps proc_points pp_map
117 (CmmProc h l v g)
118 dumps Opt_D_dump_cmm_split "Post splitting" g
119 return g
120 else do
121 -- attach info tables to return points
122 return $ [attachContInfoTables call_pps (CmmProc h l v g)]
123
124 ------------- Populate info tables with stack info -----------------
125 g <- {-# SCC "setInfoTableStackMap" #-}
126 return $ map (setInfoTableStackMap dflags stackmaps) g
127 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
128
129 ----------- Control-flow optimisations -----------------------------
130 g <- {-# SCC "cmmCfgOpts(2)" #-}
131 return $ if optLevel dflags >= 1
132 then map (cmmCfgOptsProc splitting_proc_points) g
133 else g
134 g <- return (map removeUnreachableBlocksProc g)
135 -- See Note [unreachable blocks]
136 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
137
138 return (cafEnv, g)
139
140 where dflags = hsc_dflags hsc_env
141 platform = targetPlatform dflags
142 dump = dumpGraph dflags
143
144 dumps flag name
145 = mapM_ (dumpWith dflags flag name)
146
147 condPass flag pass g dumpflag dumpname =
148 if gopt flag dflags
149 then do
150 g <- return $ pass g
151 dump dumpflag dumpname g
152 return g
153 else return g
154
155
156 -- we don't need to split proc points for the NCG, unless
157 -- tablesNextToCode is off. The latter is because we have no
158 -- label to put on info tables for basic blocks that are not
159 -- the entry point.
160 splitting_proc_points = hscTarget dflags /= HscAsm
161 || not (tablesNextToCode dflags)
162 || -- Note [inconsistent-pic-reg]
163 usingInconsistentPicReg
164 usingInconsistentPicReg
165 = case (platformArch platform, platformOS platform, gopt Opt_PIC dflags)
166 of (ArchX86, OSDarwin, pic) -> pic
167 (ArchPPC, OSDarwin, pic) -> pic
168 _ -> False
169
170 -- Note [Sinking after stack layout]
171 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
172 --
173 -- In the past we considered running sinking pass also before stack
174 -- layout, but after making some measurements we realized that:
175 --
176 -- a) running sinking only before stack layout produces slower
177 -- code than running sinking only before stack layout
178 --
179 -- b) running sinking both before and after stack layout produces
180 -- code that has the same performance as when running sinking
181 -- only after stack layout.
182 --
183 -- In other words sinking before stack layout doesn't buy as anything.
184 --
185 -- An interesting question is "why is it better to run sinking after
186 -- stack layout"? It seems that the major reason are stores and loads
187 -- generated by stack layout. Consider this code before stack layout:
188 --
189 -- c1E:
190 -- _c1C::P64 = R3;
191 -- _c1B::P64 = R2;
192 -- _c1A::P64 = R1;
193 -- I64[(young<c1D> + 8)] = c1D;
194 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
195 -- c1D:
196 -- R3 = _c1C::P64;
197 -- R2 = _c1B::P64;
198 -- R1 = _c1A::P64;
199 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
200 --
201 -- Stack layout pass will save all local variables live across a call
202 -- (_c1C, _c1B and _c1A in this example) on the stack just before
203 -- making a call and reload them from the stack after returning from a
204 -- call:
205 --
206 -- c1E:
207 -- _c1C::P64 = R3;
208 -- _c1B::P64 = R2;
209 -- _c1A::P64 = R1;
210 -- I64[Sp - 32] = c1D;
211 -- P64[Sp - 24] = _c1A::P64;
212 -- P64[Sp - 16] = _c1B::P64;
213 -- P64[Sp - 8] = _c1C::P64;
214 -- Sp = Sp - 32;
215 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
216 -- c1D:
217 -- _c1A::P64 = P64[Sp + 8];
218 -- _c1B::P64 = P64[Sp + 16];
219 -- _c1C::P64 = P64[Sp + 24];
220 -- R3 = _c1C::P64;
221 -- R2 = _c1B::P64;
222 -- R1 = _c1A::P64;
223 -- Sp = Sp + 32;
224 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
225 --
226 -- If we don't run sinking pass after stack layout we are basically
227 -- left with such code. However, running sinking on this code can lead
228 -- to significant improvements:
229 --
230 -- c1E:
231 -- I64[Sp - 32] = c1D;
232 -- P64[Sp - 24] = R1;
233 -- P64[Sp - 16] = R2;
234 -- P64[Sp - 8] = R3;
235 -- Sp = Sp - 32;
236 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
237 -- c1D:
238 -- R3 = P64[Sp + 24];
239 -- R2 = P64[Sp + 16];
240 -- R1 = P64[Sp + 8];
241 -- Sp = Sp + 32;
242 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
243 --
244 -- Now we only have 9 assignments instead of 15.
245 --
246 -- There is one case when running sinking before stack layout could
247 -- be beneficial. Consider this:
248 --
249 -- L1:
250 -- x = y
251 -- call f() returns L2
252 -- L2: ...x...y...
253 --
254 -- Since both x and y are live across a call to f, they will be stored
255 -- on the stack during stack layout and restored after the call:
256 --
257 -- L1:
258 -- x = y
259 -- P64[Sp - 24] = L2
260 -- P64[Sp - 16] = x
261 -- P64[Sp - 8] = y
262 -- Sp = Sp - 24
263 -- call f() returns L2
264 -- L2:
265 -- y = P64[Sp + 16]
266 -- x = P64[Sp + 8]
267 -- Sp = Sp + 24
268 -- ...x...y...
269 --
270 -- However, if we run sinking before stack layout we would propagate x
271 -- to its usage place (both x and y must be local register for this to
272 -- be possible - global registers cannot be floated past a call):
273 --
274 -- L1:
275 -- x = y
276 -- call f() returns L2
277 -- L2: ...y...y...
278 --
279 -- Thus making x dead at the call to f(). If we ran stack layout now
280 -- we would generate less stores and loads:
281 --
282 -- L1:
283 -- x = y
284 -- P64[Sp - 16] = L2
285 -- P64[Sp - 8] = y
286 -- Sp = Sp - 16
287 -- call f() returns L2
288 -- L2:
289 -- y = P64[Sp + 8]
290 -- Sp = Sp + 16
291 -- ...y...y...
292 --
293 -- But since we don't see any benefits from running sinking befroe stack
294 -- layout, this situation probably doesn't arise too often in practice.
295 --
296
297 {- Note [inconsistent-pic-reg]
298
299 On x86/Darwin, PIC is implemented by inserting a sequence like
300
301 call 1f
302 1: popl %reg
303
304 at the proc entry point, and then referring to labels as offsets from
305 %reg. If we don't split proc points, then we could have many entry
306 points in a proc that would need this sequence, and each entry point
307 would then get a different value for %reg. If there are any join
308 points, then at the join point we don't have a consistent value for
309 %reg, so we don't know how to refer to labels.
310
311 Hence, on x86/Darwin, we have to split proc points, and then each proc
312 point will get its own PIC initialisation sequence.
313
314 The situation is the same for ppc/Darwin. We use essentially the same
315 sequence to load the program counter onto reg:
316
317 bcl 20,31,1f
318 1: mflr reg
319
320 This isn't an issue on x86/ELF, where the sequence is
321
322 call 1f
323 1: popl %reg
324 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
325
326 so %reg always has a consistent value: the address of
327 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
328
329 -}
330
331 {- Note [unreachable blocks]
332
333 The control-flow optimiser sometimes leaves unreachable blocks behind
334 containing junk code. These aren't necessarily a problem, but
335 removing them is good because it might save time in the native code
336 generator later.
337
338 -}
339
340 runUniqSM :: UniqSM a -> IO a
341 runUniqSM m = do
342 us <- mkSplitUniqSupply 'u'
343 return (initUs_ us m)
344
345
346 dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
347 dumpGraph dflags flag name g = do
348 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
349 dumpWith dflags flag name g
350 where
351 do_lint g = case cmmLintGraph dflags g of
352 Just err -> do { fatalErrorMsg dflags err
353 ; ghcExit dflags 1
354 }
355 Nothing -> return ()
356
357 dumpWith :: Outputable a => DynFlags -> DumpFlag -> String -> a -> IO ()
358 dumpWith dflags flag txt g = do
359 -- ToDo: No easy way of say "dump all the cmm, *and* split
360 -- them into files." Also, -ddump-cmm doesn't play nicely
361 -- with -ddump-to-file, since the headers get omitted.
362 dumpIfSet_dyn dflags flag txt (ppr g)
363 when (not (dopt flag dflags)) $
364 dumpIfSet_dyn dflags Opt_D_dump_cmm txt (ppr g)