1 {-# LANGUAGE BangPatterns #-}
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmImplementSwitchPlans
21 import Hoopl
.Collections
31 -----------------------------------------------------------------------------
32 -- | Top level driver for C-- pipeline
33 -----------------------------------------------------------------------------
35 cmmPipeline
:: HscEnv
-- Compilation env including
36 -- dynamic flags: -dcmm-lint -ddump-cmm-cps
37 -> TopSRT
-- SRT table and accumulating list of compiled procs
38 -> CmmGroup
-- Input C-- with Procedures
39 -> IO (TopSRT
, CmmGroup
) -- Output CPS transformed C--
41 cmmPipeline hsc_env topSRT prog
=
42 do let dflags
= hsc_dflags hsc_env
44 tops
<- {-# SCC "tops" #-} mapM (cpsTop hsc_env
) prog
46 (topSRT
, cmms
) <- {-# SCC "doSRTs" #-} doSRTs dflags topSRT tops
47 dumpWith dflags Opt_D_dump_cmm_cps
"Post CPS Cmm" (ppr cmms
)
52 cpsTop
:: HscEnv
-> CmmDecl
-> IO (CAFEnv
, [CmmDecl
])
53 cpsTop _ p
@(CmmData
{}) = return (mapEmpty
, [p
])
56 ----------- Control-flow optimisations ----------------------------------
58 -- The first round of control-flow optimisation speeds up the
59 -- later passes by removing lots of empty blocks, so we do it
60 -- even when optimisation isn't turned on.
62 CmmProc h l v g
<- {-# SCC "cmmCfgOpts(1)" #-}
63 return $ cmmCfgOptsProc splitting_proc_points proc
64 dump Opt_D_dump_cmm_cfg
"Post control-flow optimisations" g
66 let !TopInfo
{stack_info
=StackInfo
{ arg_space
= entry_off
67 , do_layout
= do_layout
}} = h
69 ----------- Eliminate common blocks -------------------------------------
70 g
<- {-# SCC "elimCommonBlocks" #-}
71 condPass Opt_CmmElimCommonBlocks
(elimCommonBlocks dflags
) g
72 Opt_D_dump_cmm_cbe
"Post common block elimination"
74 -- Any work storing block Labels must be performed _after_
77 g
<- {-# SCC "createSwitchPlans" #-}
78 runUniqSM
$ cmmImplementSwitchPlans dflags g
79 dump Opt_D_dump_cmm_switch
"Post switch plan" g
81 ----------- Proc points -------------------------------------------------
82 let call_pps
= {-# SCC "callProcPoints" #-} callProcPoints g
84 if splitting_proc_points
86 pp
<- {-# SCC "minimalProcPointSet" #-} runUniqSM
$
87 minimalProcPointSet
(targetPlatform dflags
) call_pps g
88 dumpWith dflags Opt_D_dump_cmm_proc
"Proc points"
89 (ppr l
$$ ppr pp
$$ ppr g
)
94 ----------- Layout the stack and manifest Sp ----------------------------
96 {-# SCC "layoutStack" #-}
98 then runUniqSM
$ cmmLayoutStack dflags proc_points entry_off g
99 else return (g
, mapEmpty
)
100 dump Opt_D_dump_cmm_sp
"Layout Stack" g
102 ----------- Sink and inline assignments --------------------------------
103 g
<- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
104 condPass Opt_CmmSink
(cmmSink dflags
) g
105 Opt_D_dump_cmm_sink
"Sink assignments"
107 ------------- CAF analysis ----------------------------------------------
108 let cafEnv
= {-# SCC "cafAnal" #-} cafAnal g
109 dumpWith dflags Opt_D_dump_cmm_caf
"CAFEnv" (ppr cafEnv
)
111 g
<- if splitting_proc_points
113 ------------- Split into separate procedures -----------------------
114 let pp_map
= {-# SCC "procPointAnalysis" #-}
115 procPointAnalysis proc_points g
116 dumpWith dflags Opt_D_dump_cmm_procmap
"procpoint map" $
118 g
<- {-# SCC "splitAtProcPoints" #-} runUniqSM
$
119 splitAtProcPoints dflags l call_pps proc_points pp_map
121 dumps Opt_D_dump_cmm_split
"Post splitting" g
124 -- attach info tables to return points
125 return $ [attachContInfoTables call_pps
(CmmProc h l v g
)]
127 ------------- Populate info tables with stack info -----------------
128 g
<- {-# SCC "setInfoTableStackMap" #-}
129 return $ map (setInfoTableStackMap dflags stackmaps
) g
130 dumps Opt_D_dump_cmm_info
"after setInfoTableStackMap" g
132 ----------- Control-flow optimisations -----------------------------
133 g
<- {-# SCC "cmmCfgOpts(2)" #-}
134 return $ if optLevel dflags
>= 1
135 then map (cmmCfgOptsProc splitting_proc_points
) g
137 g
<- return (map removeUnreachableBlocksProc g
)
138 -- See Note [unreachable blocks]
139 dumps Opt_D_dump_cmm_cfg
"Post control-flow optimisations" g
143 where dflags
= hsc_dflags hsc_env
144 platform
= targetPlatform dflags
145 dump
= dumpGraph dflags
148 = mapM_ (dumpWith dflags flag name
. ppr
)
150 condPass flag pass g dumpflag dumpname
=
154 dump dumpflag dumpname g
159 -- we don't need to split proc points for the NCG, unless
160 -- tablesNextToCode is off. The latter is because we have no
161 -- label to put on info tables for basic blocks that are not
163 splitting_proc_points
= hscTarget dflags
/= HscAsm
164 ||
not (tablesNextToCode dflags
)
165 ||
-- Note [inconsistent-pic-reg]
166 usingInconsistentPicReg
167 usingInconsistentPicReg
168 = case (platformArch platform
, platformOS platform
, positionIndependent dflags
)
169 of (ArchX86
, OSDarwin
, pic
) -> pic
170 (ArchPPC
, OSDarwin
, pic
) -> pic
173 -- Note [Sinking after stack layout]
174 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
176 -- In the past we considered running sinking pass also before stack
177 -- layout, but after making some measurements we realized that:
179 -- a) running sinking only before stack layout produces slower
180 -- code than running sinking only before stack layout
182 -- b) running sinking both before and after stack layout produces
183 -- code that has the same performance as when running sinking
184 -- only after stack layout.
186 -- In other words sinking before stack layout doesn't buy as anything.
188 -- An interesting question is "why is it better to run sinking after
189 -- stack layout"? It seems that the major reason are stores and loads
190 -- generated by stack layout. Consider this code before stack layout:
196 -- I64[(young<c1D> + 8)] = c1D;
197 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
202 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
204 -- Stack layout pass will save all local variables live across a call
205 -- (_c1C, _c1B and _c1A in this example) on the stack just before
206 -- making a call and reload them from the stack after returning from a
213 -- I64[Sp - 32] = c1D;
214 -- P64[Sp - 24] = _c1A::P64;
215 -- P64[Sp - 16] = _c1B::P64;
216 -- P64[Sp - 8] = _c1C::P64;
218 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
220 -- _c1A::P64 = P64[Sp + 8];
221 -- _c1B::P64 = P64[Sp + 16];
222 -- _c1C::P64 = P64[Sp + 24];
227 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
229 -- If we don't run sinking pass after stack layout we are basically
230 -- left with such code. However, running sinking on this code can lead
231 -- to significant improvements:
234 -- I64[Sp - 32] = c1D;
235 -- P64[Sp - 24] = R1;
236 -- P64[Sp - 16] = R2;
239 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
241 -- R3 = P64[Sp + 24];
242 -- R2 = P64[Sp + 16];
245 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
247 -- Now we only have 9 assignments instead of 15.
249 -- There is one case when running sinking before stack layout could
250 -- be beneficial. Consider this:
254 -- call f() returns L2
257 -- Since both x and y are live across a call to f, they will be stored
258 -- on the stack during stack layout and restored after the call:
266 -- call f() returns L2
273 -- However, if we run sinking before stack layout we would propagate x
274 -- to its usage place (both x and y must be local register for this to
275 -- be possible - global registers cannot be floated past a call):
279 -- call f() returns L2
282 -- Thus making x dead at the call to f(). If we ran stack layout now
283 -- we would generate less stores and loads:
290 -- call f() returns L2
296 -- But since we don't see any benefits from running sinking befroe stack
297 -- layout, this situation probably doesn't arise too often in practice.
300 {- Note [inconsistent-pic-reg]
302 On x86/Darwin, PIC is implemented by inserting a sequence like
307 at the proc entry point, and then referring to labels as offsets from
308 %reg. If we don't split proc points, then we could have many entry
309 points in a proc that would need this sequence, and each entry point
310 would then get a different value for %reg. If there are any join
311 points, then at the join point we don't have a consistent value for
312 %reg, so we don't know how to refer to labels.
314 Hence, on x86/Darwin, we have to split proc points, and then each proc
315 point will get its own PIC initialisation sequence.
317 The situation is the same for ppc/Darwin. We use essentially the same
318 sequence to load the program counter onto reg:
323 This isn't an issue on x86/ELF, where the sequence is
327 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
329 so %reg always has a consistent value: the address of
330 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
334 {- Note [unreachable blocks]
336 The control-flow optimiser sometimes leaves unreachable blocks behind
337 containing junk code. These aren't necessarily a problem, but
338 removing them is good because it might save time in the native code
343 runUniqSM
:: UniqSM a
-> IO a
345 us
<- mkSplitUniqSupply
'u
'
346 return (initUs_ us m
)
349 dumpGraph
:: DynFlags
-> DumpFlag
-> String -> CmmGraph
-> IO ()
350 dumpGraph dflags flag name g
= do
351 when (gopt Opt_DoCmmLinting dflags
) $ do_lint g
352 dumpWith dflags flag name
(ppr g
)
354 do_lint g
= case cmmLintGraph dflags g
of
355 Just err
-> do { fatalErrorMsg dflags err
360 dumpWith
:: DynFlags
-> DumpFlag
-> String -> SDoc
-> IO ()
361 dumpWith dflags flag txt sdoc
= do
362 -- ToDo: No easy way of say "dump all the cmm, *and* split
363 -- them into files." Also, -ddump-cmm-verbose doesn't play
364 -- nicely with -ddump-to-file, since the headers get omitted.
365 dumpIfSet_dyn dflags flag txt sdoc
366 when (not (dopt flag dflags
)) $
367 dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose txt sdoc