1 {-# LANGUAGE BangPatterns #-}
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
14 import CmmBuildInfoTables
15 import CmmCommonBlockElim
16 import CmmImplementSwitchPlans
21 import Hoopl
.Collections
31 -----------------------------------------------------------------------------
32 -- | Top level driver for C-- pipeline
33 -----------------------------------------------------------------------------
36 :: HscEnv
-- Compilation env including
37 -- dynamic flags: -dcmm-lint -ddump-cmm-cps
38 -> ModuleSRTInfo
-- Info about SRTs generated so far
39 -> CmmGroup
-- Input C-- with Procedures
40 -> IO (ModuleSRTInfo
, CmmGroup
) -- Output CPS transformed C--
42 cmmPipeline hsc_env srtInfo prog
=
43 do let dflags
= hsc_dflags hsc_env
45 tops
<- {-# SCC "tops" #-} mapM (cpsTop hsc_env
) prog
47 (srtInfo
, cmms
) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo tops
48 dumpWith dflags Opt_D_dump_cmm_cps
"Post CPS Cmm" (ppr cmms
)
50 return (srtInfo
, cmms
)
53 cpsTop
:: HscEnv
-> CmmDecl
-> IO (CAFEnv
, [CmmDecl
])
54 cpsTop _ p
@(CmmData
{}) = return (mapEmpty
, [p
])
57 ----------- Control-flow optimisations ----------------------------------
59 -- The first round of control-flow optimisation speeds up the
60 -- later passes by removing lots of empty blocks, so we do it
61 -- even when optimisation isn't turned on.
63 CmmProc h l v g
<- {-# SCC "cmmCfgOpts(1)" #-}
64 return $ cmmCfgOptsProc splitting_proc_points proc
65 dump Opt_D_dump_cmm_cfg
"Post control-flow optimisations" g
67 let !TopInfo
{stack_info
=StackInfo
{ arg_space
= entry_off
68 , do_layout
= do_layout
}} = h
70 ----------- Eliminate common blocks -------------------------------------
71 g
<- {-# SCC "elimCommonBlocks" #-}
72 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
73 Opt_D_dump_cmm_cbe
"Post common block elimination"
75 -- Any work storing block Labels must be performed _after_
78 g
<- {-# SCC "createSwitchPlans" #-}
79 runUniqSM
$ cmmImplementSwitchPlans dflags g
80 dump Opt_D_dump_cmm_switch
"Post switch plan" g
82 ----------- Proc points -------------------------------------------------
83 let call_pps
= {-# SCC "callProcPoints" #-} callProcPoints g
85 if splitting_proc_points
87 pp
<- {-# SCC "minimalProcPointSet" #-} runUniqSM
$
88 minimalProcPointSet
(targetPlatform dflags
) call_pps g
89 dumpWith dflags Opt_D_dump_cmm_proc
"Proc points"
90 (ppr l
$$ ppr pp
$$ ppr g
)
95 ----------- Layout the stack and manifest Sp ----------------------------
97 {-# SCC "layoutStack" #-}
99 then runUniqSM
$ cmmLayoutStack dflags proc_points entry_off g
100 else return (g
, mapEmpty
)
101 dump Opt_D_dump_cmm_sp
"Layout Stack" g
103 ----------- Sink and inline assignments --------------------------------
104 g
<- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
105 condPass Opt_CmmSink
(cmmSink dflags
) g
106 Opt_D_dump_cmm_sink
"Sink assignments"
108 ------------- CAF analysis ----------------------------------------------
109 let cafEnv
= {-# SCC "cafAnal" #-} cafAnal call_pps l g
110 dumpWith dflags Opt_D_dump_cmm_caf
"CAFEnv" (ppr cafEnv
)
112 g
<- if splitting_proc_points
114 ------------- Split into separate procedures -----------------------
115 let pp_map
= {-# SCC "procPointAnalysis" #-}
116 procPointAnalysis proc_points g
117 dumpWith dflags Opt_D_dump_cmm_procmap
"procpoint map" $
119 g
<- {-# SCC "splitAtProcPoints" #-} runUniqSM
$
120 splitAtProcPoints dflags l call_pps proc_points pp_map
122 dumps Opt_D_dump_cmm_split
"Post splitting" g
125 -- attach info tables to return points
126 return $ [attachContInfoTables call_pps
(CmmProc h l v g
)]
128 ------------- Populate info tables with stack info -----------------
129 g
<- {-# SCC "setInfoTableStackMap" #-}
130 return $ map (setInfoTableStackMap dflags stackmaps
) g
131 dumps Opt_D_dump_cmm_info
"after setInfoTableStackMap" g
133 ----------- Control-flow optimisations -----------------------------
134 g
<- {-# SCC "cmmCfgOpts(2)" #-}
135 return $ if optLevel dflags
>= 1
136 then map (cmmCfgOptsProc splitting_proc_points
) g
138 g
<- return (map removeUnreachableBlocksProc g
)
139 -- See Note [unreachable blocks]
140 dumps Opt_D_dump_cmm_cfg
"Post control-flow optimisations" g
144 where dflags
= hsc_dflags hsc_env
145 platform
= targetPlatform dflags
146 dump
= dumpGraph dflags
149 = mapM_ (dumpWith dflags flag name
. ppr
)
151 condPass flag pass g dumpflag dumpname
=
155 dump dumpflag dumpname g
160 -- we don't need to split proc points for the NCG, unless
161 -- tablesNextToCode is off. The latter is because we have no
162 -- label to put on info tables for basic blocks that are not
164 splitting_proc_points
= hscTarget dflags
/= HscAsm
165 ||
not (tablesNextToCode dflags
)
166 ||
-- Note [inconsistent-pic-reg]
167 usingInconsistentPicReg
168 usingInconsistentPicReg
169 = case (platformArch platform
, platformOS platform
, positionIndependent dflags
)
170 of (ArchX86
, OSDarwin
, pic
) -> pic
171 (ArchPPC
, OSDarwin
, pic
) -> pic
174 -- Note [Sinking after stack layout]
175 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
177 -- In the past we considered running sinking pass also before stack
178 -- layout, but after making some measurements we realized that:
180 -- a) running sinking only before stack layout produces slower
181 -- code than running sinking only before stack layout
183 -- b) running sinking both before and after stack layout produces
184 -- code that has the same performance as when running sinking
185 -- only after stack layout.
187 -- In other words sinking before stack layout doesn't buy as anything.
189 -- An interesting question is "why is it better to run sinking after
190 -- stack layout"? It seems that the major reason are stores and loads
191 -- generated by stack layout. Consider this code before stack layout:
197 -- I64[(young<c1D> + 8)] = c1D;
198 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
203 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
205 -- Stack layout pass will save all local variables live across a call
206 -- (_c1C, _c1B and _c1A in this example) on the stack just before
207 -- making a call and reload them from the stack after returning from a
214 -- I64[Sp - 32] = c1D;
215 -- P64[Sp - 24] = _c1A::P64;
216 -- P64[Sp - 16] = _c1B::P64;
217 -- P64[Sp - 8] = _c1C::P64;
219 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
221 -- _c1A::P64 = P64[Sp + 8];
222 -- _c1B::P64 = P64[Sp + 16];
223 -- _c1C::P64 = P64[Sp + 24];
228 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
230 -- If we don't run sinking pass after stack layout we are basically
231 -- left with such code. However, running sinking on this code can lead
232 -- to significant improvements:
235 -- I64[Sp - 32] = c1D;
236 -- P64[Sp - 24] = R1;
237 -- P64[Sp - 16] = R2;
240 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
242 -- R3 = P64[Sp + 24];
243 -- R2 = P64[Sp + 16];
246 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
248 -- Now we only have 9 assignments instead of 15.
250 -- There is one case when running sinking before stack layout could
251 -- be beneficial. Consider this:
255 -- call f() returns L2
258 -- Since both x and y are live across a call to f, they will be stored
259 -- on the stack during stack layout and restored after the call:
267 -- call f() returns L2
274 -- However, if we run sinking before stack layout we would propagate x
275 -- to its usage place (both x and y must be local register for this to
276 -- be possible - global registers cannot be floated past a call):
280 -- call f() returns L2
283 -- Thus making x dead at the call to f(). If we ran stack layout now
284 -- we would generate less stores and loads:
291 -- call f() returns L2
297 -- But since we don't see any benefits from running sinking befroe stack
298 -- layout, this situation probably doesn't arise too often in practice.
301 {- Note [inconsistent-pic-reg]
303 On x86/Darwin, PIC is implemented by inserting a sequence like
308 at the proc entry point, and then referring to labels as offsets from
309 %reg. If we don't split proc points, then we could have many entry
310 points in a proc that would need this sequence, and each entry point
311 would then get a different value for %reg. If there are any join
312 points, then at the join point we don't have a consistent value for
313 %reg, so we don't know how to refer to labels.
315 Hence, on x86/Darwin, we have to split proc points, and then each proc
316 point will get its own PIC initialisation sequence.
318 The situation is the same for ppc/Darwin. We use essentially the same
319 sequence to load the program counter onto reg:
324 This isn't an issue on x86/ELF, where the sequence is
328 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
330 so %reg always has a consistent value: the address of
331 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
335 {- Note [unreachable blocks]
337 The control-flow optimiser sometimes leaves unreachable blocks behind
338 containing junk code. These aren't necessarily a problem, but
339 removing them is good because it might save time in the native code
344 runUniqSM
:: UniqSM a
-> IO a
346 us
<- mkSplitUniqSupply
'u
'
347 return (initUs_ us m
)
350 dumpGraph
:: DynFlags
-> DumpFlag
-> String -> CmmGraph
-> IO ()
351 dumpGraph dflags flag name g
= do
352 when (gopt Opt_DoCmmLinting dflags
) $ do_lint g
353 dumpWith dflags flag name
(ppr g
)
355 do_lint g
= case cmmLintGraph dflags g
of
356 Just err
-> do { fatalErrorMsg dflags err
361 dumpWith
:: DynFlags
-> DumpFlag
-> String -> SDoc
-> IO ()
362 dumpWith dflags flag txt sdoc
= do
363 -- ToDo: No easy way of say "dump all the cmm, *and* split
364 -- them into files." Also, -ddump-cmm-verbose doesn't play
365 -- nicely with -ddump-to-file, since the headers get omitted.
366 dumpIfSet_dyn dflags flag txt sdoc
367 when (not (dopt flag dflags
)) $
368 dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose txt sdoc