1 -- -----------------------------------------------------------------------------
3 -- (c) The University of Glasgow 1993-2004
5 -- This is the top-level module in the native code generator.
7 -- -----------------------------------------------------------------------------
10 {-# OPTIONS -fno-warn-tabs #-}
11 -- The above warning supression flag is a temporary kludge.
12 -- While working on this module you are encouraged to remove it and
13 -- detab the module (please do the detabbing in a separate patch). See
14 -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
17 module AsmCodeGen ( nativeCodeGen ) where
19 #include "HsVersions.h"
20 #include "nativeGen/NCG.h"
23 import qualified X86.CodeGen
24 import qualified X86.Regs
25 import qualified X86.Instr
26 import qualified X86.Ppr
28 import qualified SPARC.CodeGen
29 import qualified SPARC.Regs
30 import qualified SPARC.Instr
31 import qualified SPARC.Ppr
32 import qualified SPARC.ShortcutJump
33 import qualified SPARC.CodeGen.Expand
35 import qualified PPC.CodeGen
36 import qualified PPC.Cond
37 import qualified PPC.Regs
38 import qualified PPC.RegInfo
39 import qualified PPC.Instr
40 import qualified PPC.Ppr
42 import RegAlloc.Liveness
43 import qualified RegAlloc.Linear.Main as Linear
45 import qualified GraphColor as Color
46 import qualified RegAlloc.Graph.Main as Color
47 import qualified RegAlloc.Graph.Stats as Color
48 import qualified RegAlloc.Graph.TrivColorable as Color
59 import CgUtils ( fixStgRegisters )
61 import CmmOpt ( cmmEliminateDeadBlocks, cmmMiniInline, cmmMachOpFold )
66 import Unique ( Unique, getUnique )
72 import BasicTypes ( Alignment )
75 import qualified Pretty
92 The native-code generator has machine-independent and
93 machine-dependent modules.
95 This module ("AsmCodeGen") is the top-level machine-independent
96 module. Before entering machine-dependent land, we do some
97 machine-independent optimisations (defined below) on the
100 We convert to the machine-specific 'Instr' datatype with
101 'cmmCodeGen', assuming an infinite supply of registers. We then use
102 a machine-independent register allocator ('regAlloc') to rejoin
103 reality. Obviously, 'regAlloc' has machine-specific helper
104 functions (see about "RegAllocInfo" below).
106 Finally, we order the basic blocks of the function so as to minimise
107 the number of jumps between blocks, by utilising fallthrough wherever
110 The machine-dependent bits break down as follows:
112 * ["MachRegs"] Everything about the target platform's machine
113 registers (and immediate operands, and addresses, which tend to
114 intermingle/interact with registers).
116 * ["MachInstrs"] Includes the 'Instr' datatype (possibly should
117 have a module of its own), plus a miscellany of other things
118 (e.g., 'targetDoubleSize', 'smStablePtrTable', ...)
120 * ["MachCodeGen"] is where 'Cmm' stuff turns into
121 machine instructions.
123 * ["PprMach"] 'pprInstr' turns an 'Instr' into text (well, really
126 * ["RegAllocInfo"] In the register allocator, we manipulate
127 'MRegsState's, which are 'BitSet's, one bit per machine register.
128 When we want to say something about a specific machine register
129 (e.g., ``it gets clobbered by this instruction''), we set/unset
130 its bit. Obviously, we do this 'BitSet' thing for efficiency
133 The 'RegAllocInfo' module collects together the machine-specific
134 info needed to do register allocation.
136 * ["RegisterAlloc"] The (machine-independent) register allocator.
139 -- -----------------------------------------------------------------------------
140 -- Top-level of the native codegen
142 data NcgImpl statics instr jumpDest = NcgImpl {
143 cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
144 generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
145 getJumpDestBlockId :: jumpDest -> Maybe BlockId,
146 canShortcut :: instr -> Maybe jumpDest,
147 shortcutStatics :: (BlockId -> Maybe jumpDest) -> statics -> statics,
148 shortcutJump :: (BlockId -> Maybe jumpDest) -> instr -> instr,
149 pprNatCmmDecl :: Platform -> NatCmmDecl statics instr -> Doc,
150 maxSpillSlots :: Int,
151 allocatableRegs :: [RealReg],
152 ncg_x86fp_kludge :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
153 ncgExpandTop :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
154 ncgMakeFarBranches :: [NatBasicBlock instr] -> [NatBasicBlock instr]
158 nativeCodeGen :: DynFlags -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
159 nativeCodeGen dflags h us cmms
160 = let platform = targetPlatform dflags
161 nCG' :: (PlatformOutputable statics, PlatformOutputable instr, Instruction instr) => NcgImpl statics instr jumpDest -> IO ()
162 nCG' ncgImpl = nativeCodeGen' dflags ncgImpl h us cmms
163 x86NcgImpl = NcgImpl {
164 cmmTopCodeGen = X86.CodeGen.cmmTopCodeGen
165 ,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr
166 ,getJumpDestBlockId = X86.Instr.getJumpDestBlockId
167 ,canShortcut = X86.Instr.canShortcut
168 ,shortcutStatics = X86.Instr.shortcutStatics
169 ,shortcutJump = X86.Instr.shortcutJump
170 ,pprNatCmmDecl = X86.Ppr.pprNatCmmDecl
171 ,maxSpillSlots = X86.Instr.maxSpillSlots (target32Bit platform)
172 ,allocatableRegs = X86.Regs.allocatableRegs
173 ,ncg_x86fp_kludge = id
175 ,ncgMakeFarBranches = id
177 in case platformArch platform of
178 ArchX86 -> nCG' (x86NcgImpl { ncg_x86fp_kludge = map x86fp_kludge })
179 ArchX86_64 -> nCG' x86NcgImpl
182 cmmTopCodeGen = PPC.CodeGen.cmmTopCodeGen
183 ,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr
184 ,getJumpDestBlockId = PPC.RegInfo.getJumpDestBlockId
185 ,canShortcut = PPC.RegInfo.canShortcut
186 ,shortcutStatics = PPC.RegInfo.shortcutStatics
187 ,shortcutJump = PPC.RegInfo.shortcutJump
188 ,pprNatCmmDecl = PPC.Ppr.pprNatCmmDecl
189 ,maxSpillSlots = PPC.Instr.maxSpillSlots
190 ,allocatableRegs = PPC.Regs.allocatableRegs
191 ,ncg_x86fp_kludge = id
193 ,ncgMakeFarBranches = makeFarBranches
197 cmmTopCodeGen = SPARC.CodeGen.cmmTopCodeGen
198 ,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr
199 ,getJumpDestBlockId = SPARC.ShortcutJump.getJumpDestBlockId
200 ,canShortcut = SPARC.ShortcutJump.canShortcut
201 ,shortcutStatics = SPARC.ShortcutJump.shortcutStatics
202 ,shortcutJump = SPARC.ShortcutJump.shortcutJump
203 ,pprNatCmmDecl = SPARC.Ppr.pprNatCmmDecl
204 ,maxSpillSlots = SPARC.Instr.maxSpillSlots
205 ,allocatableRegs = SPARC.Regs.allocatableRegs
206 ,ncg_x86fp_kludge = id
207 ,ncgExpandTop = map SPARC.CodeGen.Expand.expandTop
208 ,ncgMakeFarBranches = id
211 panic "nativeCodeGen: No NCG for ARM"
213 panic "nativeCodeGen: No NCG for PPC 64"
215 panic "nativeCodeGen: No NCG for Alpha"
217 panic "nativeCodeGen: No NCG for mipseb"
219 panic "nativeCodeGen: No NCG for mipsel"
221 panic "nativeCodeGen: No NCG for unknown arch"
223 nativeCodeGen' :: (PlatformOutputable statics, PlatformOutputable instr, Instruction instr)
225 -> NcgImpl statics instr jumpDest
226 -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
227 nativeCodeGen' dflags ncgImpl h us cmms
229 let platform = targetPlatform dflags
230 split_cmms = concat $ map add_split cmms
231 -- BufHandle is a performance hack. We could hide it inside
232 -- Pretty if it weren't for the fact that we do lots of little
233 -- printDocs here (in order to do codegen in constant space).
234 bufh <- newBufHandle h
235 (imports, prof) <- cmmNativeGens dflags ncgImpl bufh us split_cmms [] [] 0
238 let (native, colorStats, linearStats)
243 Opt_D_dump_asm "Asm code"
244 (vcat $ map (docToSDoc . pprNatCmmDecl ncgImpl platform) $ concat native)
246 -- dump global NCG stats for graph coloring allocator
247 (case concat $ catMaybes colorStats of
250 -- build the global register conflict graph
252 = foldl Color.union Color.initGraph
253 $ [ Color.raGraph stat
254 | stat@Color.RegAllocStatsStart{} <- stats]
256 dumpSDoc dflags Opt_D_dump_asm_stats "NCG stats"
257 $ Color.pprStats stats graphGlobal
260 Opt_D_dump_asm_conflicts "Register conflict graph"
262 (targetRegDotColor platform)
263 (Color.trivColorable platform
264 (targetVirtualRegSqueeze platform)
265 (targetRealRegSqueeze platform))
269 -- dump global NCG stats for linear allocator
270 (case concat $ catMaybes linearStats of
272 stats -> dumpSDoc dflags Opt_D_dump_asm_stats "NCG stats"
273 $ Linear.pprStats (concat native) stats)
275 -- write out the imports
276 Pretty.printDoc Pretty.LeftMode h
277 $ makeImportsDoc dflags (concat imports)
282 | dopt Opt_SplitObjs dflags = split_marker : tops
285 split_marker = CmmProc Nothing mkSplitMarkerLabel (ListGraph [])
288 -- | Do native code generation on all these cmms.
290 cmmNativeGens :: (PlatformOutputable statics, PlatformOutputable instr, Instruction instr)
292 -> NcgImpl statics instr jumpDest
297 -> [ ([NatCmmDecl statics instr],
298 Maybe [Color.RegAllocStats statics instr],
299 Maybe [Linear.RegAllocStats]) ]
302 [([NatCmmDecl statics instr],
303 Maybe [Color.RegAllocStats statics instr],
304 Maybe [Linear.RegAllocStats])] )
306 cmmNativeGens _ _ _ _ [] impAcc profAcc _
307 = return (reverse impAcc, reverse profAcc)
309 cmmNativeGens dflags ncgImpl h us (cmm : cmms) impAcc profAcc count
311 let platform = targetPlatform dflags
313 (us', native, imports, colorStats, linearStats)
314 <- cmmNativeGen dflags ncgImpl us cmm count
316 Pretty.bufLeftRender h
317 $ {-# SCC "pprNativeCode" #-} Pretty.vcat $ map (pprNatCmmDecl ncgImpl platform) native
319 -- carefully evaluate this strictly. Binding it with 'let'
320 -- and then using 'seq' doesn't work, because the let
321 -- apparently gets inlined first.
322 lsPprNative <- return $!
323 if dopt Opt_D_dump_asm dflags
324 || dopt Opt_D_dump_asm_stats dflags
328 count' <- return $! count + 1;
330 -- force evaulation all this stuff to avoid space leaks
331 seqString (showSDoc $ vcat $ map (pprPlatform platform) imports) `seq` return ()
333 cmmNativeGens dflags ncgImpl
336 ((lsPprNative, colorStats, linearStats) : profAcc)
339 where seqString [] = ()
340 seqString (x:xs) = x `seq` seqString xs `seq` ()
343 -- | Complete native code generation phase for a single top-level chunk of Cmm.
344 -- Dumping the output of each stage along the way.
345 -- Global conflict graph and NGC stats
347 :: (PlatformOutputable statics, PlatformOutputable instr, Instruction instr)
349 -> NcgImpl statics instr jumpDest
351 -> RawCmmDecl -- ^ the cmm to generate code for
352 -> Int -- ^ sequence number of this top thing
354 , [NatCmmDecl statics instr] -- native code
355 , [CLabel] -- things imported by this cmm
356 , Maybe [Color.RegAllocStats statics instr] -- stats for the coloring register allocator
357 , Maybe [Linear.RegAllocStats]) -- stats for the linear register allocators
359 cmmNativeGen dflags ncgImpl us cmm count
361 let platform = targetPlatform dflags
363 -- rewrite assignments to global regs
365 {-# SCC "fixStgRegisters" #-}
368 -- cmm to cmm optimisations
369 let (opt_cmm, imports) =
370 {-# SCC "cmmToCmm" #-}
371 cmmToCmm dflags fixed_cmm
374 Opt_D_dump_opt_cmm "Optimised Cmm"
375 (pprCmmGroup platform [opt_cmm])
377 -- generate native code from cmm
378 let ((native, lastMinuteImports), usGen) =
379 {-# SCC "genMachCode" #-}
380 initUs us $ genMachCode dflags (cmmTopCodeGen ncgImpl) opt_cmm
383 Opt_D_dump_asm_native "Native code"
384 (vcat $ map (docToSDoc . pprNatCmmDecl ncgImpl platform) native)
386 -- tag instructions with register liveness information
387 let (withLiveness, usLive) =
388 {-# SCC "regLiveness" #-}
390 $ mapUs (regLiveness platform)
391 $ map natCmmTopToLive native
394 Opt_D_dump_asm_liveness "Liveness annotations added"
395 (vcat $ map (pprPlatform platform) withLiveness)
397 -- allocate registers
398 (alloced, usAlloc, ppr_raStatsColor, ppr_raStatsLinear) <-
399 if ( dopt Opt_RegsGraph dflags
400 || dopt Opt_RegsIterative dflags)
402 -- the regs usable for allocation
403 let (alloc_regs :: UniqFM (UniqSet RealReg))
404 = foldr (\r -> plusUFM_C unionUniqSets
405 $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
407 $ allocatableRegs ncgImpl
409 -- do the graph coloring register allocation
410 let ((alloced, regAllocStats), usAlloc)
411 = {-# SCC "RegAlloc" #-}
416 (mkUniqSet [0 .. maxSpillSlots ncgImpl])
419 -- dump out what happened during register allocation
421 Opt_D_dump_asm_regalloc "Registers allocated"
422 (vcat $ map (docToSDoc . pprNatCmmDecl ncgImpl platform) alloced)
425 Opt_D_dump_asm_regalloc_stages "Build/spill stages"
426 (vcat $ map (\(stage, stats)
427 -> text "# --------------------------"
428 $$ text "# cmm " <> int count <> text " Stage " <> int stage
429 $$ pprPlatform platform stats)
430 $ zip [0..] regAllocStats)
433 if dopt Opt_D_dump_asm_stats dflags
434 then Just regAllocStats else Nothing
436 -- force evaluation of the Maybe to avoid space leak
437 mPprStats `seq` return ()
439 return ( alloced, usAlloc
444 -- do linear register allocation
445 let ((alloced, regAllocStats), usAlloc)
446 = {-# SCC "RegAlloc" #-}
449 $ mapUs (Linear.regAlloc dflags) withLiveness
452 Opt_D_dump_asm_regalloc "Registers allocated"
453 (vcat $ map (docToSDoc . pprNatCmmDecl ncgImpl platform) alloced)
456 if dopt Opt_D_dump_asm_stats dflags
457 then Just (catMaybes regAllocStats) else Nothing
459 -- force evaluation of the Maybe to avoid space leak
460 mPprStats `seq` return ()
462 return ( alloced, usAlloc
466 ---- x86fp_kludge. This pass inserts ffree instructions to clear
467 ---- the FPU stack on x86. The x86 ABI requires that the FPU stack
468 ---- is clear, and library functions can return odd results if it
471 ---- NB. must happen before shortcutBranches, because that
472 ---- generates JXX_GBLs which we can't fix up in x86fp_kludge.
473 let kludged = {-# SCC "x86fp_kludge" #-} ncg_x86fp_kludge ncgImpl alloced
475 ---- generate jump tables
477 {-# SCC "generateJumpTables" #-}
478 generateJumpTables ncgImpl kludged
480 ---- shortcut branches
482 {-# SCC "shortcutBranches" #-}
483 shortcutBranches dflags ncgImpl tabled
487 {-# SCC "sequenceBlocks" #-}
488 map (sequenceTop ncgImpl) shorted
490 ---- expansion of SPARC synthetic instrs
492 {-# SCC "sparc_expand" #-}
493 ncgExpandTop ncgImpl sequenced
496 Opt_D_dump_asm_expanded "Synthetic instructions expanded"
497 (vcat $ map (docToSDoc . pprNatCmmDecl ncgImpl platform) expanded)
501 , lastMinuteImports ++ imports
506 x86fp_kludge :: NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr -> NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr
507 x86fp_kludge top@(CmmData _ _) = top
508 x86fp_kludge (CmmProc info lbl (ListGraph code)) =
509 CmmProc info lbl (ListGraph $ X86.Instr.i386_insert_ffrees code)
512 -- | Build a doc for all the imports.
514 makeImportsDoc :: DynFlags -> [CLabel] -> Pretty.Doc
515 makeImportsDoc dflags imports
518 -- On recent versions of Darwin, the linker supports
519 -- dead-stripping of code and data on a per-symbol basis.
520 -- There's a hack to make this work in PprMach.pprNatCmmDecl.
521 (if platformHasSubsectionsViaSymbols (targetPlatform dflags)
522 then Pretty.text ".subsections_via_symbols"
525 -- On recent GNU ELF systems one can mark an object file
526 -- as not requiring an executable stack. If all objects
527 -- linked into a program have this note then the program
528 -- will not use an executable stack, which is good for
529 -- security. GHC generated code does not need an executable
530 -- stack so add the note in:
531 (if platformHasGnuNonexecStack (targetPlatform dflags)
532 then Pretty.text ".section .note.GNU-stack,\"\",@progbits"
534 -- And just because every other compiler does, lets stick in
535 -- an identifier directive: .ident "GHC x.y.z"
536 Pretty.$$ let compilerIdent = Pretty.text "GHC" Pretty.<+>
537 Pretty.text cProjectVersion
538 in Pretty.text ".ident" Pretty.<+>
539 Pretty.doubleQuotes compilerIdent
542 -- Generate "symbol stubs" for all external symbols that might
543 -- come from a dynamic library.
544 dyld_stubs :: [CLabel] -> Pretty.Doc
545 {- dyld_stubs imps = Pretty.vcat $ map pprDyldSymbolStub $
546 map head $ group $ sort imps-}
548 platform = targetPlatform dflags
549 arch = platformArch platform
550 os = platformOS platform
552 -- (Hack) sometimes two Labels pretty-print the same, but have
553 -- different uniques; so we compare their text versions...
555 | needImportedSymbols arch os
557 (pprGotDeclaration arch os :) $
558 map ( pprImportedSymbol platform . fst . head) $
559 groupBy (\(_,a) (_,b) -> a == b) $
560 sortBy (\(_,a) (_,b) -> compare a b) $
566 doPpr lbl = (lbl, renderWithStyle (pprCLabel platform lbl) astyle)
567 astyle = mkCodeStyle AsmStyle
570 -- -----------------------------------------------------------------------------
571 -- Sequencing the basic blocks
573 -- Cmm BasicBlocks are self-contained entities: they always end in a
574 -- jump, either non-local or to another basic block in the same proc.
575 -- In this phase, we attempt to place the basic blocks in a sequence
576 -- such that as many of the local jumps as possible turn into
581 => NcgImpl statics instr jumpDest -> NatCmmDecl statics instr -> NatCmmDecl statics instr
583 sequenceTop _ top@(CmmData _ _) = top
584 sequenceTop ncgImpl (CmmProc info lbl (ListGraph blocks)) =
585 CmmProc info lbl (ListGraph $ ncgMakeFarBranches ncgImpl $ sequenceBlocks blocks)
587 -- The algorithm is very simple (and stupid): we make a graph out of
588 -- the blocks where there is an edge from one block to another iff the
589 -- first block ends by jumping to the second. Then we topologically
590 -- sort this graph. Then traverse the list: for each block, we first
591 -- output the block, then if it has an out edge, we move the
592 -- destination of the out edge to the front of the list, and continue.
594 -- FYI, the classic layout for basic blocks uses postorder DFS; this
595 -- algorithm is implemented in Hoopl.
599 => [NatBasicBlock instr]
600 -> [NatBasicBlock instr]
602 sequenceBlocks [] = []
603 sequenceBlocks (entry:blocks) =
604 seqBlocks (mkNode entry : reverse (flattenSCCs (sccBlocks blocks)))
605 -- the first block is the entry point ==> it must remain at the start.
610 => [NatBasicBlock instr]
611 -> [SCC ( NatBasicBlock instr
615 sccBlocks blocks = stronglyConnCompFromEdgedVerticesR (map mkNode blocks)
617 -- we're only interested in the last instruction of
618 -- the block, and only if it has a single destination.
621 => [instr] -> [Unique]
624 = case jumpDestsOfInstr (last instrs) of
625 [one] -> [getUnique one]
628 mkNode :: (Instruction t)
630 -> (GenBasicBlock t, Unique, [Unique])
631 mkNode block@(BasicBlock id instrs) = (block, getUnique id, getOutEdges instrs)
633 seqBlocks :: (Eq t) => [(GenBasicBlock t1, t, [t])] -> [GenBasicBlock t1]
635 seqBlocks ((block,_,[]) : rest)
636 = block : seqBlocks rest
637 seqBlocks ((block@(BasicBlock id instrs),_,[next]) : rest)
638 | can_fallthrough = BasicBlock id (init instrs) : seqBlocks rest'
639 | otherwise = block : seqBlocks rest'
641 (can_fallthrough, rest') = reorder next [] rest
642 -- TODO: we should do a better job for cycles; try to maximise the
643 -- fallthroughs within a loop.
644 seqBlocks _ = panic "AsmCodegen:seqBlocks"
646 reorder :: (Eq a) => a -> [(t, a, t1)] -> [(t, a, t1)] -> (Bool, [(t, a, t1)])
647 reorder _ accum [] = (False, reverse accum)
648 reorder id accum (b@(block,id',out) : rest)
649 | id == id' = (True, (block,id,out) : reverse accum ++ rest)
650 | otherwise = reorder id (b:accum) rest
653 -- -----------------------------------------------------------------------------
654 -- Making far branches
656 -- Conditional branches on PowerPC are limited to +-32KB; if our Procs get too
657 -- big, we have to work around this limitation.
660 :: [NatBasicBlock PPC.Instr.Instr]
661 -> [NatBasicBlock PPC.Instr.Instr]
662 makeFarBranches blocks
663 | last blockAddresses < nearLimit = blocks
664 | otherwise = zipWith handleBlock blockAddresses blocks
666 blockAddresses = scanl (+) 0 $ map blockLen blocks
667 blockLen (BasicBlock _ instrs) = length instrs
669 handleBlock addr (BasicBlock id instrs)
670 = BasicBlock id (zipWith makeFar [addr..] instrs)
672 makeFar _ (PPC.Instr.BCC PPC.Cond.ALWAYS tgt) = PPC.Instr.BCC PPC.Cond.ALWAYS tgt
673 makeFar addr (PPC.Instr.BCC cond tgt)
674 | abs (addr - targetAddr) >= nearLimit
675 = PPC.Instr.BCCFAR cond tgt
677 = PPC.Instr.BCC cond tgt
678 where Just targetAddr = lookupUFM blockAddressMap tgt
679 makeFar _ other = other
681 nearLimit = 7000 -- 8192 instructions are allowed; let's keep some
682 -- distance, as we have a few pseudo-insns that are
683 -- pretty-printed as multiple instructions,
684 -- and it's just not worth the effort to calculate
687 blockAddressMap = listToUFM $ zip (map blockId blocks) blockAddresses
689 -- -----------------------------------------------------------------------------
690 -- Generate jump tables
692 -- Analyzes all native code and generates data sections for all jump
693 -- table instructions.
695 :: NcgImpl statics instr jumpDest
696 -> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
697 generateJumpTables ncgImpl xs = concatMap f xs
698 where f p@(CmmProc _ _ (ListGraph xs)) = p : concatMap g xs
700 g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl) xs)
702 -- -----------------------------------------------------------------------------
707 -> NcgImpl statics instr jumpDest
708 -> [NatCmmDecl statics instr]
709 -> [NatCmmDecl statics instr]
711 shortcutBranches dflags ncgImpl tops
712 | optLevel dflags < 1 = tops -- only with -O or higher
713 | otherwise = map (apply_mapping ncgImpl mapping) tops'
715 (tops', mappings) = mapAndUnzip (build_mapping ncgImpl) tops
716 mapping = foldr plusUFM emptyUFM mappings
718 build_mapping :: NcgImpl statics instr jumpDest
719 -> GenCmmDecl d t (ListGraph instr)
720 -> (GenCmmDecl d t (ListGraph instr), UniqFM jumpDest)
721 build_mapping _ top@(CmmData _ _) = (top, emptyUFM)
722 build_mapping _ (CmmProc info lbl (ListGraph []))
723 = (CmmProc info lbl (ListGraph []), emptyUFM)
724 build_mapping ncgImpl (CmmProc info lbl (ListGraph (head:blocks)))
725 = (CmmProc info lbl (ListGraph (head:others)), mapping)
726 -- drop the shorted blocks, but don't ever drop the first one,
727 -- because it is pointed to by a global label.
729 -- find all the blocks that just consist of a jump that can be
731 -- Don't completely eliminate loops here -- that can leave a dangling jump!
732 (_, shortcut_blocks, others) = foldl split (emptyBlockSet, [], []) blocks
733 split (s, shortcut_blocks, others) b@(BasicBlock id [insn])
734 | Just jd <- canShortcut ncgImpl insn,
735 Just dest <- getJumpDestBlockId ncgImpl jd,
736 (setMember dest s) || dest == id -- loop checks
737 = (s, shortcut_blocks, b : others)
738 split (s, shortcut_blocks, others) (BasicBlock id [insn])
739 | Just dest <- canShortcut ncgImpl insn
740 = (setInsert id s, (id,dest) : shortcut_blocks, others)
741 split (s, shortcut_blocks, others) other = (s, shortcut_blocks, other : others)
744 -- build a mapping from BlockId to JumpDest for shorting branches
745 mapping = foldl add emptyUFM shortcut_blocks
746 add ufm (id,dest) = addToUFM ufm id dest
748 apply_mapping :: NcgImpl statics instr jumpDest
750 -> GenCmmDecl statics h (ListGraph instr)
751 -> GenCmmDecl statics h (ListGraph instr)
752 apply_mapping ncgImpl ufm (CmmData sec statics)
753 = CmmData sec (shortcutStatics ncgImpl (lookupUFM ufm) statics)
754 apply_mapping ncgImpl ufm (CmmProc info lbl (ListGraph blocks))
755 = CmmProc info lbl (ListGraph $ map short_bb blocks)
757 short_bb (BasicBlock id insns) = BasicBlock id $! map short_insn insns
758 short_insn i = shortcutJump ncgImpl (lookupUFM ufm) i
759 -- shortcutJump should apply the mapping repeatedly,
760 -- just in case we can short multiple branches.
762 -- -----------------------------------------------------------------------------
763 -- Instruction selection
765 -- Native code instruction selection for a chunk of stix code. For
766 -- this part of the computation, we switch from the UniqSM monad to
767 -- the NatM monad. The latter carries not only a Unique, but also an
768 -- Int denoting the current C stack pointer offset in the generated
769 -- code; this is needed for creating correct spill offsets on
770 -- architectures which don't offer, or for which it would be
771 -- prohibitively expensive to employ, a frame pointer register. Viz,
774 -- The offset is measured in bytes, and indicates the difference
775 -- between the current (simulated) C stack-ptr and the value it was at
776 -- the beginning of the block. For stacks which grow down, this value
777 -- should be either zero or negative.
779 -- Switching between the two monads whilst carrying along the same
780 -- Unique supply breaks abstraction. Is that bad?
784 -> (RawCmmDecl -> NatM [NatCmmDecl statics instr])
787 ( [NatCmmDecl statics instr]
790 genMachCode dflags cmmTopCodeGen cmm_top
791 = do { initial_us <- getUs
792 ; let initial_st = mkNatM_State initial_us 0 dflags
793 (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top)
794 final_delta = natm_delta final_st
795 final_imports = natm_imports final_st
796 ; if final_delta == 0
797 then return (new_tops, final_imports)
798 else pprPanic "genMachCode: nonzero final delta" (int final_delta)
801 -- -----------------------------------------------------------------------------
802 -- Generic Cmm optimiser
808 (b) Simple inlining: a temporary which is assigned to and then
809 used, once, can be shorted.
810 (c) Position independent code and dynamic linking
811 (i) introduce the appropriate indirections
812 and position independent refs
813 (ii) compile a list of imported symbols
814 (d) Some arch-specific optimizations
816 (a) and (b) will be moving to the new Hoopl pipeline, however, (c) and
817 (d) are only needed by the native backend and will continue to live
820 Ideas for other things we could do (put these in Hoopl please!):
822 - shortcut jumps-to-jumps
823 - simple CSE: if an expr is assigned to a temp, then replace later occs of
824 that expr with the temp, until the expr is no longer valid (can push through
825 temp assignments, and certain assigns to mem...)
828 cmmToCmm :: DynFlags -> RawCmmDecl -> (RawCmmDecl, [CLabel])
829 cmmToCmm _ top@(CmmData _ _) = (top, [])
830 cmmToCmm dflags (CmmProc info lbl (ListGraph blocks)) = runCmmOpt dflags $ do
831 let platform = targetPlatform dflags
832 blocks' <- mapM cmmBlockConFold (cmmMiniInline platform (cmmEliminateDeadBlocks blocks))
833 return $ CmmProc info lbl (ListGraph blocks')
835 newtype CmmOptM a = CmmOptM (([CLabel], DynFlags) -> (# a, [CLabel] #))
837 instance Monad CmmOptM where
838 return x = CmmOptM $ \(imports, _) -> (# x,imports #)
840 CmmOptM $ \(imports, dflags) ->
841 case f (imports, dflags) of
844 CmmOptM g' -> g' (imports', dflags)
846 addImportCmmOpt :: CLabel -> CmmOptM ()
847 addImportCmmOpt lbl = CmmOptM $ \(imports, _dflags) -> (# (), lbl:imports #)
849 getDynFlagsCmmOpt :: CmmOptM DynFlags
850 getDynFlagsCmmOpt = CmmOptM $ \(imports, dflags) -> (# dflags, imports #)
852 runCmmOpt :: DynFlags -> CmmOptM a -> (a, [CLabel])
853 runCmmOpt dflags (CmmOptM f) = case f ([], dflags) of
854 (# result, imports #) -> (result, imports)
856 cmmBlockConFold :: CmmBasicBlock -> CmmOptM CmmBasicBlock
857 cmmBlockConFold (BasicBlock id stmts) = do
858 stmts' <- mapM cmmStmtConFold stmts
859 return $ BasicBlock id stmts'
861 -- This does three optimizations, but they're very quick to check, so we don't
862 -- bother turning them off even when the Hoopl code is active. Since
863 -- this is on the old Cmm representation, we can't reuse the code either:
864 -- * reg = reg --> nop
865 -- * if 0 then jump --> nop
866 -- * if 1 then jump --> jump
867 -- We might be tempted to skip this step entirely of not opt_PIC, but
868 -- there is some PowerPC code for the non-PIC case, which would also
869 -- have to be separated.
870 cmmStmtConFold :: CmmStmt -> CmmOptM CmmStmt
874 -> do src' <- cmmExprConFold DataReference src
875 return $ case src' of
876 CmmReg reg' | reg == reg' -> CmmNop
877 new_src -> CmmAssign reg new_src
880 -> do addr' <- cmmExprConFold DataReference addr
881 src' <- cmmExprConFold DataReference src
882 return $ CmmStore addr' src'
885 -> do addr' <- cmmExprConFold JumpReference addr
886 return $ CmmJump addr' regs
888 CmmCall target regs args srt returns
889 -> do target' <- case target of
890 CmmCallee e conv -> do
891 e' <- cmmExprConFold CallReference e
892 return $ CmmCallee e' conv
893 other -> return other
894 args' <- mapM (\(CmmHinted arg hint) -> do
895 arg' <- cmmExprConFold DataReference arg
896 return (CmmHinted arg' hint)) args
897 return $ CmmCall target' regs args' srt returns
899 CmmCondBranch test dest
900 -> do test' <- cmmExprConFold DataReference test
901 dflags <- getDynFlagsCmmOpt
902 let platform = targetPlatform dflags
903 return $ case test' of
904 CmmLit (CmmInt 0 _) ->
905 CmmComment (mkFastString ("deleted: " ++
906 showSDoc (pprStmt platform stmt)))
908 CmmLit (CmmInt _ _) -> CmmBranch dest
909 _other -> CmmCondBranch test' dest
912 -> do expr' <- cmmExprConFold DataReference expr
913 return $ CmmSwitch expr' ids
918 cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
919 cmmExprConFold referenceKind expr = do
920 dflags <- getDynFlagsCmmOpt
921 -- Skip constant folding if new code generator is running
922 -- (this optimization is done in Hoopl)
923 let expr' = if dopt Opt_TryNewCodeGen dflags
926 cmmExprNative referenceKind expr'
928 cmmExprCon :: CmmExpr -> CmmExpr
929 cmmExprCon (CmmLoad addr rep) = CmmLoad (cmmExprCon addr) rep
930 cmmExprCon (CmmMachOp mop args) = cmmMachOpFold mop (map cmmExprCon args)
931 cmmExprCon other = other
933 -- handles both PIC and non-PIC cases... a very strange mixture
935 cmmExprNative :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
936 cmmExprNative referenceKind expr = do
937 dflags <- getDynFlagsCmmOpt
938 let arch = platformArch (targetPlatform dflags)
941 -> do addr' <- cmmExprNative DataReference addr
942 return $ CmmLoad addr' rep
945 -> do args' <- mapM (cmmExprNative DataReference) args
946 return $ CmmMachOp mop args'
948 CmmLit (CmmLabel lbl)
950 cmmMakeDynamicReference dflags addImportCmmOpt referenceKind lbl
951 CmmLit (CmmLabelOff lbl off)
953 dynRef <- cmmMakeDynamicReference dflags addImportCmmOpt referenceKind lbl
954 -- need to optimize here, since it's late
955 return $ cmmMachOpFold (MO_Add wordWidth) [
957 (CmmLit $ CmmInt (fromIntegral off) wordWidth)
960 -- On powerpc (non-PIC), it's easier to jump directly to a label than
961 -- to use the register table, so we replace these registers
962 -- with the corresponding labels:
963 CmmReg (CmmGlobal EagerBlackholeInfo)
964 | arch == ArchPPC && not opt_PIC
965 -> cmmExprNative referenceKind $
966 CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "__stg_EAGER_BLACKHOLE_info")))
967 CmmReg (CmmGlobal GCEnter1)
968 | arch == ArchPPC && not opt_PIC
969 -> cmmExprNative referenceKind $
970 CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "__stg_gc_enter_1")))
971 CmmReg (CmmGlobal GCFun)
972 | arch == ArchPPC && not opt_PIC
973 -> cmmExprNative referenceKind $
974 CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "__stg_gc_fun")))