Revert "Batch merge"
[ghc.git] / compiler / nativeGen / X86 / CodeGen.hs
1 {-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
2
3 -- The default iteration limit is a bit too low for the definitions
4 -- in this module.
5 {-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
6
7 -----------------------------------------------------------------------------
8 --
9 -- Generating machine code (instruction selection)
10 --
11 -- (c) The University of Glasgow 1996-2004
12 --
13 -----------------------------------------------------------------------------
14
15 -- This is a big module, but, if you pay attention to
16 -- (a) the sectioning, and (b) the type signatures, the
17 -- structure should not be too overwhelming.
18
19 module X86.CodeGen (
20 cmmTopCodeGen,
21 generateJumpTableForInstr,
22 extractUnwindPoints,
23 invertCondBranches,
24 InstrBlock
25 )
26
27 where
28
29 #include "HsVersions.h"
30 #include "nativeGen/NCG.h"
31 #include "../includes/MachDeps.h"
32
33 -- NCG stuff:
34 import GhcPrelude
35
36 import X86.Instr
37 import X86.Cond
38 import X86.Regs
39 import X86.RegInfo
40
41 --TODO: Remove - Just for development/debugging
42 import X86.Ppr()
43
44 import CodeGen.Platform
45 import CPrim
46 import Debug ( DebugBlock(..), UnwindPoint(..), UnwindTable
47 , UnwindExpr(UwReg), toUnwindExpr )
48 import Instruction
49 import PIC
50 import NCGMonad ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
51 , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
52 , getPicBaseMaybeNat, getDebugBlock, getFileId
53 , addImmediateSuccessorNat, updateCfgNat)
54 import CFG
55 import Format
56 import Reg
57 import Platform
58
59 -- Our intermediate code:
60 import BasicTypes
61 import BlockId
62 import Module ( primUnitId )
63 import PprCmm ()
64 import CmmUtils
65 import CmmSwitch
66 import Cmm
67 import Hoopl.Block
68 import Hoopl.Collections
69 import Hoopl.Graph
70 import Hoopl.Label
71 import CLabel
72 import CoreSyn ( Tickish(..) )
73 import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
74
75 -- The rest:
76 import ForeignCall ( CCallConv(..) )
77 import OrdList
78 import Outputable
79 import FastString
80 import DynFlags
81 import Util
82 import UniqSupply ( getUniqueM )
83
84 import Control.Monad
85 import Data.Bits
86 import Data.Foldable (fold)
87 import Data.Int
88 import Data.Maybe
89 import Data.Word
90
91 import qualified Data.Map as M
92
93 is32BitPlatform :: NatM Bool
94 is32BitPlatform = do
95 dflags <- getDynFlags
96 return $ target32Bit (targetPlatform dflags)
97
98 sse2Enabled :: NatM Bool
99 sse2Enabled = do
100 dflags <- getDynFlags
101 return (isSse2Enabled dflags)
102
103 sse4_2Enabled :: NatM Bool
104 sse4_2Enabled = do
105 dflags <- getDynFlags
106 return (isSse4_2Enabled dflags)
107
108 if_sse2 :: NatM a -> NatM a -> NatM a
109 if_sse2 sse2 x87 = do
110 b <- sse2Enabled
111 if b then sse2 else x87
112
113 cmmTopCodeGen
114 :: RawCmmDecl
115 -> NatM [NatCmmDecl (Alignment, CmmStatics) Instr]
116
117 cmmTopCodeGen (CmmProc info lab live graph) = do
118 let blocks = toBlockListEntryFirst graph
119 (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
120 picBaseMb <- getPicBaseMaybeNat
121 dflags <- getDynFlags
122 let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
123 tops = proc : concat statics
124 os = platformOS $ targetPlatform dflags
125
126 case picBaseMb of
127 Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
128 Nothing -> return tops
129
130 cmmTopCodeGen (CmmData sec dat) = do
131 return [CmmData sec (1, dat)] -- no translation, we just use CmmStatic
132
133
134 basicBlockCodeGen
135 :: CmmBlock
136 -> NatM ( [NatBasicBlock Instr]
137 , [NatCmmDecl (Alignment, CmmStatics) Instr])
138
139 basicBlockCodeGen block = do
140 let (_, nodes, tail) = blockSplit block
141 id = entryLabel block
142 stmts = blockToList nodes
143 -- Generate location directive
144 dbg <- getDebugBlock (entryLabel block)
145 loc_instrs <- case dblSourceTick =<< dbg of
146 Just (SourceNote span name)
147 -> do fileId <- getFileId (srcSpanFile span)
148 let line = srcSpanStartLine span; col = srcSpanStartCol span
149 return $ unitOL $ LOCATION fileId line col name
150 _ -> return nilOL
151 mid_instrs <- stmtsToInstrs id stmts
152 tail_instrs <- stmtToInstrs id tail
153 let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
154 instrs' <- fold <$> traverse addSpUnwindings instrs
155 -- code generation may introduce new basic block boundaries, which
156 -- are indicated by the NEWBLOCK instruction. We must split up the
157 -- instruction stream into basic blocks again. Also, we extract
158 -- LDATAs here too.
159 let
160 (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
161
162 mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
163 = ([], BasicBlock id instrs : blocks, statics)
164 mkBlocks (LDATA sec dat) (instrs,blocks,statics)
165 = (instrs, blocks, CmmData sec dat:statics)
166 mkBlocks instr (instrs,blocks,statics)
167 = (instr:instrs, blocks, statics)
168 return (BasicBlock id top : other_blocks, statics)
169
170 -- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
171 -- in the @sp@ register. See Note [What is this unwinding business?] in Debug
172 -- for details.
173 addSpUnwindings :: Instr -> NatM (OrdList Instr)
174 addSpUnwindings instr@(DELTA d) = do
175 dflags <- getDynFlags
176 if debugLevel dflags >= 1
177 then do lbl <- mkAsmTempLabel <$> getUniqueM
178 let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
179 return $ toOL [ instr, UNWIND lbl unwind ]
180 else return (unitOL instr)
181 addSpUnwindings instr = return $ unitOL instr
182
183 stmtsToInstrs :: BlockId -> [CmmNode e x] -> NatM InstrBlock
184 stmtsToInstrs bid stmts
185 = do instrss <- mapM (stmtToInstrs bid) stmts
186 return (concatOL instrss)
187
188 -- | `bid` refers to the current block and is used to update the CFG
189 -- if new blocks are inserted in the control flow.
190 stmtToInstrs :: BlockId -> CmmNode e x -> NatM InstrBlock
191 stmtToInstrs bid stmt = do
192 dflags <- getDynFlags
193 is32Bit <- is32BitPlatform
194 case stmt of
195 CmmComment s -> return (unitOL (COMMENT s))
196 CmmTick {} -> return nilOL
197
198 CmmUnwind regs -> do
199 let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
200 to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
201 case foldMap to_unwind_entry regs of
202 tbl | M.null tbl -> return nilOL
203 | otherwise -> do
204 lbl <- mkAsmTempLabel <$> getUniqueM
205 return $ unitOL $ UNWIND lbl tbl
206
207 CmmAssign reg src
208 | isFloatType ty -> assignReg_FltCode format reg src
209 | is32Bit && isWord64 ty -> assignReg_I64Code reg src
210 | otherwise -> assignReg_IntCode format reg src
211 where ty = cmmRegType dflags reg
212 format = cmmTypeFormat ty
213
214 CmmStore addr src
215 | isFloatType ty -> assignMem_FltCode format addr src
216 | is32Bit && isWord64 ty -> assignMem_I64Code addr src
217 | otherwise -> assignMem_IntCode format addr src
218 where ty = cmmExprType dflags src
219 format = cmmTypeFormat ty
220
221 CmmUnsafeForeignCall target result_regs args
222 -> genCCall dflags is32Bit target result_regs args bid
223
224 CmmBranch id -> return $ genBranch id
225
226 --We try to arrange blocks such that the likely branch is the fallthrough
227 --in CmmContFlowOpt. So we can assume the condition is likely false here.
228 CmmCondBranch arg true false _ -> genCondBranch bid true false arg
229 CmmSwitch arg ids -> do dflags <- getDynFlags
230 genSwitch dflags arg ids
231 CmmCall { cml_target = arg
232 , cml_args_regs = gregs } -> do
233 dflags <- getDynFlags
234 genJump arg (jumpRegs dflags gregs)
235 _ ->
236 panic "stmtToInstrs: statement should have been cps'd away"
237
238
239 jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
240 jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
241 where platform = targetPlatform dflags
242
243 --------------------------------------------------------------------------------
244 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
245 -- They are really trees of insns to facilitate fast appending, where a
246 -- left-to-right traversal yields the insns in the correct order.
247 --
248 type InstrBlock
249 = OrdList Instr
250
251
252 -- | Condition codes passed up the tree.
253 --
254 data CondCode
255 = CondCode Bool Cond InstrBlock
256
257
258 -- | a.k.a "Register64"
259 -- Reg is the lower 32-bit temporary which contains the result.
260 -- Use getHiVRegFromLo to find the other VRegUnique.
261 --
262 -- Rules of this simplified insn selection game are therefore that
263 -- the returned Reg may be modified
264 --
265 data ChildCode64
266 = ChildCode64
267 InstrBlock
268 Reg
269
270
271 -- | Register's passed up the tree. If the stix code forces the register
272 -- to live in a pre-decided machine register, it comes out as @Fixed@;
273 -- otherwise, it comes out as @Any@, and the parent can decide which
274 -- register to put it in.
275 --
276 data Register
277 = Fixed Format Reg InstrBlock
278 | Any Format (Reg -> InstrBlock)
279
280
281 swizzleRegisterRep :: Register -> Format -> Register
282 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
283 swizzleRegisterRep (Any _ codefn) format = Any format codefn
284
285
286 -- | Grab the Reg for a CmmReg
287 getRegisterReg :: Platform -> Bool -> CmmReg -> Reg
288
289 getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk))
290 = let fmt = cmmTypeFormat pk in
291 if isFloatFormat fmt && not use_sse2
292 then RegVirtual (mkVirtualReg u FF80)
293 else RegVirtual (mkVirtualReg u fmt)
294
295 getRegisterReg platform _ (CmmGlobal mid)
296 = case globalRegMaybe platform mid of
297 Just reg -> RegReal $ reg
298 Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
299 -- By this stage, the only MagicIds remaining should be the
300 -- ones which map to a real machine register on this
301 -- platform. Hence ...
302
303
304 -- | Memory addressing modes passed up the tree.
305 data Amode
306 = Amode AddrMode InstrBlock
307
308 {-
309 Now, given a tree (the argument to a CmmLoad) that references memory,
310 produce a suitable addressing mode.
311
312 A Rule of the Game (tm) for Amodes: use of the addr bit must
313 immediately follow use of the code part, since the code part puts
314 values in registers which the addr then refers to. So you can't put
315 anything in between, lest it overwrite some of those registers. If
316 you need to do some other computation between the code part and use of
317 the addr bit, first store the effective address from the amode in a
318 temporary, then do the other computation, and then use the temporary:
319
320 code
321 LEA amode, tmp
322 ... other computation ...
323 ... (tmp) ...
324 -}
325
326
327 -- | Check whether an integer will fit in 32 bits.
328 -- A CmmInt is intended to be truncated to the appropriate
329 -- number of bits, so here we truncate it to Int64. This is
330 -- important because e.g. -1 as a CmmInt might be either
331 -- -1 or 18446744073709551615.
332 --
333 is32BitInteger :: Integer -> Bool
334 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
335 where i64 = fromIntegral i :: Int64
336
337
338 -- | Convert a BlockId to some CmmStatic data
339 jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
340 jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
341 jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
342 where blockLabel = blockLbl blockid
343
344
345 -- -----------------------------------------------------------------------------
346 -- General things for putting together code sequences
347
348 -- Expand CmmRegOff. ToDo: should we do it this way around, or convert
349 -- CmmExprs into CmmRegOff?
350 mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
351 mangleIndexTree dflags reg off
352 = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
353 where width = typeWidth (cmmRegType dflags reg)
354
355 -- | The dual to getAnyReg: compute an expression into a register, but
356 -- we don't mind which one it is.
357 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
358 getSomeReg expr = do
359 r <- getRegister expr
360 case r of
361 Any rep code -> do
362 tmp <- getNewRegNat rep
363 return (tmp, code tmp)
364 Fixed _ reg code ->
365 return (reg, code)
366
367
368 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
369 assignMem_I64Code addrTree valueTree = do
370 Amode addr addr_code <- getAmode addrTree
371 ChildCode64 vcode rlo <- iselExpr64 valueTree
372 let
373 rhi = getHiVRegFromLo rlo
374
375 -- Little-endian store
376 mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
377 mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
378 return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
379
380
381 assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
382 assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
383 ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
384 let
385 r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
386 r_dst_hi = getHiVRegFromLo r_dst_lo
387 r_src_hi = getHiVRegFromLo r_src_lo
388 mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
389 mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
390 return (
391 vcode `snocOL` mov_lo `snocOL` mov_hi
392 )
393
394 assignReg_I64Code _ _
395 = panic "assignReg_I64Code(i386): invalid lvalue"
396
397
398 iselExpr64 :: CmmExpr -> NatM ChildCode64
399 iselExpr64 (CmmLit (CmmInt i _)) = do
400 (rlo,rhi) <- getNewRegPairNat II32
401 let
402 r = fromIntegral (fromIntegral i :: Word32)
403 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
404 code = toOL [
405 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
406 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
407 ]
408 return (ChildCode64 code rlo)
409
410 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
411 Amode addr addr_code <- getAmode addrTree
412 (rlo,rhi) <- getNewRegPairNat II32
413 let
414 mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
415 mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
416 return (
417 ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
418 rlo
419 )
420
421 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
422 = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
423
424 -- we handle addition, but rather badly
425 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
426 ChildCode64 code1 r1lo <- iselExpr64 e1
427 (rlo,rhi) <- getNewRegPairNat II32
428 let
429 r = fromIntegral (fromIntegral i :: Word32)
430 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
431 r1hi = getHiVRegFromLo r1lo
432 code = code1 `appOL`
433 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
434 ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
435 MOV II32 (OpReg r1hi) (OpReg rhi),
436 ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
437 return (ChildCode64 code rlo)
438
439 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
440 ChildCode64 code1 r1lo <- iselExpr64 e1
441 ChildCode64 code2 r2lo <- iselExpr64 e2
442 (rlo,rhi) <- getNewRegPairNat II32
443 let
444 r1hi = getHiVRegFromLo r1lo
445 r2hi = getHiVRegFromLo r2lo
446 code = code1 `appOL`
447 code2 `appOL`
448 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
449 ADD II32 (OpReg r2lo) (OpReg rlo),
450 MOV II32 (OpReg r1hi) (OpReg rhi),
451 ADC II32 (OpReg r2hi) (OpReg rhi) ]
452 return (ChildCode64 code rlo)
453
454 iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
455 ChildCode64 code1 r1lo <- iselExpr64 e1
456 ChildCode64 code2 r2lo <- iselExpr64 e2
457 (rlo,rhi) <- getNewRegPairNat II32
458 let
459 r1hi = getHiVRegFromLo r1lo
460 r2hi = getHiVRegFromLo r2lo
461 code = code1 `appOL`
462 code2 `appOL`
463 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
464 SUB II32 (OpReg r2lo) (OpReg rlo),
465 MOV II32 (OpReg r1hi) (OpReg rhi),
466 SBB II32 (OpReg r2hi) (OpReg rhi) ]
467 return (ChildCode64 code rlo)
468
469 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
470 fn <- getAnyReg expr
471 r_dst_lo <- getNewRegNat II32
472 let r_dst_hi = getHiVRegFromLo r_dst_lo
473 code = fn r_dst_lo
474 return (
475 ChildCode64 (code `snocOL`
476 MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
477 r_dst_lo
478 )
479
480 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
481 fn <- getAnyReg expr
482 r_dst_lo <- getNewRegNat II32
483 let r_dst_hi = getHiVRegFromLo r_dst_lo
484 code = fn r_dst_lo
485 return (
486 ChildCode64 (code `snocOL`
487 MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
488 CLTD II32 `snocOL`
489 MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
490 MOV II32 (OpReg edx) (OpReg r_dst_hi))
491 r_dst_lo
492 )
493
494 iselExpr64 expr
495 = pprPanic "iselExpr64(i386)" (ppr expr)
496
497
498 --------------------------------------------------------------------------------
499 getRegister :: CmmExpr -> NatM Register
500 getRegister e = do dflags <- getDynFlags
501 is32Bit <- is32BitPlatform
502 getRegister' dflags is32Bit e
503
504 getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
505
506 getRegister' dflags is32Bit (CmmReg reg)
507 = case reg of
508 CmmGlobal PicBaseReg
509 | is32Bit ->
510 -- on x86_64, we have %rip for PicBaseReg, but it's not
511 -- a full-featured register, it can only be used for
512 -- rip-relative addressing.
513 do reg' <- getPicBaseNat (archWordFormat is32Bit)
514 return (Fixed (archWordFormat is32Bit) reg' nilOL)
515 _ ->
516 do use_sse2 <- sse2Enabled
517 let
518 fmt = cmmTypeFormat (cmmRegType dflags reg)
519 format | not use_sse2 && isFloatFormat fmt = FF80
520 | otherwise = fmt
521 --
522 let platform = targetPlatform dflags
523 return (Fixed format
524 (getRegisterReg platform use_sse2 reg)
525 nilOL)
526
527
528 getRegister' dflags is32Bit (CmmRegOff r n)
529 = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
530
531 getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
532 = addAlignmentCheck align <$> getRegister' dflags is32Bit e
533
534 -- for 32-bit architectuers, support some 64 -> 32 bit conversions:
535 -- TO_W_(x), TO_W_(x >> 32)
536
537 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
538 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
539 | is32Bit = do
540 ChildCode64 code rlo <- iselExpr64 x
541 return $ Fixed II32 (getHiVRegFromLo rlo) code
542
543 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
544 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
545 | is32Bit = do
546 ChildCode64 code rlo <- iselExpr64 x
547 return $ Fixed II32 (getHiVRegFromLo rlo) code
548
549 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
550 | is32Bit = do
551 ChildCode64 code rlo <- iselExpr64 x
552 return $ Fixed II32 rlo code
553
554 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
555 | is32Bit = do
556 ChildCode64 code rlo <- iselExpr64 x
557 return $ Fixed II32 rlo code
558
559 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
560 if_sse2 float_const_sse2 float_const_x87
561 where
562 float_const_sse2
563 | f == 0.0 = do
564 let
565 format = floatFormat w
566 code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
567 -- I don't know why there are xorpd, xorps, and pxor instructions.
568 -- They all appear to do the same thing --SDM
569 return (Any format code)
570
571 | otherwise = do
572 Amode addr code <- memConstant (widthInBytes w) lit
573 loadFloatAmode True w addr code
574
575 float_const_x87 = case w of
576 W64
577 | f == 0.0 ->
578 let code dst = unitOL (GLDZ dst)
579 in return (Any FF80 code)
580
581 | f == 1.0 ->
582 let code dst = unitOL (GLD1 dst)
583 in return (Any FF80 code)
584
585 _otherwise -> do
586 Amode addr code <- memConstant (widthInBytes w) lit
587 loadFloatAmode False w addr code
588
589 -- catch simple cases of zero- or sign-extended load
590 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
591 code <- intLoadCode (MOVZxL II8) addr
592 return (Any II32 code)
593
594 getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
595 code <- intLoadCode (MOVSxL II8) addr
596 return (Any II32 code)
597
598 getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
599 code <- intLoadCode (MOVZxL II16) addr
600 return (Any II32 code)
601
602 getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
603 code <- intLoadCode (MOVSxL II16) addr
604 return (Any II32 code)
605
606 -- catch simple cases of zero- or sign-extended load
607 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
608 | not is32Bit = do
609 code <- intLoadCode (MOVZxL II8) addr
610 return (Any II64 code)
611
612 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
613 | not is32Bit = do
614 code <- intLoadCode (MOVSxL II8) addr
615 return (Any II64 code)
616
617 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
618 | not is32Bit = do
619 code <- intLoadCode (MOVZxL II16) addr
620 return (Any II64 code)
621
622 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
623 | not is32Bit = do
624 code <- intLoadCode (MOVSxL II16) addr
625 return (Any II64 code)
626
627 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
628 | not is32Bit = do
629 code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
630 return (Any II64 code)
631
632 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
633 | not is32Bit = do
634 code <- intLoadCode (MOVSxL II32) addr
635 return (Any II64 code)
636
637 getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
638 CmmLit displacement])
639 | not is32Bit = do
640 return $ Any II64 (\dst -> unitOL $
641 LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
642
643 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
644 sse2 <- sse2Enabled
645 case mop of
646 MO_F_Neg w
647 | sse2 -> sse2NegCode w x
648 | otherwise -> trivialUFCode FF80 (GNEG FF80) x
649
650 MO_S_Neg w -> triv_ucode NEGI (intFormat w)
651 MO_Not w -> triv_ucode NOT (intFormat w)
652
653 -- Nop conversions
654 MO_UU_Conv W32 W8 -> toI8Reg W32 x
655 MO_SS_Conv W32 W8 -> toI8Reg W32 x
656 MO_XX_Conv W32 W8 -> toI8Reg W32 x
657 MO_UU_Conv W16 W8 -> toI8Reg W16 x
658 MO_SS_Conv W16 W8 -> toI8Reg W16 x
659 MO_XX_Conv W16 W8 -> toI8Reg W16 x
660 MO_UU_Conv W32 W16 -> toI16Reg W32 x
661 MO_SS_Conv W32 W16 -> toI16Reg W32 x
662 MO_XX_Conv W32 W16 -> toI16Reg W32 x
663
664 MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
665 MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
666 MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
667 MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
668 MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
669 MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
670 MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
671 MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
672 MO_XX_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
673
674 MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
675 MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
676 MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
677
678 -- widenings
679 MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x
680 MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
681 MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x
682
683 MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x
684 MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
685 MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x
686
687 -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
688 -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
689 -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
690 MO_XX_Conv W8 W32
691 | is32Bit -> integerExtend W8 W32 MOVZxL x
692 | otherwise -> integerExtend W8 W32 MOV x
693 MO_XX_Conv W8 W16
694 | is32Bit -> integerExtend W8 W16 MOVZxL x
695 | otherwise -> integerExtend W8 W16 MOV x
696 MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
697
698 MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
699 MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
700 MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
701 MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x
702 MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
703 MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
704 -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
705 -- However, we don't want the register allocator to throw it
706 -- away as an unnecessary reg-to-reg move, so we keep it in
707 -- the form of a movzl and print it as a movl later.
708 -- This doesn't apply to MO_XX_Conv since in this case we don't care about
709 -- the upper bits. So we can just use MOV.
710 MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x
711 MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
712 MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
713
714 MO_FF_Conv W32 W64
715 | sse2 -> coerceFP2FP W64 x
716 | otherwise -> conversionNop FF80 x
717
718 MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
719
720 MO_FS_Conv from to -> coerceFP2Int from to x
721 MO_SF_Conv from to -> coerceInt2FP from to x
722
723 MO_V_Insert {} -> needLlvm
724 MO_V_Extract {} -> needLlvm
725 MO_V_Add {} -> needLlvm
726 MO_V_Sub {} -> needLlvm
727 MO_V_Mul {} -> needLlvm
728 MO_VS_Quot {} -> needLlvm
729 MO_VS_Rem {} -> needLlvm
730 MO_VS_Neg {} -> needLlvm
731 MO_VU_Quot {} -> needLlvm
732 MO_VU_Rem {} -> needLlvm
733 MO_VF_Insert {} -> needLlvm
734 MO_VF_Extract {} -> needLlvm
735 MO_VF_Add {} -> needLlvm
736 MO_VF_Sub {} -> needLlvm
737 MO_VF_Mul {} -> needLlvm
738 MO_VF_Quot {} -> needLlvm
739 MO_VF_Neg {} -> needLlvm
740
741 _other -> pprPanic "getRegister" (pprMachOp mop)
742 where
743 triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
744 triv_ucode instr format = trivialUCode format (instr format) x
745
746 -- signed or unsigned extension.
747 integerExtend :: Width -> Width
748 -> (Format -> Operand -> Operand -> Instr)
749 -> CmmExpr -> NatM Register
750 integerExtend from to instr expr = do
751 (reg,e_code) <- if from == W8 then getByteReg expr
752 else getSomeReg expr
753 let
754 code dst =
755 e_code `snocOL`
756 instr (intFormat from) (OpReg reg) (OpReg dst)
757 return (Any (intFormat to) code)
758
759 toI8Reg :: Width -> CmmExpr -> NatM Register
760 toI8Reg new_rep expr
761 = do codefn <- getAnyReg expr
762 return (Any (intFormat new_rep) codefn)
763 -- HACK: use getAnyReg to get a byte-addressable register.
764 -- If the source was a Fixed register, this will add the
765 -- mov instruction to put it into the desired destination.
766 -- We're assuming that the destination won't be a fixed
767 -- non-byte-addressable register; it won't be, because all
768 -- fixed registers are word-sized.
769
770 toI16Reg = toI8Reg -- for now
771
772 conversionNop :: Format -> CmmExpr -> NatM Register
773 conversionNop new_format expr
774 = do e_code <- getRegister' dflags is32Bit expr
775 return (swizzleRegisterRep e_code new_format)
776
777
778 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
779 sse2 <- sse2Enabled
780 case mop of
781 MO_F_Eq _ -> condFltReg is32Bit EQQ x y
782 MO_F_Ne _ -> condFltReg is32Bit NE x y
783 MO_F_Gt _ -> condFltReg is32Bit GTT x y
784 MO_F_Ge _ -> condFltReg is32Bit GE x y
785 -- Invert comparison condition and swap operands
786 -- See Note [SSE Parity Checks]
787 MO_F_Lt _ -> condFltReg is32Bit GTT y x
788 MO_F_Le _ -> condFltReg is32Bit GE y x
789
790 MO_Eq _ -> condIntReg EQQ x y
791 MO_Ne _ -> condIntReg NE x y
792
793 MO_S_Gt _ -> condIntReg GTT x y
794 MO_S_Ge _ -> condIntReg GE x y
795 MO_S_Lt _ -> condIntReg LTT x y
796 MO_S_Le _ -> condIntReg LE x y
797
798 MO_U_Gt _ -> condIntReg GU x y
799 MO_U_Ge _ -> condIntReg GEU x y
800 MO_U_Lt _ -> condIntReg LU x y
801 MO_U_Le _ -> condIntReg LEU x y
802
803 MO_F_Add w | sse2 -> trivialFCode_sse2 w ADD x y
804 | otherwise -> trivialFCode_x87 GADD x y
805 MO_F_Sub w | sse2 -> trivialFCode_sse2 w SUB x y
806 | otherwise -> trivialFCode_x87 GSUB x y
807 MO_F_Quot w | sse2 -> trivialFCode_sse2 w FDIV x y
808 | otherwise -> trivialFCode_x87 GDIV x y
809 MO_F_Mul w | sse2 -> trivialFCode_sse2 w MUL x y
810 | otherwise -> trivialFCode_x87 GMUL x y
811
812 MO_Add rep -> add_code rep x y
813 MO_Sub rep -> sub_code rep x y
814
815 MO_S_Quot rep -> div_code rep True True x y
816 MO_S_Rem rep -> div_code rep True False x y
817 MO_U_Quot rep -> div_code rep False True x y
818 MO_U_Rem rep -> div_code rep False False x y
819
820 MO_S_MulMayOflo rep -> imulMayOflo rep x y
821
822 MO_Mul W8 -> imulW8 x y
823 MO_Mul rep -> triv_op rep IMUL
824 MO_And rep -> triv_op rep AND
825 MO_Or rep -> triv_op rep OR
826 MO_Xor rep -> triv_op rep XOR
827
828 {- Shift ops on x86s have constraints on their source, it
829 either has to be Imm, CL or 1
830 => trivialCode is not restrictive enough (sigh.)
831 -}
832 MO_Shl rep -> shift_code rep SHL x y {-False-}
833 MO_U_Shr rep -> shift_code rep SHR x y {-False-}
834 MO_S_Shr rep -> shift_code rep SAR x y {-False-}
835
836 MO_V_Insert {} -> needLlvm
837 MO_V_Extract {} -> needLlvm
838 MO_V_Add {} -> needLlvm
839 MO_V_Sub {} -> needLlvm
840 MO_V_Mul {} -> needLlvm
841 MO_VS_Quot {} -> needLlvm
842 MO_VS_Rem {} -> needLlvm
843 MO_VS_Neg {} -> needLlvm
844 MO_VF_Insert {} -> needLlvm
845 MO_VF_Extract {} -> needLlvm
846 MO_VF_Add {} -> needLlvm
847 MO_VF_Sub {} -> needLlvm
848 MO_VF_Mul {} -> needLlvm
849 MO_VF_Quot {} -> needLlvm
850 MO_VF_Neg {} -> needLlvm
851
852 _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
853 where
854 --------------------
855 triv_op width instr = trivialCode width op (Just op) x y
856 where op = instr (intFormat width)
857
858 -- Special case for IMUL for bytes, since the result of IMULB will be in
859 -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
860 -- values.
861 imulW8 :: CmmExpr -> CmmExpr -> NatM Register
862 imulW8 arg_a arg_b = do
863 (a_reg, a_code) <- getNonClobberedReg arg_a
864 b_code <- getAnyReg arg_b
865
866 let code = a_code `appOL` b_code eax `appOL`
867 toOL [ IMUL2 format (OpReg a_reg) ]
868 format = intFormat W8
869
870 return (Fixed format eax code)
871
872
873 imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
874 imulMayOflo rep a b = do
875 (a_reg, a_code) <- getNonClobberedReg a
876 b_code <- getAnyReg b
877 let
878 shift_amt = case rep of
879 W32 -> 31
880 W64 -> 63
881 _ -> panic "shift_amt"
882
883 format = intFormat rep
884 code = a_code `appOL` b_code eax `appOL`
885 toOL [
886 IMUL2 format (OpReg a_reg), -- result in %edx:%eax
887 SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
888 -- sign extend lower part
889 SUB format (OpReg edx) (OpReg eax)
890 -- compare against upper
891 -- eax==0 if high part == sign extended low part
892 ]
893 return (Fixed format eax code)
894
895 --------------------
896 shift_code :: Width
897 -> (Format -> Operand -> Operand -> Instr)
898 -> CmmExpr
899 -> CmmExpr
900 -> NatM Register
901
902 {- Case1: shift length as immediate -}
903 shift_code width instr x (CmmLit lit) = do
904 x_code <- getAnyReg x
905 let
906 format = intFormat width
907 code dst
908 = x_code dst `snocOL`
909 instr format (OpImm (litToImm lit)) (OpReg dst)
910 return (Any format code)
911
912 {- Case2: shift length is complex (non-immediate)
913 * y must go in %ecx.
914 * we cannot do y first *and* put its result in %ecx, because
915 %ecx might be clobbered by x.
916 * if we do y second, then x cannot be
917 in a clobbered reg. Also, we cannot clobber x's reg
918 with the instruction itself.
919 * so we can either:
920 - do y first, put its result in a fresh tmp, then copy it to %ecx later
921 - do y second and put its result into %ecx. x gets placed in a fresh
922 tmp. This is likely to be better, because the reg alloc can
923 eliminate this reg->reg move here (it won't eliminate the other one,
924 because the move is into the fixed %ecx).
925 -}
926 shift_code width instr x y{-amount-} = do
927 x_code <- getAnyReg x
928 let format = intFormat width
929 tmp <- getNewRegNat format
930 y_code <- getAnyReg y
931 let
932 code = x_code tmp `appOL`
933 y_code ecx `snocOL`
934 instr format (OpReg ecx) (OpReg tmp)
935 return (Fixed format tmp code)
936
937 --------------------
938 add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
939 add_code rep x (CmmLit (CmmInt y _))
940 | is32BitInteger y = add_int rep x y
941 add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
942 where format = intFormat rep
943 -- TODO: There are other interesting patterns we want to replace
944 -- with a LEA, e.g. `(x + offset) + (y << shift)`.
945
946 --------------------
947 sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
948 sub_code rep x (CmmLit (CmmInt y _))
949 | is32BitInteger (-y) = add_int rep x (-y)
950 sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
951
952 -- our three-operand add instruction:
953 add_int width x y = do
954 (x_reg, x_code) <- getSomeReg x
955 let
956 format = intFormat width
957 imm = ImmInt (fromInteger y)
958 code dst
959 = x_code `snocOL`
960 LEA format
961 (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
962 (OpReg dst)
963 --
964 return (Any format code)
965
966 ----------------------
967
968 -- See Note [DIV/IDIV for bytes]
969 div_code W8 signed quotient x y = do
970 let widen | signed = MO_SS_Conv W8 W16
971 | otherwise = MO_UU_Conv W8 W16
972 div_code
973 W16
974 signed
975 quotient
976 (CmmMachOp widen [x])
977 (CmmMachOp widen [y])
978
979 div_code width signed quotient x y = do
980 (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
981 x_code <- getAnyReg x
982 let
983 format = intFormat width
984 widen | signed = CLTD format
985 | otherwise = XOR format (OpReg edx) (OpReg edx)
986
987 instr | signed = IDIV
988 | otherwise = DIV
989
990 code = y_code `appOL`
991 x_code eax `appOL`
992 toOL [widen, instr format y_op]
993
994 result | quotient = eax
995 | otherwise = edx
996
997 return (Fixed format result code)
998
999
1000 getRegister' _ _ (CmmLoad mem pk)
1001 | isFloatType pk
1002 = do
1003 Amode addr mem_code <- getAmode mem
1004 use_sse2 <- sse2Enabled
1005 loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
1006
1007 getRegister' _ is32Bit (CmmLoad mem pk)
1008 | is32Bit && not (isWord64 pk)
1009 = do
1010 code <- intLoadCode instr mem
1011 return (Any format code)
1012 where
1013 width = typeWidth pk
1014 format = intFormat width
1015 instr = case width of
1016 W8 -> MOVZxL II8
1017 _other -> MOV format
1018 -- We always zero-extend 8-bit loads, if we
1019 -- can't think of anything better. This is because
1020 -- we can't guarantee access to an 8-bit variant of every register
1021 -- (esi and edi don't have 8-bit variants), so to make things
1022 -- simpler we do our 8-bit arithmetic with full 32-bit registers.
1023
1024 -- Simpler memory load code on x86_64
1025 getRegister' _ is32Bit (CmmLoad mem pk)
1026 | not is32Bit
1027 = do
1028 code <- intLoadCode (MOV format) mem
1029 return (Any format code)
1030 where format = intFormat $ typeWidth pk
1031
1032 getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
1033 = let
1034 format = intFormat width
1035
1036 -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
1037 format1 = if is32Bit then format
1038 else case format of
1039 II64 -> II32
1040 _ -> format
1041 code dst
1042 = unitOL (XOR format1 (OpReg dst) (OpReg dst))
1043 in
1044 return (Any format code)
1045
1046 -- optimisation for loading small literals on x86_64: take advantage
1047 -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
1048 -- instruction forms are shorter.
1049 getRegister' dflags is32Bit (CmmLit lit)
1050 | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
1051 = let
1052 imm = litToImm lit
1053 code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
1054 in
1055 return (Any II64 code)
1056 where
1057 isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
1058 isBigLit _ = False
1059 -- note1: not the same as (not.is32BitLit), because that checks for
1060 -- signed literals that fit in 32 bits, but we want unsigned
1061 -- literals here.
1062 -- note2: all labels are small, because we're assuming the
1063 -- small memory model (see gcc docs, -mcmodel=small).
1064
1065 getRegister' dflags _ (CmmLit lit)
1066 = do let format = cmmTypeFormat (cmmLitType dflags lit)
1067 imm = litToImm lit
1068 code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
1069 return (Any format code)
1070
1071 getRegister' _ _ other
1072 | isVecExpr other = needLlvm
1073 | otherwise = pprPanic "getRegister(x86)" (ppr other)
1074
1075
1076 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
1077 -> NatM (Reg -> InstrBlock)
1078 intLoadCode instr mem = do
1079 Amode src mem_code <- getAmode mem
1080 return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
1081
1082 -- Compute an expression into *any* register, adding the appropriate
1083 -- move instruction if necessary.
1084 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
1085 getAnyReg expr = do
1086 r <- getRegister expr
1087 anyReg r
1088
1089 anyReg :: Register -> NatM (Reg -> InstrBlock)
1090 anyReg (Any _ code) = return code
1091 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
1092
1093 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
1094 -- Fixed registers might not be byte-addressable, so we make sure we've
1095 -- got a temporary, inserting an extra reg copy if necessary.
1096 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
1097 getByteReg expr = do
1098 is32Bit <- is32BitPlatform
1099 if is32Bit
1100 then do r <- getRegister expr
1101 case r of
1102 Any rep code -> do
1103 tmp <- getNewRegNat rep
1104 return (tmp, code tmp)
1105 Fixed rep reg code
1106 | isVirtualReg reg -> return (reg,code)
1107 | otherwise -> do
1108 tmp <- getNewRegNat rep
1109 return (tmp, code `snocOL` reg2reg rep reg tmp)
1110 -- ToDo: could optimise slightly by checking for
1111 -- byte-addressable real registers, but that will
1112 -- happen very rarely if at all.
1113 else getSomeReg expr -- all regs are byte-addressable on x86_64
1114
1115 -- Another variant: this time we want the result in a register that cannot
1116 -- be modified by code to evaluate an arbitrary expression.
1117 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
1118 getNonClobberedReg expr = do
1119 dflags <- getDynFlags
1120 r <- getRegister expr
1121 case r of
1122 Any rep code -> do
1123 tmp <- getNewRegNat rep
1124 return (tmp, code tmp)
1125 Fixed rep reg code
1126 -- only certain regs can be clobbered
1127 | reg `elem` instrClobberedRegs (targetPlatform dflags)
1128 -> do
1129 tmp <- getNewRegNat rep
1130 return (tmp, code `snocOL` reg2reg rep reg tmp)
1131 | otherwise ->
1132 return (reg, code)
1133
1134 reg2reg :: Format -> Reg -> Reg -> Instr
1135 reg2reg format src dst
1136 | format == FF80 = GMOV src dst
1137 | otherwise = MOV format (OpReg src) (OpReg dst)
1138
1139
1140 --------------------------------------------------------------------------------
1141 getAmode :: CmmExpr -> NatM Amode
1142 getAmode e = do is32Bit <- is32BitPlatform
1143 getAmode' is32Bit e
1144
1145 getAmode' :: Bool -> CmmExpr -> NatM Amode
1146 getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
1147 getAmode $ mangleIndexTree dflags r n
1148
1149 getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
1150 CmmLit displacement])
1151 | not is32Bit
1152 = return $ Amode (ripRel (litToImm displacement)) nilOL
1153
1154
1155 -- This is all just ridiculous, since it carefully undoes
1156 -- what mangleIndexTree has just done.
1157 getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
1158 | is32BitLit is32Bit lit
1159 -- ASSERT(rep == II32)???
1160 = do (x_reg, x_code) <- getSomeReg x
1161 let off = ImmInt (-(fromInteger i))
1162 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1163
1164 getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
1165 | is32BitLit is32Bit lit
1166 -- ASSERT(rep == II32)???
1167 = do (x_reg, x_code) <- getSomeReg x
1168 let off = litToImm lit
1169 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1170
1171 -- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be
1172 -- recognised by the next rule.
1173 getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
1174 b@(CmmLit _)])
1175 = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
1176
1177 -- Matches: (x + offset) + (y << shift)
1178 getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset,
1179 CmmMachOp (MO_Shl _)
1180 [y, CmmLit (CmmInt shift _)]])
1181 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1182 = x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
1183
1184 getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
1185 [y, CmmLit (CmmInt shift _)]])
1186 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1187 = x86_complex_amode x y shift 0
1188
1189 getAmode' _ (CmmMachOp (MO_Add _)
1190 [x, CmmMachOp (MO_Add _)
1191 [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
1192 CmmLit (CmmInt offset _)]])
1193 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1194 && is32BitInteger offset
1195 = x86_complex_amode x y shift offset
1196
1197 getAmode' _ (CmmMachOp (MO_Add _) [x,y])
1198 = x86_complex_amode x y 0 0
1199
1200 getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1201 = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
1202
1203 getAmode' _ expr = do
1204 (reg,code) <- getSomeReg expr
1205 return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
1206
1207 -- | Like 'getAmode', but on 32-bit use simple register addressing
1208 -- (i.e. no index register). This stops us from running out of
1209 -- registers on x86 when using instructions such as cmpxchg, which can
1210 -- use up to three virtual registers and one fixed register.
1211 getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
1212 getSimpleAmode dflags is32Bit addr
1213 | is32Bit = do
1214 addr_code <- getAnyReg addr
1215 addr_r <- getNewRegNat (intFormat (wordWidth dflags))
1216 let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
1217 return $! Amode amode (addr_code addr_r)
1218 | otherwise = getAmode addr
1219
1220 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
1221 x86_complex_amode base index shift offset
1222 = do (x_reg, x_code) <- getNonClobberedReg base
1223 -- x must be in a temp, because it has to stay live over y_code
1224 -- we could compre x_reg and y_reg and do something better here...
1225 (y_reg, y_code) <- getSomeReg index
1226 let
1227 code = x_code `appOL` y_code
1228 base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
1229 n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
1230 return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
1231 code)
1232
1233
1234
1235
1236 -- -----------------------------------------------------------------------------
1237 -- getOperand: sometimes any operand will do.
1238
1239 -- getNonClobberedOperand: the value of the operand will remain valid across
1240 -- the computation of an arbitrary expression, unless the expression
1241 -- is computed directly into a register which the operand refers to
1242 -- (see trivialCode where this function is used for an example).
1243
1244 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1245 getNonClobberedOperand (CmmLit lit) = do
1246 use_sse2 <- sse2Enabled
1247 if use_sse2 && isSuitableFloatingPointLit lit
1248 then do
1249 let CmmFloat _ w = lit
1250 Amode addr code <- memConstant (widthInBytes w) lit
1251 return (OpAddr addr, code)
1252 else do
1253
1254 is32Bit <- is32BitPlatform
1255 dflags <- getDynFlags
1256 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1257 then return (OpImm (litToImm lit), nilOL)
1258 else getNonClobberedOperand_generic (CmmLit lit)
1259
1260 getNonClobberedOperand (CmmLoad mem pk) = do
1261 is32Bit <- is32BitPlatform
1262 use_sse2 <- sse2Enabled
1263 if (not (isFloatType pk) || use_sse2)
1264 && (if is32Bit then not (isWord64 pk) else True)
1265 then do
1266 dflags <- getDynFlags
1267 let platform = targetPlatform dflags
1268 Amode src mem_code <- getAmode mem
1269 (src',save_code) <-
1270 if (amodeCouldBeClobbered platform src)
1271 then do
1272 tmp <- getNewRegNat (archWordFormat is32Bit)
1273 return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
1274 unitOL (LEA (archWordFormat is32Bit)
1275 (OpAddr src)
1276 (OpReg tmp)))
1277 else
1278 return (src, nilOL)
1279 return (OpAddr src', mem_code `appOL` save_code)
1280 else do
1281 getNonClobberedOperand_generic (CmmLoad mem pk)
1282
1283 getNonClobberedOperand e = getNonClobberedOperand_generic e
1284
1285 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1286 getNonClobberedOperand_generic e = do
1287 (reg, code) <- getNonClobberedReg e
1288 return (OpReg reg, code)
1289
1290 amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
1291 amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
1292
1293 regClobbered :: Platform -> Reg -> Bool
1294 regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
1295 regClobbered _ _ = False
1296
1297 -- getOperand: the operand is not required to remain valid across the
1298 -- computation of an arbitrary expression.
1299 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1300
1301 getOperand (CmmLit lit) = do
1302 use_sse2 <- sse2Enabled
1303 if (use_sse2 && isSuitableFloatingPointLit lit)
1304 then do
1305 let CmmFloat _ w = lit
1306 Amode addr code <- memConstant (widthInBytes w) lit
1307 return (OpAddr addr, code)
1308 else do
1309
1310 is32Bit <- is32BitPlatform
1311 dflags <- getDynFlags
1312 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1313 then return (OpImm (litToImm lit), nilOL)
1314 else getOperand_generic (CmmLit lit)
1315
1316 getOperand (CmmLoad mem pk) = do
1317 is32Bit <- is32BitPlatform
1318 use_sse2 <- sse2Enabled
1319 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1320 then do
1321 Amode src mem_code <- getAmode mem
1322 return (OpAddr src, mem_code)
1323 else
1324 getOperand_generic (CmmLoad mem pk)
1325
1326 getOperand e = getOperand_generic e
1327
1328 getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1329 getOperand_generic e = do
1330 (reg, code) <- getSomeReg e
1331 return (OpReg reg, code)
1332
1333 isOperand :: Bool -> CmmExpr -> Bool
1334 isOperand _ (CmmLoad _ _) = True
1335 isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
1336 || isSuitableFloatingPointLit lit
1337 isOperand _ _ = False
1338
1339 -- | Given a 'Register', produce a new 'Register' with an instruction block
1340 -- which will check the value for alignment. Used for @-falignment-sanitisation@.
1341 addAlignmentCheck :: Int -> Register -> Register
1342 addAlignmentCheck align reg =
1343 case reg of
1344 Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
1345 Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg)
1346 where
1347 check :: Format -> Reg -> InstrBlock
1348 check fmt reg =
1349 ASSERT(not $ isFloatFormat fmt)
1350 toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
1351 , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
1352 ]
1353
1354 memConstant :: Int -> CmmLit -> NatM Amode
1355 memConstant align lit = do
1356 lbl <- getNewLabelNat
1357 let rosection = Section ReadOnlyData lbl
1358 dflags <- getDynFlags
1359 (addr, addr_code) <- if target32Bit (targetPlatform dflags)
1360 then do dynRef <- cmmMakeDynamicReference
1361 dflags
1362 DataReference
1363 lbl
1364 Amode addr addr_code <- getAmode dynRef
1365 return (addr, addr_code)
1366 else return (ripRel (ImmCLbl lbl), nilOL)
1367 let code =
1368 LDATA rosection (align, Statics lbl [CmmStaticLit lit])
1369 `consOL` addr_code
1370 return (Amode addr code)
1371
1372
1373 loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
1374 loadFloatAmode use_sse2 w addr addr_code = do
1375 let format = floatFormat w
1376 code dst = addr_code `snocOL`
1377 if use_sse2
1378 then MOV format (OpAddr addr) (OpReg dst)
1379 else GLD format addr dst
1380 return (Any (if use_sse2 then format else FF80) code)
1381
1382
1383 -- if we want a floating-point literal as an operand, we can
1384 -- use it directly from memory. However, if the literal is
1385 -- zero, we're better off generating it into a register using
1386 -- xor.
1387 isSuitableFloatingPointLit :: CmmLit -> Bool
1388 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
1389 isSuitableFloatingPointLit _ = False
1390
1391 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
1392 getRegOrMem e@(CmmLoad mem pk) = do
1393 is32Bit <- is32BitPlatform
1394 use_sse2 <- sse2Enabled
1395 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1396 then do
1397 Amode src mem_code <- getAmode mem
1398 return (OpAddr src, mem_code)
1399 else do
1400 (reg, code) <- getNonClobberedReg e
1401 return (OpReg reg, code)
1402 getRegOrMem e = do
1403 (reg, code) <- getNonClobberedReg e
1404 return (OpReg reg, code)
1405
1406 is32BitLit :: Bool -> CmmLit -> Bool
1407 is32BitLit is32Bit (CmmInt i W64)
1408 | not is32Bit
1409 = -- assume that labels are in the range 0-2^31-1: this assumes the
1410 -- small memory model (see gcc docs, -mcmodel=small).
1411 is32BitInteger i
1412 is32BitLit _ _ = True
1413
1414
1415
1416
1417 -- Set up a condition code for a conditional branch.
1418
1419 getCondCode :: CmmExpr -> NatM CondCode
1420
1421 -- yes, they really do seem to want exactly the same!
1422
1423 getCondCode (CmmMachOp mop [x, y])
1424 =
1425 case mop of
1426 MO_F_Eq W32 -> condFltCode EQQ x y
1427 MO_F_Ne W32 -> condFltCode NE x y
1428 MO_F_Gt W32 -> condFltCode GTT x y
1429 MO_F_Ge W32 -> condFltCode GE x y
1430 -- Invert comparison condition and swap operands
1431 -- See Note [SSE Parity Checks]
1432 MO_F_Lt W32 -> condFltCode GTT y x
1433 MO_F_Le W32 -> condFltCode GE y x
1434
1435 MO_F_Eq W64 -> condFltCode EQQ x y
1436 MO_F_Ne W64 -> condFltCode NE x y
1437 MO_F_Gt W64 -> condFltCode GTT x y
1438 MO_F_Ge W64 -> condFltCode GE x y
1439 MO_F_Lt W64 -> condFltCode GTT y x
1440 MO_F_Le W64 -> condFltCode GE y x
1441
1442 _ -> condIntCode (machOpToCond mop) x y
1443
1444 getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
1445
1446 machOpToCond :: MachOp -> Cond
1447 machOpToCond mo = case mo of
1448 MO_Eq _ -> EQQ
1449 MO_Ne _ -> NE
1450 MO_S_Gt _ -> GTT
1451 MO_S_Ge _ -> GE
1452 MO_S_Lt _ -> LTT
1453 MO_S_Le _ -> LE
1454 MO_U_Gt _ -> GU
1455 MO_U_Ge _ -> GEU
1456 MO_U_Lt _ -> LU
1457 MO_U_Le _ -> LEU
1458 _other -> pprPanic "machOpToCond" (pprMachOp mo)
1459
1460
1461 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
1462 -- passed back up the tree.
1463
1464 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1465 condIntCode cond x y = do is32Bit <- is32BitPlatform
1466 condIntCode' is32Bit cond x y
1467
1468 condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1469
1470 -- memory vs immediate
1471 condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
1472 | is32BitLit is32Bit lit = do
1473 Amode x_addr x_code <- getAmode x
1474 let
1475 imm = litToImm lit
1476 code = x_code `snocOL`
1477 CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
1478 --
1479 return (CondCode False cond code)
1480
1481 -- anything vs zero, using a mask
1482 -- TODO: Add some sanity checking!!!!
1483 condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
1484 | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
1485 = do
1486 (x_reg, x_code) <- getSomeReg x
1487 let
1488 code = x_code `snocOL`
1489 TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
1490 --
1491 return (CondCode False cond code)
1492
1493 -- anything vs zero
1494 condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
1495 (x_reg, x_code) <- getSomeReg x
1496 let
1497 code = x_code `snocOL`
1498 TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
1499 --
1500 return (CondCode False cond code)
1501
1502 -- anything vs operand
1503 condIntCode' is32Bit cond x y
1504 | isOperand is32Bit y = do
1505 dflags <- getDynFlags
1506 (x_reg, x_code) <- getNonClobberedReg x
1507 (y_op, y_code) <- getOperand y
1508 let
1509 code = x_code `appOL` y_code `snocOL`
1510 CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg)
1511 return (CondCode False cond code)
1512 -- operand vs. anything: invert the comparison so that we can use a
1513 -- single comparison instruction.
1514 | isOperand is32Bit x
1515 , Just revcond <- maybeFlipCond cond = do
1516 dflags <- getDynFlags
1517 (y_reg, y_code) <- getNonClobberedReg y
1518 (x_op, x_code) <- getOperand x
1519 let
1520 code = y_code `appOL` x_code `snocOL`
1521 CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg)
1522 return (CondCode False revcond code)
1523
1524 -- anything vs anything
1525 condIntCode' _ cond x y = do
1526 dflags <- getDynFlags
1527 (y_reg, y_code) <- getNonClobberedReg y
1528 (x_op, x_code) <- getRegOrMem x
1529 let
1530 code = y_code `appOL`
1531 x_code `snocOL`
1532 CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op
1533 return (CondCode False cond code)
1534
1535
1536
1537 --------------------------------------------------------------------------------
1538 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1539
1540 condFltCode cond x y
1541 = if_sse2 condFltCode_sse2 condFltCode_x87
1542 where
1543
1544 condFltCode_x87
1545 = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
1546 (x_reg, x_code) <- getNonClobberedReg x
1547 (y_reg, y_code) <- getSomeReg y
1548 let
1549 code = x_code `appOL` y_code `snocOL`
1550 GCMP cond x_reg y_reg
1551 -- The GCMP insn does the test and sets the zero flag if comparable
1552 -- and true. Hence we always supply EQQ as the condition to test.
1553 return (CondCode True EQQ code)
1554
1555 -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
1556 -- an operand, but the right must be a reg. We can probably do better
1557 -- than this general case...
1558 condFltCode_sse2 = do
1559 dflags <- getDynFlags
1560 (x_reg, x_code) <- getNonClobberedReg x
1561 (y_op, y_code) <- getOperand y
1562 let
1563 code = x_code `appOL`
1564 y_code `snocOL`
1565 CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg)
1566 -- NB(1): we need to use the unsigned comparison operators on the
1567 -- result of this comparison.
1568 return (CondCode True (condToUnsigned cond) code)
1569
1570 -- -----------------------------------------------------------------------------
1571 -- Generating assignments
1572
1573 -- Assignments are really at the heart of the whole code generation
1574 -- business. Almost all top-level nodes of any real importance are
1575 -- assignments, which correspond to loads, stores, or register
1576 -- transfers. If we're really lucky, some of the register transfers
1577 -- will go away, because we can use the destination register to
1578 -- complete the code generation for the right hand side. This only
1579 -- fails when the right hand side is forced into a fixed register
1580 -- (e.g. the result of a call).
1581
1582 assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1583 assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1584
1585 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1586 assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1587
1588
1589 -- integer assignment to memory
1590
1591 -- specific case of adding/subtracting an integer to a particular address.
1592 -- ToDo: catch other cases where we can use an operation directly on a memory
1593 -- address.
1594 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
1595 CmmLit (CmmInt i _)])
1596 | addr == addr2, pk /= II64 || is32BitInteger i,
1597 Just instr <- check op
1598 = do Amode amode code_addr <- getAmode addr
1599 let code = code_addr `snocOL`
1600 instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
1601 return code
1602 where
1603 check (MO_Add _) = Just ADD
1604 check (MO_Sub _) = Just SUB
1605 check _ = Nothing
1606 -- ToDo: more?
1607
1608 -- general case
1609 assignMem_IntCode pk addr src = do
1610 is32Bit <- is32BitPlatform
1611 Amode addr code_addr <- getAmode addr
1612 (code_src, op_src) <- get_op_RI is32Bit src
1613 let
1614 code = code_src `appOL`
1615 code_addr `snocOL`
1616 MOV pk op_src (OpAddr addr)
1617 -- NOTE: op_src is stable, so it will still be valid
1618 -- after code_addr. This may involve the introduction
1619 -- of an extra MOV to a temporary register, but we hope
1620 -- the register allocator will get rid of it.
1621 --
1622 return code
1623 where
1624 get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator
1625 get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1626 = return (nilOL, OpImm (litToImm lit))
1627 get_op_RI _ op
1628 = do (reg,code) <- getNonClobberedReg op
1629 return (code, OpReg reg)
1630
1631
1632 -- Assign; dst is a reg, rhs is mem
1633 assignReg_IntCode pk reg (CmmLoad src _) = do
1634 load_code <- intLoadCode (MOV pk) src
1635 dflags <- getDynFlags
1636 let platform = targetPlatform dflags
1637 return (load_code (getRegisterReg platform False{-no sse2-} reg))
1638
1639 -- dst is a reg, but src could be anything
1640 assignReg_IntCode _ reg src = do
1641 dflags <- getDynFlags
1642 let platform = targetPlatform dflags
1643 code <- getAnyReg src
1644 return (code (getRegisterReg platform False{-no sse2-} reg))
1645
1646
1647 -- Floating point assignment to memory
1648 assignMem_FltCode pk addr src = do
1649 (src_reg, src_code) <- getNonClobberedReg src
1650 Amode addr addr_code <- getAmode addr
1651 use_sse2 <- sse2Enabled
1652 let
1653 code = src_code `appOL`
1654 addr_code `snocOL`
1655 if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
1656 else GST pk src_reg addr
1657 return code
1658
1659 -- Floating point assignment to a register/temporary
1660 assignReg_FltCode _ reg src = do
1661 use_sse2 <- sse2Enabled
1662 src_code <- getAnyReg src
1663 dflags <- getDynFlags
1664 let platform = targetPlatform dflags
1665 return (src_code (getRegisterReg platform use_sse2 reg))
1666
1667
1668 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
1669
1670 genJump (CmmLoad mem _) regs = do
1671 Amode target code <- getAmode mem
1672 return (code `snocOL` JMP (OpAddr target) regs)
1673
1674 genJump (CmmLit lit) regs = do
1675 return (unitOL (JMP (OpImm (litToImm lit)) regs))
1676
1677 genJump expr regs = do
1678 (reg,code) <- getSomeReg expr
1679 return (code `snocOL` JMP (OpReg reg) regs)
1680
1681
1682 -- -----------------------------------------------------------------------------
1683 -- Unconditional branches
1684
1685 genBranch :: BlockId -> InstrBlock
1686 genBranch = toOL . mkJumpInstr
1687
1688
1689
1690 -- -----------------------------------------------------------------------------
1691 -- Conditional jumps/branches
1692
1693 {-
1694 Conditional jumps are always to local labels, so we can use branch
1695 instructions. We peek at the arguments to decide what kind of
1696 comparison to do.
1697
1698 I386: First, we have to ensure that the condition
1699 codes are set according to the supplied comparison operation.
1700 -}
1701
1702
1703 genCondBranch
1704 :: BlockId -- the source of the jump
1705 -> BlockId -- the true branch target
1706 -> BlockId -- the false branch target
1707 -> CmmExpr -- the condition on which to branch
1708 -> NatM InstrBlock -- Instructions
1709
1710 genCondBranch bid id false expr = do
1711 is32Bit <- is32BitPlatform
1712 genCondBranch' is32Bit bid id false expr
1713
1714 -- | We return the instructions generated.
1715 genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
1716 -> NatM InstrBlock
1717
1718 -- 64-bit integer comparisons on 32-bit
1719 genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
1720 | is32Bit, Just W64 <- maybeIntComparison mop = do
1721 ChildCode64 code1 r1_lo <- iselExpr64 e1
1722 ChildCode64 code2 r2_lo <- iselExpr64 e2
1723 let r1_hi = getHiVRegFromLo r1_lo
1724 r2_hi = getHiVRegFromLo r2_lo
1725 cond = machOpToCond mop
1726 Just cond' = maybeFlipCond cond
1727 --TODO: Update CFG for x86
1728 let code = code1 `appOL` code2 `appOL` toOL [
1729 CMP II32 (OpReg r2_hi) (OpReg r1_hi),
1730 JXX cond true,
1731 JXX cond' false,
1732 CMP II32 (OpReg r2_lo) (OpReg r1_lo),
1733 JXX cond true] `appOL` genBranch false
1734 return code
1735
1736 genCondBranch' _ bid id false bool = do
1737 CondCode is_float cond cond_code <- getCondCode bool
1738 use_sse2 <- sse2Enabled
1739 if not is_float || not use_sse2
1740 then
1741 return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
1742 else do
1743 -- See Note [SSE Parity Checks]
1744 let jmpFalse = genBranch false
1745 code
1746 = case cond of
1747 NE -> or_unordered
1748 GU -> plain_test
1749 GEU -> plain_test
1750 -- Use ASSERT so we don't break releases if
1751 -- LTT/LE creep in somehow.
1752 LTT ->
1753 ASSERT2(False, ppr "Should have been turned into >")
1754 and_ordered
1755 LE ->
1756 ASSERT2(False, ppr "Should have been turned into >=")
1757 and_ordered
1758 _ -> and_ordered
1759
1760 plain_test = unitOL (
1761 JXX cond id
1762 ) `appOL` jmpFalse
1763 or_unordered = toOL [
1764 JXX cond id,
1765 JXX PARITY id
1766 ] `appOL` jmpFalse
1767 and_ordered = toOL [
1768 JXX PARITY false,
1769 JXX cond id,
1770 JXX ALWAYS false
1771 ]
1772 updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
1773 return (cond_code `appOL` code)
1774
1775 -- -----------------------------------------------------------------------------
1776 -- Generating C calls
1777
1778 -- Now the biggest nightmare---calls. Most of the nastiness is buried in
1779 -- @get_arg@, which moves the arguments to the correct registers/stack
1780 -- locations. Apart from that, the code is easy.
1781 --
1782 -- (If applicable) Do not fill the delay slots here; you will confuse the
1783 -- register allocator.
1784
1785 genCCall
1786 :: DynFlags
1787 -> Bool -- 32 bit platform?
1788 -> ForeignTarget -- function to call
1789 -> [CmmFormal] -- where to put the result
1790 -> [CmmActual] -- arguments (of mixed type)
1791 -> BlockId -- The block we are in
1792 -> NatM InstrBlock
1793
1794 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1795
1796 -- Unroll memcpy calls if the source and destination pointers are at
1797 -- least DWORD aligned and the number of bytes to copy isn't too
1798 -- large. Otherwise, call C's memcpy.
1799 genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
1800 [dst, src, CmmLit (CmmInt n _)] _
1801 | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
1802 code_dst <- getAnyReg dst
1803 dst_r <- getNewRegNat format
1804 code_src <- getAnyReg src
1805 src_r <- getNewRegNat format
1806 tmp_r <- getNewRegNat format
1807 return $ code_dst dst_r `appOL` code_src src_r `appOL`
1808 go dst_r src_r tmp_r (fromInteger n)
1809 where
1810 -- The number of instructions we will generate (approx). We need 2
1811 -- instructions per move.
1812 insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
1813
1814 format = if align .&. 4 /= 0 then II32 else (archWordFormat is32Bit)
1815
1816 -- The size of each move, in bytes.
1817 sizeBytes :: Integer
1818 sizeBytes = fromIntegral (formatInBytes format)
1819
1820 go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
1821 go dst src tmp i
1822 | i >= sizeBytes =
1823 unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
1824 unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
1825 go dst src tmp (i - sizeBytes)
1826 -- Deal with remaining bytes.
1827 | i >= 4 = -- Will never happen on 32-bit
1828 unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
1829 unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1830 go dst src tmp (i - 4)
1831 | i >= 2 =
1832 unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
1833 unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1834 go dst src tmp (i - 2)
1835 | i >= 1 =
1836 unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
1837 unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1838 go dst src tmp (i - 1)
1839 | otherwise = nilOL
1840 where
1841 src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
1842 (ImmInteger (n - i))
1843 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1844 (ImmInteger (n - i))
1845
1846 genCCall dflags _ (PrimTarget (MO_Memset align)) _
1847 [dst,
1848 CmmLit (CmmInt c _),
1849 CmmLit (CmmInt n _)]
1850 _
1851 | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do
1852 code_dst <- getAnyReg dst
1853 dst_r <- getNewRegNat format
1854 return $ code_dst dst_r `appOL` go dst_r (fromInteger n)
1855 where
1856 (format, val) = case align .&. 3 of
1857 2 -> (II16, c2)
1858 0 -> (II32, c4)
1859 _ -> (II8, c)
1860 c2 = c `shiftL` 8 .|. c
1861 c4 = c2 `shiftL` 16 .|. c2
1862
1863 -- The number of instructions we will generate (approx). We need 1
1864 -- instructions per move.
1865 insns = (n + sizeBytes - 1) `div` sizeBytes
1866
1867 -- The size of each move, in bytes.
1868 sizeBytes :: Integer
1869 sizeBytes = fromIntegral (formatInBytes format)
1870
1871 go :: Reg -> Integer -> OrdList Instr
1872 go dst i
1873 -- TODO: Add movabs instruction and support 64-bit sets.
1874 | i >= sizeBytes = -- This might be smaller than the below sizes
1875 unitOL (MOV format (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL`
1876 go dst (i - sizeBytes)
1877 | i >= 4 = -- Will never happen on 32-bit
1878 unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL`
1879 go dst (i - 4)
1880 | i >= 2 =
1881 unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL`
1882 go dst (i - 2)
1883 | i >= 1 =
1884 unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL`
1885 go dst (i - 1)
1886 | otherwise = nilOL
1887 where
1888 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1889 (ImmInteger (n - i))
1890
1891 genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
1892 -- write barrier compiles to no code on x86/x86-64;
1893 -- we keep it this long in order to prevent earlier optimisations.
1894
1895 genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
1896
1897 genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
1898 case n of
1899 0 -> genPrefetch src $ PREFETCH NTA format
1900 1 -> genPrefetch src $ PREFETCH Lvl2 format
1901 2 -> genPrefetch src $ PREFETCH Lvl1 format
1902 3 -> genPrefetch src $ PREFETCH Lvl0 format
1903 l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
1904 -- the c / llvm prefetch convention is 0, 1, 2, and 3
1905 -- the x86 corresponding names are : NTA, 2 , 1, and 0
1906 where
1907 format = archWordFormat is32bit
1908 -- need to know what register width for pointers!
1909 genPrefetch inRegSrc prefetchCTor =
1910 do
1911 code_src <- getAnyReg inRegSrc
1912 src_r <- getNewRegNat format
1913 return $ code_src src_r `appOL`
1914 (unitOL (prefetchCTor (OpAddr
1915 ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
1916 -- prefetch always takes an address
1917
1918 genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
1919 let platform = targetPlatform dflags
1920 let dst_r = getRegisterReg platform False (CmmLocal dst)
1921 case width of
1922 W64 | is32Bit -> do
1923 ChildCode64 vcode rlo <- iselExpr64 src
1924 let dst_rhi = getHiVRegFromLo dst_r
1925 rhi = getHiVRegFromLo rlo
1926 return $ vcode `appOL`
1927 toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
1928 MOV II32 (OpReg rhi) (OpReg dst_r),
1929 BSWAP II32 dst_rhi,
1930 BSWAP II32 dst_r ]
1931 W16 -> do code_src <- getAnyReg src
1932 return $ code_src dst_r `appOL`
1933 unitOL (BSWAP II32 dst_r) `appOL`
1934 unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
1935 _ -> do code_src <- getAnyReg src
1936 return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
1937 where
1938 format = intFormat width
1939
1940 genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
1941 args@[src] bid = do
1942 sse4_2 <- sse4_2Enabled
1943 let platform = targetPlatform dflags
1944 if sse4_2
1945 then do code_src <- getAnyReg src
1946 src_r <- getNewRegNat format
1947 let dst_r = getRegisterReg platform False (CmmLocal dst)
1948 return $ code_src src_r `appOL`
1949 (if width == W8 then
1950 -- The POPCNT instruction doesn't take a r/m8
1951 unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
1952 unitOL (POPCNT II16 (OpReg src_r) dst_r)
1953 else
1954 unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
1955 (if width == W8 || width == W16 then
1956 -- We used a 16-bit destination register above,
1957 -- so zero-extend
1958 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
1959 else nilOL)
1960 else do
1961 targetExpr <- cmmMakeDynamicReference dflags
1962 CallReference lbl
1963 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1964 [NoHint] [NoHint]
1965 CmmMayReturn)
1966 genCCall dflags is32Bit target dest_regs args bid
1967 where
1968 format = intFormat width
1969 lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
1970
1971 genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
1972 args@[src, mask] bid = do
1973 let platform = targetPlatform dflags
1974 if isBmi2Enabled dflags
1975 then do code_src <- getAnyReg src
1976 code_mask <- getAnyReg mask
1977 src_r <- getNewRegNat format
1978 mask_r <- getNewRegNat format
1979 let dst_r = getRegisterReg platform False (CmmLocal dst)
1980 return $ code_src src_r `appOL` code_mask mask_r `appOL`
1981 (if width == W8 then
1982 -- The PDEP instruction doesn't take a r/m8
1983 unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
1984 unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
1985 unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
1986 else
1987 unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
1988 (if width == W8 || width == W16 then
1989 -- We used a 16-bit destination register above,
1990 -- so zero-extend
1991 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
1992 else nilOL)
1993 else do
1994 targetExpr <- cmmMakeDynamicReference dflags
1995 CallReference lbl
1996 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1997 [NoHint] [NoHint]
1998 CmmMayReturn)
1999 genCCall dflags is32Bit target dest_regs args bid
2000 where
2001 format = intFormat width
2002 lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
2003
2004 genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
2005 args@[src, mask] bid = do
2006 let platform = targetPlatform dflags
2007 if isBmi2Enabled dflags
2008 then do code_src <- getAnyReg src
2009 code_mask <- getAnyReg mask
2010 src_r <- getNewRegNat format
2011 mask_r <- getNewRegNat format
2012 let dst_r = getRegisterReg platform False (CmmLocal dst)
2013 return $ code_src src_r `appOL` code_mask mask_r `appOL`
2014 (if width == W8 then
2015 -- The PEXT instruction doesn't take a r/m8
2016 unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
2017 unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
2018 unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
2019 else
2020 unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
2021 (if width == W8 || width == W16 then
2022 -- We used a 16-bit destination register above,
2023 -- so zero-extend
2024 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
2025 else nilOL)
2026 else do
2027 targetExpr <- cmmMakeDynamicReference dflags
2028 CallReference lbl
2029 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2030 [NoHint] [NoHint]
2031 CmmMayReturn)
2032 genCCall dflags is32Bit target dest_regs args bid
2033 where
2034 format = intFormat width
2035 lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
2036
2037 genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
2038 | is32Bit && width == W64 = do
2039 -- Fallback to `hs_clz64` on i386
2040 targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
2041 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2042 [NoHint] [NoHint]
2043 CmmMayReturn)
2044 genCCall dflags is32Bit target dest_regs args bid
2045
2046 | otherwise = do
2047 code_src <- getAnyReg src
2048 src_r <- getNewRegNat format
2049 tmp_r <- getNewRegNat format
2050 let dst_r = getRegisterReg platform False (CmmLocal dst)
2051
2052 -- The following insn sequence makes sure 'clz 0' has a defined value.
2053 -- starting with Haswell, one could use the LZCNT insn instead.
2054 return $ code_src src_r `appOL` toOL
2055 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2056 [ BSR format (OpReg src_r) tmp_r
2057 , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
2058 , CMOV NE format (OpReg tmp_r) dst_r
2059 , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
2060 ]) -- NB: We don't need to zero-extend the result for the
2061 -- W8/W16 cases because the 'MOV' insn already
2062 -- took care of implicitly clearing the upper bits
2063 where
2064 bw = widthInBits width
2065 platform = targetPlatform dflags
2066 format = if width == W8 then II16 else intFormat width
2067 lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
2068
2069 genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
2070 | is32Bit, width == W64 = do
2071 ChildCode64 vcode rlo <- iselExpr64 src
2072 let rhi = getHiVRegFromLo rlo
2073 dst_r = getRegisterReg platform False (CmmLocal dst)
2074 lbl1 <- getBlockIdNat
2075 lbl2 <- getBlockIdNat
2076 tmp_r <- getNewRegNat format
2077
2078 -- New CFG Edges:
2079 -- bid -> lbl2
2080 -- bid -> lbl1 -> lbl2
2081 -- We also changes edges originating at bid to start at lbl2 instead.
2082 updateCfgNat (addWeightEdge bid lbl1 110 .
2083 addWeightEdge lbl1 lbl2 110 .
2084 addImmediateSuccessor bid lbl2)
2085
2086 -- The following instruction sequence corresponds to the pseudo-code
2087 --
2088 -- if (src) {
2089 -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
2090 -- } else {
2091 -- dst = 64;
2092 -- }
2093 return $ vcode `appOL` toOL
2094 ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
2095 , OR II32 (OpReg rlo) (OpReg tmp_r)
2096 , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
2097 , JXX EQQ lbl2
2098 , JXX ALWAYS lbl1
2099
2100 , NEWBLOCK lbl1
2101 , BSF II32 (OpReg rhi) dst_r
2102 , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
2103 , BSF II32 (OpReg rlo) tmp_r
2104 , CMOV NE II32 (OpReg tmp_r) dst_r
2105 , JXX ALWAYS lbl2
2106
2107 , NEWBLOCK lbl2
2108 ])
2109
2110 | otherwise = do
2111 code_src <- getAnyReg src
2112 src_r <- getNewRegNat format
2113 tmp_r <- getNewRegNat format
2114 let dst_r = getRegisterReg platform False (CmmLocal dst)
2115
2116 -- The following insn sequence makes sure 'ctz 0' has a defined value.
2117 -- starting with Haswell, one could use the TZCNT insn instead.
2118 return $ code_src src_r `appOL` toOL
2119 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2120 [ BSF format (OpReg src_r) tmp_r
2121 , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
2122 , CMOV NE format (OpReg tmp_r) dst_r
2123 ]) -- NB: We don't need to zero-extend the result for the
2124 -- W8/W16 cases because the 'MOV' insn already
2125 -- took care of implicitly clearing the upper bits
2126 where
2127 bw = widthInBits width
2128 platform = targetPlatform dflags
2129 format = if width == W8 then II16 else intFormat width
2130
2131 genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
2132 targetExpr <- cmmMakeDynamicReference dflags
2133 CallReference lbl
2134 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2135 [NoHint] [NoHint]
2136 CmmMayReturn)
2137 genCCall dflags is32Bit target dest_regs args bid
2138 where
2139 lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
2140
2141 genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
2142 [dst] [addr, n] bid = do
2143 Amode amode addr_code <-
2144 if amop `elem` [AMO_Add, AMO_Sub]
2145 then getAmode addr
2146 else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
2147 arg <- getNewRegNat format
2148 arg_code <- getAnyReg n
2149 use_sse2 <- sse2Enabled
2150 let platform = targetPlatform dflags
2151 dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
2152 code <- op_code dst_r arg amode
2153 return $ addr_code `appOL` arg_code arg `appOL` code
2154 where
2155 -- Code for the operation
2156 op_code :: Reg -- Destination reg
2157 -> Reg -- Register containing argument
2158 -> AddrMode -- Address of location to mutate
2159 -> NatM (OrdList Instr)
2160 op_code dst_r arg amode = case amop of
2161 -- In the common case where dst_r is a virtual register the
2162 -- final move should go away, because it's the last use of arg
2163 -- and the first use of dst_r.
2164 AMO_Add -> return $ toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
2165 , MOV format (OpReg arg) (OpReg dst_r)
2166 ]
2167 AMO_Sub -> return $ toOL [ NEGI format (OpReg arg)
2168 , LOCK (XADD format (OpReg arg) (OpAddr amode))
2169 , MOV format (OpReg arg) (OpReg dst_r)
2170 ]
2171 AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
2172 AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
2173 , NOT format dst
2174 ])
2175 AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
2176 AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
2177 where
2178 -- Simulate operation that lacks a dedicated instruction using
2179 -- cmpxchg.
2180 cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
2181 -> NatM (OrdList Instr)
2182 cmpxchg_code instrs = do
2183 lbl <- getBlockIdNat
2184 tmp <- getNewRegNat format
2185
2186 --Record inserted blocks
2187 addImmediateSuccessorNat bid lbl
2188 updateCfgNat (addWeightEdge lbl lbl 0)
2189
2190 return $ toOL
2191 [ MOV format (OpAddr amode) (OpReg eax)
2192 , JXX ALWAYS lbl
2193 , NEWBLOCK lbl
2194 -- Keep old value so we can return it:
2195 , MOV format (OpReg eax) (OpReg dst_r)
2196 , MOV format (OpReg eax) (OpReg tmp)
2197 ]
2198 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
2199 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
2200 , JXX NE lbl
2201 ]
2202
2203 format = intFormat width
2204
2205 genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
2206 load_code <- intLoadCode (MOV (intFormat width)) addr
2207 let platform = targetPlatform dflags
2208 use_sse2 <- sse2Enabled
2209 return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
2210
2211 genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
2212 code <- assignMem_IntCode (intFormat width) addr val
2213 return $ code `snocOL` MFENCE
2214
2215 genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
2216 -- On x86 we don't have enough registers to use cmpxchg with a
2217 -- complicated addressing mode, so on that architecture we
2218 -- pre-compute the address first.
2219 Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
2220 newval <- getNewRegNat format
2221 newval_code <- getAnyReg new
2222 oldval <- getNewRegNat format
2223 oldval_code <- getAnyReg old
2224 use_sse2 <- sse2Enabled
2225 let platform = targetPlatform dflags
2226 dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
2227 code = toOL
2228 [ MOV format (OpReg oldval) (OpReg eax)
2229 , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
2230 , MOV format (OpReg eax) (OpReg dst_r)
2231 ]
2232 return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
2233 `appOL` code
2234 where
2235 format = intFormat width
2236
2237 genCCall _ is32Bit target dest_regs args bid = do
2238 dflags <- getDynFlags
2239 let platform = targetPlatform dflags
2240 sse2 = isSse2Enabled dflags
2241 case (target, dest_regs) of
2242 -- void return type prim op
2243 (PrimTarget op, []) ->
2244 outOfLineCmmOp bid op Nothing args
2245 -- we only cope with a single result for foreign calls
2246 (PrimTarget op, [r])
2247 | sse2 -> case op of
2248 MO_F32_Fabs -> case args of
2249 [x] -> sse2FabsCode W32 x
2250 _ -> panic "genCCall: Wrong number of arguments for fabs"
2251 MO_F64_Fabs -> case args of
2252 [x] -> sse2FabsCode W64 x
2253 _ -> panic "genCCall: Wrong number of arguments for fabs"
2254
2255 MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
2256 MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
2257 _other_op -> outOfLineCmmOp bid op (Just r) args
2258 | otherwise -> do
2259 l1 <- getNewLabelNat
2260 l2 <- getNewLabelNat
2261 if sse2
2262 then outOfLineCmmOp bid op (Just r) args
2263 else case op of
2264 MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
2265 MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
2266
2267 MO_F32_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
2268 MO_F64_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
2269
2270 MO_F32_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
2271 MO_F64_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
2272
2273 MO_F32_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
2274 MO_F64_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
2275
2276 _other_op -> outOfLineCmmOp bid op (Just r) args
2277
2278 where
2279 actuallyInlineFloatOp = actuallyInlineFloatOp' False
2280 actuallyInlineSSE2Op = actuallyInlineFloatOp' True
2281
2282 actuallyInlineFloatOp' usesSSE instr format [x]
2283 = do res <- trivialUFCode format (instr format) x
2284 any <- anyReg res
2285 return (any (getRegisterReg platform usesSSE (CmmLocal r)))
2286
2287 actuallyInlineFloatOp' _ _ _ args
2288 = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
2289 ++ show (length args) ++ ")"
2290
2291 sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
2292 sse2FabsCode w x = do
2293 let fmt = floatFormat w
2294 x_code <- getAnyReg x
2295 let
2296 const | FF32 <- fmt = CmmInt 0x7fffffff W32
2297 | otherwise = CmmInt 0x7fffffffffffffff W64
2298 Amode amode amode_code <- memConstant (widthInBytes w) const
2299 tmp <- getNewRegNat fmt
2300 let
2301 code dst = x_code dst `appOL` amode_code `appOL` toOL [
2302 MOV fmt (OpAddr amode) (OpReg tmp),
2303 AND fmt (OpReg tmp) (OpReg dst)
2304 ]
2305
2306 return $ code (getRegisterReg platform True (CmmLocal r))
2307
2308 (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
2309 (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
2310 (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
2311 (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
2312 case args of
2313 [arg_x, arg_y] ->
2314 do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
2315 let format = intFormat width
2316 lCode <- anyReg =<< trivialCode width (ADD_CC format)
2317 (Just (ADD_CC format)) arg_x arg_y
2318 let reg_l = getRegisterReg platform True (CmmLocal res_l)
2319 reg_h = getRegisterReg platform True (CmmLocal res_h)
2320 code = hCode reg_h `appOL`
2321 lCode reg_l `snocOL`
2322 ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
2323 return code
2324 _ -> panic "genCCall: Wrong number of arguments/results for add2"
2325 (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
2326 addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
2327 (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
2328 addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
2329 (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
2330 addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
2331 (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
2332 addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
2333 (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
2334 case args of
2335 [arg_x, arg_y] ->
2336 do (y_reg, y_code) <- getRegOrMem arg_y
2337 x_code <- getAnyReg arg_x
2338 let format = intFormat width
2339 reg_h = getRegisterReg platform True (CmmLocal res_h)
2340 reg_l = getRegisterReg platform True (CmmLocal res_l)
2341 code = y_code `appOL`
2342 x_code rax `appOL`
2343 toOL [MUL2 format y_reg,
2344 MOV format (OpReg rdx) (OpReg reg_h),
2345 MOV format (OpReg rax) (OpReg reg_l)]
2346 return code
2347 _ -> panic "genCCall: Wrong number of arguments/results for mul2"
2348
2349 _ -> if is32Bit
2350 then genCCall32' dflags target dest_regs args
2351 else genCCall64' dflags target dest_regs args
2352
2353 where divOp1 platform signed width results [arg_x, arg_y]
2354 = divOp platform signed width results Nothing arg_x arg_y
2355 divOp1 _ _ _ _ _
2356 = panic "genCCall: Wrong number of arguments for divOp1"
2357 divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
2358 = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
2359 divOp2 _ _ _ _ _
2360 = panic "genCCall: Wrong number of arguments for divOp2"
2361
2362 -- See Note [DIV/IDIV for bytes]
2363 divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
2364 let widen | signed = MO_SS_Conv W8 W16
2365 | otherwise = MO_UU_Conv W8 W16
2366 arg_x_low_16 = CmmMachOp widen [arg_x_low]
2367 arg_y_16 = CmmMachOp widen [arg_y]
2368 m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
2369 in divOp
2370 platform signed W16 [res_q, res_r]
2371 m_arg_x_high_16 arg_x_low_16 arg_y_16
2372
2373 divOp platform signed width [res_q, res_r]
2374 m_arg_x_high arg_x_low arg_y
2375 = do let format = intFormat width
2376 reg_q = getRegisterReg platform True (CmmLocal res_q)
2377 reg_r = getRegisterReg platform True (CmmLocal res_r)
2378 widen | signed = CLTD format
2379 | otherwise = XOR format (OpReg rdx) (OpReg rdx)
2380 instr | signed = IDIV
2381 | otherwise = DIV
2382 (y_reg, y_code) <- getRegOrMem arg_y
2383 x_low_code <- getAnyReg arg_x_low
2384 x_high_code <- case m_arg_x_high of
2385 Just arg_x_high ->
2386 getAnyReg arg_x_high
2387 Nothing ->
2388 return $ const $ unitOL widen
2389 return $ y_code `appOL`
2390 x_low_code rax `appOL`
2391 x_high_code rdx `appOL`
2392 toOL [instr format y_reg,
2393 MOV format (OpReg rax) (OpReg reg_q),
2394 MOV format (OpReg rdx) (OpReg reg_r)]
2395 divOp _ _ _ _ _ _ _
2396 = panic "genCCall: Wrong number of results for divOp"
2397
2398 addSubIntC platform instr mrevinstr cond width
2399 res_r res_c [arg_x, arg_y]
2400 = do let format = intFormat width
2401 rCode <- anyReg =<< trivialCode width (instr format)
2402 (mrevinstr format) arg_x arg_y
2403 reg_tmp <- getNewRegNat II8
2404 let reg_c = getRegisterReg platform True (CmmLocal res_c)
2405 reg_r = getRegisterReg platform True (CmmLocal res_r)
2406 code = rCode reg_r `snocOL`
2407 SETCC cond (OpReg reg_tmp) `snocOL`
2408 MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
2409
2410 return code
2411 addSubIntC _ _ _ _ _ _ _ _
2412 = panic "genCCall: Wrong number of arguments/results for addSubIntC"
2413
2414 -- Note [DIV/IDIV for bytes]
2415 --
2416 -- IDIV reminder:
2417 -- Size Dividend Divisor Quotient Remainder
2418 -- byte %ax r/m8 %al %ah
2419 -- word %dx:%ax r/m16 %ax %dx
2420 -- dword %edx:%eax r/m32 %eax %edx
2421 -- qword %rdx:%rax r/m64 %rax %rdx
2422 --
2423 -- We do a special case for the byte division because the current
2424 -- codegen doesn't deal well with accessing %ah register (also,
2425 -- accessing %ah in 64-bit mode is complicated because it cannot be an
2426 -- operand of many instructions). So we just widen operands to 16 bits
2427 -- and get the results from %al, %dl. This is not optimal, but a few
2428 -- register moves are probably not a huge deal when doing division.
2429
2430 genCCall32' :: DynFlags
2431 -> ForeignTarget -- function to call
2432 -> [CmmFormal] -- where to put the result
2433 -> [CmmActual] -- arguments (of mixed type)
2434 -> NatM InstrBlock
2435 genCCall32' dflags target dest_regs args = do
2436 let
2437 prom_args = map (maybePromoteCArg dflags W32) args
2438
2439 -- Align stack to 16n for calls, assuming a starting stack
2440 -- alignment of 16n - word_size on procedure entry. Which we
2441 -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2442 sizes = map (arg_size_bytes . cmmExprType dflags) (reverse args)
2443 raw_arg_size = sum sizes + wORD_SIZE dflags
2444 arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
2445 tot_arg_size = raw_arg_size + arg_pad_size - wORD_SIZE dflags
2446 delta0 <- getDeltaNat
2447 setDeltaNat (delta0 - arg_pad_size)
2448
2449 use_sse2 <- sse2Enabled
2450 push_codes <- mapM (push_arg use_sse2) (reverse prom_args)
2451 delta <- getDeltaNat
2452 MASSERT(delta == delta0 - tot_arg_size)
2453
2454 -- deal with static vs dynamic call targets
2455 (callinsns,cconv) <-
2456 case target of
2457 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2458 -> -- ToDo: stdcall arg sizes
2459 return (unitOL (CALL (Left fn_imm) []), conv)
2460 where fn_imm = ImmCLbl lbl
2461 ForeignTarget expr conv
2462 -> do { (dyn_r, dyn_c) <- getSomeReg expr
2463 ; ASSERT( isWord32 (cmmExprType dflags expr) )
2464 return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
2465 PrimTarget _
2466 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
2467 ++ "probably because too many return values."
2468
2469 let push_code
2470 | arg_pad_size /= 0
2471 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
2472 DELTA (delta0 - arg_pad_size)]
2473 `appOL` concatOL push_codes
2474 | otherwise
2475 = concatOL push_codes
2476
2477 -- Deallocate parameters after call for ccall;
2478 -- but not for stdcall (callee does it)
2479 --
2480 -- We have to pop any stack padding we added
2481 -- even if we are doing stdcall, though (#5052)
2482 pop_size
2483 | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
2484 | otherwise = tot_arg_size
2485
2486 call = callinsns `appOL`
2487 toOL (
2488 (if pop_size==0 then [] else
2489 [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
2490 ++
2491 [DELTA delta0]
2492 )
2493 setDeltaNat delta0
2494
2495 dflags <- getDynFlags
2496 let platform = targetPlatform dflags
2497
2498 let
2499 -- assign the results, if necessary
2500 assign_code [] = nilOL
2501 assign_code [dest]
2502 | isFloatType ty =
2503 if use_sse2
2504 then let tmp_amode = AddrBaseIndex (EABaseReg esp)
2505 EAIndexNone
2506 (ImmInt 0)
2507 fmt = floatFormat w
2508 in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
2509 DELTA (delta0 - b),
2510 GST fmt fake0 tmp_amode,
2511 MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
2512 ADD II32 (OpImm (ImmInt b)) (OpReg esp),
2513 DELTA delta0]
2514 else unitOL (GMOV fake0 r_dest)
2515 | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
2516 MOV II32 (OpReg edx) (OpReg r_dest_hi)]
2517 | otherwise = unitOL (MOV (intFormat w)
2518 (OpReg eax)
2519 (OpReg r_dest))
2520 where
2521 ty = localRegType dest
2522 w = typeWidth ty
2523 b = widthInBytes w
2524 r_dest_hi = getHiVRegFromLo r_dest
2525 r_dest = getRegisterReg platform use_sse2 (CmmLocal dest)
2526 assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
2527
2528 return (push_code `appOL`
2529 call `appOL`
2530 assign_code dest_regs)
2531
2532 where
2533 -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
2534 arg_size_bytes :: CmmType -> Int
2535 arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
2536
2537 roundTo a x | x `mod` a == 0 = x
2538 | otherwise = x + a - (x `mod` a)
2539
2540 push_arg :: Bool -> CmmActual {-current argument-}
2541 -> NatM InstrBlock -- code
2542
2543 push_arg use_sse2 arg -- we don't need the hints on x86
2544 | isWord64 arg_ty = do
2545 ChildCode64 code r_lo <- iselExpr64 arg
2546 delta <- getDeltaNat
2547 setDeltaNat (delta - 8)
2548 let r_hi = getHiVRegFromLo r_lo
2549 return ( code `appOL`
2550 toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
2551 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
2552 DELTA (delta-8)]
2553 )
2554
2555 | isFloatType arg_ty = do
2556 (reg, code) <- getSomeReg arg
2557 delta <- getDeltaNat
2558 setDeltaNat (delta-size)
2559 return (code `appOL`
2560 toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
2561 DELTA (delta-size),
2562 let addr = AddrBaseIndex (EABaseReg esp)
2563 EAIndexNone
2564 (ImmInt 0)
2565 format = floatFormat (typeWidth arg_ty)
2566 in
2567 if use_sse2
2568 then MOV format (OpReg reg) (OpAddr addr)
2569 else GST format reg addr
2570 ]
2571 )
2572
2573 | otherwise = do
2574 -- Arguments can be smaller than 32-bit, but we still use @PUSH
2575 -- II32@ - the usual calling conventions expect integers to be
2576 -- 4-byte aligned.
2577 ASSERT((typeWidth arg_ty) <= W32) return ()
2578 (operand, code) <- getOperand arg
2579 delta <- getDeltaNat
2580 setDeltaNat (delta-size)
2581 return (code `snocOL`
2582 PUSH II32 operand `snocOL`
2583 DELTA (delta-size))
2584
2585 where
2586 arg_ty = cmmExprType dflags arg
2587 size = arg_size_bytes arg_ty -- Byte size
2588
2589 genCCall64' :: DynFlags
2590 -> ForeignTarget -- function to call
2591 -> [CmmFormal] -- where to put the result
2592 -> [CmmActual] -- arguments (of mixed type)
2593 -> NatM InstrBlock
2594 genCCall64' dflags target dest_regs args = do
2595 -- load up the register arguments
2596 let prom_args = map (maybePromoteCArg dflags W32) args
2597
2598 (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
2599 <-
2600 if platformOS platform == OSMinGW32
2601 then load_args_win prom_args [] [] (allArgRegs platform) nilOL
2602 else do
2603 (stack_args, aregs, fregs, load_args_code, assign_args_code)
2604 <- load_args prom_args (allIntArgRegs platform)
2605 (allFPArgRegs platform)
2606 nilOL nilOL
2607 let used_regs rs as = reverse (drop (length rs) (reverse as))
2608 fregs_used = used_regs fregs (allFPArgRegs platform)
2609 aregs_used = used_regs aregs (allIntArgRegs platform)
2610 return (stack_args, aregs_used, fregs_used, load_args_code
2611 , assign_args_code)
2612
2613 let
2614 arg_regs_used = int_regs_used ++ fp_regs_used
2615 arg_regs = [eax] ++ arg_regs_used
2616 -- for annotating the call instruction with
2617 sse_regs = length fp_regs_used
2618 arg_stack_slots = if platformOS platform == OSMinGW32
2619 then length stack_args + length (allArgRegs platform)
2620 else length stack_args
2621 tot_arg_size = arg_size * arg_stack_slots
2622
2623
2624 -- Align stack to 16n for calls, assuming a starting stack
2625 -- alignment of 16n - word_size on procedure entry. Which we
2626 -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2627 (real_size, adjust_rsp) <-
2628 if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
2629 then return (tot_arg_size, nilOL)
2630 else do -- we need to adjust...
2631 delta <- getDeltaNat
2632 setDeltaNat (delta - wORD_SIZE dflags)
2633 return (tot_arg_size + wORD_SIZE dflags, toOL [
2634 SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
2635 DELTA (delta - wORD_SIZE dflags) ])
2636
2637 -- push the stack args, right to left
2638 push_code <- push_args (reverse stack_args) nilOL
2639 -- On Win64, we also have to leave stack space for the arguments
2640 -- that we are passing in registers
2641 lss_code <- if platformOS platform == OSMinGW32
2642 then leaveStackSpace (length (allArgRegs platform))
2643 else return nilOL
2644 delta <- getDeltaNat
2645
2646 -- deal with static vs dynamic call targets
2647 (callinsns,_cconv) <-
2648 case target of
2649 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2650 -> -- ToDo: stdcall arg sizes
2651 return (unitOL (CALL (Left fn_imm) arg_regs), conv)
2652 where fn_imm = ImmCLbl lbl
2653 ForeignTarget expr conv
2654 -> do (dyn_r, dyn_c) <- getSomeReg expr
2655 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
2656 PrimTarget _
2657 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
2658 ++ "probably because too many return values."
2659
2660 let
2661 -- The x86_64 ABI requires us to set %al to the number of SSE2
2662 -- registers that contain arguments, if the called routine
2663 -- is a varargs function. We don't know whether it's a
2664 -- varargs function or not, so we have to assume it is.
2665 --
2666 -- It's not safe to omit this assignment, even if the number
2667 -- of SSE2 regs in use is zero. If %al is larger than 8
2668 -- on entry to a varargs function, seg faults ensue.
2669 assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
2670
2671 let call = callinsns `appOL`
2672 toOL (
2673 -- Deallocate parameters after call for ccall;
2674 -- stdcall has callee do it, but is not supported on
2675 -- x86_64 target (see #3336)
2676 (if real_size==0 then [] else
2677 [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
2678 ++
2679 [DELTA (delta + real_size)]
2680 )
2681 setDeltaNat (delta + real_size)
2682
2683 let
2684 -- assign the results, if necessary
2685 assign_code [] = nilOL
2686 assign_code [dest] =
2687 case typeWidth rep of
2688 W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
2689 (OpReg xmm0)
2690 (OpReg r_dest))
2691 W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
2692 (OpReg xmm0)
2693 (OpReg r_dest))
2694 _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
2695 where
2696 rep = localRegType dest
2697 r_dest = getRegisterReg platform True (CmmLocal dest)
2698 assign_code _many = panic "genCCall.assign_code many"
2699
2700 return (adjust_rsp `appOL`
2701 push_code `appOL`
2702 load_args_code `appOL`
2703 assign_args_code `appOL`
2704 lss_code `appOL`
2705 assign_eax sse_regs `appOL`
2706 call `appOL`
2707 assign_code dest_regs)
2708
2709 where platform = targetPlatform dflags
2710 arg_size = 8 -- always, at the mo
2711
2712
2713 load_args :: [CmmExpr]
2714 -> [Reg] -- int regs avail for args
2715 -> [Reg] -- FP regs avail for args
2716 -> InstrBlock -- code computing args
2717 -> InstrBlock -- code assigning args to ABI regs
2718 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
2719 -- no more regs to use
2720 load_args args [] [] code acode =
2721 return (args, [], [], code, acode)
2722
2723 -- no more args to push
2724 load_args [] aregs fregs code acode =
2725 return ([], aregs, fregs, code, acode)
2726
2727 load_args (arg : rest) aregs fregs code acode
2728 | isFloatType arg_rep = case fregs of
2729 [] -> push_this_arg
2730 (r:rs) -> do
2731 (code',acode') <- reg_this_arg r
2732 load_args rest aregs rs code' acode'
2733 | otherwise = case aregs of
2734 [] -> push_this_arg
2735 (r:rs) -> do
2736 (code',acode') <- reg_this_arg r
2737 load_args rest rs fregs code' acode'
2738 where
2739
2740 -- put arg into the list of stack pushed args
2741 push_this_arg = do
2742 (args',ars,frs,code',acode')
2743 <- load_args rest aregs fregs code acode
2744 return (arg:args', ars, frs, code', acode')
2745
2746 -- pass the arg into the given register
2747 reg_this_arg r
2748 -- "operand" args can be directly assigned into r
2749 | isOperand False arg = do
2750 arg_code <- getAnyReg arg
2751 return (code, (acode `appOL` arg_code r))
2752 -- The last non-operand arg can be directly assigned after its
2753 -- computation without going into a temporary register
2754 | all (isOperand False) rest = do
2755 arg_code <- getAnyReg arg
2756 return (code `appOL` arg_code r,acode)
2757
2758 -- other args need to be computed beforehand to avoid clobbering
2759 -- previously assigned registers used to pass parameters (see
2760 -- #11792, #12614). They are assigned into temporary registers
2761 -- and get assigned to proper call ABI registers after they all
2762 -- have been computed.
2763 | otherwise = do
2764 arg_code <- getAnyReg arg
2765 tmp <- getNewRegNat arg_fmt
2766 let
2767 code' = code `appOL` arg_code tmp
2768 acode' = acode `snocOL` reg2reg arg_fmt tmp r
2769 return (code',acode')
2770
2771 arg_rep = cmmExprType dflags arg
2772 arg_fmt = cmmTypeFormat arg_rep
2773
2774 load_args_win :: [CmmExpr]
2775 -> [Reg] -- used int regs
2776 -> [Reg] -- used FP regs
2777 -> [(Reg, Reg)] -- (int, FP) regs avail for args
2778 -> InstrBlock
2779 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
2780 load_args_win args usedInt usedFP [] code
2781 = return (args, usedInt, usedFP, code, nilOL)
2782 -- no more regs to use
2783 load_args_win [] usedInt usedFP _ code
2784 = return ([], usedInt, usedFP, code, nilOL)
2785 -- no more args to push
2786 load_args_win (arg : rest) usedInt usedFP
2787 ((ireg, freg) : regs) code
2788 | isFloatType arg_rep = do
2789 arg_code <- getAnyReg arg
2790 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
2791 (code `appOL`
2792 arg_code freg `snocOL`
2793 -- If we are calling a varargs function
2794 -- then we need to define ireg as well
2795 -- as freg
2796 MOV II64 (OpReg freg) (OpReg ireg))
2797 | otherwise = do
2798 arg_code <- getAnyReg arg
2799 load_args_win rest (ireg : usedInt) usedFP regs
2800 (code `appOL` arg_code ireg)
2801 where
2802 arg_rep = cmmExprType dflags arg
2803
2804 push_args [] code = return code
2805 push_args (arg:rest) code
2806 | isFloatType arg_rep = do
2807 (arg_reg, arg_code) <- getSomeReg arg
2808 delta <- getDeltaNat
2809 setDeltaNat (delta-arg_size)
2810 let code' = code `appOL` arg_code `appOL` toOL [
2811 SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp),
2812 DELTA (delta-arg_size),
2813 MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
2814 push_args rest code'
2815
2816 | otherwise = do
2817 -- Arguments can be smaller than 64-bit, but we still use @PUSH
2818 -- II64@ - the usual calling conventions expect integers to be
2819 -- 8-byte aligned.
2820 ASSERT(width <= W64) return ()
2821 (arg_op, arg_code) <- getOperand arg
2822 delta <- getDeltaNat
2823 setDeltaNat (delta-arg_size)
2824 let code' = code `appOL` arg_code `appOL` toOL [
2825 PUSH II64 arg_op,
2826 DELTA (delta-arg_size)]
2827 push_args rest code'
2828 where
2829 arg_rep = cmmExprType dflags arg
2830 width = typeWidth arg_rep
2831
2832 leaveStackSpace n = do
2833 delta <- getDeltaNat
2834 setDeltaNat (delta - n * arg_size)
2835 return $ toOL [
2836 SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
2837 DELTA (delta - n * arg_size)]
2838
2839 maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr
2840 maybePromoteCArg dflags wto arg
2841 | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
2842 | otherwise = arg
2843 where
2844 wfrom = cmmExprWidth dflags arg
2845
2846 outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
2847 -> NatM InstrBlock
2848 outOfLineCmmOp bid mop res args
2849 = do
2850 dflags <- getDynFlags
2851 targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
2852 let target = ForeignTarget targetExpr
2853 (ForeignConvention CCallConv [] [] CmmMayReturn)
2854
2855 stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
2856 where
2857 -- Assume we can call these functions directly, and that they're not in a dynamic library.
2858 -- TODO: Why is this ok? Under linux this code will be in libm.so
2859 -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31
2860 lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
2861
2862 fn = case mop of
2863 MO_F32_Sqrt -> fsLit "sqrtf"
2864 MO_F32_Fabs -> fsLit "fabsf"
2865 MO_F32_Sin -> fsLit "sinf"
2866 MO_F32_Cos -> fsLit "cosf"
2867 MO_F32_Tan -> fsLit "tanf"
2868 MO_F32_Exp -> fsLit "expf"
2869 MO_F32_Log -> fsLit "logf"
2870
2871 MO_F32_Asin -> fsLit "asinf"
2872 MO_F32_Acos -> fsLit "acosf"
2873 MO_F32_Atan -> fsLit "atanf"
2874
2875 MO_F32_Sinh -> fsLit "sinhf"
2876 MO_F32_Cosh -> fsLit "coshf"
2877 MO_F32_Tanh -> fsLit "tanhf"
2878 MO_F32_Pwr -> fsLit "powf"
2879
2880 MO_F32_Asinh -> fsLit "asinhf"
2881 MO_F32_Acosh -> fsLit "acoshf"
2882 MO_F32_Atanh -> fsLit "atanhf"
2883
2884 MO_F64_Sqrt -> fsLit "sqrt"
2885 MO_F64_Fabs -> fsLit "fabs"
2886 MO_F64_Sin -> fsLit "sin"
2887 MO_F64_Cos -> fsLit "cos"
2888 MO_F64_Tan -> fsLit "tan"
2889 MO_F64_Exp -> fsLit "exp"
2890 MO_F64_Log -> fsLit "log"
2891
2892 MO_F64_Asin -> fsLit "asin"
2893 MO_F64_Acos -> fsLit "acos"
2894 MO_F64_Atan -> fsLit "atan"
2895
2896 MO_F64_Sinh -> fsLit "sinh"
2897 MO_F64_Cosh -> fsLit "cosh"
2898 MO_F64_Tanh -> fsLit "tanh"
2899 MO_F64_Pwr -> fsLit "pow"
2900
2901 MO_F64_Asinh -> fsLit "asinh"
2902 MO_F64_Acosh -> fsLit "acosh"
2903 MO_F64_Atanh -> fsLit "atanh"
2904
2905 MO_Memcpy _ -> fsLit "memcpy"
2906 MO_Memset _ -> fsLit "memset"
2907 MO_Memmove _ -> fsLit "memmove"
2908 MO_Memcmp _ -> fsLit "memcmp"
2909
2910 MO_PopCnt _ -> fsLit "popcnt"
2911 MO_BSwap _ -> fsLit "bswap"
2912 MO_Clz w -> fsLit $ clzLabel w
2913 MO_Ctz _ -> unsupported
2914
2915 MO_Pdep w -> fsLit $ pdepLabel w
2916 MO_Pext w -> fsLit $ pextLabel w
2917
2918 MO_AtomicRMW _ _ -> fsLit "atomicrmw"
2919 MO_AtomicRead _ -> fsLit "atomicread"
2920 MO_AtomicWrite _ -> fsLit "atomicwrite"
2921 MO_Cmpxchg _ -> fsLit "cmpxchg"
2922
2923 MO_UF_Conv _ -> unsupported
2924
2925 MO_S_QuotRem {} -> unsupported
2926 MO_U_QuotRem {} -> unsupported
2927 MO_U_QuotRem2 {} -> unsupported
2928 MO_Add2 {} -> unsupported
2929 MO_AddIntC {} -> unsupported
2930 MO_SubIntC {} -> unsupported
2931 MO_AddWordC {} -> unsupported
2932 MO_SubWordC {} -> unsupported
2933 MO_U_Mul2 {} -> unsupported
2934 MO_WriteBarrier -> unsupported
2935 MO_Touch -> unsupported
2936 (MO_Prefetch_Data _ ) -> unsupported
2937 unsupported = panic ("outOfLineCmmOp: " ++ show mop
2938 ++ " not supported here")
2939
2940 -- -----------------------------------------------------------------------------
2941 -- Generating a table-branch
2942
2943 genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
2944
2945 genSwitch dflags expr targets
2946 | positionIndependent dflags
2947 = do
2948 (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
2949 -- getNonClobberedReg because it needs to survive across t_code
2950 lbl <- getNewLabelNat
2951 dflags <- getDynFlags
2952 let is32bit = target32Bit (targetPlatform dflags)
2953 os = platformOS (targetPlatform dflags)
2954 -- Might want to use .rodata.<function we're in> instead, but as
2955 -- long as it's something unique it'll work out since the
2956 -- references to the jump table are in the appropriate section.
2957 rosection = case os of
2958 -- on Mac OS X/x86_64, put the jump table in the text section to
2959 -- work around a limitation of the linker.
2960 -- ld64 is unable to handle the relocations for
2961 -- .quad L1 - L0
2962 -- if L0 is not preceded by a non-anonymous label in its section.
2963 OSDarwin | not is32bit -> Section Text lbl
2964 _ -> Section ReadOnlyData lbl
2965 dynRef <- cmmMakeDynamicReference dflags DataReference lbl
2966 (tableReg,t_code) <- getSomeReg $ dynRef
2967 let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
2968 (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
2969
2970 offsetReg <- getNewRegNat (intFormat (wordWidth dflags))
2971 return $ if is32bit || os == OSDarwin
2972 then e_code `appOL` t_code `appOL` toOL [
2973 ADD (intFormat (wordWidth dflags)) op (OpReg tableReg),
2974 JMP_TBL (OpReg tableReg) ids rosection lbl
2975 ]
2976 else -- HACK: On x86_64 binutils<2.17 is only able to generate
2977 -- PC32 relocations, hence we only get 32-bit offsets in
2978 -- the jump table. As these offsets are always negative
2979 -- we need to properly sign extend them to 64-bit. This
2980 -- hack should be removed in conjunction with the hack in
2981 -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
2982 e_code `appOL` t_code `appOL` toOL [
2983 MOVSxL II32 op (OpReg offsetReg),
2984 ADD (intFormat (wordWidth dflags))
2985 (OpReg offsetReg)
2986 (OpReg tableReg),
2987 JMP_TBL (OpReg tableReg) ids rosection lbl
2988 ]
2989 | otherwise
2990 = do
2991 (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
2992 lbl <- getNewLabelNat
2993 let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
2994 code = e_code `appOL` toOL [
2995 JMP_TBL op ids (Section ReadOnlyData lbl) lbl
2996 ]
2997 return code
2998 where
2999 (offset, blockIds) = switchTargetsToTable targets
3000 ids = map (fmap DestBlockId) blockIds
3001
3002 generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, CmmStatics) Instr)
3003 generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
3004 = let getBlockId (DestBlockId id) = id
3005 getBlockId _ = panic "Non-Label target in Jump Table"
3006 blockIds = map (fmap getBlockId) ids
3007 in Just (createJumpTable dflags blockIds section lbl)
3008 generateJumpTableForInstr _ _ = Nothing
3009
3010 createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
3011 -> GenCmmDecl (Alignment, CmmStatics) h g
3012 createJumpTable dflags ids section lbl
3013 = let jumpTable
3014 | positionIndependent dflags =
3015 let ww = wordWidth dflags
3016 jumpTableEntryRel Nothing
3017 = CmmStaticLit (CmmInt 0 ww)
3018 jumpTableEntryRel (Just blockid)
3019 = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
3020 where blockLabel = blockLbl blockid
3021 in map jumpTableEntryRel ids
3022 | otherwise = map (jumpTableEntry dflags) ids
3023 in CmmData section (1, Statics lbl jumpTable)
3024
3025 extractUnwindPoints :: [Instr] -> [UnwindPoint]
3026 extractUnwindPoints instrs =
3027 [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
3028
3029 -- -----------------------------------------------------------------------------
3030 -- 'condIntReg' and 'condFltReg': condition codes into registers
3031
3032 -- Turn those condition codes into integers now (when they appear on
3033 -- the right hand side of an assignment).
3034 --
3035 -- (If applicable) Do not fill the delay slots here; you will confuse the
3036 -- register allocator.
3037
3038 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
3039
3040 condIntReg cond x y = do
3041 CondCode _ cond cond_code <- condIntCode cond x y
3042 tmp <- getNewRegNat II8
3043 let
3044 code dst = cond_code `appOL` toOL [
3045 SETCC cond (OpReg tmp),
3046 MOVZxL II8 (OpReg tmp) (OpReg dst)
3047 ]
3048 return (Any II32 code)
3049
3050
3051 -----------------------------------------------------------
3052 --- Note [SSE Parity Checks] ---
3053 -----------------------------------------------------------
3054
3055 -- We have to worry about unordered operands (eg. comparisons
3056 -- against NaN). If the operands are unordered, the comparison
3057 -- sets the parity flag, carry flag and zero flag.
3058 -- All comparisons are supposed to return false for unordered
3059 -- operands except for !=, which returns true.
3060 --
3061 -- Optimisation: we don't have to test the parity flag if we
3062 -- know the test has already excluded the unordered case: eg >
3063 -- and >= test for a zero carry flag, which can only occur for
3064 -- ordered operands.
3065 --
3066 -- By reversing comparisons we can avoid testing the parity
3067 -- for < and <= as well. If any of the arguments is an NaN we
3068 -- return false either way. If both arguments are valid then
3069 -- x <= y <-> y >= x holds. So it's safe to swap these.
3070 --
3071 -- We invert the condition inside getRegister'and getCondCode
3072 -- which should cover all invertable cases.
3073 -- All other functions translating FP comparisons to assembly
3074 -- use these to two generate the comparison code.
3075 --
3076 -- As an example consider a simple check:
3077 --
3078 -- func :: Float -> Float -> Int
3079 -- func x y = if x < y then 1 else 0
3080 --
3081 -- Which in Cmm gives the floating point comparison.
3082 --
3083 -- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
3084 --
3085 -- We used to compile this to an assembly code block like this:
3086 -- _c2gh:
3087 -- ucomiss %xmm2,%xmm1
3088 -- jp _c2gf
3089 -- jb _c2gg
3090 -- jmp _c2gf
3091 --
3092 -- Where we have to introduce an explicit
3093 -- check for unordered results (using jmp parity):
3094 --
3095 -- We can avoid this by exchanging the arguments and inverting the direction
3096 -- of the comparison. This results in the sequence of:
3097 --
3098 -- ucomiss %xmm1,%xmm2
3099 -- ja _c2g2
3100 -- jmp _c2g1
3101 --
3102 -- Removing the jump reduces the pressure on the branch predidiction system
3103 -- and plays better with the uOP cache.
3104
3105 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
3106 condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
3107 where
3108 condFltReg_x87 = do
3109 CondCode _ cond cond_code <- condFltCode cond x y
3110 tmp <- getNewRegNat II8
3111 let
3112 code dst = cond_code `appOL` toOL [
3113 SETCC cond (OpReg tmp),
3114 MOVZxL II8 (OpReg tmp) (OpReg dst)
3115 ]
3116 return (Any II32 code)
3117
3118 condFltReg_sse2 = do
3119 CondCode _ cond cond_code <- condFltCode cond x y
3120 tmp1 <- getNewRegNat (archWordFormat is32Bit)
3121 tmp2 <- getNewRegNat (archWordFormat is32Bit)
3122 let -- See Note [SSE Parity Checks]
3123 code dst =
3124 cond_code `appOL`
3125 (case cond of
3126 NE -> or_unordered dst
3127 GU -> plain_test dst
3128 GEU -> plain_test dst
3129 -- Use ASSERT so we don't break releases if these creep in.
3130 LTT -> ASSERT2(False, ppr "Should have been turned into >")
3131 and_ordered dst
3132 LE -> ASSERT2(False, ppr "Should have been turned into >=")
3133 and_ordered dst
3134 _ -> and_ordered dst)
3135
3136 plain_test dst = toOL [
3137 SETCC cond (OpReg tmp1),
3138 MOVZxL II8 (OpReg tmp1) (OpReg dst)
3139 ]
3140 or_unordered dst = toOL [
3141 SETCC cond (OpReg tmp1),
3142 SETCC PARITY (OpReg tmp2),
3143 OR II8 (OpReg tmp1) (OpReg tmp2),
3144 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3145 ]
3146 and_ordered dst = toOL [
3147 SETCC cond (OpReg tmp1),
3148 SETCC NOTPARITY (OpReg tmp2),
3149 AND II8 (OpReg tmp1) (OpReg tmp2),
3150 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3151 ]
3152 return (Any II32 code)
3153
3154
3155 -- -----------------------------------------------------------------------------
3156 -- 'trivial*Code': deal with trivial instructions
3157
3158 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
3159 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
3160 -- Only look for constants on the right hand side, because that's
3161 -- where the generic optimizer will have put them.
3162
3163 -- Similarly, for unary instructions, we don't have to worry about
3164 -- matching an StInt as the argument, because genericOpt will already
3165 -- have handled the constant-folding.
3166
3167
3168 {-
3169 The Rules of the Game are:
3170
3171 * You cannot assume anything about the destination register dst;
3172 it may be anything, including a fixed reg.
3173
3174 * You may compute an operand into a fixed reg, but you may not
3175 subsequently change the contents of that fixed reg. If you
3176 want to do so, first copy the value either to a temporary
3177 or into dst. You are free to modify dst even if it happens
3178 to be a fixed reg -- that's not your problem.
3179
3180 * You cannot assume that a fixed reg will stay live over an
3181 arbitrary computation. The same applies to the dst reg.
3182
3183 * Temporary regs obtained from getNewRegNat are distinct from
3184 each other and from all other regs, and stay live over
3185 arbitrary computations.
3186
3187 --------------------
3188
3189 SDM's version of The Rules:
3190
3191 * If getRegister returns Any, that means it can generate correct
3192 code which places the result in any register, period. Even if that
3193 register happens to be read during the computation.
3194
3195 Corollary #1: this means that if you are generating code for an
3196 operation with two arbitrary operands, you cannot assign the result
3197 of the first operand into the destination register before computing
3198 the second operand. The second operand might require the old value
3199 of the destination register.
3200
3201 Corollary #2: A function might be able to generate more efficient
3202 code if it knows the destination register is a new temporary (and
3203 therefore not read by any of the sub-computations).
3204
3205 * If getRegister returns Any, then the code it generates may modify only:
3206 (a) fresh temporaries
3207 (b) the destination register
3208 (c) known registers (eg. %ecx is used by shifts)
3209 In particular, it may *not* modify global registers, unless the global
3210 register happens to be the destination register.
3211 -}
3212
3213 trivialCode :: Width -> (Operand -> Operand -> Instr)
3214 -> Maybe (Operand -> Operand -> Instr)
3215 -> CmmExpr -> CmmExpr -> NatM Register
3216 trivialCode width instr m a b
3217 = do is32Bit <- is32BitPlatform
3218 trivialCode' is32Bit width instr m a b
3219
3220 trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
3221 -> Maybe (Operand -> Operand -> Instr)
3222 -> CmmExpr -> CmmExpr -> NatM Register
3223 trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
3224 | is32BitLit is32Bit lit_a = do
3225 b_code <- getAnyReg b
3226 let
3227 code dst
3228 = b_code dst `snocOL`
3229 revinstr (OpImm (litToImm lit_a)) (OpReg dst)
3230 return (Any (intFormat width) code)
3231
3232 trivialCode' _ width instr _ a b
3233 = genTrivialCode (intFormat width) instr a b
3234
3235 -- This is re-used for floating pt instructions too.
3236 genTrivialCode :: Format -> (Operand -> Operand -> Instr)
3237 -> CmmExpr -> CmmExpr -> NatM Register
3238 genTrivialCode rep instr a b = do
3239 (b_op, b_code) <- getNonClobberedOperand b
3240 a_code <- getAnyReg a
3241 tmp <- getNewRegNat rep
3242 let
3243 -- We want the value of b to stay alive across the computation of a.
3244 -- But, we want to calculate a straight into the destination register,
3245 -- because the instruction only has two operands (dst := dst `op` src).
3246 -- The troublesome case is when the result of b is in the same register
3247 -- as the destination reg. In this case, we have to save b in a
3248 -- new temporary across the computation of a.
3249 code dst
3250 | dst `regClashesWithOp` b_op =
3251 b_code `appOL`
3252 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
3253 a_code dst `snocOL`
3254 instr (OpReg tmp) (OpReg dst)
3255 | otherwise =
3256 b_code `appOL`
3257 a_code dst `snocOL`
3258 instr b_op (OpReg dst)
3259 return (Any rep code)
3260
3261 regClashesWithOp :: Reg -> Operand -> Bool
3262 reg `regClashesWithOp` OpReg reg2 = reg == reg2
3263 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
3264 _ `regClashesWithOp` _ = False
3265
3266 -----------
3267
3268 trivialUCode :: Format -> (Operand -> Instr)
3269 -> CmmExpr -> NatM Register
3270 trivialUCode rep instr x = do
3271 x_code <- getAnyReg x
3272 let
3273 code dst =
3274 x_code dst `snocOL`
3275 instr (OpReg dst)
3276 return (Any rep code)
3277
3278 -----------
3279
3280 trivialFCode_x87 :: (Format -> Reg -> Reg -> Reg -> Instr)
3281 -> CmmExpr -> CmmExpr -> NatM Register
3282 trivialFCode_x87 instr x y = do
3283 (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
3284 (y_reg, y_code) <- getSomeReg y
3285 let
3286 format = FF80 -- always, on x87
3287 code dst =
3288 x_code `appOL`
3289 y_code `snocOL`
3290 instr format x_reg y_reg dst
3291 return (Any format code)
3292
3293 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
3294 -> CmmExpr -> CmmExpr -> NatM Register
3295 trivialFCode_sse2 pk instr x y
3296 = genTrivialCode format (instr format) x y
3297 where format = floatFormat pk
3298
3299
3300 trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
3301 trivialUFCode format instr x = do
3302 (x_reg, x_code) <- getSomeReg x
3303 let
3304 code dst =
3305 x_code `snocOL`
3306 instr x_reg dst
3307 return (Any format code)
3308
3309
3310 --------------------------------------------------------------------------------
3311 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
3312 coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
3313 where
3314 coerce_x87 = do
3315 (x_reg, x_code) <- getSomeReg x
3316 let
3317 opc = case to of W32 -> GITOF; W64 -> GITOD;
3318 n -> panic $ "coerceInt2FP.x87: unhandled width ("
3319 ++ show n ++ ")"
3320 code dst = x_code `snocOL` opc x_reg dst
3321 -- ToDo: works for non-II32 reps?
3322 return (Any FF80 code)
3323
3324 coerce_sse2 = do
3325 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3326 let
3327 opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
3328 n -> panic $ "coerceInt2FP.sse: unhandled width ("
3329 ++ show n ++ ")"
3330 code dst = x_code `snocOL` opc (intFormat from) x_op dst
3331 return (Any (floatFormat to) code)
3332 -- works even if the destination rep is <II32
3333
3334 --------------------------------------------------------------------------------
3335 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
3336 coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
3337 where
3338 coerceFP2Int_x87 = do
3339 (x_reg, x_code) <- getSomeReg x
3340 let
3341 opc = case from of W32 -> GFTOI; W64 -> GDTOI
3342 n -> panic $ "coerceFP2Int.x87: unhandled width ("
3343 ++ show n ++ ")"
3344 code dst = x_code `snocOL` opc x_reg dst
3345 -- ToDo: works for non-II32 reps?
3346 return (Any (intFormat to) code)
3347
3348 coerceFP2Int_sse2 = do
3349 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3350 let
3351 opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
3352 n -> panic $ "coerceFP2Init.sse: unhandled width ("
3353 ++ show n ++ ")"
3354 code dst = x_code `snocOL` opc (intFormat to) x_op dst
3355 return (Any (intFormat to) code)
3356 -- works even if the destination rep is <II32
3357
3358
3359 --------------------------------------------------------------------------------
3360 coerceFP2FP :: Width -> CmmExpr -> NatM Register
3361 coerceFP2FP to x = do
3362 use_sse2 <- sse2Enabled
3363 (x_reg, x_code) <- getSomeReg x
3364 let
3365 opc | use_sse2 = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
3366 n -> panic $ "coerceFP2FP: unhandled width ("
3367 ++ show n ++ ")"
3368 | otherwise = GDTOF
3369 code dst = x_code `snocOL` opc x_reg dst
3370 return (Any (if use_sse2 then floatFormat to else FF80) code)
3371
3372 --------------------------------------------------------------------------------
3373
3374 sse2NegCode :: Width -> CmmExpr -> NatM Register
3375 sse2NegCode w x = do
3376 let fmt = floatFormat w
3377 x_code <- getAnyReg x
3378 -- This is how gcc does it, so it can't be that bad:
3379 let
3380 const = case fmt of
3381 FF32 -> CmmInt 0x80000000 W32
3382 FF64 -> CmmInt 0x8000000000000000 W64
3383 x@II8 -> wrongFmt x
3384 x@II16 -> wrongFmt x
3385 x@II32 -> wrongFmt x
3386 x@II64 -> wrongFmt x
3387 x@FF80 -> wrongFmt x
3388 where
3389 wrongFmt x = panic $ "sse2NegCode: " ++ show x
3390 Amode amode amode_code <- memConstant (widthInBytes w) const
3391 tmp <- getNewRegNat fmt
3392 let
3393 code dst = x_code dst `appOL` amode_code `appOL` toOL [
3394 MOV fmt (OpAddr amode) (OpReg tmp),
3395 XOR fmt (OpReg tmp) (OpReg dst)
3396 ]
3397 --
3398 return (Any fmt code)
3399
3400 isVecExpr :: CmmExpr -> Bool
3401 isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
3402 isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
3403 isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
3404 isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
3405 isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
3406 isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
3407 isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
3408 isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
3409 isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
3410 isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
3411 isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
3412 isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
3413 isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
3414 isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
3415 isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
3416 isVecExpr (CmmMachOp _ [e]) = isVecExpr e
3417 isVecExpr _ = False
3418
3419 needLlvm :: NatM a
3420 needLlvm =
3421 sorry $ unlines ["The native code generator does not support vector"
3422 ,"instructions. Please use -fllvm."]
3423
3424 -- | This works on the invariant that all jumps in the given blocks are required.
3425 -- Starting from there we try to make a few more jumps redundant by reordering
3426 -- them.
3427 invertCondBranches :: CFG -> LabelMap a -> [NatBasicBlock Instr]
3428 -> [NatBasicBlock Instr]
3429 invertCondBranches cfg keep bs =
3430 --trace "Foo" $
3431 invert bs
3432 where
3433 invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
3434 invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
3435 | --pprTrace "Block" (ppr lbl1) True,
3436 (jmp1,jmp2) <- last2 ins
3437 , JXX cond1 target1 <- jmp1
3438 , target1 == lbl2
3439 --, pprTrace "CutChance" (ppr b1) True
3440 , JXX ALWAYS target2 <- jmp2
3441 -- We have enough information to check if we can perform the inversion
3442 -- TODO: We could also check for the last asm instruction which sets
3443 -- status flags instead. Which I suspect is worse in terms of compiler
3444 -- performance, but might be applicable to more cases
3445 , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
3446 , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
3447 -- Both jumps come from the same cmm statement
3448 , transitionSource edgeInfo1 == transitionSource edgeInfo2
3449 , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1
3450
3451 --Int comparisons are invertable
3452 , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
3453 , Just _ <- maybeIntComparison op
3454 , Just invCond <- maybeInvertCond cond1
3455
3456 --Swap the last two jumps, invert the conditional jumps condition.
3457 = let jumps =
3458 case () of
3459 -- We are free the eliminate the jmp. So we do so.
3460 _ | not (mapMember target1 keep)
3461 -> [JXX invCond target2]
3462 -- If the conditional target is unlikely we put the other
3463 -- target at the front.
3464 | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
3465 -> [JXX invCond target2, JXX ALWAYS target1]
3466 -- Keep things as-is otherwise
3467 | otherwise
3468 -> [jmp1, jmp2]
3469 in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
3470 (BasicBlock lbl1
3471 (dropTail 2 ins ++ jumps))
3472 : invert (b2:bs)
3473 invert (b:bs) = b : invert bs
3474 invert [] = []