9591c42ede454d2381eb29584872516e8c41c4b3
[ghc.git] / compiler / nativeGen / X86 / CodeGen.hs
1 {-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
2
3 -- The default iteration limit is a bit too low for the definitions
4 -- in this module.
5 {-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
6
7 -----------------------------------------------------------------------------
8 --
9 -- Generating machine code (instruction selection)
10 --
11 -- (c) The University of Glasgow 1996-2004
12 --
13 -----------------------------------------------------------------------------
14
15 -- This is a big module, but, if you pay attention to
16 -- (a) the sectioning, and (b) the type signatures, the
17 -- structure should not be too overwhelming.
18
19 module X86.CodeGen (
20 cmmTopCodeGen,
21 generateJumpTableForInstr,
22 extractUnwindPoints,
23 invertCondBranches,
24 InstrBlock
25 )
26
27 where
28
29 #include "HsVersions.h"
30 #include "nativeGen/NCG.h"
31 #include "../includes/MachDeps.h"
32
33 -- NCG stuff:
34 import GhcPrelude
35
36 import X86.Instr
37 import X86.Cond
38 import X86.Regs
39 import X86.RegInfo
40
41 --TODO: Remove - Just for development/debugging
42 import X86.Ppr()
43
44 import CodeGen.Platform
45 import CPrim
46 import Debug ( DebugBlock(..), UnwindPoint(..), UnwindTable
47 , UnwindExpr(UwReg), toUnwindExpr )
48 import Instruction
49 import PIC
50 import NCGMonad ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
51 , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
52 , getPicBaseMaybeNat, getDebugBlock, getFileId
53 , addImmediateSuccessorNat, updateCfgNat)
54 import CFG
55 import Format
56 import Reg
57 import Platform
58
59 -- Our intermediate code:
60 import BasicTypes
61 import BlockId
62 import Module ( primUnitId )
63 import PprCmm ()
64 import CmmUtils
65 import CmmSwitch
66 import Cmm
67 import Hoopl.Block
68 import Hoopl.Collections
69 import Hoopl.Graph
70 import Hoopl.Label
71 import CLabel
72 import CoreSyn ( Tickish(..) )
73 import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
74
75 -- The rest:
76 import ForeignCall ( CCallConv(..) )
77 import OrdList
78 import Outputable
79 import FastString
80 import DynFlags
81 import Util
82 import UniqSupply ( getUniqueM )
83
84 import Control.Monad
85 import Data.Bits
86 import Data.Foldable (fold)
87 import Data.Int
88 import Data.Maybe
89 import Data.Word
90
91 import qualified Data.Map as M
92
93 is32BitPlatform :: NatM Bool
94 is32BitPlatform = do
95 dflags <- getDynFlags
96 return $ target32Bit (targetPlatform dflags)
97
98 sse2Enabled :: NatM Bool
99 sse2Enabled = do
100 dflags <- getDynFlags
101 return (isSse2Enabled dflags)
102
103 sse4_2Enabled :: NatM Bool
104 sse4_2Enabled = do
105 dflags <- getDynFlags
106 return (isSse4_2Enabled dflags)
107
108 if_sse2 :: NatM a -> NatM a -> NatM a
109 if_sse2 sse2 x87 = do
110 b <- sse2Enabled
111 if b then sse2 else x87
112
113 cmmTopCodeGen
114 :: RawCmmDecl
115 -> NatM [NatCmmDecl (Alignment, CmmStatics) Instr]
116
117 cmmTopCodeGen (CmmProc info lab live graph) = do
118 let blocks = toBlockListEntryFirst graph
119 (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
120 picBaseMb <- getPicBaseMaybeNat
121 dflags <- getDynFlags
122 let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
123 tops = proc : concat statics
124 os = platformOS $ targetPlatform dflags
125
126 case picBaseMb of
127 Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
128 Nothing -> return tops
129
130 cmmTopCodeGen (CmmData sec dat) = do
131 return [CmmData sec (1, dat)] -- no translation, we just use CmmStatic
132
133
134 basicBlockCodeGen
135 :: CmmBlock
136 -> NatM ( [NatBasicBlock Instr]
137 , [NatCmmDecl (Alignment, CmmStatics) Instr])
138
139 basicBlockCodeGen block = do
140 let (_, nodes, tail) = blockSplit block
141 id = entryLabel block
142 stmts = blockToList nodes
143 -- Generate location directive
144 dbg <- getDebugBlock (entryLabel block)
145 loc_instrs <- case dblSourceTick =<< dbg of
146 Just (SourceNote span name)
147 -> do fileId <- getFileId (srcSpanFile span)
148 let line = srcSpanStartLine span; col = srcSpanStartCol span
149 return $ unitOL $ LOCATION fileId line col name
150 _ -> return nilOL
151 mid_instrs <- stmtsToInstrs id stmts
152 tail_instrs <- stmtToInstrs id tail
153 let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
154 instrs' <- fold <$> traverse addSpUnwindings instrs
155 -- code generation may introduce new basic block boundaries, which
156 -- are indicated by the NEWBLOCK instruction. We must split up the
157 -- instruction stream into basic blocks again. Also, we extract
158 -- LDATAs here too.
159 let
160 (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
161
162 mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
163 = ([], BasicBlock id instrs : blocks, statics)
164 mkBlocks (LDATA sec dat) (instrs,blocks,statics)
165 = (instrs, blocks, CmmData sec dat:statics)
166 mkBlocks instr (instrs,blocks,statics)
167 = (instr:instrs, blocks, statics)
168 return (BasicBlock id top : other_blocks, statics)
169
170 -- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
171 -- in the @sp@ register. See Note [What is this unwinding business?] in Debug
172 -- for details.
173 addSpUnwindings :: Instr -> NatM (OrdList Instr)
174 addSpUnwindings instr@(DELTA d) = do
175 dflags <- getDynFlags
176 if debugLevel dflags >= 1
177 then do lbl <- mkAsmTempLabel <$> getUniqueM
178 let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
179 return $ toOL [ instr, UNWIND lbl unwind ]
180 else return (unitOL instr)
181 addSpUnwindings instr = return $ unitOL instr
182
183 stmtsToInstrs :: BlockId -> [CmmNode e x] -> NatM InstrBlock
184 stmtsToInstrs bid stmts
185 = do instrss <- mapM (stmtToInstrs bid) stmts
186 return (concatOL instrss)
187
188 -- | `bid` refers to the current block and is used to update the CFG
189 -- if new blocks are inserted in the control flow.
190 stmtToInstrs :: BlockId -> CmmNode e x -> NatM InstrBlock
191 stmtToInstrs bid stmt = do
192 dflags <- getDynFlags
193 is32Bit <- is32BitPlatform
194 case stmt of
195 CmmComment s -> return (unitOL (COMMENT s))
196 CmmTick {} -> return nilOL
197
198 CmmUnwind regs -> do
199 let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
200 to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
201 case foldMap to_unwind_entry regs of
202 tbl | M.null tbl -> return nilOL
203 | otherwise -> do
204 lbl <- mkAsmTempLabel <$> getUniqueM
205 return $ unitOL $ UNWIND lbl tbl
206
207 CmmAssign reg src
208 | isFloatType ty -> assignReg_FltCode format reg src
209 | is32Bit && isWord64 ty -> assignReg_I64Code reg src
210 | otherwise -> assignReg_IntCode format reg src
211 where ty = cmmRegType dflags reg
212 format = cmmTypeFormat ty
213
214 CmmStore addr src
215 | isFloatType ty -> assignMem_FltCode format addr src
216 | is32Bit && isWord64 ty -> assignMem_I64Code addr src
217 | otherwise -> assignMem_IntCode format addr src
218 where ty = cmmExprType dflags src
219 format = cmmTypeFormat ty
220
221 CmmUnsafeForeignCall target result_regs args
222 -> genCCall dflags is32Bit target result_regs args bid
223
224 CmmBranch id -> return $ genBranch id
225
226 --We try to arrange blocks such that the likely branch is the fallthrough
227 --in CmmContFlowOpt. So we can assume the condition is likely false here.
228 CmmCondBranch arg true false _ -> genCondBranch bid true false arg
229 CmmSwitch arg ids -> do dflags <- getDynFlags
230 genSwitch dflags arg ids
231 CmmCall { cml_target = arg
232 , cml_args_regs = gregs } -> do
233 dflags <- getDynFlags
234 genJump arg (jumpRegs dflags gregs)
235 _ ->
236 panic "stmtToInstrs: statement should have been cps'd away"
237
238
239 jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
240 jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
241 where platform = targetPlatform dflags
242
243 --------------------------------------------------------------------------------
244 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
245 -- They are really trees of insns to facilitate fast appending, where a
246 -- left-to-right traversal yields the insns in the correct order.
247 --
248 type InstrBlock
249 = OrdList Instr
250
251
252 -- | Condition codes passed up the tree.
253 --
254 data CondCode
255 = CondCode Bool Cond InstrBlock
256
257
258 -- | a.k.a "Register64"
259 -- Reg is the lower 32-bit temporary which contains the result.
260 -- Use getHiVRegFromLo to find the other VRegUnique.
261 --
262 -- Rules of this simplified insn selection game are therefore that
263 -- the returned Reg may be modified
264 --
265 data ChildCode64
266 = ChildCode64
267 InstrBlock
268 Reg
269
270
271 -- | Register's passed up the tree. If the stix code forces the register
272 -- to live in a pre-decided machine register, it comes out as @Fixed@;
273 -- otherwise, it comes out as @Any@, and the parent can decide which
274 -- register to put it in.
275 --
276 data Register
277 = Fixed Format Reg InstrBlock
278 | Any Format (Reg -> InstrBlock)
279
280
281 swizzleRegisterRep :: Register -> Format -> Register
282 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
283 swizzleRegisterRep (Any _ codefn) format = Any format codefn
284
285
286 -- | Grab the Reg for a CmmReg
287 getRegisterReg :: Platform -> Bool -> CmmReg -> Reg
288
289 getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk))
290 = let fmt = cmmTypeFormat pk in
291 if isFloatFormat fmt && not use_sse2
292 then RegVirtual (mkVirtualReg u FF80)
293 else RegVirtual (mkVirtualReg u fmt)
294
295 getRegisterReg platform _ (CmmGlobal mid)
296 = case globalRegMaybe platform mid of
297 Just reg -> RegReal $ reg
298 Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
299 -- By this stage, the only MagicIds remaining should be the
300 -- ones which map to a real machine register on this
301 -- platform. Hence ...
302
303
304 -- | Memory addressing modes passed up the tree.
305 data Amode
306 = Amode AddrMode InstrBlock
307
308 {-
309 Now, given a tree (the argument to a CmmLoad) that references memory,
310 produce a suitable addressing mode.
311
312 A Rule of the Game (tm) for Amodes: use of the addr bit must
313 immediately follow use of the code part, since the code part puts
314 values in registers which the addr then refers to. So you can't put
315 anything in between, lest it overwrite some of those registers. If
316 you need to do some other computation between the code part and use of
317 the addr bit, first store the effective address from the amode in a
318 temporary, then do the other computation, and then use the temporary:
319
320 code
321 LEA amode, tmp
322 ... other computation ...
323 ... (tmp) ...
324 -}
325
326
327 -- | Check whether an integer will fit in 32 bits.
328 -- A CmmInt is intended to be truncated to the appropriate
329 -- number of bits, so here we truncate it to Int64. This is
330 -- important because e.g. -1 as a CmmInt might be either
331 -- -1 or 18446744073709551615.
332 --
333 is32BitInteger :: Integer -> Bool
334 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
335 where i64 = fromIntegral i :: Int64
336
337
338 -- | Convert a BlockId to some CmmStatic data
339 jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
340 jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
341 jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
342 where blockLabel = blockLbl blockid
343
344
345 -- -----------------------------------------------------------------------------
346 -- General things for putting together code sequences
347
348 -- Expand CmmRegOff. ToDo: should we do it this way around, or convert
349 -- CmmExprs into CmmRegOff?
350 mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
351 mangleIndexTree dflags reg off
352 = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
353 where width = typeWidth (cmmRegType dflags reg)
354
355 -- | The dual to getAnyReg: compute an expression into a register, but
356 -- we don't mind which one it is.
357 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
358 getSomeReg expr = do
359 r <- getRegister expr
360 case r of
361 Any rep code -> do
362 tmp <- getNewRegNat rep
363 return (tmp, code tmp)
364 Fixed _ reg code ->
365 return (reg, code)
366
367
368 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
369 assignMem_I64Code addrTree valueTree = do
370 Amode addr addr_code <- getAmode addrTree
371 ChildCode64 vcode rlo <- iselExpr64 valueTree
372 let
373 rhi = getHiVRegFromLo rlo
374
375 -- Little-endian store
376 mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
377 mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
378 return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
379
380
381 assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
382 assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
383 ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
384 let
385 r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
386 r_dst_hi = getHiVRegFromLo r_dst_lo
387 r_src_hi = getHiVRegFromLo r_src_lo
388 mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
389 mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
390 return (
391 vcode `snocOL` mov_lo `snocOL` mov_hi
392 )
393
394 assignReg_I64Code _ _
395 = panic "assignReg_I64Code(i386): invalid lvalue"
396
397
398 iselExpr64 :: CmmExpr -> NatM ChildCode64
399 iselExpr64 (CmmLit (CmmInt i _)) = do
400 (rlo,rhi) <- getNewRegPairNat II32
401 let
402 r = fromIntegral (fromIntegral i :: Word32)
403 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
404 code = toOL [
405 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
406 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
407 ]
408 return (ChildCode64 code rlo)
409
410 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
411 Amode addr addr_code <- getAmode addrTree
412 (rlo,rhi) <- getNewRegPairNat II32
413 let
414 mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
415 mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
416 return (
417 ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
418 rlo
419 )
420
421 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
422 = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
423
424 -- we handle addition, but rather badly
425 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
426 ChildCode64 code1 r1lo <- iselExpr64 e1
427 (rlo,rhi) <- getNewRegPairNat II32
428 let
429 r = fromIntegral (fromIntegral i :: Word32)
430 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
431 r1hi = getHiVRegFromLo r1lo
432 code = code1 `appOL`
433 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
434 ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
435 MOV II32 (OpReg r1hi) (OpReg rhi),
436 ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
437 return (ChildCode64 code rlo)
438
439 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
440 ChildCode64 code1 r1lo <- iselExpr64 e1
441 ChildCode64 code2 r2lo <- iselExpr64 e2
442 (rlo,rhi) <- getNewRegPairNat II32
443 let
444 r1hi = getHiVRegFromLo r1lo
445 r2hi = getHiVRegFromLo r2lo
446 code = code1 `appOL`
447 code2 `appOL`
448 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
449 ADD II32 (OpReg r2lo) (OpReg rlo),
450 MOV II32 (OpReg r1hi) (OpReg rhi),
451 ADC II32 (OpReg r2hi) (OpReg rhi) ]
452 return (ChildCode64 code rlo)
453
454 iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
455 ChildCode64 code1 r1lo <- iselExpr64 e1
456 ChildCode64 code2 r2lo <- iselExpr64 e2
457 (rlo,rhi) <- getNewRegPairNat II32
458 let
459 r1hi = getHiVRegFromLo r1lo
460 r2hi = getHiVRegFromLo r2lo
461 code = code1 `appOL`
462 code2 `appOL`
463 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
464 SUB II32 (OpReg r2lo) (OpReg rlo),
465 MOV II32 (OpReg r1hi) (OpReg rhi),
466 SBB II32 (OpReg r2hi) (OpReg rhi) ]
467 return (ChildCode64 code rlo)
468
469 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
470 fn <- getAnyReg expr
471 r_dst_lo <- getNewRegNat II32
472 let r_dst_hi = getHiVRegFromLo r_dst_lo
473 code = fn r_dst_lo
474 return (
475 ChildCode64 (code `snocOL`
476 MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
477 r_dst_lo
478 )
479
480 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
481 fn <- getAnyReg expr
482 r_dst_lo <- getNewRegNat II32
483 let r_dst_hi = getHiVRegFromLo r_dst_lo
484 code = fn r_dst_lo
485 return (
486 ChildCode64 (code `snocOL`
487 MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
488 CLTD II32 `snocOL`
489 MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
490 MOV II32 (OpReg edx) (OpReg r_dst_hi))
491 r_dst_lo
492 )
493
494 iselExpr64 expr
495 = pprPanic "iselExpr64(i386)" (ppr expr)
496
497
498 --------------------------------------------------------------------------------
499 getRegister :: CmmExpr -> NatM Register
500 getRegister e = do dflags <- getDynFlags
501 is32Bit <- is32BitPlatform
502 getRegister' dflags is32Bit e
503
504 getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
505
506 getRegister' dflags is32Bit (CmmReg reg)
507 = case reg of
508 CmmGlobal PicBaseReg
509 | is32Bit ->
510 -- on x86_64, we have %rip for PicBaseReg, but it's not
511 -- a full-featured register, it can only be used for
512 -- rip-relative addressing.
513 do reg' <- getPicBaseNat (archWordFormat is32Bit)
514 return (Fixed (archWordFormat is32Bit) reg' nilOL)
515 _ ->
516 do use_sse2 <- sse2Enabled
517 let
518 fmt = cmmTypeFormat (cmmRegType dflags reg)
519 format | not use_sse2 && isFloatFormat fmt = FF80
520 | otherwise = fmt
521 --
522 let platform = targetPlatform dflags
523 return (Fixed format
524 (getRegisterReg platform use_sse2 reg)
525 nilOL)
526
527
528 getRegister' dflags is32Bit (CmmRegOff r n)
529 = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
530
531 getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
532 = addAlignmentCheck align <$> getRegister' dflags is32Bit e
533
534 -- for 32-bit architectuers, support some 64 -> 32 bit conversions:
535 -- TO_W_(x), TO_W_(x >> 32)
536
537 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
538 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
539 | is32Bit = do
540 ChildCode64 code rlo <- iselExpr64 x
541 return $ Fixed II32 (getHiVRegFromLo rlo) code
542
543 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
544 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
545 | is32Bit = do
546 ChildCode64 code rlo <- iselExpr64 x
547 return $ Fixed II32 (getHiVRegFromLo rlo) code
548
549 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
550 | is32Bit = do
551 ChildCode64 code rlo <- iselExpr64 x
552 return $ Fixed II32 rlo code
553
554 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
555 | is32Bit = do
556 ChildCode64 code rlo <- iselExpr64 x
557 return $ Fixed II32 rlo code
558
559 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
560 if_sse2 float_const_sse2 float_const_x87
561 where
562 float_const_sse2
563 | f == 0.0 = do
564 let
565 format = floatFormat w
566 code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
567 -- I don't know why there are xorpd, xorps, and pxor instructions.
568 -- They all appear to do the same thing --SDM
569 return (Any format code)
570
571 | otherwise = do
572 Amode addr code <- memConstant (widthInBytes w) lit
573 loadFloatAmode True w addr code
574
575 float_const_x87 = case w of
576 W64
577 | f == 0.0 ->
578 let code dst = unitOL (GLDZ dst)
579 in return (Any FF80 code)
580
581 | f == 1.0 ->
582 let code dst = unitOL (GLD1 dst)
583 in return (Any FF80 code)
584
585 _otherwise -> do
586 Amode addr code <- memConstant (widthInBytes w) lit
587 loadFloatAmode False w addr code
588
589 -- catch simple cases of zero- or sign-extended load
590 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
591 code <- intLoadCode (MOVZxL II8) addr
592 return (Any II32 code)
593
594 getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
595 code <- intLoadCode (MOVSxL II8) addr
596 return (Any II32 code)
597
598 getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
599 code <- intLoadCode (MOVZxL II16) addr
600 return (Any II32 code)
601
602 getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
603 code <- intLoadCode (MOVSxL II16) addr
604 return (Any II32 code)
605
606 -- catch simple cases of zero- or sign-extended load
607 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
608 | not is32Bit = do
609 code <- intLoadCode (MOVZxL II8) addr
610 return (Any II64 code)
611
612 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
613 | not is32Bit = do
614 code <- intLoadCode (MOVSxL II8) addr
615 return (Any II64 code)
616
617 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
618 | not is32Bit = do
619 code <- intLoadCode (MOVZxL II16) addr
620 return (Any II64 code)
621
622 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
623 | not is32Bit = do
624 code <- intLoadCode (MOVSxL II16) addr
625 return (Any II64 code)
626
627 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
628 | not is32Bit = do
629 code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
630 return (Any II64 code)
631
632 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
633 | not is32Bit = do
634 code <- intLoadCode (MOVSxL II32) addr
635 return (Any II64 code)
636
637 getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
638 CmmLit displacement])
639 | not is32Bit = do
640 return $ Any II64 (\dst -> unitOL $
641 LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
642
643 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
644 sse2 <- sse2Enabled
645 case mop of
646 MO_F_Neg w
647 | sse2 -> sse2NegCode w x
648 | otherwise -> trivialUFCode FF80 (GNEG FF80) x
649
650 MO_S_Neg w -> triv_ucode NEGI (intFormat w)
651 MO_Not w -> triv_ucode NOT (intFormat w)
652
653 -- Nop conversions
654 MO_UU_Conv W32 W8 -> toI8Reg W32 x
655 MO_SS_Conv W32 W8 -> toI8Reg W32 x
656 MO_XX_Conv W32 W8 -> toI8Reg W32 x
657 MO_UU_Conv W16 W8 -> toI8Reg W16 x
658 MO_SS_Conv W16 W8 -> toI8Reg W16 x
659 MO_XX_Conv W16 W8 -> toI8Reg W16 x
660 MO_UU_Conv W32 W16 -> toI16Reg W32 x
661 MO_SS_Conv W32 W16 -> toI16Reg W32 x
662 MO_XX_Conv W32 W16 -> toI16Reg W32 x
663
664 MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
665 MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
666 MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
667 MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
668 MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
669 MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
670 MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
671 MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
672 MO_XX_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
673
674 MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
675 MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
676 MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
677
678 -- widenings
679 MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x
680 MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
681 MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x
682
683 MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x
684 MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
685 MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x
686
687 -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
688 -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
689 -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
690 MO_XX_Conv W8 W32
691 | is32Bit -> integerExtend W8 W32 MOVZxL x
692 | otherwise -> integerExtend W8 W32 MOV x
693 MO_XX_Conv W8 W16
694 | is32Bit -> integerExtend W8 W16 MOVZxL x
695 | otherwise -> integerExtend W8 W16 MOV x
696 MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
697
698 MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
699 MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
700 MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
701 MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x
702 MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
703 MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
704 -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
705 -- However, we don't want the register allocator to throw it
706 -- away as an unnecessary reg-to-reg move, so we keep it in
707 -- the form of a movzl and print it as a movl later.
708 -- This doesn't apply to MO_XX_Conv since in this case we don't care about
709 -- the upper bits. So we can just use MOV.
710 MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x
711 MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
712 MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
713
714 MO_FF_Conv W32 W64
715 | sse2 -> coerceFP2FP W64 x
716 | otherwise -> conversionNop FF80 x
717
718 MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
719
720 MO_FS_Conv from to -> coerceFP2Int from to x
721 MO_SF_Conv from to -> coerceInt2FP from to x
722
723 MO_V_Insert {} -> needLlvm
724 MO_V_Extract {} -> needLlvm
725 MO_V_Add {} -> needLlvm
726 MO_V_Sub {} -> needLlvm
727 MO_V_Mul {} -> needLlvm
728 MO_VS_Quot {} -> needLlvm
729 MO_VS_Rem {} -> needLlvm
730 MO_VS_Neg {} -> needLlvm
731 MO_VU_Quot {} -> needLlvm
732 MO_VU_Rem {} -> needLlvm
733 MO_VF_Insert {} -> needLlvm
734 MO_VF_Extract {} -> needLlvm
735 MO_VF_Add {} -> needLlvm
736 MO_VF_Sub {} -> needLlvm
737 MO_VF_Mul {} -> needLlvm
738 MO_VF_Quot {} -> needLlvm
739 MO_VF_Neg {} -> needLlvm
740
741 _other -> pprPanic "getRegister" (pprMachOp mop)
742 where
743 triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
744 triv_ucode instr format = trivialUCode format (instr format) x
745
746 -- signed or unsigned extension.
747 integerExtend :: Width -> Width
748 -> (Format -> Operand -> Operand -> Instr)
749 -> CmmExpr -> NatM Register
750 integerExtend from to instr expr = do
751 (reg,e_code) <- if from == W8 then getByteReg expr
752 else getSomeReg expr
753 let
754 code dst =
755 e_code `snocOL`
756 instr (intFormat from) (OpReg reg) (OpReg dst)
757 return (Any (intFormat to) code)
758
759 toI8Reg :: Width -> CmmExpr -> NatM Register
760 toI8Reg new_rep expr
761 = do codefn <- getAnyReg expr
762 return (Any (intFormat new_rep) codefn)
763 -- HACK: use getAnyReg to get a byte-addressable register.
764 -- If the source was a Fixed register, this will add the
765 -- mov instruction to put it into the desired destination.
766 -- We're assuming that the destination won't be a fixed
767 -- non-byte-addressable register; it won't be, because all
768 -- fixed registers are word-sized.
769
770 toI16Reg = toI8Reg -- for now
771
772 conversionNop :: Format -> CmmExpr -> NatM Register
773 conversionNop new_format expr
774 = do e_code <- getRegister' dflags is32Bit expr
775 return (swizzleRegisterRep e_code new_format)
776
777
778 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
779 sse2 <- sse2Enabled
780 case mop of
781 MO_F_Eq _ -> condFltReg is32Bit EQQ x y
782 MO_F_Ne _ -> condFltReg is32Bit NE x y
783 MO_F_Gt _ -> condFltReg is32Bit GTT x y
784 MO_F_Ge _ -> condFltReg is32Bit GE x y
785 -- Invert comparison condition and swap operands
786 -- See Note [SSE Parity Checks]
787 MO_F_Lt _ -> condFltReg is32Bit GTT y x
788 MO_F_Le _ -> condFltReg is32Bit GE y x
789
790 MO_Eq _ -> condIntReg EQQ x y
791 MO_Ne _ -> condIntReg NE x y
792
793 MO_S_Gt _ -> condIntReg GTT x y
794 MO_S_Ge _ -> condIntReg GE x y
795 MO_S_Lt _ -> condIntReg LTT x y
796 MO_S_Le _ -> condIntReg LE x y
797
798 MO_U_Gt _ -> condIntReg GU x y
799 MO_U_Ge _ -> condIntReg GEU x y
800 MO_U_Lt _ -> condIntReg LU x y
801 MO_U_Le _ -> condIntReg LEU x y
802
803 MO_F_Add w | sse2 -> trivialFCode_sse2 w ADD x y
804 | otherwise -> trivialFCode_x87 GADD x y
805 MO_F_Sub w | sse2 -> trivialFCode_sse2 w SUB x y
806 | otherwise -> trivialFCode_x87 GSUB x y
807 MO_F_Quot w | sse2 -> trivialFCode_sse2 w FDIV x y
808 | otherwise -> trivialFCode_x87 GDIV x y
809 MO_F_Mul w | sse2 -> trivialFCode_sse2 w MUL x y
810 | otherwise -> trivialFCode_x87 GMUL x y
811
812 MO_Add rep -> add_code rep x y
813 MO_Sub rep -> sub_code rep x y
814
815 MO_S_Quot rep -> div_code rep True True x y
816 MO_S_Rem rep -> div_code rep True False x y
817 MO_U_Quot rep -> div_code rep False True x y
818 MO_U_Rem rep -> div_code rep False False x y
819
820 MO_S_MulMayOflo rep -> imulMayOflo rep x y
821
822 MO_Mul W8 -> imulW8 x y
823 MO_Mul rep -> triv_op rep IMUL
824 MO_And rep -> triv_op rep AND
825 MO_Or rep -> triv_op rep OR
826 MO_Xor rep -> triv_op rep XOR
827
828 {- Shift ops on x86s have constraints on their source, it
829 either has to be Imm, CL or 1
830 => trivialCode is not restrictive enough (sigh.)
831 -}
832 MO_Shl rep -> shift_code rep SHL x y {-False-}
833 MO_U_Shr rep -> shift_code rep SHR x y {-False-}
834 MO_S_Shr rep -> shift_code rep SAR x y {-False-}
835
836 MO_V_Insert {} -> needLlvm
837 MO_V_Extract {} -> needLlvm
838 MO_V_Add {} -> needLlvm
839 MO_V_Sub {} -> needLlvm
840 MO_V_Mul {} -> needLlvm
841 MO_VS_Quot {} -> needLlvm
842 MO_VS_Rem {} -> needLlvm
843 MO_VS_Neg {} -> needLlvm
844 MO_VF_Insert {} -> needLlvm
845 MO_VF_Extract {} -> needLlvm
846 MO_VF_Add {} -> needLlvm
847 MO_VF_Sub {} -> needLlvm
848 MO_VF_Mul {} -> needLlvm
849 MO_VF_Quot {} -> needLlvm
850 MO_VF_Neg {} -> needLlvm
851
852 _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
853 where
854 --------------------
855 triv_op width instr = trivialCode width op (Just op) x y
856 where op = instr (intFormat width)
857
858 -- Special case for IMUL for bytes, since the result of IMULB will be in
859 -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
860 -- values.
861 imulW8 :: CmmExpr -> CmmExpr -> NatM Register
862 imulW8 arg_a arg_b = do
863 (a_reg, a_code) <- getNonClobberedReg arg_a
864 b_code <- getAnyReg arg_b
865
866 let code = a_code `appOL` b_code eax `appOL`
867 toOL [ IMUL2 format (OpReg a_reg) ]
868 format = intFormat W8
869
870 return (Fixed format eax code)
871
872
873 imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
874 imulMayOflo rep a b = do
875 (a_reg, a_code) <- getNonClobberedReg a
876 b_code <- getAnyReg b
877 let
878 shift_amt = case rep of
879 W32 -> 31
880 W64 -> 63
881 _ -> panic "shift_amt"
882
883 format = intFormat rep
884 code = a_code `appOL` b_code eax `appOL`
885 toOL [
886 IMUL2 format (OpReg a_reg), -- result in %edx:%eax
887 SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
888 -- sign extend lower part
889 SUB format (OpReg edx) (OpReg eax)
890 -- compare against upper
891 -- eax==0 if high part == sign extended low part
892 ]
893 return (Fixed format eax code)
894
895 --------------------
896 shift_code :: Width
897 -> (Format -> Operand -> Operand -> Instr)
898 -> CmmExpr
899 -> CmmExpr
900 -> NatM Register
901
902 {- Case1: shift length as immediate -}
903 shift_code width instr x (CmmLit lit) = do
904 x_code <- getAnyReg x
905 let
906 format = intFormat width
907 code dst
908 = x_code dst `snocOL`
909 instr format (OpImm (litToImm lit)) (OpReg dst)
910 return (Any format code)
911
912 {- Case2: shift length is complex (non-immediate)
913 * y must go in %ecx.
914 * we cannot do y first *and* put its result in %ecx, because
915 %ecx might be clobbered by x.
916 * if we do y second, then x cannot be
917 in a clobbered reg. Also, we cannot clobber x's reg
918 with the instruction itself.
919 * so we can either:
920 - do y first, put its result in a fresh tmp, then copy it to %ecx later
921 - do y second and put its result into %ecx. x gets placed in a fresh
922 tmp. This is likely to be better, because the reg alloc can
923 eliminate this reg->reg move here (it won't eliminate the other one,
924 because the move is into the fixed %ecx).
925 -}
926 shift_code width instr x y{-amount-} = do
927 x_code <- getAnyReg x
928 let format = intFormat width
929 tmp <- getNewRegNat format
930 y_code <- getAnyReg y
931 let
932 code = x_code tmp `appOL`
933 y_code ecx `snocOL`
934 instr format (OpReg ecx) (OpReg tmp)
935 return (Fixed format tmp code)
936
937 --------------------
938 add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
939 add_code rep x (CmmLit (CmmInt y _))
940 | is32BitInteger y = add_int rep x y
941 add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
942 where format = intFormat rep
943 -- TODO: There are other interesting patterns we want to replace
944 -- with a LEA, e.g. `(x + offset) + (y << shift)`.
945
946 --------------------
947 sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
948 sub_code rep x (CmmLit (CmmInt y _))
949 | is32BitInteger (-y) = add_int rep x (-y)
950 sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
951
952 -- our three-operand add instruction:
953 add_int width x y = do
954 (x_reg, x_code) <- getSomeReg x
955 let
956 format = intFormat width
957 imm = ImmInt (fromInteger y)
958 code dst
959 = x_code `snocOL`
960 LEA format
961 (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
962 (OpReg dst)
963 --
964 return (Any format code)
965
966 ----------------------
967
968 -- See Note [DIV/IDIV for bytes]
969 div_code W8 signed quotient x y = do
970 let widen | signed = MO_SS_Conv W8 W16
971 | otherwise = MO_UU_Conv W8 W16
972 div_code
973 W16
974 signed
975 quotient
976 (CmmMachOp widen [x])
977 (CmmMachOp widen [y])
978
979 div_code width signed quotient x y = do
980 (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
981 x_code <- getAnyReg x
982 let
983 format = intFormat width
984 widen | signed = CLTD format
985 | otherwise = XOR format (OpReg edx) (OpReg edx)
986
987 instr | signed = IDIV
988 | otherwise = DIV
989
990 code = y_code `appOL`
991 x_code eax `appOL`
992 toOL [widen, instr format y_op]
993
994 result | quotient = eax
995 | otherwise = edx
996
997 return (Fixed format result code)
998
999
1000 getRegister' _ _ (CmmLoad mem pk)
1001 | isFloatType pk
1002 = do
1003 Amode addr mem_code <- getAmode mem
1004 use_sse2 <- sse2Enabled
1005 loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
1006
1007 getRegister' _ is32Bit (CmmLoad mem pk)
1008 | is32Bit && not (isWord64 pk)
1009 = do
1010 code <- intLoadCode instr mem
1011 return (Any format code)
1012 where
1013 width = typeWidth pk
1014 format = intFormat width
1015 instr = case width of
1016 W8 -> MOVZxL II8
1017 _other -> MOV format
1018 -- We always zero-extend 8-bit loads, if we
1019 -- can't think of anything better. This is because
1020 -- we can't guarantee access to an 8-bit variant of every register
1021 -- (esi and edi don't have 8-bit variants), so to make things
1022 -- simpler we do our 8-bit arithmetic with full 32-bit registers.
1023
1024 -- Simpler memory load code on x86_64
1025 getRegister' _ is32Bit (CmmLoad mem pk)
1026 | not is32Bit
1027 = do
1028 code <- intLoadCode (MOV format) mem
1029 return (Any format code)
1030 where format = intFormat $ typeWidth pk
1031
1032 getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
1033 = let
1034 format = intFormat width
1035
1036 -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
1037 format1 = if is32Bit then format
1038 else case format of
1039 II64 -> II32
1040 _ -> format
1041 code dst
1042 = unitOL (XOR format1 (OpReg dst) (OpReg dst))
1043 in
1044 return (Any format code)
1045
1046 -- optimisation for loading small literals on x86_64: take advantage
1047 -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
1048 -- instruction forms are shorter.
1049 getRegister' dflags is32Bit (CmmLit lit)
1050 | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
1051 = let
1052 imm = litToImm lit
1053 code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
1054 in
1055 return (Any II64 code)
1056 where
1057 isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
1058 isBigLit _ = False
1059 -- note1: not the same as (not.is32BitLit), because that checks for
1060 -- signed literals that fit in 32 bits, but we want unsigned
1061 -- literals here.
1062 -- note2: all labels are small, because we're assuming the
1063 -- small memory model (see gcc docs, -mcmodel=small).
1064
1065 getRegister' dflags _ (CmmLit lit)
1066 = do let format = cmmTypeFormat (cmmLitType dflags lit)
1067 imm = litToImm lit
1068 code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
1069 return (Any format code)
1070
1071 getRegister' _ _ other
1072 | isVecExpr other = needLlvm
1073 | otherwise = pprPanic "getRegister(x86)" (ppr other)
1074
1075
1076 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
1077 -> NatM (Reg -> InstrBlock)
1078 intLoadCode instr mem = do
1079 Amode src mem_code <- getAmode mem
1080 return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
1081
1082 -- Compute an expression into *any* register, adding the appropriate
1083 -- move instruction if necessary.
1084 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
1085 getAnyReg expr = do
1086 r <- getRegister expr
1087 anyReg r
1088
1089 anyReg :: Register -> NatM (Reg -> InstrBlock)
1090 anyReg (Any _ code) = return code
1091 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
1092
1093 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
1094 -- Fixed registers might not be byte-addressable, so we make sure we've
1095 -- got a temporary, inserting an extra reg copy if necessary.
1096 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
1097 getByteReg expr = do
1098 is32Bit <- is32BitPlatform
1099 if is32Bit
1100 then do r <- getRegister expr
1101 case r of
1102 Any rep code -> do
1103 tmp <- getNewRegNat rep
1104 return (tmp, code tmp)
1105 Fixed rep reg code
1106 | isVirtualReg reg -> return (reg,code)
1107 | otherwise -> do
1108 tmp <- getNewRegNat rep
1109 return (tmp, code `snocOL` reg2reg rep reg tmp)
1110 -- ToDo: could optimise slightly by checking for
1111 -- byte-addressable real registers, but that will
1112 -- happen very rarely if at all.
1113 else getSomeReg expr -- all regs are byte-addressable on x86_64
1114
1115 -- Another variant: this time we want the result in a register that cannot
1116 -- be modified by code to evaluate an arbitrary expression.
1117 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
1118 getNonClobberedReg expr = do
1119 dflags <- getDynFlags
1120 r <- getRegister expr
1121 case r of
1122 Any rep code -> do
1123 tmp <- getNewRegNat rep
1124 return (tmp, code tmp)
1125 Fixed rep reg code
1126 -- only certain regs can be clobbered
1127 | reg `elem` instrClobberedRegs (targetPlatform dflags)
1128 -> do
1129 tmp <- getNewRegNat rep
1130 return (tmp, code `snocOL` reg2reg rep reg tmp)
1131 | otherwise ->
1132 return (reg, code)
1133
1134 reg2reg :: Format -> Reg -> Reg -> Instr
1135 reg2reg format src dst
1136 | format == FF80 = GMOV src dst
1137 | otherwise = MOV format (OpReg src) (OpReg dst)
1138
1139
1140 --------------------------------------------------------------------------------
1141 getAmode :: CmmExpr -> NatM Amode
1142 getAmode e = do is32Bit <- is32BitPlatform
1143 getAmode' is32Bit e
1144
1145 getAmode' :: Bool -> CmmExpr -> NatM Amode
1146 getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
1147 getAmode $ mangleIndexTree dflags r n
1148
1149 getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
1150 CmmLit displacement])
1151 | not is32Bit
1152 = return $ Amode (ripRel (litToImm displacement)) nilOL
1153
1154
1155 -- This is all just ridiculous, since it carefully undoes
1156 -- what mangleIndexTree has just done.
1157 getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
1158 | is32BitLit is32Bit lit
1159 -- ASSERT(rep == II32)???
1160 = do (x_reg, x_code) <- getSomeReg x
1161 let off = ImmInt (-(fromInteger i))
1162 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1163
1164 getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
1165 | is32BitLit is32Bit lit
1166 -- ASSERT(rep == II32)???
1167 = do (x_reg, x_code) <- getSomeReg x
1168 let off = litToImm lit
1169 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1170
1171 -- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be
1172 -- recognised by the next rule.
1173 getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
1174 b@(CmmLit _)])
1175 = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
1176
1177 -- Matches: (x + offset) + (y << shift)
1178 getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset,
1179 CmmMachOp (MO_Shl _)
1180 [y, CmmLit (CmmInt shift _)]])
1181 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1182 = x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
1183
1184 getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
1185 [y, CmmLit (CmmInt shift _)]])
1186 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1187 = x86_complex_amode x y shift 0
1188
1189 getAmode' _ (CmmMachOp (MO_Add _)
1190 [x, CmmMachOp (MO_Add _)
1191 [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
1192 CmmLit (CmmInt offset _)]])
1193 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1194 && is32BitInteger offset
1195 = x86_complex_amode x y shift offset
1196
1197 getAmode' _ (CmmMachOp (MO_Add _) [x,y])
1198 = x86_complex_amode x y 0 0
1199
1200 getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1201 = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
1202
1203 getAmode' _ expr = do
1204 (reg,code) <- getSomeReg expr
1205 return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
1206
1207 -- | Like 'getAmode', but on 32-bit use simple register addressing
1208 -- (i.e. no index register). This stops us from running out of
1209 -- registers on x86 when using instructions such as cmpxchg, which can
1210 -- use up to three virtual registers and one fixed register.
1211 getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
1212 getSimpleAmode dflags is32Bit addr
1213 | is32Bit = do
1214 addr_code <- getAnyReg addr
1215 addr_r <- getNewRegNat (intFormat (wordWidth dflags))
1216 let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
1217 return $! Amode amode (addr_code addr_r)
1218 | otherwise = getAmode addr
1219
1220 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
1221 x86_complex_amode base index shift offset
1222 = do (x_reg, x_code) <- getNonClobberedReg base
1223 -- x must be in a temp, because it has to stay live over y_code
1224 -- we could compre x_reg and y_reg and do something better here...
1225 (y_reg, y_code) <- getSomeReg index
1226 let
1227 code = x_code `appOL` y_code
1228 base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
1229 n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
1230 return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
1231 code)
1232
1233
1234
1235
1236 -- -----------------------------------------------------------------------------
1237 -- getOperand: sometimes any operand will do.
1238
1239 -- getNonClobberedOperand: the value of the operand will remain valid across
1240 -- the computation of an arbitrary expression, unless the expression
1241 -- is computed directly into a register which the operand refers to
1242 -- (see trivialCode where this function is used for an example).
1243
1244 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1245 getNonClobberedOperand (CmmLit lit) = do
1246 use_sse2 <- sse2Enabled
1247 if use_sse2 && isSuitableFloatingPointLit lit
1248 then do
1249 let CmmFloat _ w = lit
1250 Amode addr code <- memConstant (widthInBytes w) lit
1251 return (OpAddr addr, code)
1252 else do
1253
1254 is32Bit <- is32BitPlatform
1255 dflags <- getDynFlags
1256 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1257 then return (OpImm (litToImm lit), nilOL)
1258 else getNonClobberedOperand_generic (CmmLit lit)
1259
1260 getNonClobberedOperand (CmmLoad mem pk) = do
1261 is32Bit <- is32BitPlatform
1262 use_sse2 <- sse2Enabled
1263 if (not (isFloatType pk) || use_sse2)
1264 && (if is32Bit then not (isWord64 pk) else True)
1265 then do
1266 dflags <- getDynFlags
1267 let platform = targetPlatform dflags
1268 Amode src mem_code <- getAmode mem
1269 (src',save_code) <-
1270 if (amodeCouldBeClobbered platform src)
1271 then do
1272 tmp <- getNewRegNat (archWordFormat is32Bit)
1273 return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
1274 unitOL (LEA (archWordFormat is32Bit)
1275 (OpAddr src)
1276 (OpReg tmp)))
1277 else
1278 return (src, nilOL)
1279 return (OpAddr src', mem_code `appOL` save_code)
1280 else do
1281 getNonClobberedOperand_generic (CmmLoad mem pk)
1282
1283 getNonClobberedOperand e = getNonClobberedOperand_generic e
1284
1285 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1286 getNonClobberedOperand_generic e = do
1287 (reg, code) <- getNonClobberedReg e
1288 return (OpReg reg, code)
1289
1290 amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
1291 amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
1292
1293 regClobbered :: Platform -> Reg -> Bool
1294 regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
1295 regClobbered _ _ = False
1296
1297 -- getOperand: the operand is not required to remain valid across the
1298 -- computation of an arbitrary expression.
1299 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1300
1301 getOperand (CmmLit lit) = do
1302 use_sse2 <- sse2Enabled
1303 if (use_sse2 && isSuitableFloatingPointLit lit)
1304 then do
1305 let CmmFloat _ w = lit
1306 Amode addr code <- memConstant (widthInBytes w) lit
1307 return (OpAddr addr, code)
1308 else do
1309
1310 is32Bit <- is32BitPlatform
1311 dflags <- getDynFlags
1312 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1313 then return (OpImm (litToImm lit), nilOL)
1314 else getOperand_generic (CmmLit lit)
1315
1316 getOperand (CmmLoad mem pk) = do
1317 is32Bit <- is32BitPlatform
1318 use_sse2 <- sse2Enabled
1319 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1320 then do
1321 Amode src mem_code <- getAmode mem
1322 return (OpAddr src, mem_code)
1323 else
1324 getOperand_generic (CmmLoad mem pk)
1325
1326 getOperand e = getOperand_generic e
1327
1328 getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1329 getOperand_generic e = do
1330 (reg, code) <- getSomeReg e
1331 return (OpReg reg, code)
1332
1333 isOperand :: Bool -> CmmExpr -> Bool
1334 isOperand _ (CmmLoad _ _) = True
1335 isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
1336 || isSuitableFloatingPointLit lit
1337 isOperand _ _ = False
1338
1339 -- | Given a 'Register', produce a new 'Register' with an instruction block
1340 -- which will check the value for alignment. Used for @-falignment-sanitisation@.
1341 addAlignmentCheck :: Int -> Register -> Register
1342 addAlignmentCheck align reg =
1343 case reg of
1344 Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
1345 Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg)
1346 where
1347 check :: Format -> Reg -> InstrBlock
1348 check fmt reg =
1349 ASSERT(not $ isFloatFormat fmt)
1350 toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
1351 , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
1352 ]
1353
1354 memConstant :: Int -> CmmLit -> NatM Amode
1355 memConstant align lit = do
1356 lbl <- getNewLabelNat
1357 let rosection = Section ReadOnlyData lbl
1358 dflags <- getDynFlags
1359 (addr, addr_code) <- if target32Bit (targetPlatform dflags)
1360 then do dynRef <- cmmMakeDynamicReference
1361 dflags
1362 DataReference
1363 lbl
1364 Amode addr addr_code <- getAmode dynRef
1365 return (addr, addr_code)
1366 else return (ripRel (ImmCLbl lbl), nilOL)
1367 let code =
1368 LDATA rosection (align, Statics lbl [CmmStaticLit lit])
1369 `consOL` addr_code
1370 return (Amode addr code)
1371
1372
1373 loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
1374 loadFloatAmode use_sse2 w addr addr_code = do
1375 let format = floatFormat w
1376 code dst = addr_code `snocOL`
1377 if use_sse2
1378 then MOV format (OpAddr addr) (OpReg dst)
1379 else GLD format addr dst
1380 return (Any (if use_sse2 then format else FF80) code)
1381
1382
1383 -- if we want a floating-point literal as an operand, we can
1384 -- use it directly from memory. However, if the literal is
1385 -- zero, we're better off generating it into a register using
1386 -- xor.
1387 isSuitableFloatingPointLit :: CmmLit -> Bool
1388 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
1389 isSuitableFloatingPointLit _ = False
1390
1391 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
1392 getRegOrMem e@(CmmLoad mem pk) = do
1393 is32Bit <- is32BitPlatform
1394 use_sse2 <- sse2Enabled
1395 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1396 then do
1397 Amode src mem_code <- getAmode mem
1398 return (OpAddr src, mem_code)
1399 else do
1400 (reg, code) <- getNonClobberedReg e
1401 return (OpReg reg, code)
1402 getRegOrMem e = do
1403 (reg, code) <- getNonClobberedReg e
1404 return (OpReg reg, code)
1405
1406 is32BitLit :: Bool -> CmmLit -> Bool
1407 is32BitLit is32Bit (CmmInt i W64)
1408 | not is32Bit
1409 = -- assume that labels are in the range 0-2^31-1: this assumes the
1410 -- small memory model (see gcc docs, -mcmodel=small).
1411 is32BitInteger i
1412 is32BitLit _ _ = True
1413
1414
1415
1416
1417 -- Set up a condition code for a conditional branch.
1418
1419 getCondCode :: CmmExpr -> NatM CondCode
1420
1421 -- yes, they really do seem to want exactly the same!
1422
1423 getCondCode (CmmMachOp mop [x, y])
1424 =
1425 case mop of
1426 MO_F_Eq W32 -> condFltCode EQQ x y
1427 MO_F_Ne W32 -> condFltCode NE x y
1428 MO_F_Gt W32 -> condFltCode GTT x y
1429 MO_F_Ge W32 -> condFltCode GE x y
1430 -- Invert comparison condition and swap operands
1431 -- See Note [SSE Parity Checks]
1432 MO_F_Lt W32 -> condFltCode GTT y x
1433 MO_F_Le W32 -> condFltCode GE y x
1434
1435 MO_F_Eq W64 -> condFltCode EQQ x y
1436 MO_F_Ne W64 -> condFltCode NE x y
1437 MO_F_Gt W64 -> condFltCode GTT x y
1438 MO_F_Ge W64 -> condFltCode GE x y
1439 MO_F_Lt W64 -> condFltCode GTT y x
1440 MO_F_Le W64 -> condFltCode GE y x
1441
1442 _ -> condIntCode (machOpToCond mop) x y
1443
1444 getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
1445
1446 machOpToCond :: MachOp -> Cond
1447 machOpToCond mo = case mo of
1448 MO_Eq _ -> EQQ
1449 MO_Ne _ -> NE
1450 MO_S_Gt _ -> GTT
1451 MO_S_Ge _ -> GE
1452 MO_S_Lt _ -> LTT
1453 MO_S_Le _ -> LE
1454 MO_U_Gt _ -> GU
1455 MO_U_Ge _ -> GEU
1456 MO_U_Lt _ -> LU
1457 MO_U_Le _ -> LEU
1458 _other -> pprPanic "machOpToCond" (pprMachOp mo)
1459
1460
1461 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
1462 -- passed back up the tree.
1463
1464 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1465 condIntCode cond x y = do is32Bit <- is32BitPlatform
1466 condIntCode' is32Bit cond x y
1467
1468 condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1469
1470 -- memory vs immediate
1471 condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
1472 | is32BitLit is32Bit lit = do
1473 Amode x_addr x_code <- getAmode x
1474 let
1475 imm = litToImm lit
1476 code = x_code `snocOL`
1477 CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
1478 --
1479 return (CondCode False cond code)
1480
1481 -- anything vs zero, using a mask
1482 -- TODO: Add some sanity checking!!!!
1483 condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
1484 | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
1485 = do
1486 (x_reg, x_code) <- getSomeReg x
1487 let
1488 code = x_code `snocOL`
1489 TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
1490 --
1491 return (CondCode False cond code)
1492
1493 -- anything vs zero
1494 condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
1495 (x_reg, x_code) <- getSomeReg x
1496 let
1497 code = x_code `snocOL`
1498 TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
1499 --
1500 return (CondCode False cond code)
1501
1502 -- anything vs operand
1503 condIntCode' is32Bit cond x y
1504 | isOperand is32Bit y = do
1505 dflags <- getDynFlags
1506 (x_reg, x_code) <- getNonClobberedReg x
1507 (y_op, y_code) <- getOperand y
1508 let
1509 code = x_code `appOL` y_code `snocOL`
1510 CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg)
1511 return (CondCode False cond code)
1512 -- operand vs. anything: invert the comparison so that we can use a
1513 -- single comparison instruction.
1514 | isOperand is32Bit x
1515 , Just revcond <- maybeFlipCond cond = do
1516 dflags <- getDynFlags
1517 (y_reg, y_code) <- getNonClobberedReg y
1518 (x_op, x_code) <- getOperand x
1519 let
1520 code = y_code `appOL` x_code `snocOL`
1521 CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg)
1522 return (CondCode False revcond code)
1523
1524 -- anything vs anything
1525 condIntCode' _ cond x y = do
1526 dflags <- getDynFlags
1527 (y_reg, y_code) <- getNonClobberedReg y
1528 (x_op, x_code) <- getRegOrMem x
1529 let
1530 code = y_code `appOL`
1531 x_code `snocOL`
1532 CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op
1533 return (CondCode False cond code)
1534
1535
1536
1537 --------------------------------------------------------------------------------
1538 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1539
1540 condFltCode cond x y
1541 = if_sse2 condFltCode_sse2 condFltCode_x87
1542 where
1543
1544 condFltCode_x87
1545 = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
1546 (x_reg, x_code) <- getNonClobberedReg x
1547 (y_reg, y_code) <- getSomeReg y
1548 let
1549 code = x_code `appOL` y_code `snocOL`
1550 GCMP cond x_reg y_reg
1551 -- The GCMP insn does the test and sets the zero flag if comparable
1552 -- and true. Hence we always supply EQQ as the condition to test.
1553 return (CondCode True EQQ code)
1554
1555 -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
1556 -- an operand, but the right must be a reg. We can probably do better
1557 -- than this general case...
1558 condFltCode_sse2 = do
1559 dflags <- getDynFlags
1560 (x_reg, x_code) <- getNonClobberedReg x
1561 (y_op, y_code) <- getOperand y
1562 let
1563 code = x_code `appOL`
1564 y_code `snocOL`
1565 CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg)
1566 -- NB(1): we need to use the unsigned comparison operators on the
1567 -- result of this comparison.
1568 return (CondCode True (condToUnsigned cond) code)
1569
1570 -- -----------------------------------------------------------------------------
1571 -- Generating assignments
1572
1573 -- Assignments are really at the heart of the whole code generation
1574 -- business. Almost all top-level nodes of any real importance are
1575 -- assignments, which correspond to loads, stores, or register
1576 -- transfers. If we're really lucky, some of the register transfers
1577 -- will go away, because we can use the destination register to
1578 -- complete the code generation for the right hand side. This only
1579 -- fails when the right hand side is forced into a fixed register
1580 -- (e.g. the result of a call).
1581
1582 assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1583 assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1584
1585 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1586 assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1587
1588
1589 -- integer assignment to memory
1590
1591 -- specific case of adding/subtracting an integer to a particular address.
1592 -- ToDo: catch other cases where we can use an operation directly on a memory
1593 -- address.
1594 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
1595 CmmLit (CmmInt i _)])
1596 | addr == addr2, pk /= II64 || is32BitInteger i,
1597 Just instr <- check op
1598 = do Amode amode code_addr <- getAmode addr
1599 let code = code_addr `snocOL`
1600 instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
1601 return code
1602 where
1603 check (MO_Add _) = Just ADD
1604 check (MO_Sub _) = Just SUB
1605 check _ = Nothing
1606 -- ToDo: more?
1607
1608 -- general case
1609 assignMem_IntCode pk addr src = do
1610 is32Bit <- is32BitPlatform
1611 Amode addr code_addr <- getAmode addr
1612 (code_src, op_src) <- get_op_RI is32Bit src
1613 let
1614 code = code_src `appOL`
1615 code_addr `snocOL`
1616 MOV pk op_src (OpAddr addr)
1617 -- NOTE: op_src is stable, so it will still be valid
1618 -- after code_addr. This may involve the introduction
1619 -- of an extra MOV to a temporary register, but we hope
1620 -- the register allocator will get rid of it.
1621 --
1622 return code
1623 where
1624 get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator
1625 get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1626 = return (nilOL, OpImm (litToImm lit))
1627 get_op_RI _ op
1628 = do (reg,code) <- getNonClobberedReg op
1629 return (code, OpReg reg)
1630
1631
1632 -- Assign; dst is a reg, rhs is mem
1633 assignReg_IntCode pk reg (CmmLoad src _) = do
1634 load_code <- intLoadCode (MOV pk) src
1635 dflags <- getDynFlags
1636 let platform = targetPlatform dflags
1637 return (load_code (getRegisterReg platform False{-no sse2-} reg))
1638
1639 -- dst is a reg, but src could be anything
1640 assignReg_IntCode _ reg src = do
1641 dflags <- getDynFlags
1642 let platform = targetPlatform dflags
1643 code <- getAnyReg src
1644 return (code (getRegisterReg platform False{-no sse2-} reg))
1645
1646
1647 -- Floating point assignment to memory
1648 assignMem_FltCode pk addr src = do
1649 (src_reg, src_code) <- getNonClobberedReg src
1650 Amode addr addr_code <- getAmode addr
1651 use_sse2 <- sse2Enabled
1652 let
1653 code = src_code `appOL`
1654 addr_code `snocOL`
1655 if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
1656 else GST pk src_reg addr
1657 return code
1658
1659 -- Floating point assignment to a register/temporary
1660 assignReg_FltCode _ reg src = do
1661 use_sse2 <- sse2Enabled
1662 src_code <- getAnyReg src
1663 dflags <- getDynFlags
1664 let platform = targetPlatform dflags
1665 return (src_code (getRegisterReg platform use_sse2 reg))
1666
1667
1668 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
1669
1670 genJump (CmmLoad mem _) regs = do
1671 Amode target code <- getAmode mem
1672 return (code `snocOL` JMP (OpAddr target) regs)
1673
1674 genJump (CmmLit lit) regs = do
1675 return (unitOL (JMP (OpImm (litToImm lit)) regs))
1676
1677 genJump expr regs = do
1678 (reg,code) <- getSomeReg expr
1679 return (code `snocOL` JMP (OpReg reg) regs)
1680
1681
1682 -- -----------------------------------------------------------------------------
1683 -- Unconditional branches
1684
1685 genBranch :: BlockId -> InstrBlock
1686 genBranch = toOL . mkJumpInstr
1687
1688
1689
1690 -- -----------------------------------------------------------------------------
1691 -- Conditional jumps/branches
1692
1693 {-
1694 Conditional jumps are always to local labels, so we can use branch
1695 instructions. We peek at the arguments to decide what kind of
1696 comparison to do.
1697
1698 I386: First, we have to ensure that the condition
1699 codes are set according to the supplied comparison operation.
1700 -}
1701
1702
1703 genCondBranch
1704 :: BlockId -- the source of the jump
1705 -> BlockId -- the true branch target
1706 -> BlockId -- the false branch target
1707 -> CmmExpr -- the condition on which to branch
1708 -> NatM InstrBlock -- Instructions
1709
1710 genCondBranch bid id false expr = do
1711 is32Bit <- is32BitPlatform
1712 genCondBranch' is32Bit bid id false expr
1713
1714 -- | We return the instructions generated.
1715 genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
1716 -> NatM InstrBlock
1717
1718 -- 64-bit integer comparisons on 32-bit
1719 genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
1720 | is32Bit, Just W64 <- maybeIntComparison mop = do
1721 ChildCode64 code1 r1_lo <- iselExpr64 e1
1722 ChildCode64 code2 r2_lo <- iselExpr64 e2
1723 let r1_hi = getHiVRegFromLo r1_lo
1724 r2_hi = getHiVRegFromLo r2_lo
1725 cond = machOpToCond mop
1726 Just cond' = maybeFlipCond cond
1727 --TODO: Update CFG for x86
1728 let code = code1 `appOL` code2 `appOL` toOL [
1729 CMP II32 (OpReg r2_hi) (OpReg r1_hi),
1730 JXX cond true,
1731 JXX cond' false,
1732 CMP II32 (OpReg r2_lo) (OpReg r1_lo),
1733 JXX cond true] `appOL` genBranch false
1734 return code
1735
1736 genCondBranch' _ bid id false bool = do
1737 CondCode is_float cond cond_code <- getCondCode bool
1738 use_sse2 <- sse2Enabled
1739 if not is_float || not use_sse2
1740 then
1741 return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
1742 else do
1743 -- See Note [SSE Parity Checks]
1744 let jmpFalse = genBranch false
1745 code
1746 = case cond of
1747 NE -> or_unordered
1748 GU -> plain_test
1749 GEU -> plain_test
1750 -- Use ASSERT so we don't break releases if
1751 -- LTT/LE creep in somehow.
1752 LTT ->
1753 ASSERT2(False, ppr "Should have been turned into >")
1754 and_ordered
1755 LE ->
1756 ASSERT2(False, ppr "Should have been turned into >=")
1757 and_ordered
1758 _ -> and_ordered
1759
1760 plain_test = unitOL (
1761 JXX cond id
1762 ) `appOL` jmpFalse
1763 or_unordered = toOL [
1764 JXX cond id,
1765 JXX PARITY id
1766 ] `appOL` jmpFalse
1767 and_ordered = toOL [
1768 JXX PARITY false,
1769 JXX cond id,
1770 JXX ALWAYS false
1771 ]
1772 updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
1773 return (cond_code `appOL` code)
1774
1775 -- -----------------------------------------------------------------------------
1776 -- Generating C calls
1777
1778 -- Now the biggest nightmare---calls. Most of the nastiness is buried in
1779 -- @get_arg@, which moves the arguments to the correct registers/stack
1780 -- locations. Apart from that, the code is easy.
1781 --
1782 -- (If applicable) Do not fill the delay slots here; you will confuse the
1783 -- register allocator.
1784
1785 genCCall
1786 :: DynFlags
1787 -> Bool -- 32 bit platform?
1788 -> ForeignTarget -- function to call
1789 -> [CmmFormal] -- where to put the result
1790 -> [CmmActual] -- arguments (of mixed type)
1791 -> BlockId -- The block we are in
1792 -> NatM InstrBlock
1793
1794 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1795
1796 -- Unroll memcpy calls if the source and destination pointers are at
1797 -- least DWORD aligned and the number of bytes to copy isn't too
1798 -- large. Otherwise, call C's memcpy.
1799 genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
1800 [dst, src, CmmLit (CmmInt n _)] _
1801 | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
1802 code_dst <- getAnyReg dst
1803 dst_r <- getNewRegNat format
1804 code_src <- getAnyReg src
1805 src_r <- getNewRegNat format
1806 tmp_r <- getNewRegNat format
1807 return $ code_dst dst_r `appOL` code_src src_r `appOL`
1808 go dst_r src_r tmp_r (fromInteger n)
1809 where
1810 -- The number of instructions we will generate (approx). We need 2
1811 -- instructions per move.
1812 insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
1813
1814 format = if align .&. 4 /= 0 then II32 else (archWordFormat is32Bit)
1815
1816 -- The size of each move, in bytes.
1817 sizeBytes :: Integer
1818 sizeBytes = fromIntegral (formatInBytes format)
1819
1820 go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
1821 go dst src tmp i
1822 | i >= sizeBytes =
1823 unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
1824 unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
1825 go dst src tmp (i - sizeBytes)
1826 -- Deal with remaining bytes.
1827 | i >= 4 = -- Will never happen on 32-bit
1828 unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
1829 unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1830 go dst src tmp (i - 4)
1831 | i >= 2 =
1832 unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
1833 unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1834 go dst src tmp (i - 2)
1835 | i >= 1 =
1836 unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
1837 unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1838 go dst src tmp (i - 1)
1839 | otherwise = nilOL
1840 where
1841 src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
1842 (ImmInteger (n - i))
1843 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1844 (ImmInteger (n - i))
1845
1846 genCCall dflags _ (PrimTarget (MO_Memset align)) _
1847 [dst,
1848 CmmLit (CmmInt c _),
1849 CmmLit (CmmInt n _)]
1850 _
1851 | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do
1852 code_dst <- getAnyReg dst
1853 dst_r <- getNewRegNat format
1854 return $ code_dst dst_r `appOL` go dst_r (fromInteger n)
1855 where
1856 (format, val) = case align .&. 3 of
1857 2 -> (II16, c2)
1858 0 -> (II32, c4)
1859 _ -> (II8, c)
1860 c2 = c `shiftL` 8 .|. c
1861 c4 = c2 `shiftL` 16 .|. c2
1862
1863 -- The number of instructions we will generate (approx). We need 1
1864 -- instructions per move.
1865 insns = (n + sizeBytes - 1) `div` sizeBytes
1866
1867 -- The size of each move, in bytes.
1868 sizeBytes :: Integer
1869 sizeBytes = fromIntegral (formatInBytes format)
1870
1871 go :: Reg -> Integer -> OrdList Instr
1872 go dst i
1873 -- TODO: Add movabs instruction and support 64-bit sets.
1874 | i >= sizeBytes = -- This might be smaller than the below sizes
1875 unitOL (MOV format (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL`
1876 go dst (i - sizeBytes)
1877 | i >= 4 = -- Will never happen on 32-bit
1878 unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL`
1879 go dst (i - 4)
1880 | i >= 2 =
1881 unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL`
1882 go dst (i - 2)
1883 | i >= 1 =
1884 unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL`
1885 go dst (i - 1)
1886 | otherwise = nilOL
1887 where
1888 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1889 (ImmInteger (n - i))
1890
1891 genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
1892 -- write barrier compiles to no code on x86/x86-64;
1893 -- we keep it this long in order to prevent earlier optimisations.
1894
1895 genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
1896
1897 genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
1898 case n of
1899 0 -> genPrefetch src $ PREFETCH NTA format
1900 1 -> genPrefetch src $ PREFETCH Lvl2 format
1901 2 -> genPrefetch src $ PREFETCH Lvl1 format
1902 3 -> genPrefetch src $ PREFETCH Lvl0 format
1903 l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
1904 -- the c / llvm prefetch convention is 0, 1, 2, and 3
1905 -- the x86 corresponding names are : NTA, 2 , 1, and 0
1906 where
1907 format = archWordFormat is32bit
1908 -- need to know what register width for pointers!
1909 genPrefetch inRegSrc prefetchCTor =
1910 do
1911 code_src <- getAnyReg inRegSrc
1912 src_r <- getNewRegNat format
1913 return $ code_src src_r `appOL`
1914 (unitOL (prefetchCTor (OpAddr
1915 ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
1916 -- prefetch always takes an address
1917
1918 genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
1919 let platform = targetPlatform dflags
1920 let dst_r = getRegisterReg platform False (CmmLocal dst)
1921 case width of
1922 W64 | is32Bit -> do
1923 ChildCode64 vcode rlo <- iselExpr64 src
1924 let dst_rhi = getHiVRegFromLo dst_r
1925 rhi = getHiVRegFromLo rlo
1926 return $ vcode `appOL`
1927 toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
1928 MOV II32 (OpReg rhi) (OpReg dst_r),
1929 BSWAP II32 dst_rhi,
1930 BSWAP II32 dst_r ]
1931 W16 -> do code_src <- getAnyReg src
1932 return $ code_src dst_r `appOL`
1933 unitOL (BSWAP II32 dst_r) `appOL`
1934 unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
1935 _ -> do code_src <- getAnyReg src
1936 return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
1937 where
1938 format = intFormat width
1939
1940 genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
1941 args@[src] bid = do
1942 sse4_2 <- sse4_2Enabled
1943 let platform = targetPlatform dflags
1944 if sse4_2
1945 then do code_src <- getAnyReg src
1946 src_r <- getNewRegNat format
1947 let dst_r = getRegisterReg platform False (CmmLocal dst)
1948 return $ code_src src_r `appOL`
1949 (if width == W8 then
1950 -- The POPCNT instruction doesn't take a r/m8
1951 unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
1952 unitOL (POPCNT II16 (OpReg src_r) dst_r)
1953 else
1954 unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
1955 (if width == W8 || width == W16 then
1956 -- We used a 16-bit destination register above,
1957 -- so zero-extend
1958 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
1959 else nilOL)
1960 else do
1961 targetExpr <- cmmMakeDynamicReference dflags
1962 CallReference lbl
1963 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1964 [NoHint] [NoHint]
1965 CmmMayReturn)
1966 genCCall dflags is32Bit target dest_regs args bid
1967 where
1968 format = intFormat width
1969 lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
1970
1971 genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
1972 args@[src, mask] bid = do
1973 let platform = targetPlatform dflags
1974 if isBmi2Enabled dflags
1975 then do code_src <- getAnyReg src
1976 code_mask <- getAnyReg mask
1977 src_r <- getNewRegNat format
1978 mask_r <- getNewRegNat format
1979 let dst_r = getRegisterReg platform False (CmmLocal dst)
1980 return $ code_src src_r `appOL` code_mask mask_r `appOL`
1981 (if width == W8 then
1982 -- The PDEP instruction doesn't take a r/m8
1983 unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
1984 unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
1985 unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
1986 else
1987 unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
1988 (if width == W8 || width == W16 then
1989 -- We used a 16-bit destination register above,
1990 -- so zero-extend
1991 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
1992 else nilOL)
1993 else do
1994 targetExpr <- cmmMakeDynamicReference dflags
1995 CallReference lbl
1996 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1997 [NoHint] [NoHint]
1998 CmmMayReturn)
1999 genCCall dflags is32Bit target dest_regs args bid
2000 where
2001 format = intFormat width
2002 lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
2003
2004 genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
2005 args@[src, mask] bid = do
2006 let platform = targetPlatform dflags
2007 if isBmi2Enabled dflags
2008 then do code_src <- getAnyReg src
2009 code_mask <- getAnyReg mask
2010 src_r <- getNewRegNat format
2011 mask_r <- getNewRegNat format
2012 let dst_r = getRegisterReg platform False (CmmLocal dst)
2013 return $ code_src src_r `appOL` code_mask mask_r `appOL`
2014 (if width == W8 then
2015 -- The PEXT instruction doesn't take a r/m8
2016 unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
2017 unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
2018 unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
2019 else
2020 unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
2021 (if width == W8 || width == W16 then
2022 -- We used a 16-bit destination register above,
2023 -- so zero-extend
2024 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
2025 else nilOL)
2026 else do
2027 targetExpr <- cmmMakeDynamicReference dflags
2028 CallReference lbl
2029 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2030 [NoHint] [NoHint]
2031 CmmMayReturn)
2032 genCCall dflags is32Bit target dest_regs args bid
2033 where
2034 format = intFormat width
2035 lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
2036
2037 genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
2038 | is32Bit && width == W64 = do
2039 -- Fallback to `hs_clz64` on i386
2040 targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
2041 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2042 [NoHint] [NoHint]
2043 CmmMayReturn)
2044 genCCall dflags is32Bit target dest_regs args bid
2045
2046 | otherwise = do
2047 code_src <- getAnyReg src
2048 let dst_r = getRegisterReg platform False (CmmLocal dst)
2049 if isBmi2Enabled dflags
2050 then do
2051 src_r <- getNewRegNat (intFormat width)
2052 return $ appOL (code_src src_r) $ case width of
2053 W8 -> toOL
2054 [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit
2055 , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros
2056 , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
2057 ]
2058 W16 -> toOL
2059 [ LZCNT II16 (OpReg src_r) dst_r
2060 , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
2061 ]
2062 _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
2063 else do
2064 let format = if width == W8 then II16 else intFormat width
2065 src_r <- getNewRegNat format
2066 tmp_r <- getNewRegNat format
2067 return $ code_src src_r `appOL` toOL
2068 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2069 [ BSR format (OpReg src_r) tmp_r
2070 , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
2071 , CMOV NE format (OpReg tmp_r) dst_r
2072 , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
2073 ]) -- NB: We don't need to zero-extend the result for the
2074 -- W8/W16 cases because the 'MOV' insn already
2075 -- took care of implicitly clearing the upper bits
2076 where
2077 bw = widthInBits width
2078 platform = targetPlatform dflags
2079 lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
2080
2081 genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
2082 | is32Bit, width == W64 = do
2083 ChildCode64 vcode rlo <- iselExpr64 src
2084 let rhi = getHiVRegFromLo rlo
2085 dst_r = getRegisterReg platform False (CmmLocal dst)
2086 lbl1 <- getBlockIdNat
2087 lbl2 <- getBlockIdNat
2088 let format = if width == W8 then II16 else intFormat width
2089 tmp_r <- getNewRegNat format
2090
2091 -- New CFG Edges:
2092 -- bid -> lbl2
2093 -- bid -> lbl1 -> lbl2
2094 -- We also changes edges originating at bid to start at lbl2 instead.
2095 updateCfgNat (addWeightEdge bid lbl1 110 .
2096 addWeightEdge lbl1 lbl2 110 .
2097 addImmediateSuccessor bid lbl2)
2098
2099 -- The following instruction sequence corresponds to the pseudo-code
2100 --
2101 -- if (src) {
2102 -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
2103 -- } else {
2104 -- dst = 64;
2105 -- }
2106 return $ vcode `appOL` toOL
2107 ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
2108 , OR II32 (OpReg rlo) (OpReg tmp_r)
2109 , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
2110 , JXX EQQ lbl2
2111 , JXX ALWAYS lbl1
2112
2113 , NEWBLOCK lbl1
2114 , BSF II32 (OpReg rhi) dst_r
2115 , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
2116 , BSF II32 (OpReg rlo) tmp_r
2117 , CMOV NE II32 (OpReg tmp_r) dst_r
2118 , JXX ALWAYS lbl2
2119
2120 , NEWBLOCK lbl2
2121 ])
2122
2123 | otherwise = do
2124 code_src <- getAnyReg src
2125 let dst_r = getRegisterReg platform False (CmmLocal dst)
2126
2127 if isBmi2Enabled dflags
2128 then do
2129 src_r <- getNewRegNat (intFormat width)
2130 return $ appOL (code_src src_r) $ case width of
2131 W8 -> toOL
2132 [ OR II32 (OpImm (ImmInt 0xFFFFFF00)) (OpReg src_r)
2133 , TZCNT II32 (OpReg src_r) dst_r
2134 ]
2135 W16 -> toOL
2136 [ TZCNT II16 (OpReg src_r) dst_r
2137 , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
2138 ]
2139 _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
2140 else do
2141 -- The following insn sequence makes sure 'ctz 0' has a defined value.
2142 -- starting with Haswell, one could use the TZCNT insn instead.
2143 let format = if width == W8 then II16 else intFormat width
2144 src_r <- getNewRegNat format
2145 tmp_r <- getNewRegNat format
2146 return $ code_src src_r `appOL` toOL
2147 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2148 [ BSF format (OpReg src_r) tmp_r
2149 , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
2150 , CMOV NE format (OpReg tmp_r) dst_r
2151 ]) -- NB: We don't need to zero-extend the result for the
2152 -- W8/W16 cases because the 'MOV' insn already
2153 -- took care of implicitly clearing the upper bits
2154 where
2155 bw = widthInBits width
2156 platform = targetPlatform dflags
2157
2158 genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
2159 targetExpr <- cmmMakeDynamicReference dflags
2160 CallReference lbl
2161 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2162 [NoHint] [NoHint]
2163 CmmMayReturn)
2164 genCCall dflags is32Bit target dest_regs args bid
2165 where
2166 lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
2167
2168 genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
2169 [dst] [addr, n] bid = do
2170 Amode amode addr_code <-
2171 if amop `elem` [AMO_Add, AMO_Sub]
2172 then getAmode addr
2173 else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
2174 arg <- getNewRegNat format
2175 arg_code <- getAnyReg n
2176 use_sse2 <- sse2Enabled
2177 let platform = targetPlatform dflags
2178 dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
2179 code <- op_code dst_r arg amode
2180 return $ addr_code `appOL` arg_code arg `appOL` code
2181 where
2182 -- Code for the operation
2183 op_code :: Reg -- Destination reg
2184 -> Reg -- Register containing argument
2185 -> AddrMode -- Address of location to mutate
2186 -> NatM (OrdList Instr)
2187 op_code dst_r arg amode = case amop of
2188 -- In the common case where dst_r is a virtual register the
2189 -- final move should go away, because it's the last use of arg
2190 -- and the first use of dst_r.
2191 AMO_Add -> return $ toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
2192 , MOV format (OpReg arg) (OpReg dst_r)
2193 ]
2194 AMO_Sub -> return $ toOL [ NEGI format (OpReg arg)
2195 , LOCK (XADD format (OpReg arg) (OpAddr amode))
2196 , MOV format (OpReg arg) (OpReg dst_r)
2197 ]
2198 AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
2199 AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
2200 , NOT format dst
2201 ])
2202 AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
2203 AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
2204 where
2205 -- Simulate operation that lacks a dedicated instruction using
2206 -- cmpxchg.
2207 cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
2208 -> NatM (OrdList Instr)
2209 cmpxchg_code instrs = do
2210 lbl <- getBlockIdNat
2211 tmp <- getNewRegNat format
2212
2213 --Record inserted blocks
2214 addImmediateSuccessorNat bid lbl
2215 updateCfgNat (addWeightEdge lbl lbl 0)
2216
2217 return $ toOL
2218 [ MOV format (OpAddr amode) (OpReg eax)
2219 , JXX ALWAYS lbl
2220 , NEWBLOCK lbl
2221 -- Keep old value so we can return it:
2222 , MOV format (OpReg eax) (OpReg dst_r)
2223 , MOV format (OpReg eax) (OpReg tmp)
2224 ]
2225 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
2226 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
2227 , JXX NE lbl
2228 ]
2229
2230 format = intFormat width
2231
2232 genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
2233 load_code <- intLoadCode (MOV (intFormat width)) addr
2234 let platform = targetPlatform dflags
2235 use_sse2 <- sse2Enabled
2236 return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
2237
2238 genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
2239 code <- assignMem_IntCode (intFormat width) addr val
2240 return $ code `snocOL` MFENCE
2241
2242 genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
2243 -- On x86 we don't have enough registers to use cmpxchg with a
2244 -- complicated addressing mode, so on that architecture we
2245 -- pre-compute the address first.
2246 Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
2247 newval <- getNewRegNat format
2248 newval_code <- getAnyReg new
2249 oldval <- getNewRegNat format
2250 oldval_code <- getAnyReg old
2251 use_sse2 <- sse2Enabled
2252 let platform = targetPlatform dflags
2253 dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
2254 code = toOL
2255 [ MOV format (OpReg oldval) (OpReg eax)
2256 , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
2257 , MOV format (OpReg eax) (OpReg dst_r)
2258 ]
2259 return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
2260 `appOL` code
2261 where
2262 format = intFormat width
2263
2264 genCCall _ is32Bit target dest_regs args bid = do
2265 dflags <- getDynFlags
2266 let platform = targetPlatform dflags
2267 sse2 = isSse2Enabled dflags
2268 case (target, dest_regs) of
2269 -- void return type prim op
2270 (PrimTarget op, []) ->
2271 outOfLineCmmOp bid op Nothing args
2272 -- we only cope with a single result for foreign calls
2273 (PrimTarget op, [r])
2274 | sse2 -> case op of
2275 MO_F32_Fabs -> case args of
2276 [x] -> sse2FabsCode W32 x
2277 _ -> panic "genCCall: Wrong number of arguments for fabs"
2278 MO_F64_Fabs -> case args of
2279 [x] -> sse2FabsCode W64 x
2280 _ -> panic "genCCall: Wrong number of arguments for fabs"
2281
2282 MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
2283 MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
2284 _other_op -> outOfLineCmmOp bid op (Just r) args
2285 | otherwise -> do
2286 l1 <- getNewLabelNat
2287 l2 <- getNewLabelNat
2288 if sse2
2289 then outOfLineCmmOp bid op (Just r) args
2290 else case op of
2291 MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
2292 MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
2293
2294 MO_F32_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
2295 MO_F64_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
2296
2297 MO_F32_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
2298 MO_F64_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
2299
2300 MO_F32_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
2301 MO_F64_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
2302
2303 _other_op -> outOfLineCmmOp bid op (Just r) args
2304
2305 where
2306 actuallyInlineFloatOp = actuallyInlineFloatOp' False
2307 actuallyInlineSSE2Op = actuallyInlineFloatOp' True
2308
2309 actuallyInlineFloatOp' usesSSE instr format [x]
2310 = do res <- trivialUFCode format (instr format) x
2311 any <- anyReg res
2312 return (any (getRegisterReg platform usesSSE (CmmLocal r)))
2313
2314 actuallyInlineFloatOp' _ _ _ args
2315 = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
2316 ++ show (length args) ++ ")"
2317
2318 sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
2319 sse2FabsCode w x = do
2320 let fmt = floatFormat w
2321 x_code <- getAnyReg x
2322 let
2323 const | FF32 <- fmt = CmmInt 0x7fffffff W32
2324 | otherwise = CmmInt 0x7fffffffffffffff W64
2325 Amode amode amode_code <- memConstant (widthInBytes w) const
2326 tmp <- getNewRegNat fmt
2327 let
2328 code dst = x_code dst `appOL` amode_code `appOL` toOL [
2329 MOV fmt (OpAddr amode) (OpReg tmp),
2330 AND fmt (OpReg tmp) (OpReg dst)
2331 ]
2332
2333 return $ code (getRegisterReg platform True (CmmLocal r))
2334
2335 (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
2336 (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
2337 (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
2338 (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
2339 case args of
2340 [arg_x, arg_y] ->
2341 do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
2342 let format = intFormat width
2343 lCode <- anyReg =<< trivialCode width (ADD_CC format)
2344 (Just (ADD_CC format)) arg_x arg_y
2345 let reg_l = getRegisterReg platform True (CmmLocal res_l)
2346 reg_h = getRegisterReg platform True (CmmLocal res_h)
2347 code = hCode reg_h `appOL`
2348 lCode reg_l `snocOL`
2349 ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
2350 return code
2351 _ -> panic "genCCall: Wrong number of arguments/results for add2"
2352 (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
2353 addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
2354 (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
2355 addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
2356 (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
2357 addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
2358 (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
2359 addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
2360 (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
2361 case args of
2362 [arg_x, arg_y] ->
2363 do (y_reg, y_code) <- getRegOrMem arg_y
2364 x_code <- getAnyReg arg_x
2365 let format = intFormat width
2366 reg_h = getRegisterReg platform True (CmmLocal res_h)
2367 reg_l = getRegisterReg platform True (CmmLocal res_l)
2368 code = y_code `appOL`
2369 x_code rax `appOL`
2370 toOL [MUL2 format y_reg,
2371 MOV format (OpReg rdx) (OpReg reg_h),
2372 MOV format (OpReg rax) (OpReg reg_l)]
2373 return code
2374 _ -> panic "genCCall: Wrong number of arguments/results for mul2"
2375
2376 _ -> if is32Bit
2377 then genCCall32' dflags target dest_regs args
2378 else genCCall64' dflags target dest_regs args
2379
2380 where divOp1 platform signed width results [arg_x, arg_y]
2381 = divOp platform signed width results Nothing arg_x arg_y
2382 divOp1 _ _ _ _ _
2383 = panic "genCCall: Wrong number of arguments for divOp1"
2384 divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
2385 = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
2386 divOp2 _ _ _ _ _
2387 = panic "genCCall: Wrong number of arguments for divOp2"
2388
2389 -- See Note [DIV/IDIV for bytes]
2390 divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
2391 let widen | signed = MO_SS_Conv W8 W16
2392 | otherwise = MO_UU_Conv W8 W16
2393 arg_x_low_16 = CmmMachOp widen [arg_x_low]
2394 arg_y_16 = CmmMachOp widen [arg_y]
2395 m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
2396 in divOp
2397 platform signed W16 [res_q, res_r]
2398 m_arg_x_high_16 arg_x_low_16 arg_y_16
2399
2400 divOp platform signed width [res_q, res_r]
2401 m_arg_x_high arg_x_low arg_y
2402 = do let format = intFormat width
2403 reg_q = getRegisterReg platform True (CmmLocal res_q)
2404 reg_r = getRegisterReg platform True (CmmLocal res_r)
2405 widen | signed = CLTD format
2406 | otherwise = XOR format (OpReg rdx) (OpReg rdx)
2407 instr | signed = IDIV
2408 | otherwise = DIV
2409 (y_reg, y_code) <- getRegOrMem arg_y
2410 x_low_code <- getAnyReg arg_x_low
2411 x_high_code <- case m_arg_x_high of
2412 Just arg_x_high ->
2413 getAnyReg arg_x_high
2414 Nothing ->
2415 return $ const $ unitOL widen
2416 return $ y_code `appOL`
2417 x_low_code rax `appOL`
2418 x_high_code rdx `appOL`
2419 toOL [instr format y_reg,
2420 MOV format (OpReg rax) (OpReg reg_q),
2421 MOV format (OpReg rdx) (OpReg reg_r)]
2422 divOp _ _ _ _ _ _ _
2423 = panic "genCCall: Wrong number of results for divOp"
2424
2425 addSubIntC platform instr mrevinstr cond width
2426 res_r res_c [arg_x, arg_y]
2427 = do let format = intFormat width
2428 rCode <- anyReg =<< trivialCode width (instr format)
2429 (mrevinstr format) arg_x arg_y
2430 reg_tmp <- getNewRegNat II8
2431 let reg_c = getRegisterReg platform True (CmmLocal res_c)
2432 reg_r = getRegisterReg platform True (CmmLocal res_r)
2433 code = rCode reg_r `snocOL`
2434 SETCC cond (OpReg reg_tmp) `snocOL`
2435 MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
2436
2437 return code
2438 addSubIntC _ _ _ _ _ _ _ _
2439 = panic "genCCall: Wrong number of arguments/results for addSubIntC"
2440
2441 -- Note [DIV/IDIV for bytes]
2442 --
2443 -- IDIV reminder:
2444 -- Size Dividend Divisor Quotient Remainder
2445 -- byte %ax r/m8 %al %ah
2446 -- word %dx:%ax r/m16 %ax %dx
2447 -- dword %edx:%eax r/m32 %eax %edx
2448 -- qword %rdx:%rax r/m64 %rax %rdx
2449 --
2450 -- We do a special case for the byte division because the current
2451 -- codegen doesn't deal well with accessing %ah register (also,
2452 -- accessing %ah in 64-bit mode is complicated because it cannot be an
2453 -- operand of many instructions). So we just widen operands to 16 bits
2454 -- and get the results from %al, %dl. This is not optimal, but a few
2455 -- register moves are probably not a huge deal when doing division.
2456
2457 genCCall32' :: DynFlags
2458 -> ForeignTarget -- function to call
2459 -> [CmmFormal] -- where to put the result
2460 -> [CmmActual] -- arguments (of mixed type)
2461 -> NatM InstrBlock
2462 genCCall32' dflags target dest_regs args = do
2463 let
2464 prom_args = map (maybePromoteCArg dflags W32) args
2465
2466 -- Align stack to 16n for calls, assuming a starting stack
2467 -- alignment of 16n - word_size on procedure entry. Which we
2468 -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2469 sizes = map (arg_size_bytes . cmmExprType dflags) (reverse args)
2470 raw_arg_size = sum sizes + wORD_SIZE dflags
2471 arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
2472 tot_arg_size = raw_arg_size + arg_pad_size - wORD_SIZE dflags
2473 delta0 <- getDeltaNat
2474 setDeltaNat (delta0 - arg_pad_size)
2475
2476 use_sse2 <- sse2Enabled
2477 push_codes <- mapM (push_arg use_sse2) (reverse prom_args)
2478 delta <- getDeltaNat
2479 MASSERT(delta == delta0 - tot_arg_size)
2480
2481 -- deal with static vs dynamic call targets
2482 (callinsns,cconv) <-
2483 case target of
2484 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2485 -> -- ToDo: stdcall arg sizes
2486 return (unitOL (CALL (Left fn_imm) []), conv)
2487 where fn_imm = ImmCLbl lbl
2488 ForeignTarget expr conv
2489 -> do { (dyn_r, dyn_c) <- getSomeReg expr
2490 ; ASSERT( isWord32 (cmmExprType dflags expr) )
2491 return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
2492 PrimTarget _
2493 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
2494 ++ "probably because too many return values."
2495
2496 let push_code
2497 | arg_pad_size /= 0
2498 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
2499 DELTA (delta0 - arg_pad_size)]
2500 `appOL` concatOL push_codes
2501 | otherwise
2502 = concatOL push_codes
2503
2504 -- Deallocate parameters after call for ccall;
2505 -- but not for stdcall (callee does it)
2506 --
2507 -- We have to pop any stack padding we added
2508 -- even if we are doing stdcall, though (#5052)
2509 pop_size
2510 | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
2511 | otherwise = tot_arg_size
2512
2513 call = callinsns `appOL`
2514 toOL (
2515 (if pop_size==0 then [] else
2516 [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
2517 ++
2518 [DELTA delta0]
2519 )
2520 setDeltaNat delta0
2521
2522 dflags <- getDynFlags
2523 let platform = targetPlatform dflags
2524
2525 let
2526 -- assign the results, if necessary
2527 assign_code [] = nilOL
2528 assign_code [dest]
2529 | isFloatType ty =
2530 if use_sse2
2531 then let tmp_amode = AddrBaseIndex (EABaseReg esp)
2532 EAIndexNone
2533 (ImmInt 0)
2534 fmt = floatFormat w
2535 in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
2536 DELTA (delta0 - b),
2537 GST fmt fake0 tmp_amode,
2538 MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
2539 ADD II32 (OpImm (ImmInt b)) (OpReg esp),
2540 DELTA delta0]
2541 else unitOL (GMOV fake0 r_dest)
2542 | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
2543 MOV II32 (OpReg edx) (OpReg r_dest_hi)]
2544 | otherwise = unitOL (MOV (intFormat w)
2545 (OpReg eax)
2546 (OpReg r_dest))
2547 where
2548 ty = localRegType dest
2549 w = typeWidth ty
2550 b = widthInBytes w
2551 r_dest_hi = getHiVRegFromLo r_dest
2552 r_dest = getRegisterReg platform use_sse2 (CmmLocal dest)
2553 assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
2554
2555 return (push_code `appOL`
2556 call `appOL`
2557 assign_code dest_regs)
2558
2559 where
2560 -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
2561 arg_size_bytes :: CmmType -> Int
2562 arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
2563
2564 roundTo a x | x `mod` a == 0 = x
2565 | otherwise = x + a - (x `mod` a)
2566
2567 push_arg :: Bool -> CmmActual {-current argument-}
2568 -> NatM InstrBlock -- code
2569
2570 push_arg use_sse2 arg -- we don't need the hints on x86
2571 | isWord64 arg_ty = do
2572 ChildCode64 code r_lo <- iselExpr64 arg
2573 delta <- getDeltaNat
2574 setDeltaNat (delta - 8)
2575 let r_hi = getHiVRegFromLo r_lo
2576 return ( code `appOL`
2577 toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
2578 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
2579 DELTA (delta-8)]
2580 )
2581
2582 | isFloatType arg_ty = do
2583 (reg, code) <- getSomeReg arg
2584 delta <- getDeltaNat
2585 setDeltaNat (delta-size)
2586 return (code `appOL`
2587 toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
2588 DELTA (delta-size),
2589 let addr = AddrBaseIndex (EABaseReg esp)
2590 EAIndexNone
2591 (ImmInt 0)
2592 format = floatFormat (typeWidth arg_ty)
2593 in
2594 if use_sse2
2595 then MOV format (OpReg reg) (OpAddr addr)
2596 else GST format reg addr
2597 ]
2598 )
2599
2600 | otherwise = do
2601 -- Arguments can be smaller than 32-bit, but we still use @PUSH
2602 -- II32@ - the usual calling conventions expect integers to be
2603 -- 4-byte aligned.
2604 ASSERT((typeWidth arg_ty) <= W32) return ()
2605 (operand, code) <- getOperand arg
2606 delta <- getDeltaNat
2607 setDeltaNat (delta-size)
2608 return (code `snocOL`
2609 PUSH II32 operand `snocOL`
2610 DELTA (delta-size))
2611
2612 where
2613 arg_ty = cmmExprType dflags arg
2614 size = arg_size_bytes arg_ty -- Byte size
2615
2616 genCCall64' :: DynFlags
2617 -> ForeignTarget -- function to call
2618 -> [CmmFormal] -- where to put the result
2619 -> [CmmActual] -- arguments (of mixed type)
2620 -> NatM InstrBlock
2621 genCCall64' dflags target dest_regs args = do
2622 -- load up the register arguments
2623 let prom_args = map (maybePromoteCArg dflags W32) args
2624
2625 (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
2626 <-
2627 if platformOS platform == OSMinGW32
2628 then load_args_win prom_args [] [] (allArgRegs platform) nilOL
2629 else do
2630 (stack_args, aregs, fregs, load_args_code, assign_args_code)
2631 <- load_args prom_args (allIntArgRegs platform)
2632 (allFPArgRegs platform)
2633 nilOL nilOL
2634 let used_regs rs as = reverse (drop (length rs) (reverse as))
2635 fregs_used = used_regs fregs (allFPArgRegs platform)
2636 aregs_used = used_regs aregs (allIntArgRegs platform)
2637 return (stack_args, aregs_used, fregs_used, load_args_code
2638 , assign_args_code)
2639
2640 let
2641 arg_regs_used = int_regs_used ++ fp_regs_used
2642 arg_regs = [eax] ++ arg_regs_used
2643 -- for annotating the call instruction with
2644 sse_regs = length fp_regs_used
2645 arg_stack_slots = if platformOS platform == OSMinGW32
2646 then length stack_args + length (allArgRegs platform)
2647 else length stack_args
2648 tot_arg_size = arg_size * arg_stack_slots
2649
2650
2651 -- Align stack to 16n for calls, assuming a starting stack
2652 -- alignment of 16n - word_size on procedure entry. Which we
2653 -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2654 (real_size, adjust_rsp) <-
2655 if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
2656 then return (tot_arg_size, nilOL)
2657 else do -- we need to adjust...
2658 delta <- getDeltaNat
2659 setDeltaNat (delta - wORD_SIZE dflags)
2660 return (tot_arg_size + wORD_SIZE dflags, toOL [
2661 SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
2662 DELTA (delta - wORD_SIZE dflags) ])
2663
2664 -- push the stack args, right to left
2665 push_code <- push_args (reverse stack_args) nilOL
2666 -- On Win64, we also have to leave stack space for the arguments
2667 -- that we are passing in registers
2668 lss_code <- if platformOS platform == OSMinGW32
2669 then leaveStackSpace (length (allArgRegs platform))
2670 else return nilOL
2671 delta <- getDeltaNat
2672
2673 -- deal with static vs dynamic call targets
2674 (callinsns,_cconv) <-
2675 case target of
2676 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2677 -> -- ToDo: stdcall arg sizes
2678 return (unitOL (CALL (Left fn_imm) arg_regs), conv)
2679 where fn_imm = ImmCLbl lbl
2680 ForeignTarget expr conv
2681 -> do (dyn_r, dyn_c) <- getSomeReg expr
2682 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
2683 PrimTarget _
2684 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
2685 ++ "probably because too many return values."
2686
2687 let
2688 -- The x86_64 ABI requires us to set %al to the number of SSE2
2689 -- registers that contain arguments, if the called routine
2690 -- is a varargs function. We don't know whether it's a
2691 -- varargs function or not, so we have to assume it is.
2692 --
2693 -- It's not safe to omit this assignment, even if the number
2694 -- of SSE2 regs in use is zero. If %al is larger than 8
2695 -- on entry to a varargs function, seg faults ensue.
2696 assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
2697
2698 let call = callinsns `appOL`
2699 toOL (
2700 -- Deallocate parameters after call for ccall;
2701 -- stdcall has callee do it, but is not supported on
2702 -- x86_64 target (see #3336)
2703 (if real_size==0 then [] else
2704 [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
2705 ++
2706 [DELTA (delta + real_size)]
2707 )
2708 setDeltaNat (delta + real_size)
2709
2710 let
2711 -- assign the results, if necessary
2712 assign_code [] = nilOL
2713 assign_code [dest] =
2714 case typeWidth rep of
2715 W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
2716 (OpReg xmm0)
2717 (OpReg r_dest))
2718 W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
2719 (OpReg xmm0)
2720 (OpReg r_dest))
2721 _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
2722 where
2723 rep = localRegType dest
2724 r_dest = getRegisterReg platform True (CmmLocal dest)
2725 assign_code _many = panic "genCCall.assign_code many"
2726
2727 return (adjust_rsp `appOL`
2728 push_code `appOL`
2729 load_args_code `appOL`
2730 assign_args_code `appOL`
2731 lss_code `appOL`
2732 assign_eax sse_regs `appOL`
2733 call `appOL`
2734 assign_code dest_regs)
2735
2736 where platform = targetPlatform dflags
2737 arg_size = 8 -- always, at the mo
2738
2739
2740 load_args :: [CmmExpr]
2741 -> [Reg] -- int regs avail for args
2742 -> [Reg] -- FP regs avail for args
2743 -> InstrBlock -- code computing args
2744 -> InstrBlock -- code assigning args to ABI regs
2745 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
2746 -- no more regs to use
2747 load_args args [] [] code acode =
2748 return (args, [], [], code, acode)
2749
2750 -- no more args to push
2751 load_args [] aregs fregs code acode =
2752 return ([], aregs, fregs, code, acode)
2753
2754 load_args (arg : rest) aregs fregs code acode
2755 | isFloatType arg_rep = case fregs of
2756 [] -> push_this_arg
2757 (r:rs) -> do
2758 (code',acode') <- reg_this_arg r
2759 load_args rest aregs rs code' acode'
2760 | otherwise = case aregs of
2761 [] -> push_this_arg
2762 (r:rs) -> do
2763 (code',acode') <- reg_this_arg r
2764 load_args rest rs fregs code' acode'
2765 where
2766
2767 -- put arg into the list of stack pushed args
2768 push_this_arg = do
2769 (args',ars,frs,code',acode')
2770 <- load_args rest aregs fregs code acode
2771 return (arg:args', ars, frs, code', acode')
2772
2773 -- pass the arg into the given register
2774 reg_this_arg r
2775 -- "operand" args can be directly assigned into r
2776 | isOperand False arg = do
2777 arg_code <- getAnyReg arg
2778 return (code, (acode `appOL` arg_code r))
2779 -- The last non-operand arg can be directly assigned after its
2780 -- computation without going into a temporary register
2781 | all (isOperand False) rest = do
2782 arg_code <- getAnyReg arg
2783 return (code `appOL` arg_code r,acode)
2784
2785 -- other args need to be computed beforehand to avoid clobbering
2786 -- previously assigned registers used to pass parameters (see
2787 -- #11792, #12614). They are assigned into temporary registers
2788 -- and get assigned to proper call ABI registers after they all
2789 -- have been computed.
2790 | otherwise = do
2791 arg_code <- getAnyReg arg
2792 tmp <- getNewRegNat arg_fmt
2793 let
2794 code' = code `appOL` arg_code tmp
2795 acode' = acode `snocOL` reg2reg arg_fmt tmp r
2796 return (code',acode')
2797
2798 arg_rep = cmmExprType dflags arg
2799 arg_fmt = cmmTypeFormat arg_rep
2800
2801 load_args_win :: [CmmExpr]
2802 -> [Reg] -- used int regs
2803 -> [Reg] -- used FP regs
2804 -> [(Reg, Reg)] -- (int, FP) regs avail for args
2805 -> InstrBlock
2806 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
2807 load_args_win args usedInt usedFP [] code
2808 = return (args, usedInt, usedFP, code, nilOL)
2809 -- no more regs to use
2810 load_args_win [] usedInt usedFP _ code
2811 = return ([], usedInt, usedFP, code, nilOL)
2812 -- no more args to push
2813 load_args_win (arg : rest) usedInt usedFP
2814 ((ireg, freg) : regs) code
2815 | isFloatType arg_rep = do
2816 arg_code <- getAnyReg arg
2817 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
2818 (code `appOL`
2819 arg_code freg `snocOL`
2820 -- If we are calling a varargs function
2821 -- then we need to define ireg as well
2822 -- as freg
2823 MOV II64 (OpReg freg) (OpReg ireg))
2824 | otherwise = do
2825 arg_code <- getAnyReg arg
2826 load_args_win rest (ireg : usedInt) usedFP regs
2827 (code `appOL` arg_code ireg)
2828 where
2829 arg_rep = cmmExprType dflags arg
2830
2831 push_args [] code = return code
2832 push_args (arg:rest) code
2833 | isFloatType arg_rep = do
2834 (arg_reg, arg_code) <- getSomeReg arg
2835 delta <- getDeltaNat
2836 setDeltaNat (delta-arg_size)
2837 let code' = code `appOL` arg_code `appOL` toOL [
2838 SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp),
2839 DELTA (delta-arg_size),
2840 MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
2841 push_args rest code'
2842
2843 | otherwise = do
2844 -- Arguments can be smaller than 64-bit, but we still use @PUSH
2845 -- II64@ - the usual calling conventions expect integers to be
2846 -- 8-byte aligned.
2847 ASSERT(width <= W64) return ()
2848 (arg_op, arg_code) <- getOperand arg
2849 delta <- getDeltaNat
2850 setDeltaNat (delta-arg_size)
2851 let code' = code `appOL` arg_code `appOL` toOL [
2852 PUSH II64 arg_op,
2853 DELTA (delta-arg_size)]
2854 push_args rest code'
2855 where
2856 arg_rep = cmmExprType dflags arg
2857 width = typeWidth arg_rep
2858
2859 leaveStackSpace n = do
2860 delta <- getDeltaNat
2861 setDeltaNat (delta - n * arg_size)
2862 return $ toOL [
2863 SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
2864 DELTA (delta - n * arg_size)]
2865
2866 maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr
2867 maybePromoteCArg dflags wto arg
2868 | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
2869 | otherwise = arg
2870 where
2871 wfrom = cmmExprWidth dflags arg
2872
2873 outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
2874 -> NatM InstrBlock
2875 outOfLineCmmOp bid mop res args
2876 = do
2877 dflags <- getDynFlags
2878 targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
2879 let target = ForeignTarget targetExpr
2880 (ForeignConvention CCallConv [] [] CmmMayReturn)
2881
2882 stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
2883 where
2884 -- Assume we can call these functions directly, and that they're not in a dynamic library.
2885 -- TODO: Why is this ok? Under linux this code will be in libm.so
2886 -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31
2887 lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
2888
2889 fn = case mop of
2890 MO_F32_Sqrt -> fsLit "sqrtf"
2891 MO_F32_Fabs -> fsLit "fabsf"
2892 MO_F32_Sin -> fsLit "sinf"
2893 MO_F32_Cos -> fsLit "cosf"
2894 MO_F32_Tan -> fsLit "tanf"
2895 MO_F32_Exp -> fsLit "expf"
2896 MO_F32_Log -> fsLit "logf"
2897
2898 MO_F32_Asin -> fsLit "asinf"
2899 MO_F32_Acos -> fsLit "acosf"
2900 MO_F32_Atan -> fsLit "atanf"
2901
2902 MO_F32_Sinh -> fsLit "sinhf"
2903 MO_F32_Cosh -> fsLit "coshf"
2904 MO_F32_Tanh -> fsLit "tanhf"
2905 MO_F32_Pwr -> fsLit "powf"
2906
2907 MO_F32_Asinh -> fsLit "asinhf"
2908 MO_F32_Acosh -> fsLit "acoshf"
2909 MO_F32_Atanh -> fsLit "atanhf"
2910
2911 MO_F64_Sqrt -> fsLit "sqrt"
2912 MO_F64_Fabs -> fsLit "fabs"
2913 MO_F64_Sin -> fsLit "sin"
2914 MO_F64_Cos -> fsLit "cos"
2915 MO_F64_Tan -> fsLit "tan"
2916 MO_F64_Exp -> fsLit "exp"
2917 MO_F64_Log -> fsLit "log"
2918
2919 MO_F64_Asin -> fsLit "asin"
2920 MO_F64_Acos -> fsLit "acos"
2921 MO_F64_Atan -> fsLit "atan"
2922
2923 MO_F64_Sinh -> fsLit "sinh"
2924 MO_F64_Cosh -> fsLit "cosh"
2925 MO_F64_Tanh -> fsLit "tanh"
2926 MO_F64_Pwr -> fsLit "pow"
2927
2928 MO_F64_Asinh -> fsLit "asinh"
2929 MO_F64_Acosh -> fsLit "acosh"
2930 MO_F64_Atanh -> fsLit "atanh"
2931
2932 MO_Memcpy _ -> fsLit "memcpy"
2933 MO_Memset _ -> fsLit "memset"
2934 MO_Memmove _ -> fsLit "memmove"
2935 MO_Memcmp _ -> fsLit "memcmp"
2936
2937 MO_PopCnt _ -> fsLit "popcnt"
2938 MO_BSwap _ -> fsLit "bswap"
2939 MO_Clz w -> fsLit $ clzLabel w
2940 MO_Ctz _ -> unsupported
2941
2942 MO_Pdep w -> fsLit $ pdepLabel w
2943 MO_Pext w -> fsLit $ pextLabel w
2944
2945 MO_AtomicRMW _ _ -> fsLit "atomicrmw"
2946 MO_AtomicRead _ -> fsLit "atomicread"
2947 MO_AtomicWrite _ -> fsLit "atomicwrite"
2948 MO_Cmpxchg _ -> fsLit "cmpxchg"
2949
2950 MO_UF_Conv _ -> unsupported
2951
2952 MO_S_QuotRem {} -> unsupported
2953 MO_U_QuotRem {} -> unsupported
2954 MO_U_QuotRem2 {} -> unsupported
2955 MO_Add2 {} -> unsupported
2956 MO_AddIntC {} -> unsupported
2957 MO_SubIntC {} -> unsupported
2958 MO_AddWordC {} -> unsupported
2959 MO_SubWordC {} -> unsupported
2960 MO_U_Mul2 {} -> unsupported
2961 MO_WriteBarrier -> unsupported
2962 MO_Touch -> unsupported
2963 (MO_Prefetch_Data _ ) -> unsupported
2964 unsupported = panic ("outOfLineCmmOp: " ++ show mop
2965 ++ " not supported here")
2966
2967 -- -----------------------------------------------------------------------------
2968 -- Generating a table-branch
2969
2970 genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
2971
2972 genSwitch dflags expr targets
2973 | positionIndependent dflags
2974 = do
2975 (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
2976 -- getNonClobberedReg because it needs to survive across t_code
2977 lbl <- getNewLabelNat
2978 dflags <- getDynFlags
2979 let is32bit = target32Bit (targetPlatform dflags)
2980 os = platformOS (targetPlatform dflags)
2981 -- Might want to use .rodata.<function we're in> instead, but as
2982 -- long as it's something unique it'll work out since the
2983 -- references to the jump table are in the appropriate section.
2984 rosection = case os of
2985 -- on Mac OS X/x86_64, put the jump table in the text section to
2986 -- work around a limitation of the linker.
2987 -- ld64 is unable to handle the relocations for
2988 -- .quad L1 - L0
2989 -- if L0 is not preceded by a non-anonymous label in its section.
2990 OSDarwin | not is32bit -> Section Text lbl
2991 _ -> Section ReadOnlyData lbl
2992 dynRef <- cmmMakeDynamicReference dflags DataReference lbl
2993 (tableReg,t_code) <- getSomeReg $ dynRef
2994 let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
2995 (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
2996
2997 offsetReg <- getNewRegNat (intFormat (wordWidth dflags))
2998 return $ if is32bit || os == OSDarwin
2999 then e_code `appOL` t_code `appOL` toOL [
3000 ADD (intFormat (wordWidth dflags)) op (OpReg tableReg),
3001 JMP_TBL (OpReg tableReg) ids rosection lbl
3002 ]
3003 else -- HACK: On x86_64 binutils<2.17 is only able to generate
3004 -- PC32 relocations, hence we only get 32-bit offsets in
3005 -- the jump table. As these offsets are always negative
3006 -- we need to properly sign extend them to 64-bit. This
3007 -- hack should be removed in conjunction with the hack in
3008 -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
3009 e_code `appOL` t_code `appOL` toOL [
3010 MOVSxL II32 op (OpReg offsetReg),
3011 ADD (intFormat (wordWidth dflags))
3012 (OpReg offsetReg)
3013 (OpReg tableReg),
3014 JMP_TBL (OpReg tableReg) ids rosection lbl
3015 ]
3016 | otherwise
3017 = do
3018 (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
3019 lbl <- getNewLabelNat
3020 let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
3021 code = e_code `appOL` toOL [
3022 JMP_TBL op ids (Section ReadOnlyData lbl) lbl
3023 ]
3024 return code
3025 where
3026 (offset, blockIds) = switchTargetsToTable targets
3027 ids = map (fmap DestBlockId) blockIds
3028
3029 generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, CmmStatics) Instr)
3030 generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
3031 = let getBlockId (DestBlockId id) = id
3032 getBlockId _ = panic "Non-Label target in Jump Table"
3033 blockIds = map (fmap getBlockId) ids
3034 in Just (createJumpTable dflags blockIds section lbl)
3035 generateJumpTableForInstr _ _ = Nothing
3036
3037 createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
3038 -> GenCmmDecl (Alignment, CmmStatics) h g
3039 createJumpTable dflags ids section lbl
3040 = let jumpTable
3041 | positionIndependent dflags =
3042 let ww = wordWidth dflags
3043 jumpTableEntryRel Nothing
3044 = CmmStaticLit (CmmInt 0 ww)
3045 jumpTableEntryRel (Just blockid)
3046 = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
3047 where blockLabel = blockLbl blockid
3048 in map jumpTableEntryRel ids
3049 | otherwise = map (jumpTableEntry dflags) ids
3050 in CmmData section (1, Statics lbl jumpTable)
3051
3052 extractUnwindPoints :: [Instr] -> [UnwindPoint]
3053 extractUnwindPoints instrs =
3054 [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
3055
3056 -- -----------------------------------------------------------------------------
3057 -- 'condIntReg' and 'condFltReg': condition codes into registers
3058
3059 -- Turn those condition codes into integers now (when they appear on
3060 -- the right hand side of an assignment).
3061 --
3062 -- (If applicable) Do not fill the delay slots here; you will confuse the
3063 -- register allocator.
3064
3065 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
3066
3067 condIntReg cond x y = do
3068 CondCode _ cond cond_code <- condIntCode cond x y
3069 tmp <- getNewRegNat II8
3070 let
3071 code dst = cond_code `appOL` toOL [
3072 SETCC cond (OpReg tmp),
3073 MOVZxL II8 (OpReg tmp) (OpReg dst)
3074 ]
3075 return (Any II32 code)
3076
3077
3078 -----------------------------------------------------------
3079 --- Note [SSE Parity Checks] ---
3080 -----------------------------------------------------------
3081
3082 -- We have to worry about unordered operands (eg. comparisons
3083 -- against NaN). If the operands are unordered, the comparison
3084 -- sets the parity flag, carry flag and zero flag.
3085 -- All comparisons are supposed to return false for unordered
3086 -- operands except for !=, which returns true.
3087 --
3088 -- Optimisation: we don't have to test the parity flag if we
3089 -- know the test has already excluded the unordered case: eg >
3090 -- and >= test for a zero carry flag, which can only occur for
3091 -- ordered operands.
3092 --
3093 -- By reversing comparisons we can avoid testing the parity
3094 -- for < and <= as well. If any of the arguments is an NaN we
3095 -- return false either way. If both arguments are valid then
3096 -- x <= y <-> y >= x holds. So it's safe to swap these.
3097 --
3098 -- We invert the condition inside getRegister'and getCondCode
3099 -- which should cover all invertable cases.
3100 -- All other functions translating FP comparisons to assembly
3101 -- use these to two generate the comparison code.
3102 --
3103 -- As an example consider a simple check:
3104 --
3105 -- func :: Float -> Float -> Int
3106 -- func x y = if x < y then 1 else 0
3107 --
3108 -- Which in Cmm gives the floating point comparison.
3109 --
3110 -- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
3111 --
3112 -- We used to compile this to an assembly code block like this:
3113 -- _c2gh:
3114 -- ucomiss %xmm2,%xmm1
3115 -- jp _c2gf
3116 -- jb _c2gg
3117 -- jmp _c2gf
3118 --
3119 -- Where we have to introduce an explicit
3120 -- check for unordered results (using jmp parity):
3121 --
3122 -- We can avoid this by exchanging the arguments and inverting the direction
3123 -- of the comparison. This results in the sequence of:
3124 --
3125 -- ucomiss %xmm1,%xmm2
3126 -- ja _c2g2
3127 -- jmp _c2g1
3128 --
3129 -- Removing the jump reduces the pressure on the branch predidiction system
3130 -- and plays better with the uOP cache.
3131
3132 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
3133 condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
3134 where
3135 condFltReg_x87 = do
3136 CondCode _ cond cond_code <- condFltCode cond x y
3137 tmp <- getNewRegNat II8
3138 let
3139 code dst = cond_code `appOL` toOL [
3140 SETCC cond (OpReg tmp),
3141 MOVZxL II8 (OpReg tmp) (OpReg dst)
3142 ]
3143 return (Any II32 code)
3144
3145 condFltReg_sse2 = do
3146 CondCode _ cond cond_code <- condFltCode cond x y
3147 tmp1 <- getNewRegNat (archWordFormat is32Bit)
3148 tmp2 <- getNewRegNat (archWordFormat is32Bit)
3149 let -- See Note [SSE Parity Checks]
3150 code dst =
3151 cond_code `appOL`
3152 (case cond of
3153 NE -> or_unordered dst
3154 GU -> plain_test dst
3155 GEU -> plain_test dst
3156 -- Use ASSERT so we don't break releases if these creep in.
3157 LTT -> ASSERT2(False, ppr "Should have been turned into >")
3158 and_ordered dst
3159 LE -> ASSERT2(False, ppr "Should have been turned into >=")
3160 and_ordered dst
3161 _ -> and_ordered dst)
3162
3163 plain_test dst = toOL [
3164 SETCC cond (OpReg tmp1),
3165 MOVZxL II8 (OpReg tmp1) (OpReg dst)
3166 ]
3167 or_unordered dst = toOL [
3168 SETCC cond (OpReg tmp1),
3169 SETCC PARITY (OpReg tmp2),
3170 OR II8 (OpReg tmp1) (OpReg tmp2),
3171 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3172 ]
3173 and_ordered dst = toOL [
3174 SETCC cond (OpReg tmp1),
3175 SETCC NOTPARITY (OpReg tmp2),
3176 AND II8 (OpReg tmp1) (OpReg tmp2),
3177 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3178 ]
3179 return (Any II32 code)
3180
3181
3182 -- -----------------------------------------------------------------------------
3183 -- 'trivial*Code': deal with trivial instructions
3184
3185 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
3186 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
3187 -- Only look for constants on the right hand side, because that's
3188 -- where the generic optimizer will have put them.
3189
3190 -- Similarly, for unary instructions, we don't have to worry about
3191 -- matching an StInt as the argument, because genericOpt will already
3192 -- have handled the constant-folding.
3193
3194
3195 {-
3196 The Rules of the Game are:
3197
3198 * You cannot assume anything about the destination register dst;
3199 it may be anything, including a fixed reg.
3200
3201 * You may compute an operand into a fixed reg, but you may not
3202 subsequently change the contents of that fixed reg. If you
3203 want to do so, first copy the value either to a temporary
3204 or into dst. You are free to modify dst even if it happens
3205 to be a fixed reg -- that's not your problem.
3206
3207 * You cannot assume that a fixed reg will stay live over an
3208 arbitrary computation. The same applies to the dst reg.
3209
3210 * Temporary regs obtained from getNewRegNat are distinct from
3211 each other and from all other regs, and stay live over
3212 arbitrary computations.
3213
3214 --------------------
3215
3216 SDM's version of The Rules:
3217
3218 * If getRegister returns Any, that means it can generate correct
3219 code which places the result in any register, period. Even if that
3220 register happens to be read during the computation.
3221
3222 Corollary #1: this means that if you are generating code for an
3223 operation with two arbitrary operands, you cannot assign the result
3224 of the first operand into the destination register before computing
3225 the second operand. The second operand might require the old value
3226 of the destination register.
3227
3228 Corollary #2: A function might be able to generate more efficient
3229 code if it knows the destination register is a new temporary (and
3230 therefore not read by any of the sub-computations).
3231
3232 * If getRegister returns Any, then the code it generates may modify only:
3233 (a) fresh temporaries
3234 (b) the destination register
3235 (c) known registers (eg. %ecx is used by shifts)
3236 In particular, it may *not* modify global registers, unless the global
3237 register happens to be the destination register.
3238 -}
3239
3240 trivialCode :: Width -> (Operand -> Operand -> Instr)
3241 -> Maybe (Operand -> Operand -> Instr)
3242 -> CmmExpr -> CmmExpr -> NatM Register
3243 trivialCode width instr m a b
3244 = do is32Bit <- is32BitPlatform
3245 trivialCode' is32Bit width instr m a b
3246
3247 trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
3248 -> Maybe (Operand -> Operand -> Instr)
3249 -> CmmExpr -> CmmExpr -> NatM Register
3250 trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
3251 | is32BitLit is32Bit lit_a = do
3252 b_code <- getAnyReg b
3253 let
3254 code dst
3255 = b_code dst `snocOL`
3256 revinstr (OpImm (litToImm lit_a)) (OpReg dst)
3257 return (Any (intFormat width) code)
3258
3259 trivialCode' _ width instr _ a b
3260 = genTrivialCode (intFormat width) instr a b
3261
3262 -- This is re-used for floating pt instructions too.
3263 genTrivialCode :: Format -> (Operand -> Operand -> Instr)
3264 -> CmmExpr -> CmmExpr -> NatM Register
3265 genTrivialCode rep instr a b = do
3266 (b_op, b_code) <- getNonClobberedOperand b
3267 a_code <- getAnyReg a
3268 tmp <- getNewRegNat rep
3269 let
3270 -- We want the value of b to stay alive across the computation of a.
3271 -- But, we want to calculate a straight into the destination register,
3272 -- because the instruction only has two operands (dst := dst `op` src).
3273 -- The troublesome case is when the result of b is in the same register
3274 -- as the destination reg. In this case, we have to save b in a
3275 -- new temporary across the computation of a.
3276 code dst
3277 | dst `regClashesWithOp` b_op =
3278 b_code `appOL`
3279 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
3280 a_code dst `snocOL`
3281 instr (OpReg tmp) (OpReg dst)
3282 | otherwise =
3283 b_code `appOL`
3284 a_code dst `snocOL`
3285 instr b_op (OpReg dst)
3286 return (Any rep code)
3287
3288 regClashesWithOp :: Reg -> Operand -> Bool
3289 reg `regClashesWithOp` OpReg reg2 = reg == reg2
3290 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
3291 _ `regClashesWithOp` _ = False
3292
3293 -----------
3294
3295 trivialUCode :: Format -> (Operand -> Instr)
3296 -> CmmExpr -> NatM Register
3297 trivialUCode rep instr x = do
3298 x_code <- getAnyReg x
3299 let
3300 code dst =
3301 x_code dst `snocOL`
3302 instr (OpReg dst)
3303 return (Any rep code)
3304
3305 -----------
3306
3307 trivialFCode_x87 :: (Format -> Reg -> Reg -> Reg -> Instr)
3308 -> CmmExpr -> CmmExpr -> NatM Register
3309 trivialFCode_x87 instr x y = do
3310 (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
3311 (y_reg, y_code) <- getSomeReg y
3312 let
3313 format = FF80 -- always, on x87
3314 code dst =
3315 x_code `appOL`
3316 y_code `snocOL`
3317 instr format x_reg y_reg dst
3318 return (Any format code)
3319
3320 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
3321 -> CmmExpr -> CmmExpr -> NatM Register
3322 trivialFCode_sse2 pk instr x y
3323 = genTrivialCode format (instr format) x y
3324 where format = floatFormat pk
3325
3326
3327 trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
3328 trivialUFCode format instr x = do
3329 (x_reg, x_code) <- getSomeReg x
3330 let
3331 code dst =
3332 x_code `snocOL`
3333 instr x_reg dst
3334 return (Any format code)
3335
3336
3337 --------------------------------------------------------------------------------
3338 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
3339 coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
3340 where
3341 coerce_x87 = do
3342 (x_reg, x_code) <- getSomeReg x
3343 let
3344 opc = case to of W32 -> GITOF; W64 -> GITOD;
3345 n -> panic $ "coerceInt2FP.x87: unhandled width ("
3346 ++ show n ++ ")"
3347 code dst = x_code `snocOL` opc x_reg dst
3348 -- ToDo: works for non-II32 reps?
3349 return (Any FF80 code)
3350
3351 coerce_sse2 = do
3352 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3353 let
3354 opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
3355 n -> panic $ "coerceInt2FP.sse: unhandled width ("
3356 ++ show n ++ ")"
3357 code dst = x_code `snocOL` opc (intFormat from) x_op dst
3358 return (Any (floatFormat to) code)
3359 -- works even if the destination rep is <II32
3360
3361 --------------------------------------------------------------------------------
3362 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
3363 coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
3364 where
3365 coerceFP2Int_x87 = do
3366 (x_reg, x_code) <- getSomeReg x
3367 let
3368 opc = case from of W32 -> GFTOI; W64 -> GDTOI
3369 n -> panic $ "coerceFP2Int.x87: unhandled width ("
3370 ++ show n ++ ")"
3371 code dst = x_code `snocOL` opc x_reg dst
3372 -- ToDo: works for non-II32 reps?
3373 return (Any (intFormat to) code)
3374
3375 coerceFP2Int_sse2 = do
3376 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3377 let
3378 opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
3379 n -> panic $ "coerceFP2Init.sse: unhandled width ("
3380 ++ show n ++ ")"
3381 code dst = x_code `snocOL` opc (intFormat to) x_op dst
3382 return (Any (intFormat to) code)
3383 -- works even if the destination rep is <II32
3384
3385
3386 --------------------------------------------------------------------------------
3387 coerceFP2FP :: Width -> CmmExpr -> NatM Register
3388 coerceFP2FP to x = do
3389 use_sse2 <- sse2Enabled
3390 (x_reg, x_code) <- getSomeReg x
3391 let
3392 opc | use_sse2 = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
3393 n -> panic $ "coerceFP2FP: unhandled width ("
3394 ++ show n ++ ")"
3395 | otherwise = GDTOF
3396 code dst = x_code `snocOL` opc x_reg dst
3397 return (Any (if use_sse2 then floatFormat to else FF80) code)
3398
3399 --------------------------------------------------------------------------------
3400
3401 sse2NegCode :: Width -> CmmExpr -> NatM Register
3402 sse2NegCode w x = do
3403 let fmt = floatFormat w
3404 x_code <- getAnyReg x
3405 -- This is how gcc does it, so it can't be that bad:
3406 let
3407 const = case fmt of
3408 FF32 -> CmmInt 0x80000000 W32
3409 FF64 -> CmmInt 0x8000000000000000 W64
3410 x@II8 -> wrongFmt x
3411 x@II16 -> wrongFmt x
3412 x@II32 -> wrongFmt x
3413 x@II64 -> wrongFmt x
3414 x@FF80 -> wrongFmt x
3415 where
3416 wrongFmt x = panic $ "sse2NegCode: " ++ show x
3417 Amode amode amode_code <- memConstant (widthInBytes w) const
3418 tmp <- getNewRegNat fmt
3419 let
3420 code dst = x_code dst `appOL` amode_code `appOL` toOL [
3421 MOV fmt (OpAddr amode) (OpReg tmp),
3422 XOR fmt (OpReg tmp) (OpReg dst)
3423 ]
3424 --
3425 return (Any fmt code)
3426
3427 isVecExpr :: CmmExpr -> Bool
3428 isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
3429 isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
3430 isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
3431 isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
3432 isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
3433 isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
3434 isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
3435 isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
3436 isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
3437 isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
3438 isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
3439 isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
3440 isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
3441 isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
3442 isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
3443 isVecExpr (CmmMachOp _ [e]) = isVecExpr e
3444 isVecExpr _ = False
3445
3446 needLlvm :: NatM a
3447 needLlvm =
3448 sorry $ unlines ["The native code generator does not support vector"
3449 ,"instructions. Please use -fllvm."]
3450
3451 -- | This works on the invariant that all jumps in the given blocks are required.
3452 -- Starting from there we try to make a few more jumps redundant by reordering
3453 -- them.
3454 invertCondBranches :: CFG -> LabelMap a -> [NatBasicBlock Instr]
3455 -> [NatBasicBlock Instr]
3456 invertCondBranches cfg keep bs =
3457 --trace "Foo" $
3458 invert bs
3459 where
3460 invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
3461 invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
3462 | --pprTrace "Block" (ppr lbl1) True,
3463 (jmp1,jmp2) <- last2 ins
3464 , JXX cond1 target1 <- jmp1
3465 , target1 == lbl2
3466 --, pprTrace "CutChance" (ppr b1) True
3467 , JXX ALWAYS target2 <- jmp2
3468 -- We have enough information to check if we can perform the inversion
3469 -- TODO: We could also check for the last asm instruction which sets
3470 -- status flags instead. Which I suspect is worse in terms of compiler
3471 -- performance, but might be applicable to more cases
3472 , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
3473 , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
3474 -- Both jumps come from the same cmm statement
3475 , transitionSource edgeInfo1 == transitionSource edgeInfo2
3476 , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1
3477
3478 --Int comparisons are invertable
3479 , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
3480 , Just _ <- maybeIntComparison op
3481 , Just invCond <- maybeInvertCond cond1
3482
3483 --Swap the last two jumps, invert the conditional jumps condition.
3484 = let jumps =
3485 case () of
3486 -- We are free the eliminate the jmp. So we do so.
3487 _ | not (mapMember target1 keep)
3488 -> [JXX invCond target2]
3489 -- If the conditional target is unlikely we put the other
3490 -- target at the front.
3491 | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
3492 -> [JXX invCond target2, JXX ALWAYS target1]
3493 -- Keep things as-is otherwise
3494 | otherwise
3495 -> [jmp1, jmp2]
3496 in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
3497 (BasicBlock lbl1
3498 (dropTail 2 ins ++ jumps))
3499 : invert (b2:bs)
3500 invert (b:bs) = b : invert bs
3501 invert [] = []