Add support for passing SSE vectors in registers.
[ghc.git] / compiler / nativeGen / X86 / CodeGen.hs
1 -----------------------------------------------------------------------------
2 --
3 -- Generating machine code (instruction selection)
4 --
5 -- (c) The University of Glasgow 1996-2004
6 --
7 -----------------------------------------------------------------------------
8
9 -- This is a big module, but, if you pay attention to
10 -- (a) the sectioning, and (b) the type signatures, the
11 -- structure should not be too overwhelming.
12
13 {-# LANGUAGE GADTs #-}
14 module X86.CodeGen (
15 cmmTopCodeGen,
16 generateJumpTableForInstr,
17 InstrBlock
18 )
19
20 where
21
22 #include "HsVersions.h"
23 #include "nativeGen/NCG.h"
24 #include "../includes/MachDeps.h"
25
26 -- NCG stuff:
27 import X86.Instr
28 import X86.Cond
29 import X86.Regs
30 import X86.RegInfo
31 import CodeGen.Platform
32 import CPrim
33 import Instruction
34 import PIC
35 import NCGMonad
36 import Size
37 import Reg
38 import Platform
39
40 -- Our intermediate code:
41 import BasicTypes
42 import BlockId
43 import Module ( primPackageId )
44 import PprCmm ()
45 import CmmUtils
46 import Cmm
47 import Hoopl
48 import CLabel
49
50 -- The rest:
51 import ForeignCall ( CCallConv(..) )
52 import OrdList
53 import Outputable
54 import Unique
55 import FastString
56 import FastBool ( isFastTrue )
57 import DynFlags
58 import Util
59
60 import Control.Monad
61 import Data.Bits
62 import Data.Int
63 import Data.Maybe
64 import Data.Word
65
66 is32BitPlatform :: NatM Bool
67 is32BitPlatform = do
68 dflags <- getDynFlags
69 return $ target32Bit (targetPlatform dflags)
70
71 sse2Enabled :: NatM Bool
72 sse2Enabled = do
73 dflags <- getDynFlags
74 return (isSse2Enabled dflags)
75
76 sse4_2Enabled :: NatM Bool
77 sse4_2Enabled = do
78 dflags <- getDynFlags
79 return (isSse4_2Enabled dflags)
80
81 if_sse2 :: NatM a -> NatM a -> NatM a
82 if_sse2 sse2 x87 = do
83 b <- sse2Enabled
84 if b then sse2 else x87
85
86 cmmTopCodeGen
87 :: RawCmmDecl
88 -> NatM [NatCmmDecl (Alignment, CmmStatics) Instr]
89
90 cmmTopCodeGen (CmmProc info lab live graph) = do
91 let blocks = toBlockListEntryFirst graph
92 (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
93 picBaseMb <- getPicBaseMaybeNat
94 dflags <- getDynFlags
95 let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
96 tops = proc : concat statics
97 os = platformOS $ targetPlatform dflags
98
99 case picBaseMb of
100 Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
101 Nothing -> return tops
102
103 cmmTopCodeGen (CmmData sec dat) = do
104 return [CmmData sec (1, dat)] -- no translation, we just use CmmStatic
105
106
107 basicBlockCodeGen
108 :: CmmBlock
109 -> NatM ( [NatBasicBlock Instr]
110 , [NatCmmDecl (Alignment, CmmStatics) Instr])
111
112 basicBlockCodeGen block = do
113 let (CmmEntry id, nodes, tail) = blockSplit block
114 stmts = blockToList nodes
115 mid_instrs <- stmtsToInstrs stmts
116 tail_instrs <- stmtToInstrs tail
117 let instrs = mid_instrs `appOL` tail_instrs
118 -- code generation may introduce new basic block boundaries, which
119 -- are indicated by the NEWBLOCK instruction. We must split up the
120 -- instruction stream into basic blocks again. Also, we extract
121 -- LDATAs here too.
122 let
123 (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
124
125 mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
126 = ([], BasicBlock id instrs : blocks, statics)
127 mkBlocks (LDATA sec dat) (instrs,blocks,statics)
128 = (instrs, blocks, CmmData sec dat:statics)
129 mkBlocks instr (instrs,blocks,statics)
130 = (instr:instrs, blocks, statics)
131 return (BasicBlock id top : other_blocks, statics)
132
133
134 stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock
135 stmtsToInstrs stmts
136 = do instrss <- mapM stmtToInstrs stmts
137 return (concatOL instrss)
138
139
140 stmtToInstrs :: CmmNode e x -> NatM InstrBlock
141 stmtToInstrs stmt = do
142 dflags <- getDynFlags
143 is32Bit <- is32BitPlatform
144 case stmt of
145 CmmComment s -> return (unitOL (COMMENT s))
146
147 CmmAssign reg src
148 | isFloatType ty -> assignReg_FltCode size reg src
149 | is32Bit && isWord64 ty -> assignReg_I64Code reg src
150 | otherwise -> assignReg_IntCode size reg src
151 where ty = cmmRegType dflags reg
152 size = cmmTypeSize ty
153
154 CmmStore addr src
155 | isFloatType ty -> assignMem_FltCode size addr src
156 | is32Bit && isWord64 ty -> assignMem_I64Code addr src
157 | otherwise -> assignMem_IntCode size addr src
158 where ty = cmmExprType dflags src
159 size = cmmTypeSize ty
160
161 CmmUnsafeForeignCall target result_regs args
162 -> genCCall is32Bit target result_regs args
163
164 CmmBranch id -> genBranch id
165 CmmCondBranch arg true false -> do b1 <- genCondJump true arg
166 b2 <- genBranch false
167 return (b1 `appOL` b2)
168 CmmSwitch arg ids -> do dflags <- getDynFlags
169 genSwitch dflags arg ids
170 CmmCall { cml_target = arg
171 , cml_args_regs = gregs } -> do
172 dflags <- getDynFlags
173 genJump arg (jumpRegs dflags gregs)
174 _ ->
175 panic "stmtToInstrs: statement should have been cps'd away"
176
177
178 jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
179 jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
180 where platform = targetPlatform dflags
181
182 --------------------------------------------------------------------------------
183 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
184 -- They are really trees of insns to facilitate fast appending, where a
185 -- left-to-right traversal yields the insns in the correct order.
186 --
187 type InstrBlock
188 = OrdList Instr
189
190
191 -- | Condition codes passed up the tree.
192 --
193 data CondCode
194 = CondCode Bool Cond InstrBlock
195
196
197 -- | a.k.a "Register64"
198 -- Reg is the lower 32-bit temporary which contains the result.
199 -- Use getHiVRegFromLo to find the other VRegUnique.
200 --
201 -- Rules of this simplified insn selection game are therefore that
202 -- the returned Reg may be modified
203 --
204 data ChildCode64
205 = ChildCode64
206 InstrBlock
207 Reg
208
209
210 -- | Register's passed up the tree. If the stix code forces the register
211 -- to live in a pre-decided machine register, it comes out as @Fixed@;
212 -- otherwise, it comes out as @Any@, and the parent can decide which
213 -- register to put it in.
214 --
215 data Register
216 = Fixed Size Reg InstrBlock
217 | Any Size (Reg -> InstrBlock)
218
219
220 swizzleRegisterRep :: Register -> Size -> Register
221 swizzleRegisterRep (Fixed _ reg code) size = Fixed size reg code
222 swizzleRegisterRep (Any _ codefn) size = Any size codefn
223
224
225 -- | Grab the Reg for a CmmReg
226 getRegisterReg :: Platform -> Bool -> CmmReg -> Reg
227
228 getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk))
229 = let sz = cmmTypeSize pk in
230 if isFloatSize sz && not use_sse2
231 then RegVirtual (mkVirtualReg u FF80)
232 else RegVirtual (mkVirtualReg u sz)
233
234 getRegisterReg platform _ (CmmGlobal mid)
235 = case globalRegMaybe platform mid of
236 Just reg -> RegReal $ reg
237 Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
238 -- By this stage, the only MagicIds remaining should be the
239 -- ones which map to a real machine register on this
240 -- platform. Hence ...
241
242
243 -- | Memory addressing modes passed up the tree.
244 data Amode
245 = Amode AddrMode InstrBlock
246
247 {-
248 Now, given a tree (the argument to an CmmLoad) that references memory,
249 produce a suitable addressing mode.
250
251 A Rule of the Game (tm) for Amodes: use of the addr bit must
252 immediately follow use of the code part, since the code part puts
253 values in registers which the addr then refers to. So you can't put
254 anything in between, lest it overwrite some of those registers. If
255 you need to do some other computation between the code part and use of
256 the addr bit, first store the effective address from the amode in a
257 temporary, then do the other computation, and then use the temporary:
258
259 code
260 LEA amode, tmp
261 ... other computation ...
262 ... (tmp) ...
263 -}
264
265
266 -- | Check whether an integer will fit in 32 bits.
267 -- A CmmInt is intended to be truncated to the appropriate
268 -- number of bits, so here we truncate it to Int64. This is
269 -- important because e.g. -1 as a CmmInt might be either
270 -- -1 or 18446744073709551615.
271 --
272 is32BitInteger :: Integer -> Bool
273 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
274 where i64 = fromIntegral i :: Int64
275
276
277 -- | Convert a BlockId to some CmmStatic data
278 jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
279 jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
280 jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
281 where blockLabel = mkAsmTempLabel (getUnique blockid)
282
283
284 -- -----------------------------------------------------------------------------
285 -- General things for putting together code sequences
286
287 -- Expand CmmRegOff. ToDo: should we do it this way around, or convert
288 -- CmmExprs into CmmRegOff?
289 mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
290 mangleIndexTree dflags reg off
291 = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
292 where width = typeWidth (cmmRegType dflags reg)
293
294 -- | The dual to getAnyReg: compute an expression into a register, but
295 -- we don't mind which one it is.
296 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
297 getSomeReg expr = do
298 r <- getRegister expr
299 case r of
300 Any rep code -> do
301 tmp <- getNewRegNat rep
302 return (tmp, code tmp)
303 Fixed _ reg code ->
304 return (reg, code)
305
306
307 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
308 assignMem_I64Code addrTree valueTree = do
309 Amode addr addr_code <- getAmode addrTree
310 ChildCode64 vcode rlo <- iselExpr64 valueTree
311 let
312 rhi = getHiVRegFromLo rlo
313
314 -- Little-endian store
315 mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
316 mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
317 return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
318
319
320 assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
321 assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
322 ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
323 let
324 r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
325 r_dst_hi = getHiVRegFromLo r_dst_lo
326 r_src_hi = getHiVRegFromLo r_src_lo
327 mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
328 mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
329 return (
330 vcode `snocOL` mov_lo `snocOL` mov_hi
331 )
332
333 assignReg_I64Code _ _
334 = panic "assignReg_I64Code(i386): invalid lvalue"
335
336
337 iselExpr64 :: CmmExpr -> NatM ChildCode64
338 iselExpr64 (CmmLit (CmmInt i _)) = do
339 (rlo,rhi) <- getNewRegPairNat II32
340 let
341 r = fromIntegral (fromIntegral i :: Word32)
342 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
343 code = toOL [
344 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
345 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
346 ]
347 return (ChildCode64 code rlo)
348
349 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
350 Amode addr addr_code <- getAmode addrTree
351 (rlo,rhi) <- getNewRegPairNat II32
352 let
353 mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
354 mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
355 return (
356 ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
357 rlo
358 )
359
360 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
361 = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
362
363 -- we handle addition, but rather badly
364 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
365 ChildCode64 code1 r1lo <- iselExpr64 e1
366 (rlo,rhi) <- getNewRegPairNat II32
367 let
368 r = fromIntegral (fromIntegral i :: Word32)
369 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
370 r1hi = getHiVRegFromLo r1lo
371 code = code1 `appOL`
372 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
373 ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
374 MOV II32 (OpReg r1hi) (OpReg rhi),
375 ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
376 return (ChildCode64 code rlo)
377
378 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
379 ChildCode64 code1 r1lo <- iselExpr64 e1
380 ChildCode64 code2 r2lo <- iselExpr64 e2
381 (rlo,rhi) <- getNewRegPairNat II32
382 let
383 r1hi = getHiVRegFromLo r1lo
384 r2hi = getHiVRegFromLo r2lo
385 code = code1 `appOL`
386 code2 `appOL`
387 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
388 ADD II32 (OpReg r2lo) (OpReg rlo),
389 MOV II32 (OpReg r1hi) (OpReg rhi),
390 ADC II32 (OpReg r2hi) (OpReg rhi) ]
391 return (ChildCode64 code rlo)
392
393 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
394 fn <- getAnyReg expr
395 r_dst_lo <- getNewRegNat II32
396 let r_dst_hi = getHiVRegFromLo r_dst_lo
397 code = fn r_dst_lo
398 return (
399 ChildCode64 (code `snocOL`
400 MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
401 r_dst_lo
402 )
403
404 iselExpr64 expr
405 = pprPanic "iselExpr64(i386)" (ppr expr)
406
407
408 --------------------------------------------------------------------------------
409 getRegister :: CmmExpr -> NatM Register
410 getRegister e = do dflags <- getDynFlags
411 is32Bit <- is32BitPlatform
412 getRegister' dflags is32Bit e
413
414 getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
415
416 getRegister' dflags is32Bit (CmmReg reg)
417 = case reg of
418 CmmGlobal PicBaseReg
419 | is32Bit ->
420 -- on x86_64, we have %rip for PicBaseReg, but it's not
421 -- a full-featured register, it can only be used for
422 -- rip-relative addressing.
423 do reg' <- getPicBaseNat (archWordSize is32Bit)
424 return (Fixed (archWordSize is32Bit) reg' nilOL)
425 _ ->
426 do use_sse2 <- sse2Enabled
427 let
428 sz = cmmTypeSize (cmmRegType dflags reg)
429 size | not use_sse2 && isFloatSize sz = FF80
430 | otherwise = sz
431 --
432 let platform = targetPlatform dflags
433 return (Fixed size (getRegisterReg platform use_sse2 reg) nilOL)
434
435
436 getRegister' dflags is32Bit (CmmRegOff r n)
437 = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
438
439 -- for 32-bit architectuers, support some 64 -> 32 bit conversions:
440 -- TO_W_(x), TO_W_(x >> 32)
441
442 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
443 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
444 | is32Bit = do
445 ChildCode64 code rlo <- iselExpr64 x
446 return $ Fixed II32 (getHiVRegFromLo rlo) code
447
448 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
449 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
450 | is32Bit = do
451 ChildCode64 code rlo <- iselExpr64 x
452 return $ Fixed II32 (getHiVRegFromLo rlo) code
453
454 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
455 | is32Bit = do
456 ChildCode64 code rlo <- iselExpr64 x
457 return $ Fixed II32 rlo code
458
459 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
460 | is32Bit = do
461 ChildCode64 code rlo <- iselExpr64 x
462 return $ Fixed II32 rlo code
463
464 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
465 if_sse2 float_const_sse2 float_const_x87
466 where
467 float_const_sse2
468 | f == 0.0 = do
469 let
470 size = floatSize w
471 code dst = unitOL (XOR size (OpReg dst) (OpReg dst))
472 -- I don't know why there are xorpd, xorps, and pxor instructions.
473 -- They all appear to do the same thing --SDM
474 return (Any size code)
475
476 | otherwise = do
477 Amode addr code <- memConstant (widthInBytes w) lit
478 loadFloatAmode True w addr code
479
480 float_const_x87 = case w of
481 W64
482 | f == 0.0 ->
483 let code dst = unitOL (GLDZ dst)
484 in return (Any FF80 code)
485
486 | f == 1.0 ->
487 let code dst = unitOL (GLD1 dst)
488 in return (Any FF80 code)
489
490 _otherwise -> do
491 Amode addr code <- memConstant (widthInBytes w) lit
492 loadFloatAmode False w addr code
493
494 -- catch simple cases of zero- or sign-extended load
495 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
496 code <- intLoadCode (MOVZxL II8) addr
497 return (Any II32 code)
498
499 getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
500 code <- intLoadCode (MOVSxL II8) addr
501 return (Any II32 code)
502
503 getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
504 code <- intLoadCode (MOVZxL II16) addr
505 return (Any II32 code)
506
507 getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
508 code <- intLoadCode (MOVSxL II16) addr
509 return (Any II32 code)
510
511 -- catch simple cases of zero- or sign-extended load
512 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
513 | not is32Bit = do
514 code <- intLoadCode (MOVZxL II8) addr
515 return (Any II64 code)
516
517 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
518 | not is32Bit = do
519 code <- intLoadCode (MOVSxL II8) addr
520 return (Any II64 code)
521
522 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
523 | not is32Bit = do
524 code <- intLoadCode (MOVZxL II16) addr
525 return (Any II64 code)
526
527 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
528 | not is32Bit = do
529 code <- intLoadCode (MOVSxL II16) addr
530 return (Any II64 code)
531
532 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
533 | not is32Bit = do
534 code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
535 return (Any II64 code)
536
537 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
538 | not is32Bit = do
539 code <- intLoadCode (MOVSxL II32) addr
540 return (Any II64 code)
541
542 getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
543 CmmLit displacement])
544 | not is32Bit = do
545 return $ Any II64 (\dst -> unitOL $
546 LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
547
548 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
549 sse2 <- sse2Enabled
550 case mop of
551 MO_F_Neg w
552 | sse2 -> sse2NegCode w x
553 | otherwise -> trivialUFCode FF80 (GNEG FF80) x
554
555 MO_S_Neg w -> triv_ucode NEGI (intSize w)
556 MO_Not w -> triv_ucode NOT (intSize w)
557
558 -- Nop conversions
559 MO_UU_Conv W32 W8 -> toI8Reg W32 x
560 MO_SS_Conv W32 W8 -> toI8Reg W32 x
561 MO_UU_Conv W16 W8 -> toI8Reg W16 x
562 MO_SS_Conv W16 W8 -> toI8Reg W16 x
563 MO_UU_Conv W32 W16 -> toI16Reg W32 x
564 MO_SS_Conv W32 W16 -> toI16Reg W32 x
565
566 MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
567 MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
568 MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
569 MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
570 MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
571 MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
572
573 MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intSize rep1) x
574 MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intSize rep1) x
575
576 -- widenings
577 MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x
578 MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
579 MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x
580
581 MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x
582 MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
583 MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x
584
585 MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
586 MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
587 MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
588 MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x
589 MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
590 MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
591 -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
592 -- However, we don't want the register allocator to throw it
593 -- away as an unnecessary reg-to-reg move, so we keep it in
594 -- the form of a movzl and print it as a movl later.
595
596 MO_FF_Conv W32 W64
597 | sse2 -> coerceFP2FP W64 x
598 | otherwise -> conversionNop FF80 x
599
600 MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
601
602 MO_FS_Conv from to -> coerceFP2Int from to x
603 MO_SF_Conv from to -> coerceInt2FP from to x
604
605 MO_V_Insert {} -> needLlvm
606 MO_V_Extract {} -> needLlvm
607 MO_V_Add {} -> needLlvm
608 MO_V_Sub {} -> needLlvm
609 MO_V_Mul {} -> needLlvm
610 MO_VS_Quot {} -> needLlvm
611 MO_VS_Rem {} -> needLlvm
612 MO_VS_Neg {} -> needLlvm
613 MO_VF_Insert {} -> needLlvm
614 MO_VF_Extract {} -> needLlvm
615 MO_VF_Add {} -> needLlvm
616 MO_VF_Sub {} -> needLlvm
617 MO_VF_Mul {} -> needLlvm
618 MO_VF_Quot {} -> needLlvm
619 MO_VF_Neg {} -> needLlvm
620
621 _other -> pprPanic "getRegister" (pprMachOp mop)
622 where
623 triv_ucode :: (Size -> Operand -> Instr) -> Size -> NatM Register
624 triv_ucode instr size = trivialUCode size (instr size) x
625
626 -- signed or unsigned extension.
627 integerExtend :: Width -> Width
628 -> (Size -> Operand -> Operand -> Instr)
629 -> CmmExpr -> NatM Register
630 integerExtend from to instr expr = do
631 (reg,e_code) <- if from == W8 then getByteReg expr
632 else getSomeReg expr
633 let
634 code dst =
635 e_code `snocOL`
636 instr (intSize from) (OpReg reg) (OpReg dst)
637 return (Any (intSize to) code)
638
639 toI8Reg :: Width -> CmmExpr -> NatM Register
640 toI8Reg new_rep expr
641 = do codefn <- getAnyReg expr
642 return (Any (intSize new_rep) codefn)
643 -- HACK: use getAnyReg to get a byte-addressable register.
644 -- If the source was a Fixed register, this will add the
645 -- mov instruction to put it into the desired destination.
646 -- We're assuming that the destination won't be a fixed
647 -- non-byte-addressable register; it won't be, because all
648 -- fixed registers are word-sized.
649
650 toI16Reg = toI8Reg -- for now
651
652 conversionNop :: Size -> CmmExpr -> NatM Register
653 conversionNop new_size expr
654 = do e_code <- getRegister' dflags is32Bit expr
655 return (swizzleRegisterRep e_code new_size)
656
657
658 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
659 sse2 <- sse2Enabled
660 case mop of
661 MO_F_Eq _ -> condFltReg is32Bit EQQ x y
662 MO_F_Ne _ -> condFltReg is32Bit NE x y
663 MO_F_Gt _ -> condFltReg is32Bit GTT x y
664 MO_F_Ge _ -> condFltReg is32Bit GE x y
665 MO_F_Lt _ -> condFltReg is32Bit LTT x y
666 MO_F_Le _ -> condFltReg is32Bit LE x y
667
668 MO_Eq _ -> condIntReg EQQ x y
669 MO_Ne _ -> condIntReg NE x y
670
671 MO_S_Gt _ -> condIntReg GTT x y
672 MO_S_Ge _ -> condIntReg GE x y
673 MO_S_Lt _ -> condIntReg LTT x y
674 MO_S_Le _ -> condIntReg LE x y
675
676 MO_U_Gt _ -> condIntReg GU x y
677 MO_U_Ge _ -> condIntReg GEU x y
678 MO_U_Lt _ -> condIntReg LU x y
679 MO_U_Le _ -> condIntReg LEU x y
680
681 MO_F_Add w | sse2 -> trivialFCode_sse2 w ADD x y
682 | otherwise -> trivialFCode_x87 GADD x y
683 MO_F_Sub w | sse2 -> trivialFCode_sse2 w SUB x y
684 | otherwise -> trivialFCode_x87 GSUB x y
685 MO_F_Quot w | sse2 -> trivialFCode_sse2 w FDIV x y
686 | otherwise -> trivialFCode_x87 GDIV x y
687 MO_F_Mul w | sse2 -> trivialFCode_sse2 w MUL x y
688 | otherwise -> trivialFCode_x87 GMUL x y
689
690 MO_Add rep -> add_code rep x y
691 MO_Sub rep -> sub_code rep x y
692
693 MO_S_Quot rep -> div_code rep True True x y
694 MO_S_Rem rep -> div_code rep True False x y
695 MO_U_Quot rep -> div_code rep False True x y
696 MO_U_Rem rep -> div_code rep False False x y
697
698 MO_S_MulMayOflo rep -> imulMayOflo rep x y
699
700 MO_Mul rep -> triv_op rep IMUL
701 MO_And rep -> triv_op rep AND
702 MO_Or rep -> triv_op rep OR
703 MO_Xor rep -> triv_op rep XOR
704
705 {- Shift ops on x86s have constraints on their source, it
706 either has to be Imm, CL or 1
707 => trivialCode is not restrictive enough (sigh.)
708 -}
709 MO_Shl rep -> shift_code rep SHL x y {-False-}
710 MO_U_Shr rep -> shift_code rep SHR x y {-False-}
711 MO_S_Shr rep -> shift_code rep SAR x y {-False-}
712
713 MO_V_Insert {} -> needLlvm
714 MO_V_Extract {} -> needLlvm
715 MO_V_Add {} -> needLlvm
716 MO_V_Sub {} -> needLlvm
717 MO_V_Mul {} -> needLlvm
718 MO_VS_Quot {} -> needLlvm
719 MO_VS_Rem {} -> needLlvm
720 MO_VS_Neg {} -> needLlvm
721 MO_VF_Insert {} -> needLlvm
722 MO_VF_Extract {} -> needLlvm
723 MO_VF_Add {} -> needLlvm
724 MO_VF_Sub {} -> needLlvm
725 MO_VF_Mul {} -> needLlvm
726 MO_VF_Quot {} -> needLlvm
727 MO_VF_Neg {} -> needLlvm
728
729 _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
730 where
731 --------------------
732 triv_op width instr = trivialCode width op (Just op) x y
733 where op = instr (intSize width)
734
735 imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
736 imulMayOflo rep a b = do
737 (a_reg, a_code) <- getNonClobberedReg a
738 b_code <- getAnyReg b
739 let
740 shift_amt = case rep of
741 W32 -> 31
742 W64 -> 63
743 _ -> panic "shift_amt"
744
745 size = intSize rep
746 code = a_code `appOL` b_code eax `appOL`
747 toOL [
748 IMUL2 size (OpReg a_reg), -- result in %edx:%eax
749 SAR size (OpImm (ImmInt shift_amt)) (OpReg eax),
750 -- sign extend lower part
751 SUB size (OpReg edx) (OpReg eax)
752 -- compare against upper
753 -- eax==0 if high part == sign extended low part
754 ]
755 return (Fixed size eax code)
756
757 --------------------
758 shift_code :: Width
759 -> (Size -> Operand -> Operand -> Instr)
760 -> CmmExpr
761 -> CmmExpr
762 -> NatM Register
763
764 {- Case1: shift length as immediate -}
765 shift_code width instr x (CmmLit lit) = do
766 x_code <- getAnyReg x
767 let
768 size = intSize width
769 code dst
770 = x_code dst `snocOL`
771 instr size (OpImm (litToImm lit)) (OpReg dst)
772 return (Any size code)
773
774 {- Case2: shift length is complex (non-immediate)
775 * y must go in %ecx.
776 * we cannot do y first *and* put its result in %ecx, because
777 %ecx might be clobbered by x.
778 * if we do y second, then x cannot be
779 in a clobbered reg. Also, we cannot clobber x's reg
780 with the instruction itself.
781 * so we can either:
782 - do y first, put its result in a fresh tmp, then copy it to %ecx later
783 - do y second and put its result into %ecx. x gets placed in a fresh
784 tmp. This is likely to be better, because the reg alloc can
785 eliminate this reg->reg move here (it won't eliminate the other one,
786 because the move is into the fixed %ecx).
787 -}
788 shift_code width instr x y{-amount-} = do
789 x_code <- getAnyReg x
790 let size = intSize width
791 tmp <- getNewRegNat size
792 y_code <- getAnyReg y
793 let
794 code = x_code tmp `appOL`
795 y_code ecx `snocOL`
796 instr size (OpReg ecx) (OpReg tmp)
797 return (Fixed size tmp code)
798
799 --------------------
800 add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
801 add_code rep x (CmmLit (CmmInt y _))
802 | is32BitInteger y = add_int rep x y
803 add_code rep x y = trivialCode rep (ADD size) (Just (ADD size)) x y
804 where size = intSize rep
805
806 --------------------
807 sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
808 sub_code rep x (CmmLit (CmmInt y _))
809 | is32BitInteger (-y) = add_int rep x (-y)
810 sub_code rep x y = trivialCode rep (SUB (intSize rep)) Nothing x y
811
812 -- our three-operand add instruction:
813 add_int width x y = do
814 (x_reg, x_code) <- getSomeReg x
815 let
816 size = intSize width
817 imm = ImmInt (fromInteger y)
818 code dst
819 = x_code `snocOL`
820 LEA size
821 (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
822 (OpReg dst)
823 --
824 return (Any size code)
825
826 ----------------------
827 div_code width signed quotient x y = do
828 (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
829 x_code <- getAnyReg x
830 let
831 size = intSize width
832 widen | signed = CLTD size
833 | otherwise = XOR size (OpReg edx) (OpReg edx)
834
835 instr | signed = IDIV
836 | otherwise = DIV
837
838 code = y_code `appOL`
839 x_code eax `appOL`
840 toOL [widen, instr size y_op]
841
842 result | quotient = eax
843 | otherwise = edx
844
845 return (Fixed size result code)
846
847
848 getRegister' _ _ (CmmLoad mem pk)
849 | isFloatType pk
850 = do
851 Amode addr mem_code <- getAmode mem
852 use_sse2 <- sse2Enabled
853 loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
854
855 getRegister' _ is32Bit (CmmLoad mem pk)
856 | is32Bit && not (isWord64 pk)
857 = do
858 code <- intLoadCode instr mem
859 return (Any size code)
860 where
861 width = typeWidth pk
862 size = intSize width
863 instr = case width of
864 W8 -> MOVZxL II8
865 _other -> MOV size
866 -- We always zero-extend 8-bit loads, if we
867 -- can't think of anything better. This is because
868 -- we can't guarantee access to an 8-bit variant of every register
869 -- (esi and edi don't have 8-bit variants), so to make things
870 -- simpler we do our 8-bit arithmetic with full 32-bit registers.
871
872 -- Simpler memory load code on x86_64
873 getRegister' _ is32Bit (CmmLoad mem pk)
874 | not is32Bit
875 = do
876 code <- intLoadCode (MOV size) mem
877 return (Any size code)
878 where size = intSize $ typeWidth pk
879
880 getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
881 = let
882 size = intSize width
883
884 -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
885 size1 = if is32Bit then size
886 else case size of
887 II64 -> II32
888 _ -> size
889 code dst
890 = unitOL (XOR size1 (OpReg dst) (OpReg dst))
891 in
892 return (Any size code)
893
894 -- optimisation for loading small literals on x86_64: take advantage
895 -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
896 -- instruction forms are shorter.
897 getRegister' dflags is32Bit (CmmLit lit)
898 | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
899 = let
900 imm = litToImm lit
901 code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
902 in
903 return (Any II64 code)
904 where
905 isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
906 isBigLit _ = False
907 -- note1: not the same as (not.is32BitLit), because that checks for
908 -- signed literals that fit in 32 bits, but we want unsigned
909 -- literals here.
910 -- note2: all labels are small, because we're assuming the
911 -- small memory model (see gcc docs, -mcmodel=small).
912
913 getRegister' dflags _ (CmmLit lit)
914 = do let size = cmmTypeSize (cmmLitType dflags lit)
915 imm = litToImm lit
916 code dst = unitOL (MOV size (OpImm imm) (OpReg dst))
917 return (Any size code)
918
919 getRegister' _ _ other
920 | isVecExpr other = needLlvm
921 | otherwise = pprPanic "getRegister(x86)" (ppr other)
922
923
924 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
925 -> NatM (Reg -> InstrBlock)
926 intLoadCode instr mem = do
927 Amode src mem_code <- getAmode mem
928 return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
929
930 -- Compute an expression into *any* register, adding the appropriate
931 -- move instruction if necessary.
932 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
933 getAnyReg expr = do
934 r <- getRegister expr
935 anyReg r
936
937 anyReg :: Register -> NatM (Reg -> InstrBlock)
938 anyReg (Any _ code) = return code
939 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
940
941 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
942 -- Fixed registers might not be byte-addressable, so we make sure we've
943 -- got a temporary, inserting an extra reg copy if necessary.
944 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
945 getByteReg expr = do
946 is32Bit <- is32BitPlatform
947 if is32Bit
948 then do r <- getRegister expr
949 case r of
950 Any rep code -> do
951 tmp <- getNewRegNat rep
952 return (tmp, code tmp)
953 Fixed rep reg code
954 | isVirtualReg reg -> return (reg,code)
955 | otherwise -> do
956 tmp <- getNewRegNat rep
957 return (tmp, code `snocOL` reg2reg rep reg tmp)
958 -- ToDo: could optimise slightly by checking for
959 -- byte-addressable real registers, but that will
960 -- happen very rarely if at all.
961 else getSomeReg expr -- all regs are byte-addressable on x86_64
962
963 -- Another variant: this time we want the result in a register that cannot
964 -- be modified by code to evaluate an arbitrary expression.
965 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
966 getNonClobberedReg expr = do
967 dflags <- getDynFlags
968 r <- getRegister expr
969 case r of
970 Any rep code -> do
971 tmp <- getNewRegNat rep
972 return (tmp, code tmp)
973 Fixed rep reg code
974 -- only certain regs can be clobbered
975 | reg `elem` instrClobberedRegs (targetPlatform dflags)
976 -> do
977 tmp <- getNewRegNat rep
978 return (tmp, code `snocOL` reg2reg rep reg tmp)
979 | otherwise ->
980 return (reg, code)
981
982 reg2reg :: Size -> Reg -> Reg -> Instr
983 reg2reg size src dst
984 | size == FF80 = GMOV src dst
985 | otherwise = MOV size (OpReg src) (OpReg dst)
986
987
988 --------------------------------------------------------------------------------
989 getAmode :: CmmExpr -> NatM Amode
990 getAmode e = do is32Bit <- is32BitPlatform
991 getAmode' is32Bit e
992
993 getAmode' :: Bool -> CmmExpr -> NatM Amode
994 getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
995 getAmode $ mangleIndexTree dflags r n
996
997 getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
998 CmmLit displacement])
999 | not is32Bit
1000 = return $ Amode (ripRel (litToImm displacement)) nilOL
1001
1002
1003 -- This is all just ridiculous, since it carefully undoes
1004 -- what mangleIndexTree has just done.
1005 getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
1006 | is32BitLit is32Bit lit
1007 -- ASSERT(rep == II32)???
1008 = do (x_reg, x_code) <- getSomeReg x
1009 let off = ImmInt (-(fromInteger i))
1010 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1011
1012 getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
1013 | is32BitLit is32Bit lit
1014 -- ASSERT(rep == II32)???
1015 = do (x_reg, x_code) <- getSomeReg x
1016 let off = litToImm lit
1017 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1018
1019 -- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be
1020 -- recognised by the next rule.
1021 getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
1022 b@(CmmLit _)])
1023 = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
1024
1025 getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
1026 [y, CmmLit (CmmInt shift _)]])
1027 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1028 = x86_complex_amode x y shift 0
1029
1030 getAmode' _ (CmmMachOp (MO_Add _)
1031 [x, CmmMachOp (MO_Add _)
1032 [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
1033 CmmLit (CmmInt offset _)]])
1034 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1035 && is32BitInteger offset
1036 = x86_complex_amode x y shift offset
1037
1038 getAmode' _ (CmmMachOp (MO_Add _) [x,y])
1039 = x86_complex_amode x y 0 0
1040
1041 getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1042 = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
1043
1044 getAmode' _ expr = do
1045 (reg,code) <- getSomeReg expr
1046 return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
1047
1048
1049 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
1050 x86_complex_amode base index shift offset
1051 = do (x_reg, x_code) <- getNonClobberedReg base
1052 -- x must be in a temp, because it has to stay live over y_code
1053 -- we could compre x_reg and y_reg and do something better here...
1054 (y_reg, y_code) <- getSomeReg index
1055 let
1056 code = x_code `appOL` y_code
1057 base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
1058 n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
1059 return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
1060 code)
1061
1062
1063
1064
1065 -- -----------------------------------------------------------------------------
1066 -- getOperand: sometimes any operand will do.
1067
1068 -- getNonClobberedOperand: the value of the operand will remain valid across
1069 -- the computation of an arbitrary expression, unless the expression
1070 -- is computed directly into a register which the operand refers to
1071 -- (see trivialCode where this function is used for an example).
1072
1073 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1074 getNonClobberedOperand (CmmLit lit) = do
1075 use_sse2 <- sse2Enabled
1076 if use_sse2 && isSuitableFloatingPointLit lit
1077 then do
1078 let CmmFloat _ w = lit
1079 Amode addr code <- memConstant (widthInBytes w) lit
1080 return (OpAddr addr, code)
1081 else do
1082
1083 is32Bit <- is32BitPlatform
1084 dflags <- getDynFlags
1085 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1086 then return (OpImm (litToImm lit), nilOL)
1087 else getNonClobberedOperand_generic (CmmLit lit)
1088
1089 getNonClobberedOperand (CmmLoad mem pk) = do
1090 is32Bit <- is32BitPlatform
1091 use_sse2 <- sse2Enabled
1092 if (not (isFloatType pk) || use_sse2)
1093 && (if is32Bit then not (isWord64 pk) else True)
1094 then do
1095 dflags <- getDynFlags
1096 let platform = targetPlatform dflags
1097 Amode src mem_code <- getAmode mem
1098 (src',save_code) <-
1099 if (amodeCouldBeClobbered platform src)
1100 then do
1101 tmp <- getNewRegNat (archWordSize is32Bit)
1102 return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
1103 unitOL (LEA (archWordSize is32Bit) (OpAddr src) (OpReg tmp)))
1104 else
1105 return (src, nilOL)
1106 return (OpAddr src', mem_code `appOL` save_code)
1107 else do
1108 getNonClobberedOperand_generic (CmmLoad mem pk)
1109
1110 getNonClobberedOperand e = getNonClobberedOperand_generic e
1111
1112 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1113 getNonClobberedOperand_generic e = do
1114 (reg, code) <- getNonClobberedReg e
1115 return (OpReg reg, code)
1116
1117 amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
1118 amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
1119
1120 regClobbered :: Platform -> Reg -> Bool
1121 regClobbered platform (RegReal (RealRegSingle rr)) = isFastTrue (freeReg platform rr)
1122 regClobbered _ _ = False
1123
1124 -- getOperand: the operand is not required to remain valid across the
1125 -- computation of an arbitrary expression.
1126 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1127
1128 getOperand (CmmLit lit) = do
1129 use_sse2 <- sse2Enabled
1130 if (use_sse2 && isSuitableFloatingPointLit lit)
1131 then do
1132 let CmmFloat _ w = lit
1133 Amode addr code <- memConstant (widthInBytes w) lit
1134 return (OpAddr addr, code)
1135 else do
1136
1137 is32Bit <- is32BitPlatform
1138 dflags <- getDynFlags
1139 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
1140 then return (OpImm (litToImm lit), nilOL)
1141 else getOperand_generic (CmmLit lit)
1142
1143 getOperand (CmmLoad mem pk) = do
1144 is32Bit <- is32BitPlatform
1145 use_sse2 <- sse2Enabled
1146 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1147 then do
1148 Amode src mem_code <- getAmode mem
1149 return (OpAddr src, mem_code)
1150 else
1151 getOperand_generic (CmmLoad mem pk)
1152
1153 getOperand e = getOperand_generic e
1154
1155 getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1156 getOperand_generic e = do
1157 (reg, code) <- getSomeReg e
1158 return (OpReg reg, code)
1159
1160 isOperand :: Bool -> CmmExpr -> Bool
1161 isOperand _ (CmmLoad _ _) = True
1162 isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
1163 || isSuitableFloatingPointLit lit
1164 isOperand _ _ = False
1165
1166 memConstant :: Int -> CmmLit -> NatM Amode
1167 memConstant align lit = do
1168 lbl <- getNewLabelNat
1169 dflags <- getDynFlags
1170 (addr, addr_code) <- if target32Bit (targetPlatform dflags)
1171 then do dynRef <- cmmMakeDynamicReference
1172 dflags
1173 addImportNat
1174 DataReference
1175 lbl
1176 Amode addr addr_code <- getAmode dynRef
1177 return (addr, addr_code)
1178 else return (ripRel (ImmCLbl lbl), nilOL)
1179 let code =
1180 LDATA ReadOnlyData (align, Statics lbl [CmmStaticLit lit])
1181 `consOL` addr_code
1182 return (Amode addr code)
1183
1184
1185 loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
1186 loadFloatAmode use_sse2 w addr addr_code = do
1187 let size = floatSize w
1188 code dst = addr_code `snocOL`
1189 if use_sse2
1190 then MOV size (OpAddr addr) (OpReg dst)
1191 else GLD size addr dst
1192 return (Any (if use_sse2 then size else FF80) code)
1193
1194
1195 -- if we want a floating-point literal as an operand, we can
1196 -- use it directly from memory. However, if the literal is
1197 -- zero, we're better off generating it into a register using
1198 -- xor.
1199 isSuitableFloatingPointLit :: CmmLit -> Bool
1200 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
1201 isSuitableFloatingPointLit _ = False
1202
1203 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
1204 getRegOrMem e@(CmmLoad mem pk) = do
1205 is32Bit <- is32BitPlatform
1206 use_sse2 <- sse2Enabled
1207 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1208 then do
1209 Amode src mem_code <- getAmode mem
1210 return (OpAddr src, mem_code)
1211 else do
1212 (reg, code) <- getNonClobberedReg e
1213 return (OpReg reg, code)
1214 getRegOrMem e = do
1215 (reg, code) <- getNonClobberedReg e
1216 return (OpReg reg, code)
1217
1218 is32BitLit :: Bool -> CmmLit -> Bool
1219 is32BitLit is32Bit (CmmInt i W64)
1220 | not is32Bit
1221 = -- assume that labels are in the range 0-2^31-1: this assumes the
1222 -- small memory model (see gcc docs, -mcmodel=small).
1223 is32BitInteger i
1224 is32BitLit _ _ = True
1225
1226
1227
1228
1229 -- Set up a condition code for a conditional branch.
1230
1231 getCondCode :: CmmExpr -> NatM CondCode
1232
1233 -- yes, they really do seem to want exactly the same!
1234
1235 getCondCode (CmmMachOp mop [x, y])
1236 =
1237 case mop of
1238 MO_F_Eq W32 -> condFltCode EQQ x y
1239 MO_F_Ne W32 -> condFltCode NE x y
1240 MO_F_Gt W32 -> condFltCode GTT x y
1241 MO_F_Ge W32 -> condFltCode GE x y
1242 MO_F_Lt W32 -> condFltCode LTT x y
1243 MO_F_Le W32 -> condFltCode LE x y
1244
1245 MO_F_Eq W64 -> condFltCode EQQ x y
1246 MO_F_Ne W64 -> condFltCode NE x y
1247 MO_F_Gt W64 -> condFltCode GTT x y
1248 MO_F_Ge W64 -> condFltCode GE x y
1249 MO_F_Lt W64 -> condFltCode LTT x y
1250 MO_F_Le W64 -> condFltCode LE x y
1251
1252 MO_Eq _ -> condIntCode EQQ x y
1253 MO_Ne _ -> condIntCode NE x y
1254
1255 MO_S_Gt _ -> condIntCode GTT x y
1256 MO_S_Ge _ -> condIntCode GE x y
1257 MO_S_Lt _ -> condIntCode LTT x y
1258 MO_S_Le _ -> condIntCode LE x y
1259
1260 MO_U_Gt _ -> condIntCode GU x y
1261 MO_U_Ge _ -> condIntCode GEU x y
1262 MO_U_Lt _ -> condIntCode LU x y
1263 MO_U_Le _ -> condIntCode LEU x y
1264
1265 _other -> pprPanic "getCondCode(x86,x86_64)" (ppr (CmmMachOp mop [x,y]))
1266
1267 getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
1268
1269
1270
1271
1272 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
1273 -- passed back up the tree.
1274
1275 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1276 condIntCode cond x y = do is32Bit <- is32BitPlatform
1277 condIntCode' is32Bit cond x y
1278
1279 condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1280
1281 -- memory vs immediate
1282 condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
1283 | is32BitLit is32Bit lit = do
1284 Amode x_addr x_code <- getAmode x
1285 let
1286 imm = litToImm lit
1287 code = x_code `snocOL`
1288 CMP (cmmTypeSize pk) (OpImm imm) (OpAddr x_addr)
1289 --
1290 return (CondCode False cond code)
1291
1292 -- anything vs zero, using a mask
1293 -- TODO: Add some sanity checking!!!!
1294 condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
1295 | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
1296 = do
1297 (x_reg, x_code) <- getSomeReg x
1298 let
1299 code = x_code `snocOL`
1300 TEST (intSize pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
1301 --
1302 return (CondCode False cond code)
1303
1304 -- anything vs zero
1305 condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
1306 (x_reg, x_code) <- getSomeReg x
1307 let
1308 code = x_code `snocOL`
1309 TEST (intSize pk) (OpReg x_reg) (OpReg x_reg)
1310 --
1311 return (CondCode False cond code)
1312
1313 -- anything vs operand
1314 condIntCode' is32Bit cond x y
1315 | isOperand is32Bit y = do
1316 dflags <- getDynFlags
1317 (x_reg, x_code) <- getNonClobberedReg x
1318 (y_op, y_code) <- getOperand y
1319 let
1320 code = x_code `appOL` y_code `snocOL`
1321 CMP (cmmTypeSize (cmmExprType dflags x)) y_op (OpReg x_reg)
1322 return (CondCode False cond code)
1323 -- operand vs. anything: invert the comparison so that we can use a
1324 -- single comparison instruction.
1325 | isOperand is32Bit x
1326 , Just revcond <- maybeFlipCond cond = do
1327 dflags <- getDynFlags
1328 (y_reg, y_code) <- getNonClobberedReg y
1329 (x_op, x_code) <- getOperand x
1330 let
1331 code = y_code `appOL` x_code `snocOL`
1332 CMP (cmmTypeSize (cmmExprType dflags x)) x_op (OpReg y_reg)
1333 return (CondCode False revcond code)
1334
1335 -- anything vs anything
1336 condIntCode' _ cond x y = do
1337 dflags <- getDynFlags
1338 (y_reg, y_code) <- getNonClobberedReg y
1339 (x_op, x_code) <- getRegOrMem x
1340 let
1341 code = y_code `appOL`
1342 x_code `snocOL`
1343 CMP (cmmTypeSize (cmmExprType dflags x)) (OpReg y_reg) x_op
1344 return (CondCode False cond code)
1345
1346
1347
1348 --------------------------------------------------------------------------------
1349 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1350
1351 condFltCode cond x y
1352 = if_sse2 condFltCode_sse2 condFltCode_x87
1353 where
1354
1355 condFltCode_x87
1356 = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
1357 (x_reg, x_code) <- getNonClobberedReg x
1358 (y_reg, y_code) <- getSomeReg y
1359 let
1360 code = x_code `appOL` y_code `snocOL`
1361 GCMP cond x_reg y_reg
1362 -- The GCMP insn does the test and sets the zero flag if comparable
1363 -- and true. Hence we always supply EQQ as the condition to test.
1364 return (CondCode True EQQ code)
1365
1366 -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
1367 -- an operand, but the right must be a reg. We can probably do better
1368 -- than this general case...
1369 condFltCode_sse2 = do
1370 dflags <- getDynFlags
1371 (x_reg, x_code) <- getNonClobberedReg x
1372 (y_op, y_code) <- getOperand y
1373 let
1374 code = x_code `appOL`
1375 y_code `snocOL`
1376 CMP (floatSize $ cmmExprWidth dflags x) y_op (OpReg x_reg)
1377 -- NB(1): we need to use the unsigned comparison operators on the
1378 -- result of this comparison.
1379 return (CondCode True (condToUnsigned cond) code)
1380
1381 -- -----------------------------------------------------------------------------
1382 -- Generating assignments
1383
1384 -- Assignments are really at the heart of the whole code generation
1385 -- business. Almost all top-level nodes of any real importance are
1386 -- assignments, which correspond to loads, stores, or register
1387 -- transfers. If we're really lucky, some of the register transfers
1388 -- will go away, because we can use the destination register to
1389 -- complete the code generation for the right hand side. This only
1390 -- fails when the right hand side is forced into a fixed register
1391 -- (e.g. the result of a call).
1392
1393 assignMem_IntCode :: Size -> CmmExpr -> CmmExpr -> NatM InstrBlock
1394 assignReg_IntCode :: Size -> CmmReg -> CmmExpr -> NatM InstrBlock
1395
1396 assignMem_FltCode :: Size -> CmmExpr -> CmmExpr -> NatM InstrBlock
1397 assignReg_FltCode :: Size -> CmmReg -> CmmExpr -> NatM InstrBlock
1398
1399
1400 -- integer assignment to memory
1401
1402 -- specific case of adding/subtracting an integer to a particular address.
1403 -- ToDo: catch other cases where we can use an operation directly on a memory
1404 -- address.
1405 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
1406 CmmLit (CmmInt i _)])
1407 | addr == addr2, pk /= II64 || is32BitInteger i,
1408 Just instr <- check op
1409 = do Amode amode code_addr <- getAmode addr
1410 let code = code_addr `snocOL`
1411 instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
1412 return code
1413 where
1414 check (MO_Add _) = Just ADD
1415 check (MO_Sub _) = Just SUB
1416 check _ = Nothing
1417 -- ToDo: more?
1418
1419 -- general case
1420 assignMem_IntCode pk addr src = do
1421 is32Bit <- is32BitPlatform
1422 Amode addr code_addr <- getAmode addr
1423 (code_src, op_src) <- get_op_RI is32Bit src
1424 let
1425 code = code_src `appOL`
1426 code_addr `snocOL`
1427 MOV pk op_src (OpAddr addr)
1428 -- NOTE: op_src is stable, so it will still be valid
1429 -- after code_addr. This may involve the introduction
1430 -- of an extra MOV to a temporary register, but we hope
1431 -- the register allocator will get rid of it.
1432 --
1433 return code
1434 where
1435 get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator
1436 get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1437 = return (nilOL, OpImm (litToImm lit))
1438 get_op_RI _ op
1439 = do (reg,code) <- getNonClobberedReg op
1440 return (code, OpReg reg)
1441
1442
1443 -- Assign; dst is a reg, rhs is mem
1444 assignReg_IntCode pk reg (CmmLoad src _) = do
1445 load_code <- intLoadCode (MOV pk) src
1446 dflags <- getDynFlags
1447 let platform = targetPlatform dflags
1448 return (load_code (getRegisterReg platform False{-no sse2-} reg))
1449
1450 -- dst is a reg, but src could be anything
1451 assignReg_IntCode _ reg src = do
1452 dflags <- getDynFlags
1453 let platform = targetPlatform dflags
1454 code <- getAnyReg src
1455 return (code (getRegisterReg platform False{-no sse2-} reg))
1456
1457
1458 -- Floating point assignment to memory
1459 assignMem_FltCode pk addr src = do
1460 (src_reg, src_code) <- getNonClobberedReg src
1461 Amode addr addr_code <- getAmode addr
1462 use_sse2 <- sse2Enabled
1463 let
1464 code = src_code `appOL`
1465 addr_code `snocOL`
1466 if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
1467 else GST pk src_reg addr
1468 return code
1469
1470 -- Floating point assignment to a register/temporary
1471 assignReg_FltCode _ reg src = do
1472 use_sse2 <- sse2Enabled
1473 src_code <- getAnyReg src
1474 dflags <- getDynFlags
1475 let platform = targetPlatform dflags
1476 return (src_code (getRegisterReg platform use_sse2 reg))
1477
1478
1479 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
1480
1481 genJump (CmmLoad mem _) regs = do
1482 Amode target code <- getAmode mem
1483 return (code `snocOL` JMP (OpAddr target) regs)
1484
1485 genJump (CmmLit lit) regs = do
1486 return (unitOL (JMP (OpImm (litToImm lit)) regs))
1487
1488 genJump expr regs = do
1489 (reg,code) <- getSomeReg expr
1490 return (code `snocOL` JMP (OpReg reg) regs)
1491
1492
1493 -- -----------------------------------------------------------------------------
1494 -- Unconditional branches
1495
1496 genBranch :: BlockId -> NatM InstrBlock
1497 genBranch = return . toOL . mkJumpInstr
1498
1499
1500
1501 -- -----------------------------------------------------------------------------
1502 -- Conditional jumps
1503
1504 {-
1505 Conditional jumps are always to local labels, so we can use branch
1506 instructions. We peek at the arguments to decide what kind of
1507 comparison to do.
1508
1509 I386: First, we have to ensure that the condition
1510 codes are set according to the supplied comparison operation.
1511 -}
1512
1513 genCondJump
1514 :: BlockId -- the branch target
1515 -> CmmExpr -- the condition on which to branch
1516 -> NatM InstrBlock
1517
1518 genCondJump id bool = do
1519 CondCode is_float cond cond_code <- getCondCode bool
1520 use_sse2 <- sse2Enabled
1521 if not is_float || not use_sse2
1522 then
1523 return (cond_code `snocOL` JXX cond id)
1524 else do
1525 lbl <- getBlockIdNat
1526
1527 -- see comment with condFltReg
1528 let code = case cond of
1529 NE -> or_unordered
1530 GU -> plain_test
1531 GEU -> plain_test
1532 _ -> and_ordered
1533
1534 plain_test = unitOL (
1535 JXX cond id
1536 )
1537 or_unordered = toOL [
1538 JXX cond id,
1539 JXX PARITY id
1540 ]
1541 and_ordered = toOL [
1542 JXX PARITY lbl,
1543 JXX cond id,
1544 JXX ALWAYS lbl,
1545 NEWBLOCK lbl
1546 ]
1547 return (cond_code `appOL` code)
1548
1549
1550 -- -----------------------------------------------------------------------------
1551 -- Generating C calls
1552
1553 -- Now the biggest nightmare---calls. Most of the nastiness is buried in
1554 -- @get_arg@, which moves the arguments to the correct registers/stack
1555 -- locations. Apart from that, the code is easy.
1556 --
1557 -- (If applicable) Do not fill the delay slots here; you will confuse the
1558 -- register allocator.
1559
1560 genCCall
1561 :: Bool -- 32 bit platform?
1562 -> ForeignTarget -- function to call
1563 -> [CmmFormal] -- where to put the result
1564 -> [CmmActual] -- arguments (of mixed type)
1565 -> NatM InstrBlock
1566
1567 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1568
1569 -- Unroll memcpy calls if the source and destination pointers are at
1570 -- least DWORD aligned and the number of bytes to copy isn't too
1571 -- large. Otherwise, call C's memcpy.
1572 genCCall is32Bit (PrimTarget MO_Memcpy) _
1573 [dst, src,
1574 (CmmLit (CmmInt n _)),
1575 (CmmLit (CmmInt align _))]
1576 | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do
1577 code_dst <- getAnyReg dst
1578 dst_r <- getNewRegNat size
1579 code_src <- getAnyReg src
1580 src_r <- getNewRegNat size
1581 tmp_r <- getNewRegNat size
1582 return $ code_dst dst_r `appOL` code_src src_r `appOL`
1583 go dst_r src_r tmp_r n
1584 where
1585 size = if align .&. 4 /= 0 then II32 else (archWordSize is32Bit)
1586
1587 sizeBytes = fromIntegral (sizeInBytes size)
1588
1589 go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
1590 go dst src tmp i
1591 | i >= sizeBytes =
1592 unitOL (MOV size (OpAddr src_addr) (OpReg tmp)) `appOL`
1593 unitOL (MOV size (OpReg tmp) (OpAddr dst_addr)) `appOL`
1594 go dst src tmp (i - sizeBytes)
1595 -- Deal with remaining bytes.
1596 | i >= 4 = -- Will never happen on 32-bit
1597 unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
1598 unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1599 go dst src tmp (i - 4)
1600 | i >= 2 =
1601 unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
1602 unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1603 go dst src tmp (i - 2)
1604 | i >= 1 =
1605 unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
1606 unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
1607 go dst src tmp (i - 1)
1608 | otherwise = nilOL
1609 where
1610 src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
1611 (ImmInteger (n - i))
1612 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1613 (ImmInteger (n - i))
1614
1615 genCCall _ (PrimTarget MO_Memset) _
1616 [dst,
1617 CmmLit (CmmInt c _),
1618 CmmLit (CmmInt n _),
1619 CmmLit (CmmInt align _)]
1620 | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do
1621 code_dst <- getAnyReg dst
1622 dst_r <- getNewRegNat size
1623 return $ code_dst dst_r `appOL` go dst_r n
1624 where
1625 (size, val) = case align .&. 3 of
1626 2 -> (II16, c2)
1627 0 -> (II32, c4)
1628 _ -> (II8, c)
1629 c2 = c `shiftL` 8 .|. c
1630 c4 = c2 `shiftL` 16 .|. c2
1631
1632 sizeBytes = fromIntegral (sizeInBytes size)
1633
1634 go :: Reg -> Integer -> OrdList Instr
1635 go dst i
1636 -- TODO: Add movabs instruction and support 64-bit sets.
1637 | i >= sizeBytes = -- This might be smaller than the below sizes
1638 unitOL (MOV size (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL`
1639 go dst (i - sizeBytes)
1640 | i >= 4 = -- Will never happen on 32-bit
1641 unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL`
1642 go dst (i - 4)
1643 | i >= 2 =
1644 unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL`
1645 go dst (i - 2)
1646 | i >= 1 =
1647 unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL`
1648 go dst (i - 1)
1649 | otherwise = nilOL
1650 where
1651 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
1652 (ImmInteger (n - i))
1653
1654 genCCall _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL
1655 -- write barrier compiles to no code on x86/x86-64;
1656 -- we keep it this long in order to prevent earlier optimisations.
1657
1658 genCCall _ (PrimTarget MO_Touch) _ _ = return nilOL
1659
1660 genCCall is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
1661 args@[src] = do
1662 sse4_2 <- sse4_2Enabled
1663 dflags <- getDynFlags
1664 let platform = targetPlatform dflags
1665 if sse4_2
1666 then do code_src <- getAnyReg src
1667 src_r <- getNewRegNat size
1668 return $ code_src src_r `appOL`
1669 (if width == W8 then
1670 -- The POPCNT instruction doesn't take a r/m8
1671 unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
1672 unitOL (POPCNT II16 (OpReg src_r)
1673 (getRegisterReg platform False (CmmLocal dst)))
1674 else
1675 unitOL (POPCNT size (OpReg src_r)
1676 (getRegisterReg platform False (CmmLocal dst))))
1677 else do
1678 targetExpr <- cmmMakeDynamicReference dflags addImportNat
1679 CallReference lbl
1680 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1681 [NoHint] [NoHint]
1682 CmmMayReturn)
1683 genCCall is32Bit target dest_regs args
1684 where
1685 size = intSize width
1686 lbl = mkCmmCodeLabel primPackageId (fsLit (popCntLabel width))
1687
1688 genCCall is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
1689 dflags <- getDynFlags
1690 targetExpr <- cmmMakeDynamicReference dflags addImportNat
1691 CallReference lbl
1692 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
1693 [NoHint] [NoHint]
1694 CmmMayReturn)
1695 genCCall is32Bit target dest_regs args
1696 where
1697 lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
1698
1699 genCCall is32Bit target dest_regs args
1700 | is32Bit = genCCall32 target dest_regs args
1701 | otherwise = genCCall64 target dest_regs args
1702
1703 genCCall32 :: ForeignTarget -- function to call
1704 -> [CmmFormal] -- where to put the result
1705 -> [CmmActual] -- arguments (of mixed type)
1706 -> NatM InstrBlock
1707 genCCall32 target dest_regs args = do
1708 dflags <- getDynFlags
1709 let platform = targetPlatform dflags
1710 case (target, dest_regs) of
1711 -- void return type prim op
1712 (PrimTarget op, []) ->
1713 outOfLineCmmOp op Nothing args
1714 -- we only cope with a single result for foreign calls
1715 (PrimTarget op, [r]) -> do
1716 l1 <- getNewLabelNat
1717 l2 <- getNewLabelNat
1718 sse2 <- sse2Enabled
1719 if sse2
1720 then
1721 outOfLineCmmOp op (Just r) args
1722 else case op of
1723 MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
1724 MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
1725
1726 MO_F32_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
1727 MO_F64_Sin -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
1728
1729 MO_F32_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
1730 MO_F64_Cos -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
1731
1732 MO_F32_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
1733 MO_F64_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
1734
1735 _other_op -> outOfLineCmmOp op (Just r) args
1736
1737 where
1738 actuallyInlineFloatOp instr size [x]
1739 = do res <- trivialUFCode size (instr size) x
1740 any <- anyReg res
1741 return (any (getRegisterReg platform False (CmmLocal r)))
1742
1743 actuallyInlineFloatOp _ _ args
1744 = panic $ "genCCall32.actuallyInlineFloatOp: bad number of arguments! ("
1745 ++ show (length args) ++ ")"
1746
1747 (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
1748 (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
1749 (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
1750 (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
1751 case args of
1752 [arg_x, arg_y] ->
1753 do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
1754 lCode <- getAnyReg (CmmMachOp (MO_Add width) [arg_x, arg_y])
1755 let size = intSize width
1756 reg_l = getRegisterReg platform True (CmmLocal res_l)
1757 reg_h = getRegisterReg platform True (CmmLocal res_h)
1758 code = hCode reg_h `appOL`
1759 lCode reg_l `snocOL`
1760 ADC size (OpImm (ImmInteger 0)) (OpReg reg_h)
1761 return code
1762 _ -> panic "genCCall32: Wrong number of arguments/results for add2"
1763 (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
1764 case args of
1765 [arg_x, arg_y] ->
1766 do (y_reg, y_code) <- getRegOrMem arg_y
1767 x_code <- getAnyReg arg_x
1768 let size = intSize width
1769 reg_h = getRegisterReg platform True (CmmLocal res_h)
1770 reg_l = getRegisterReg platform True (CmmLocal res_l)
1771 code = y_code `appOL`
1772 x_code rax `appOL`
1773 toOL [MUL2 size y_reg,
1774 MOV size (OpReg rdx) (OpReg reg_h),
1775 MOV size (OpReg rax) (OpReg reg_l)]
1776 return code
1777 _ -> panic "genCCall32: Wrong number of arguments/results for add2"
1778
1779 _ -> genCCall32' dflags target dest_regs args
1780
1781 where divOp1 platform signed width results [arg_x, arg_y]
1782 = divOp platform signed width results Nothing arg_x arg_y
1783 divOp1 _ _ _ _ _
1784 = panic "genCCall32: Wrong number of arguments for divOp1"
1785 divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
1786 = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
1787 divOp2 _ _ _ _ _
1788 = panic "genCCall64: Wrong number of arguments for divOp2"
1789 divOp platform signed width [res_q, res_r]
1790 m_arg_x_high arg_x_low arg_y
1791 = do let size = intSize width
1792 reg_q = getRegisterReg platform True (CmmLocal res_q)
1793 reg_r = getRegisterReg platform True (CmmLocal res_r)
1794 widen | signed = CLTD size
1795 | otherwise = XOR size (OpReg rdx) (OpReg rdx)
1796 instr | signed = IDIV
1797 | otherwise = DIV
1798 (y_reg, y_code) <- getRegOrMem arg_y
1799 x_low_code <- getAnyReg arg_x_low
1800 x_high_code <- case m_arg_x_high of
1801 Just arg_x_high ->
1802 getAnyReg arg_x_high
1803 Nothing ->
1804 return $ const $ unitOL widen
1805 return $ y_code `appOL`
1806 x_low_code rax `appOL`
1807 x_high_code rdx `appOL`
1808 toOL [instr size y_reg,
1809 MOV size (OpReg rax) (OpReg reg_q),
1810 MOV size (OpReg rdx) (OpReg reg_r)]
1811 divOp _ _ _ _ _ _ _
1812 = panic "genCCall32: Wrong number of results for divOp"
1813
1814 genCCall32' :: DynFlags
1815 -> ForeignTarget -- function to call
1816 -> [CmmFormal] -- where to put the result
1817 -> [CmmActual] -- arguments (of mixed type)
1818 -> NatM InstrBlock
1819 genCCall32' dflags target dest_regs args = do
1820 let
1821 -- Align stack to 16n for calls, assuming a starting stack
1822 -- alignment of 16n - word_size on procedure entry. Which we
1823 -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
1824 sizes = map (arg_size . cmmExprType dflags) (reverse args)
1825 raw_arg_size = sum sizes + wORD_SIZE dflags
1826 arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
1827 tot_arg_size = raw_arg_size + arg_pad_size - wORD_SIZE dflags
1828 delta0 <- getDeltaNat
1829 setDeltaNat (delta0 - arg_pad_size)
1830
1831 use_sse2 <- sse2Enabled
1832 push_codes <- mapM (push_arg use_sse2) (reverse args)
1833 delta <- getDeltaNat
1834 MASSERT (delta == delta0 - tot_arg_size)
1835
1836 -- deal with static vs dynamic call targets
1837 (callinsns,cconv) <-
1838 case target of
1839 ForeignTarget (CmmLit (CmmLabel lbl)) conv
1840 -> -- ToDo: stdcall arg sizes
1841 return (unitOL (CALL (Left fn_imm) []), conv)
1842 where fn_imm = ImmCLbl lbl
1843 ForeignTarget expr conv
1844 -> do { (dyn_r, dyn_c) <- getSomeReg expr
1845 ; ASSERT( isWord32 (cmmExprType dflags expr) )
1846 return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
1847 PrimTarget _
1848 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
1849 ++ "probably because too many return values."
1850
1851 let push_code
1852 | arg_pad_size /= 0
1853 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
1854 DELTA (delta0 - arg_pad_size)]
1855 `appOL` concatOL push_codes
1856 | otherwise
1857 = concatOL push_codes
1858
1859 -- Deallocate parameters after call for ccall;
1860 -- but not for stdcall (callee does it)
1861 --
1862 -- We have to pop any stack padding we added
1863 -- even if we are doing stdcall, though (#5052)
1864 pop_size
1865 | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
1866 | otherwise = tot_arg_size
1867
1868 call = callinsns `appOL`
1869 toOL (
1870 (if pop_size==0 then [] else
1871 [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
1872 ++
1873 [DELTA delta0]
1874 )
1875 setDeltaNat delta0
1876
1877 dflags <- getDynFlags
1878 let platform = targetPlatform dflags
1879
1880 let
1881 -- assign the results, if necessary
1882 assign_code [] = nilOL
1883 assign_code [dest]
1884 | isFloatType ty =
1885 if use_sse2
1886 then let tmp_amode = AddrBaseIndex (EABaseReg esp)
1887 EAIndexNone
1888 (ImmInt 0)
1889 sz = floatSize w
1890 in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
1891 DELTA (delta0 - b),
1892 GST sz fake0 tmp_amode,
1893 MOV sz (OpAddr tmp_amode) (OpReg r_dest),
1894 ADD II32 (OpImm (ImmInt b)) (OpReg esp),
1895 DELTA delta0]
1896 else unitOL (GMOV fake0 r_dest)
1897 | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
1898 MOV II32 (OpReg edx) (OpReg r_dest_hi)]
1899 | otherwise = unitOL (MOV (intSize w) (OpReg eax) (OpReg r_dest))
1900 where
1901 ty = localRegType dest
1902 w = typeWidth ty
1903 b = widthInBytes w
1904 r_dest_hi = getHiVRegFromLo r_dest
1905 r_dest = getRegisterReg platform use_sse2 (CmmLocal dest)
1906 assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
1907
1908 return (push_code `appOL`
1909 call `appOL`
1910 assign_code dest_regs)
1911
1912 where
1913 arg_size :: CmmType -> Int -- Width in bytes
1914 arg_size ty = widthInBytes (typeWidth ty)
1915
1916 roundTo a x | x `mod` a == 0 = x
1917 | otherwise = x + a - (x `mod` a)
1918
1919 push_arg :: Bool -> CmmActual {-current argument-}
1920 -> NatM InstrBlock -- code
1921
1922 push_arg use_sse2 arg -- we don't need the hints on x86
1923 | isWord64 arg_ty = do
1924 ChildCode64 code r_lo <- iselExpr64 arg
1925 delta <- getDeltaNat
1926 setDeltaNat (delta - 8)
1927 let
1928 r_hi = getHiVRegFromLo r_lo
1929 return ( code `appOL`
1930 toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
1931 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
1932 DELTA (delta-8)]
1933 )
1934
1935 | isFloatType arg_ty = do
1936 (reg, code) <- getSomeReg arg
1937 delta <- getDeltaNat
1938 setDeltaNat (delta-size)
1939 return (code `appOL`
1940 toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
1941 DELTA (delta-size),
1942 let addr = AddrBaseIndex (EABaseReg esp)
1943 EAIndexNone
1944 (ImmInt 0)
1945 size = floatSize (typeWidth arg_ty)
1946 in
1947 if use_sse2
1948 then MOV size (OpReg reg) (OpAddr addr)
1949 else GST size reg addr
1950 ]
1951 )
1952
1953 | otherwise = do
1954 (operand, code) <- getOperand arg
1955 delta <- getDeltaNat
1956 setDeltaNat (delta-size)
1957 return (code `snocOL`
1958 PUSH II32 operand `snocOL`
1959 DELTA (delta-size))
1960
1961 where
1962 arg_ty = cmmExprType dflags arg
1963 size = arg_size arg_ty -- Byte size
1964
1965 genCCall64 :: ForeignTarget -- function to call
1966 -> [CmmFormal] -- where to put the result
1967 -> [CmmActual] -- arguments (of mixed type)
1968 -> NatM InstrBlock
1969 genCCall64 target dest_regs args = do
1970 dflags <- getDynFlags
1971 let platform = targetPlatform dflags
1972 case (target, dest_regs) of
1973
1974 (PrimTarget op, []) ->
1975 -- void return type prim op
1976 outOfLineCmmOp op Nothing args
1977
1978 (PrimTarget op, [res]) ->
1979 -- we only cope with a single result for foreign calls
1980 outOfLineCmmOp op (Just res) args
1981
1982 (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
1983 (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
1984 (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
1985 (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
1986 case args of
1987 [arg_x, arg_y] ->
1988 do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
1989 lCode <- getAnyReg (CmmMachOp (MO_Add width) [arg_x, arg_y])
1990 let size = intSize width
1991 reg_l = getRegisterReg platform True (CmmLocal res_l)
1992 reg_h = getRegisterReg platform True (CmmLocal res_h)
1993 code = hCode reg_h `appOL`
1994 lCode reg_l `snocOL`
1995 ADC size (OpImm (ImmInteger 0)) (OpReg reg_h)
1996 return code
1997 _ -> panic "genCCall64: Wrong number of arguments/results for add2"
1998 (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
1999 case args of
2000 [arg_x, arg_y] ->
2001 do (y_reg, y_code) <- getRegOrMem arg_y
2002 x_code <- getAnyReg arg_x
2003 let size = intSize width
2004 reg_h = getRegisterReg platform True (CmmLocal res_h)
2005 reg_l = getRegisterReg platform True (CmmLocal res_l)
2006 code = y_code `appOL`
2007 x_code rax `appOL`
2008 toOL [MUL2 size y_reg,
2009 MOV size (OpReg rdx) (OpReg reg_h),
2010 MOV size (OpReg rax) (OpReg reg_l)]
2011 return code
2012 _ -> panic "genCCall64: Wrong number of arguments/results for add2"
2013
2014 _ ->
2015 do dflags <- getDynFlags
2016 genCCall64' dflags target dest_regs args
2017
2018 where divOp1 platform signed width results [arg_x, arg_y]
2019 = divOp platform signed width results Nothing arg_x arg_y
2020 divOp1 _ _ _ _ _
2021 = panic "genCCall64: Wrong number of arguments for divOp1"
2022 divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
2023 = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
2024 divOp2 _ _ _ _ _
2025 = panic "genCCall64: Wrong number of arguments for divOp2"
2026 divOp platform signed width [res_q, res_r]
2027 m_arg_x_high arg_x_low arg_y
2028 = do let size = intSize width
2029 reg_q = getRegisterReg platform True (CmmLocal res_q)
2030 reg_r = getRegisterReg platform True (CmmLocal res_r)
2031 widen | signed = CLTD size
2032 | otherwise = XOR size (OpReg rdx) (OpReg rdx)
2033 instr | signed = IDIV
2034 | otherwise = DIV
2035 (y_reg, y_code) <- getRegOrMem arg_y
2036 x_low_code <- getAnyReg arg_x_low
2037 x_high_code <- case m_arg_x_high of
2038 Just arg_x_high -> getAnyReg arg_x_high
2039 Nothing -> return $ const $ unitOL widen
2040 return $ y_code `appOL`
2041 x_low_code rax `appOL`
2042 x_high_code rdx `appOL`
2043 toOL [instr size y_reg,
2044 MOV size (OpReg rax) (OpReg reg_q),
2045 MOV size (OpReg rdx) (OpReg reg_r)]
2046 divOp _ _ _ _ _ _ _
2047 = panic "genCCall64: Wrong number of results for divOp"
2048
2049 genCCall64' :: DynFlags
2050 -> ForeignTarget -- function to call
2051 -> [CmmFormal] -- where to put the result
2052 -> [CmmActual] -- arguments (of mixed type)
2053 -> NatM InstrBlock
2054 genCCall64' dflags target dest_regs args = do
2055 -- load up the register arguments
2056 (stack_args, int_regs_used, fp_regs_used, load_args_code)
2057 <-
2058 if platformOS platform == OSMinGW32
2059 then load_args_win args [] [] (allArgRegs platform) nilOL
2060 else do (stack_args, aregs, fregs, load_args_code)
2061 <- load_args args (allIntArgRegs platform) (allFPArgRegs platform) nilOL
2062 let fp_regs_used = reverse (drop (length fregs) (reverse (allFPArgRegs platform)))
2063 int_regs_used = reverse (drop (length aregs) (reverse (allIntArgRegs platform)))
2064 return (stack_args, int_regs_used, fp_regs_used, load_args_code)
2065
2066 let
2067 arg_regs_used = int_regs_used ++ fp_regs_used
2068 arg_regs = [eax] ++ arg_regs_used
2069 -- for annotating the call instruction with
2070 sse_regs = length fp_regs_used
2071 arg_stack_slots = if platformOS platform == OSMinGW32
2072 then length stack_args + length (allArgRegs platform)
2073 else length stack_args
2074 tot_arg_size = arg_size * arg_stack_slots
2075
2076
2077 -- Align stack to 16n for calls, assuming a starting stack
2078 -- alignment of 16n - word_size on procedure entry. Which we
2079 -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2080 (real_size, adjust_rsp) <-
2081 if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
2082 then return (tot_arg_size, nilOL)
2083 else do -- we need to adjust...
2084 delta <- getDeltaNat
2085 setDeltaNat (delta - wORD_SIZE dflags)
2086 return (tot_arg_size + wORD_SIZE dflags, toOL [
2087 SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
2088 DELTA (delta - wORD_SIZE dflags) ])
2089
2090 -- push the stack args, right to left
2091 push_code <- push_args (reverse stack_args) nilOL
2092 -- On Win64, we also have to leave stack space for the arguments
2093 -- that we are passing in registers
2094 lss_code <- if platformOS platform == OSMinGW32
2095 then leaveStackSpace (length (allArgRegs platform))
2096 else return nilOL
2097 delta <- getDeltaNat
2098
2099 -- deal with static vs dynamic call targets
2100 (callinsns,_cconv) <-
2101 case target of
2102 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2103 -> -- ToDo: stdcall arg sizes
2104 return (unitOL (CALL (Left fn_imm) arg_regs), conv)
2105 where fn_imm = ImmCLbl lbl
2106 ForeignTarget expr conv
2107 -> do (dyn_r, dyn_c) <- getSomeReg expr
2108 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
2109 PrimTarget _
2110 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
2111 ++ "probably because too many return values."
2112
2113 let
2114 -- The x86_64 ABI requires us to set %al to the number of SSE2
2115 -- registers that contain arguments, if the called routine
2116 -- is a varargs function. We don't know whether it's a
2117 -- varargs function or not, so we have to assume it is.
2118 --
2119 -- It's not safe to omit this assignment, even if the number
2120 -- of SSE2 regs in use is zero. If %al is larger than 8
2121 -- on entry to a varargs function, seg faults ensue.
2122 assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
2123
2124 let call = callinsns `appOL`
2125 toOL (
2126 -- Deallocate parameters after call for ccall;
2127 -- stdcall has callee do it, but is not supported on
2128 -- x86_64 target (see #3336)
2129 (if real_size==0 then [] else
2130 [ADD (intSize (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
2131 ++
2132 [DELTA (delta + real_size)]
2133 )
2134 setDeltaNat (delta + real_size)
2135
2136 let
2137 -- assign the results, if necessary
2138 assign_code [] = nilOL
2139 assign_code [dest] =
2140 case typeWidth rep of
2141 W32 | isFloatType rep -> unitOL (MOV (floatSize W32) (OpReg xmm0) (OpReg r_dest))
2142 W64 | isFloatType rep -> unitOL (MOV (floatSize W64) (OpReg xmm0) (OpReg r_dest))
2143 _ -> unitOL (MOV (cmmTypeSize rep) (OpReg rax) (OpReg r_dest))
2144 where
2145 rep = localRegType dest
2146 r_dest = getRegisterReg platform True (CmmLocal dest)
2147 assign_code _many = panic "genCCall.assign_code many"
2148
2149 return (load_args_code `appOL`
2150 adjust_rsp `appOL`
2151 push_code `appOL`
2152 lss_code `appOL`
2153 assign_eax sse_regs `appOL`
2154 call `appOL`
2155 assign_code dest_regs)
2156
2157 where platform = targetPlatform dflags
2158 arg_size = 8 -- always, at the mo
2159
2160 load_args :: [CmmExpr]
2161 -> [Reg] -- int regs avail for args
2162 -> [Reg] -- FP regs avail for args
2163 -> InstrBlock
2164 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock)
2165 load_args args [] [] code = return (args, [], [], code)
2166 -- no more regs to use
2167 load_args [] aregs fregs code = return ([], aregs, fregs, code)
2168 -- no more args to push
2169 load_args (arg : rest) aregs fregs code
2170 | isFloatType arg_rep =
2171 case fregs of
2172 [] -> push_this_arg
2173 (r:rs) -> do
2174 arg_code <- getAnyReg arg
2175 load_args rest aregs rs (code `appOL` arg_code r)
2176 | otherwise =
2177 case aregs of
2178 [] -> push_this_arg
2179 (r:rs) -> do
2180 arg_code <- getAnyReg arg
2181 load_args rest rs fregs (code `appOL` arg_code r)
2182 where
2183 arg_rep = cmmExprType dflags arg
2184
2185 push_this_arg = do
2186 (args',ars,frs,code') <- load_args rest aregs fregs code
2187 return (arg:args', ars, frs, code')
2188
2189 load_args_win :: [CmmExpr]
2190 -> [Reg] -- used int regs
2191 -> [Reg] -- used FP regs
2192 -> [(Reg, Reg)] -- (int, FP) regs avail for args
2193 -> InstrBlock
2194 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock)
2195 load_args_win args usedInt usedFP [] code
2196 = return (args, usedInt, usedFP, code)
2197 -- no more regs to use
2198 load_args_win [] usedInt usedFP _ code
2199 = return ([], usedInt, usedFP, code)
2200 -- no more args to push
2201 load_args_win (arg : rest) usedInt usedFP
2202 ((ireg, freg) : regs) code
2203 | isFloatType arg_rep = do
2204 arg_code <- getAnyReg arg
2205 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
2206 (code `appOL`
2207 arg_code freg `snocOL`
2208 -- If we are calling a varargs function
2209 -- then we need to define ireg as well
2210 -- as freg
2211 MOV II64 (OpReg freg) (OpReg ireg))
2212 | otherwise = do
2213 arg_code <- getAnyReg arg
2214 load_args_win rest (ireg : usedInt) usedFP regs
2215 (code `appOL` arg_code ireg)
2216 where
2217 arg_rep = cmmExprType dflags arg
2218
2219 push_args [] code = return code
2220 push_args (arg:rest) code
2221 | isFloatType arg_rep = do
2222 (arg_reg, arg_code) <- getSomeReg arg
2223 delta <- getDeltaNat
2224 setDeltaNat (delta-arg_size)
2225 let code' = code `appOL` arg_code `appOL` toOL [
2226 SUB (intSize (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp) ,
2227 DELTA (delta-arg_size),
2228 MOV (floatSize width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
2229 push_args rest code'
2230
2231 | otherwise = do
2232 -- we only ever generate word-sized function arguments. Promotion
2233 -- has already happened: our Int8# type is kept sign-extended
2234 -- in an Int#, for example.
2235 ASSERT(width == W64) return ()
2236 (arg_op, arg_code) <- getOperand arg
2237 delta <- getDeltaNat
2238 setDeltaNat (delta-arg_size)
2239 let code' = code `appOL` arg_code `appOL` toOL [
2240 PUSH II64 arg_op,
2241 DELTA (delta-arg_size)]
2242 push_args rest code'
2243 where
2244 arg_rep = cmmExprType dflags arg
2245 width = typeWidth arg_rep
2246
2247 leaveStackSpace n = do
2248 delta <- getDeltaNat
2249 setDeltaNat (delta - n * arg_size)
2250 return $ toOL [
2251 SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
2252 DELTA (delta - n * arg_size)]
2253
2254 -- | We're willing to inline and unroll memcpy/memset calls that touch
2255 -- at most these many bytes. This threshold is the same as the one
2256 -- used by GCC and LLVM.
2257 maxInlineSizeThreshold :: Integer
2258 maxInlineSizeThreshold = 128
2259
2260 outOfLineCmmOp :: CallishMachOp -> Maybe CmmFormal -> [CmmActual] -> NatM InstrBlock
2261 outOfLineCmmOp mop res args
2262 = do
2263 dflags <- getDynFlags
2264 targetExpr <- cmmMakeDynamicReference dflags addImportNat CallReference lbl
2265 let target = ForeignTarget targetExpr
2266 (ForeignConvention CCallConv [] [] CmmMayReturn)
2267
2268 stmtToInstrs (CmmUnsafeForeignCall target (catMaybes [res]) args')
2269 where
2270 -- Assume we can call these functions directly, and that they're not in a dynamic library.
2271 -- TODO: Why is this ok? Under linux this code will be in libm.so
2272 -- Is is because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31
2273 lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
2274
2275 args' = case mop of
2276 MO_Memcpy -> init args
2277 MO_Memset -> init args
2278 MO_Memmove -> init args
2279 _ -> args
2280
2281 fn = case mop of
2282 MO_F32_Sqrt -> fsLit "sqrtf"
2283 MO_F32_Sin -> fsLit "sinf"
2284 MO_F32_Cos -> fsLit "cosf"
2285 MO_F32_Tan -> fsLit "tanf"
2286 MO_F32_Exp -> fsLit "expf"
2287 MO_F32_Log -> fsLit "logf"
2288
2289 MO_F32_Asin -> fsLit "asinf"
2290 MO_F32_Acos -> fsLit "acosf"
2291 MO_F32_Atan -> fsLit "atanf"
2292
2293 MO_F32_Sinh -> fsLit "sinhf"
2294 MO_F32_Cosh -> fsLit "coshf"
2295 MO_F32_Tanh -> fsLit "tanhf"
2296 MO_F32_Pwr -> fsLit "powf"
2297
2298 MO_F64_Sqrt -> fsLit "sqrt"
2299 MO_F64_Sin -> fsLit "sin"
2300 MO_F64_Cos -> fsLit "cos"
2301 MO_F64_Tan -> fsLit "tan"
2302 MO_F64_Exp -> fsLit "exp"
2303 MO_F64_Log -> fsLit "log"
2304
2305 MO_F64_Asin -> fsLit "asin"
2306 MO_F64_Acos -> fsLit "acos"
2307 MO_F64_Atan -> fsLit "atan"
2308
2309 MO_F64_Sinh -> fsLit "sinh"
2310 MO_F64_Cosh -> fsLit "cosh"
2311 MO_F64_Tanh -> fsLit "tanh"
2312 MO_F64_Pwr -> fsLit "pow"
2313
2314 MO_Memcpy -> fsLit "memcpy"
2315 MO_Memset -> fsLit "memset"
2316 MO_Memmove -> fsLit "memmove"
2317
2318 MO_PopCnt _ -> fsLit "popcnt"
2319
2320 MO_UF_Conv _ -> unsupported
2321
2322 MO_S_QuotRem {} -> unsupported
2323 MO_U_QuotRem {} -> unsupported
2324 MO_U_QuotRem2 {} -> unsupported
2325 MO_Add2 {} -> unsupported
2326 MO_U_Mul2 {} -> unsupported
2327 MO_WriteBarrier -> unsupported
2328 MO_Touch -> unsupported
2329 unsupported = panic ("outOfLineCmmOp: " ++ show mop
2330 ++ " not supported here")
2331
2332 -- -----------------------------------------------------------------------------
2333 -- Generating a table-branch
2334
2335 genSwitch :: DynFlags -> CmmExpr -> [Maybe BlockId] -> NatM InstrBlock
2336
2337 genSwitch dflags expr ids
2338 | gopt Opt_PIC dflags
2339 = do
2340 (reg,e_code) <- getSomeReg expr
2341 lbl <- getNewLabelNat
2342 dflags <- getDynFlags
2343 dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
2344 (tableReg,t_code) <- getSomeReg $ dynRef
2345 let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
2346 (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
2347
2348 return $ if target32Bit (targetPlatform dflags)
2349 then e_code `appOL` t_code `appOL` toOL [
2350 ADD (intSize (wordWidth dflags)) op (OpReg tableReg),
2351 JMP_TBL (OpReg tableReg) ids ReadOnlyData lbl
2352 ]
2353 else case platformOS (targetPlatform dflags) of
2354 OSDarwin ->
2355 -- on Mac OS X/x86_64, put the jump table
2356 -- in the text section to work around a
2357 -- limitation of the linker.
2358 -- ld64 is unable to handle the relocations for
2359 -- .quad L1 - L0
2360 -- if L0 is not preceded by a non-anonymous
2361 -- label in its section.
2362 e_code `appOL` t_code `appOL` toOL [
2363 ADD (intSize (wordWidth dflags)) op (OpReg tableReg),
2364 JMP_TBL (OpReg tableReg) ids Text lbl
2365 ]
2366 _ ->
2367 -- HACK: On x86_64 binutils<2.17 is only able
2368 -- to generate PC32 relocations, hence we only
2369 -- get 32-bit offsets in the jump table. As
2370 -- these offsets are always negative we need
2371 -- to properly sign extend them to 64-bit.
2372 -- This hack should be removed in conjunction
2373 -- with the hack in PprMach.hs/pprDataItem
2374 -- once binutils 2.17 is standard.
2375 e_code `appOL` t_code `appOL` toOL [
2376 MOVSxL II32 op (OpReg reg),
2377 ADD (intSize (wordWidth dflags)) (OpReg reg) (OpReg tableReg),
2378 JMP_TBL (OpReg tableReg) ids ReadOnlyData lbl
2379 ]
2380 | otherwise
2381 = do
2382 (reg,e_code) <- getSomeReg expr
2383 lbl <- getNewLabelNat
2384 let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
2385 code = e_code `appOL` toOL [
2386 JMP_TBL op ids ReadOnlyData lbl
2387 ]
2388 return code
2389
2390 generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, CmmStatics) Instr)
2391 generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
2392 = Just (createJumpTable dflags ids section lbl)
2393 generateJumpTableForInstr _ _ = Nothing
2394
2395 createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
2396 -> GenCmmDecl (Alignment, CmmStatics) h g
2397 createJumpTable dflags ids section lbl
2398 = let jumpTable
2399 | gopt Opt_PIC dflags =
2400 let jumpTableEntryRel Nothing
2401 = CmmStaticLit (CmmInt 0 (wordWidth dflags))
2402 jumpTableEntryRel (Just blockid)
2403 = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
2404 where blockLabel = mkAsmTempLabel (getUnique blockid)
2405 in map jumpTableEntryRel ids
2406 | otherwise = map (jumpTableEntry dflags) ids
2407 in CmmData section (1, Statics lbl jumpTable)
2408
2409 -- -----------------------------------------------------------------------------
2410 -- 'condIntReg' and 'condFltReg': condition codes into registers
2411
2412 -- Turn those condition codes into integers now (when they appear on
2413 -- the right hand side of an assignment).
2414 --
2415 -- (If applicable) Do not fill the delay slots here; you will confuse the
2416 -- register allocator.
2417
2418 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
2419
2420 condIntReg cond x y = do
2421 CondCode _ cond cond_code <- condIntCode cond x y
2422 tmp <- getNewRegNat II8
2423 let
2424 code dst = cond_code `appOL` toOL [
2425 SETCC cond (OpReg tmp),
2426 MOVZxL II8 (OpReg tmp) (OpReg dst)
2427 ]
2428 return (Any II32 code)
2429
2430
2431
2432 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
2433 condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
2434 where
2435 condFltReg_x87 = do
2436 CondCode _ cond cond_code <- condFltCode cond x y
2437 tmp <- getNewRegNat II8
2438 let
2439 code dst = cond_code `appOL` toOL [
2440 SETCC cond (OpReg tmp),
2441 MOVZxL II8 (OpReg tmp) (OpReg dst)
2442 ]
2443 return (Any II32 code)
2444
2445 condFltReg_sse2 = do
2446 CondCode _ cond cond_code <- condFltCode cond x y
2447 tmp1 <- getNewRegNat (archWordSize is32Bit)
2448 tmp2 <- getNewRegNat (archWordSize is32Bit)
2449 let
2450 -- We have to worry about unordered operands (eg. comparisons
2451 -- against NaN). If the operands are unordered, the comparison
2452 -- sets the parity flag, carry flag and zero flag.
2453 -- All comparisons are supposed to return false for unordered
2454 -- operands except for !=, which returns true.
2455 --
2456 -- Optimisation: we don't have to test the parity flag if we
2457 -- know the test has already excluded the unordered case: eg >
2458 -- and >= test for a zero carry flag, which can only occur for
2459 -- ordered operands.
2460 --
2461 -- ToDo: by reversing comparisons we could avoid testing the
2462 -- parity flag in more cases.
2463
2464 code dst =
2465 cond_code `appOL`
2466 (case cond of
2467 NE -> or_unordered dst
2468 GU -> plain_test dst
2469 GEU -> plain_test dst
2470 _ -> and_ordered dst)
2471
2472 plain_test dst = toOL [
2473 SETCC cond (OpReg tmp1),
2474 MOVZxL II8 (OpReg tmp1) (OpReg dst)
2475 ]
2476 or_unordered dst = toOL [
2477 SETCC cond (OpReg tmp1),
2478 SETCC PARITY (OpReg tmp2),
2479 OR II8 (OpReg tmp1) (OpReg tmp2),
2480 MOVZxL II8 (OpReg tmp2) (OpReg dst)
2481 ]
2482 and_ordered dst = toOL [
2483 SETCC cond (OpReg tmp1),
2484 SETCC NOTPARITY (OpReg tmp2),
2485 AND II8 (OpReg tmp1) (OpReg tmp2),
2486 MOVZxL II8 (OpReg tmp2) (OpReg dst)
2487 ]
2488 return (Any II32 code)
2489
2490
2491 -- -----------------------------------------------------------------------------
2492 -- 'trivial*Code': deal with trivial instructions
2493
2494 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
2495 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
2496 -- Only look for constants on the right hand side, because that's
2497 -- where the generic optimizer will have put them.
2498
2499 -- Similarly, for unary instructions, we don't have to worry about
2500 -- matching an StInt as the argument, because genericOpt will already
2501 -- have handled the constant-folding.
2502
2503
2504 {-
2505 The Rules of the Game are:
2506
2507 * You cannot assume anything about the destination register dst;
2508 it may be anything, including a fixed reg.
2509
2510 * You may compute an operand into a fixed reg, but you may not
2511 subsequently change the contents of that fixed reg. If you
2512 want to do so, first copy the value either to a temporary
2513 or into dst. You are free to modify dst even if it happens
2514 to be a fixed reg -- that's not your problem.
2515
2516 * You cannot assume that a fixed reg will stay live over an
2517 arbitrary computation. The same applies to the dst reg.
2518
2519 * Temporary regs obtained from getNewRegNat are distinct from
2520 each other and from all other regs, and stay live over
2521 arbitrary computations.
2522
2523 --------------------
2524
2525 SDM's version of The Rules:
2526
2527 * If getRegister returns Any, that means it can generate correct
2528 code which places the result in any register, period. Even if that
2529 register happens to be read during the computation.
2530
2531 Corollary #1: this means that if you are generating code for an
2532 operation with two arbitrary operands, you cannot assign the result
2533 of the first operand into the destination register before computing
2534 the second operand. The second operand might require the old value
2535 of the destination register.
2536
2537 Corollary #2: A function might be able to generate more efficient
2538 code if it knows the destination register is a new temporary (and
2539 therefore not read by any of the sub-computations).
2540
2541 * If getRegister returns Any, then the code it generates may modify only:
2542 (a) fresh temporaries
2543 (b) the destination register
2544 (c) known registers (eg. %ecx is used by shifts)
2545 In particular, it may *not* modify global registers, unless the global
2546 register happens to be the destination register.
2547 -}
2548
2549 trivialCode :: Width -> (Operand -> Operand -> Instr)
2550 -> Maybe (Operand -> Operand -> Instr)
2551 -> CmmExpr -> CmmExpr -> NatM Register
2552 trivialCode width instr m a b
2553 = do is32Bit <- is32BitPlatform
2554 trivialCode' is32Bit width instr m a b
2555
2556 trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
2557 -> Maybe (Operand -> Operand -> Instr)
2558 -> CmmExpr -> CmmExpr -> NatM Register
2559 trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
2560 | is32BitLit is32Bit lit_a = do
2561 b_code <- getAnyReg b
2562 let
2563 code dst
2564 = b_code dst `snocOL`
2565 revinstr (OpImm (litToImm lit_a)) (OpReg dst)
2566 return (Any (intSize width) code)
2567
2568 trivialCode' _ width instr _ a b
2569 = genTrivialCode (intSize width) instr a b
2570
2571 -- This is re-used for floating pt instructions too.
2572 genTrivialCode :: Size -> (Operand -> Operand -> Instr)
2573 -> CmmExpr -> CmmExpr -> NatM Register
2574 genTrivialCode rep instr a b = do
2575 (b_op, b_code) <- getNonClobberedOperand b
2576 a_code <- getAnyReg a
2577 tmp <- getNewRegNat rep
2578 let
2579 -- We want the value of b to stay alive across the computation of a.
2580 -- But, we want to calculate a straight into the destination register,
2581 -- because the instruction only has two operands (dst := dst `op` src).
2582 -- The troublesome case is when the result of b is in the same register
2583 -- as the destination reg. In this case, we have to save b in a
2584 -- new temporary across the computation of a.
2585 code dst
2586 | dst `regClashesWithOp` b_op =
2587 b_code `appOL`
2588 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
2589 a_code dst `snocOL`
2590 instr (OpReg tmp) (OpReg dst)
2591 | otherwise =
2592 b_code `appOL`
2593 a_code dst `snocOL`
2594 instr b_op (OpReg dst)
2595 return (Any rep code)
2596
2597 regClashesWithOp :: Reg -> Operand -> Bool
2598 reg `regClashesWithOp` OpReg reg2 = reg == reg2
2599 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
2600 _ `regClashesWithOp` _ = False
2601
2602 -----------
2603
2604 trivialUCode :: Size -> (Operand -> Instr)
2605 -> CmmExpr -> NatM Register
2606 trivialUCode rep instr x = do
2607 x_code <- getAnyReg x
2608 let
2609 code dst =
2610 x_code dst `snocOL`
2611 instr (OpReg dst)
2612 return (Any rep code)
2613
2614 -----------
2615
2616 trivialFCode_x87 :: (Size -> Reg -> Reg -> Reg -> Instr)
2617 -> CmmExpr -> CmmExpr -> NatM Register
2618 trivialFCode_x87 instr x y = do
2619 (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
2620 (y_reg, y_code) <- getSomeReg y
2621 let
2622 size = FF80 -- always, on x87
2623 code dst =
2624 x_code `appOL`
2625 y_code `snocOL`
2626 instr size x_reg y_reg dst
2627 return (Any size code)
2628
2629 trivialFCode_sse2 :: Width -> (Size -> Operand -> Operand -> Instr)
2630 -> CmmExpr -> CmmExpr -> NatM Register
2631 trivialFCode_sse2 pk instr x y
2632 = genTrivialCode size (instr size) x y
2633 where size = floatSize pk
2634
2635
2636 trivialUFCode :: Size -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
2637 trivialUFCode size instr x = do
2638 (x_reg, x_code) <- getSomeReg x
2639 let
2640 code dst =
2641 x_code `snocOL`
2642 instr x_reg dst
2643 return (Any size code)
2644
2645
2646 --------------------------------------------------------------------------------
2647 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
2648 coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
2649 where
2650 coerce_x87 = do
2651 (x_reg, x_code) <- getSomeReg x
2652 let
2653 opc = case to of W32 -> GITOF; W64 -> GITOD;
2654 n -> panic $ "coerceInt2FP.x87: unhandled width ("
2655 ++ show n ++ ")"
2656 code dst = x_code `snocOL` opc x_reg dst
2657 -- ToDo: works for non-II32 reps?
2658 return (Any FF80 code)
2659
2660 coerce_sse2 = do
2661 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
2662 let
2663 opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
2664 n -> panic $ "coerceInt2FP.sse: unhandled width ("
2665 ++ show n ++ ")"
2666 code dst = x_code `snocOL` opc (intSize from) x_op dst
2667 return (Any (floatSize to) code)
2668 -- works even if the destination rep is <II32
2669
2670 --------------------------------------------------------------------------------
2671 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
2672 coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
2673 where
2674 coerceFP2Int_x87 = do
2675 (x_reg, x_code) <- getSomeReg x
2676 let
2677 opc = case from of W32 -> GFTOI; W64 -> GDTOI
2678 n -> panic $ "coerceFP2Int.x87: unhandled width ("
2679 ++ show n ++ ")"
2680 code dst = x_code `snocOL` opc x_reg dst
2681 -- ToDo: works for non-II32 reps?
2682 return (Any (intSize to) code)
2683
2684 coerceFP2Int_sse2 = do
2685 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
2686 let
2687 opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
2688 n -> panic $ "coerceFP2Init.sse: unhandled width ("
2689 ++ show n ++ ")"
2690 code dst = x_code `snocOL` opc (intSize to) x_op dst
2691 return (Any (intSize to) code)
2692 -- works even if the destination rep is <II32
2693
2694
2695 --------------------------------------------------------------------------------
2696 coerceFP2FP :: Width -> CmmExpr -> NatM Register
2697 coerceFP2FP to x = do
2698 use_sse2 <- sse2Enabled
2699 (x_reg, x_code) <- getSomeReg x
2700 let
2701 opc | use_sse2 = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
2702 n -> panic $ "coerceFP2FP: unhandled width ("
2703 ++ show n ++ ")"
2704 | otherwise = GDTOF
2705 code dst = x_code `snocOL` opc x_reg dst
2706 return (Any (if use_sse2 then floatSize to else FF80) code)
2707
2708 --------------------------------------------------------------------------------
2709
2710 sse2NegCode :: Width -> CmmExpr -> NatM Register
2711 sse2NegCode w x = do
2712 let sz = floatSize w
2713 x_code <- getAnyReg x
2714 -- This is how gcc does it, so it can't be that bad:
2715 let
2716 const | FF32 <- sz = CmmInt 0x80000000 W32
2717 | otherwise = CmmInt 0x8000000000000000 W64
2718 Amode amode amode_code <- memConstant (widthInBytes w) const
2719 tmp <- getNewRegNat sz
2720 let
2721 code dst = x_code dst `appOL` amode_code `appOL` toOL [
2722 MOV sz (OpAddr amode) (OpReg tmp),
2723 XOR sz (OpReg tmp) (OpReg dst)
2724 ]
2725 --
2726 return (Any sz code)
2727
2728 isVecExpr :: CmmExpr -> Bool
2729 isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
2730 isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
2731 isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
2732 isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
2733 isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
2734 isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
2735 isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
2736 isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
2737 isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
2738 isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
2739 isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
2740 isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
2741 isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
2742 isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
2743 isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
2744 isVecExpr (CmmMachOp _ [e]) = isVecExpr e
2745 isVecExpr _ = False
2746
2747 needLlvm :: NatM a
2748 needLlvm =
2749 sorry $ unlines ["The native code generator does not support vector"
2750 ,"instructions. Please use -fllvm."]