gitlab-ci: Fix URL of Windows cabal-install tarball
[ghc.git] / compiler / nativeGen / X86 / Instr.hs
1 {-# LANGUAGE CPP, TypeFamilies #-}
2
3 -----------------------------------------------------------------------------
4 --
5 -- Machine-dependent assembly language
6 --
7 -- (c) The University of Glasgow 1993-2004
8 --
9 -----------------------------------------------------------------------------
10
11 module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
12 getJumpDestBlockId, canShortcut, shortcutStatics,
13 shortcutJump, allocMoreStack,
14 maxSpillSlots, archWordFormat )
15 where
16
17 #include "HsVersions.h"
18
19 import GhcPrelude
20
21 import X86.Cond
22 import X86.Regs
23 import Instruction
24 import Format
25 import RegClass
26 import Reg
27 import TargetReg
28
29 import BlockId
30 import Hoopl.Collections
31 import Hoopl.Label
32 import GHC.Platform.Regs
33 import Cmm
34 import FastString
35 import Outputable
36 import GHC.Platform
37
38 import BasicTypes (Alignment)
39 import CLabel
40 import DynFlags
41 import UniqSet
42 import Unique
43 import UniqSupply
44 import Debug (UnwindTable)
45
46 import Control.Monad
47 import Data.Maybe (fromMaybe)
48
49 -- Format of an x86/x86_64 memory address, in bytes.
50 --
51 archWordFormat :: Bool -> Format
52 archWordFormat is32Bit
53 | is32Bit = II32
54 | otherwise = II64
55
56 -- | Instruction instance for x86 instruction set.
57 instance Instruction Instr where
58 regUsageOfInstr = x86_regUsageOfInstr
59 patchRegsOfInstr = x86_patchRegsOfInstr
60 isJumpishInstr = x86_isJumpishInstr
61 jumpDestsOfInstr = x86_jumpDestsOfInstr
62 patchJumpInstr = x86_patchJumpInstr
63 mkSpillInstr = x86_mkSpillInstr
64 mkLoadInstr = x86_mkLoadInstr
65 takeDeltaInstr = x86_takeDeltaInstr
66 isMetaInstr = x86_isMetaInstr
67 mkRegRegMoveInstr = x86_mkRegRegMoveInstr
68 takeRegRegMoveInstr = x86_takeRegRegMoveInstr
69 mkJumpInstr = x86_mkJumpInstr
70 mkStackAllocInstr = x86_mkStackAllocInstr
71 mkStackDeallocInstr = x86_mkStackDeallocInstr
72
73
74 -- -----------------------------------------------------------------------------
75 -- Intel x86 instructions
76
77 {-
78 Intel, in their infinite wisdom, selected a stack model for floating
79 point registers on x86. That might have made sense back in 1979 --
80 nowadays we can see it for the nonsense it really is. A stack model
81 fits poorly with the existing nativeGen infrastructure, which assumes
82 flat integer and FP register sets. Prior to this commit, nativeGen
83 could not generate correct x86 FP code -- to do so would have meant
84 somehow working the register-stack paradigm into the register
85 allocator and spiller, which sounds very difficult.
86
87 We have decided to cheat, and go for a simple fix which requires no
88 infrastructure modifications, at the expense of generating ropey but
89 correct FP code. All notions of the x86 FP stack and its insns have
90 been removed. Instead, we pretend (to the instruction selector and
91 register allocator) that x86 has six floating point registers, %fake0
92 .. %fake5, which can be used in the usual flat manner. We further
93 claim that x86 has floating point instructions very similar to SPARC
94 and Alpha, that is, a simple 3-operand register-register arrangement.
95 Code generation and register allocation proceed on this basis.
96
97 When we come to print out the final assembly, our convenient fiction
98 is converted to dismal reality. Each fake instruction is
99 independently converted to a series of real x86 instructions.
100 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
101 arithmetic operations, the two operands are pushed onto the top of the
102 FP stack, the operation done, and the result copied back into the
103 relevant register. There are only six %fake registers because 2 are
104 needed for the translation, and x86 has 8 in total.
105
106 The translation is inefficient but is simple and it works. A cleverer
107 translation would handle a sequence of insns, simulating the FP stack
108 contents, would not impose a fixed mapping from %fake to %st regs, and
109 hopefully could avoid most of the redundant reg-reg moves of the
110 current translation.
111
112 We might as well make use of whatever unique FP facilities Intel have
113 chosen to bless us with (let's not be churlish, after all).
114 Hence GLDZ and GLD1. Bwahahahahahahaha!
115 -}
116
117 {-
118 Note [x86 Floating point precision]
119
120 Intel's internal floating point registers are by default 80 bit
121 extended precision. This means that all operations done on values in
122 registers are done at 80 bits, and unless the intermediate values are
123 truncated to the appropriate size (32 or 64 bits) by storing in
124 memory, calculations in registers will give different results from
125 calculations which pass intermediate values in memory (eg. via
126 function calls).
127
128 One solution is to set the FPU into 64 bit precision mode. Some OSs
129 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
130 that this will only affect 64-bit precision arithmetic; 32-bit
131 calculations will still be done at 64-bit precision in registers. So
132 it doesn't solve the whole problem.
133
134 There's also the issue of what the C library is expecting in terms of
135 precision. It seems to be the case that glibc on Linux expects the
136 FPU to be set to 80 bit precision, so setting it to 64 bit could have
137 unexpected effects. Changing the default could have undesirable
138 effects on other 3rd-party library code too, so the right thing would
139 be to save/restore the FPU control word across Haskell code if we were
140 to do this.
141
142 gcc's -ffloat-store gives consistent results by always storing the
143 results of floating-point calculations in memory, which works for both
144 32 and 64-bit precision. However, it only affects the values of
145 user-declared floating point variables in C, not intermediate results.
146 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
147 flag).
148
149 Another problem is how to spill floating point registers in the
150 register allocator. Should we spill the whole 80 bits, or just 64?
151 On an OS which is set to 64 bit precision, spilling 64 is fine. On
152 Linux, spilling 64 bits will round the results of some operations.
153 This is what gcc does. Spilling at 80 bits requires taking up a full
154 128 bit slot (so we get alignment). We spill at 80-bits and ignore
155 the alignment problems.
156
157 In the future [edit: now available in GHC 7.0.1, with the -msse2
158 flag], we'll use the SSE registers for floating point. This requires
159 a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
160 float ops), which means P4 or Xeon and above. Using SSE will solve
161 all these problems, because the SSE registers use fixed 32 bit or 64
162 bit precision.
163
164 --SDM 1/2003
165 -}
166
167 data Instr
168 -- comment pseudo-op
169 = COMMENT FastString
170
171 -- location pseudo-op (file, line, col, name)
172 | LOCATION Int Int Int String
173
174 -- some static data spat out during code
175 -- generation. Will be extracted before
176 -- pretty-printing.
177 | LDATA Section (Alignment, CmmStatics)
178
179 -- start a new basic block. Useful during
180 -- codegen, removed later. Preceding
181 -- instruction should be a jump, as per the
182 -- invariants for a BasicBlock (see Cmm).
183 | NEWBLOCK BlockId
184
185 -- unwinding information
186 -- See Note [Unwinding information in the NCG].
187 | UNWIND CLabel UnwindTable
188
189 -- specify current stack offset for benefit of subsequent passes.
190 -- This carries a BlockId so it can be used in unwinding information.
191 | DELTA Int
192
193 -- Moves.
194 | MOV Format Operand Operand
195 | CMOV Cond Format Operand Reg
196 | MOVZxL Format Operand Operand -- format is the size of operand 1
197 | MOVSxL Format Operand Operand -- format is the size of operand 1
198 -- x86_64 note: plain mov into a 32-bit register always zero-extends
199 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
200 -- don't affect the high bits of the register.
201
202 -- Load effective address (also a very useful three-operand add instruction :-)
203 | LEA Format Operand Operand
204
205 -- Int Arithmetic.
206 | ADD Format Operand Operand
207 | ADC Format Operand Operand
208 | SUB Format Operand Operand
209 | SBB Format Operand Operand
210
211 | MUL Format Operand Operand
212 | MUL2 Format Operand -- %edx:%eax = operand * %rax
213 | IMUL Format Operand Operand -- signed int mul
214 | IMUL2 Format Operand -- %edx:%eax = operand * %eax
215
216 | DIV Format Operand -- eax := eax:edx/op, edx := eax:edx%op
217 | IDIV Format Operand -- ditto, but signed
218
219 -- Int Arithmetic, where the effects on the condition register
220 -- are important. Used in specialized sequences such as MO_Add2.
221 -- Do not rewrite these instructions to "equivalent" ones that
222 -- have different effect on the condition register! (See #9013.)
223 | ADD_CC Format Operand Operand
224 | SUB_CC Format Operand Operand
225
226 -- Simple bit-twiddling.
227 | AND Format Operand Operand
228 | OR Format Operand Operand
229 | XOR Format Operand Operand
230 | NOT Format Operand
231 | NEGI Format Operand -- NEG instruction (name clash with Cond)
232 | BSWAP Format Reg
233
234 -- Shifts (amount may be immediate or %cl only)
235 | SHL Format Operand{-amount-} Operand
236 | SAR Format Operand{-amount-} Operand
237 | SHR Format Operand{-amount-} Operand
238
239 | BT Format Imm Operand
240 | NOP
241
242
243 -- We need to support the FSTP (x87 store and pop) instruction
244 -- so that we can correctly read off the return value of an
245 -- x86 CDECL C function call when its floating point.
246 -- so we dont include a register argument, and just use st(0)
247 -- this instruction is used ONLY for return values of C ffi calls
248 -- in x86_32 abi
249 | X87Store Format AddrMode -- st(0), dst
250
251
252 -- SSE2 floating point: we use a restricted set of the available SSE2
253 -- instructions for floating-point.
254 -- use MOV for moving (either movss or movsd (movlpd better?))
255 | CVTSS2SD Reg Reg -- F32 to F64
256 | CVTSD2SS Reg Reg -- F64 to F32
257 | CVTTSS2SIQ Format Operand Reg -- F32 to I32/I64 (with truncation)
258 | CVTTSD2SIQ Format Operand Reg -- F64 to I32/I64 (with truncation)
259 | CVTSI2SS Format Operand Reg -- I32/I64 to F32
260 | CVTSI2SD Format Operand Reg -- I32/I64 to F64
261
262 -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands
263 -- are Operand Reg.
264
265 -- SSE2 floating-point division:
266 | FDIV Format Operand Operand -- divisor, dividend(dst)
267
268 -- use CMP for comparisons. ucomiss and ucomisd instructions
269 -- compare single/double prec floating point respectively.
270
271 | SQRT Format Operand Reg -- src, dst
272
273
274 -- Comparison
275 | TEST Format Operand Operand
276 | CMP Format Operand Operand
277 | SETCC Cond Operand
278
279 -- Stack Operations.
280 | PUSH Format Operand
281 | POP Format Operand
282 -- both unused (SDM):
283 -- | PUSHA
284 -- | POPA
285
286 -- Jumping around.
287 | JMP Operand [Reg] -- including live Regs at the call
288 | JXX Cond BlockId -- includes unconditional branches
289 | JXX_GBL Cond Imm -- non-local version of JXX
290 -- Table jump
291 | JMP_TBL Operand -- Address to jump to
292 [Maybe JumpDest] -- Targets of the jump table
293 Section -- Data section jump table should be put in
294 CLabel -- Label of jump table
295 | CALL (Either Imm Reg) [Reg]
296
297 -- Other things.
298 | CLTD Format -- sign extend %eax into %edx:%eax
299
300 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
301 -- pretty-prints as
302 -- call 1f
303 -- 1: popl %reg
304 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
305 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
306 -- pretty-prints as
307 -- call 1f
308 -- 1: popl %reg
309
310 -- bit counting instructions
311 | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1
312 | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros
313 | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros
314 | BSF Format Operand Reg -- bit scan forward
315 | BSR Format Operand Reg -- bit scan reverse
316
317 -- bit manipulation instructions
318 | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
319 | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
320
321 -- prefetch
322 | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
323 -- variant can be NTA, Lvl0, Lvl1, or Lvl2
324
325 | LOCK Instr -- lock prefix
326 | XADD Format Operand Operand -- src (r), dst (r/m)
327 | CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit
328 | MFENCE
329
330 data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
331
332
333 data Operand
334 = OpReg Reg -- register
335 | OpImm Imm -- immediate value
336 | OpAddr AddrMode -- memory reference
337
338
339
340 -- | Returns which registers are read and written as a (read, written)
341 -- pair.
342 x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
343 x86_regUsageOfInstr platform instr
344 = case instr of
345 MOV _ src dst -> usageRW src dst
346 CMOV _ _ src dst -> mkRU (use_R src [dst]) [dst]
347 MOVZxL _ src dst -> usageRW src dst
348 MOVSxL _ src dst -> usageRW src dst
349 LEA _ src dst -> usageRW src dst
350 ADD _ src dst -> usageRM src dst
351 ADC _ src dst -> usageRM src dst
352 SUB _ src dst -> usageRM src dst
353 SBB _ src dst -> usageRM src dst
354 IMUL _ src dst -> usageRM src dst
355
356 -- Result of IMULB will be in just in %ax
357 IMUL2 II8 src -> mkRU (eax:use_R src []) [eax]
358 -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
359 -- %ax/%eax/%rax.
360 IMUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
361
362 MUL _ src dst -> usageRM src dst
363 MUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
364 DIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
365 IDIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
366 ADD_CC _ src dst -> usageRM src dst
367 SUB_CC _ src dst -> usageRM src dst
368 AND _ src dst -> usageRM src dst
369 OR _ src dst -> usageRM src dst
370
371 XOR _ (OpReg src) (OpReg dst)
372 | src == dst -> mkRU [] [dst]
373
374 XOR _ src dst -> usageRM src dst
375 NOT _ op -> usageM op
376 BSWAP _ reg -> mkRU [reg] [reg]
377 NEGI _ op -> usageM op
378 SHL _ imm dst -> usageRM imm dst
379 SAR _ imm dst -> usageRM imm dst
380 SHR _ imm dst -> usageRM imm dst
381 BT _ _ src -> mkRUR (use_R src [])
382
383 PUSH _ op -> mkRUR (use_R op [])
384 POP _ op -> mkRU [] (def_W op)
385 TEST _ src dst -> mkRUR (use_R src $! use_R dst [])
386 CMP _ src dst -> mkRUR (use_R src $! use_R dst [])
387 SETCC _ op -> mkRU [] (def_W op)
388 JXX _ _ -> mkRU [] []
389 JXX_GBL _ _ -> mkRU [] []
390 JMP op regs -> mkRUR (use_R op regs)
391 JMP_TBL op _ _ _ -> mkRUR (use_R op [])
392 CALL (Left _) params -> mkRU params (callClobberedRegs platform)
393 CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
394 CLTD _ -> mkRU [eax] [edx]
395 NOP -> mkRU [] []
396
397 X87Store _ dst -> mkRUR ( use_EA dst [])
398
399 CVTSS2SD src dst -> mkRU [src] [dst]
400 CVTSD2SS src dst -> mkRU [src] [dst]
401 CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
402 CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
403 CVTSI2SS _ src dst -> mkRU (use_R src []) [dst]
404 CVTSI2SD _ src dst -> mkRU (use_R src []) [dst]
405 FDIV _ src dst -> usageRM src dst
406 SQRT _ src dst -> mkRU (use_R src []) [dst]
407
408 FETCHGOT reg -> mkRU [] [reg]
409 FETCHPC reg -> mkRU [] [reg]
410
411 COMMENT _ -> noUsage
412 LOCATION{} -> noUsage
413 UNWIND{} -> noUsage
414 DELTA _ -> noUsage
415
416 POPCNT _ src dst -> mkRU (use_R src []) [dst]
417 LZCNT _ src dst -> mkRU (use_R src []) [dst]
418 TZCNT _ src dst -> mkRU (use_R src []) [dst]
419 BSF _ src dst -> mkRU (use_R src []) [dst]
420 BSR _ src dst -> mkRU (use_R src []) [dst]
421
422 PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
423 PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
424
425 -- note: might be a better way to do this
426 PREFETCH _ _ src -> mkRU (use_R src []) []
427 LOCK i -> x86_regUsageOfInstr platform i
428 XADD _ src dst -> usageMM src dst
429 CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
430 MFENCE -> noUsage
431
432 _other -> panic "regUsage: unrecognised instr"
433 where
434 -- # Definitions
435 --
436 -- Written: If the operand is a register, it's written. If it's an
437 -- address, registers mentioned in the address are read.
438 --
439 -- Modified: If the operand is a register, it's both read and
440 -- written. If it's an address, registers mentioned in the address
441 -- are read.
442
443 -- 2 operand form; first operand Read; second Written
444 usageRW :: Operand -> Operand -> RegUsage
445 usageRW op (OpReg reg) = mkRU (use_R op []) [reg]
446 usageRW op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
447 usageRW _ _ = panic "X86.RegInfo.usageRW: no match"
448
449 -- 2 operand form; first operand Read; second Modified
450 usageRM :: Operand -> Operand -> RegUsage
451 usageRM op (OpReg reg) = mkRU (use_R op [reg]) [reg]
452 usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
453 usageRM _ _ = panic "X86.RegInfo.usageRM: no match"
454
455 -- 2 operand form; first operand Modified; second Modified
456 usageMM :: Operand -> Operand -> RegUsage
457 usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
458 usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
459 usageMM _ _ = panic "X86.RegInfo.usageMM: no match"
460
461 -- 3 operand form; first operand Read; second Modified; third Modified
462 usageRMM :: Operand -> Operand -> Operand -> RegUsage
463 usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
464 usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
465 usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
466
467 -- 1 operand form; operand Modified
468 usageM :: Operand -> RegUsage
469 usageM (OpReg reg) = mkRU [reg] [reg]
470 usageM (OpAddr ea) = mkRUR (use_EA ea [])
471 usageM _ = panic "X86.RegInfo.usageM: no match"
472
473 -- Registers defd when an operand is written.
474 def_W (OpReg reg) = [reg]
475 def_W (OpAddr _ ) = []
476 def_W _ = panic "X86.RegInfo.def_W: no match"
477
478 -- Registers used when an operand is read.
479 use_R (OpReg reg) tl = reg : tl
480 use_R (OpImm _) tl = tl
481 use_R (OpAddr ea) tl = use_EA ea tl
482
483 -- Registers used to compute an effective address.
484 use_EA (ImmAddr _ _) tl = tl
485 use_EA (AddrBaseIndex base index _) tl =
486 use_base base $! use_index index tl
487 where use_base (EABaseReg r) tl = r : tl
488 use_base _ tl = tl
489 use_index EAIndexNone tl = tl
490 use_index (EAIndex i _) tl = i : tl
491
492 mkRUR src = src' `seq` RU src' []
493 where src' = filter (interesting platform) src
494
495 mkRU src dst = src' `seq` dst' `seq` RU src' dst'
496 where src' = filter (interesting platform) src
497 dst' = filter (interesting platform) dst
498
499 -- | Is this register interesting for the register allocator?
500 interesting :: Platform -> Reg -> Bool
501 interesting _ (RegVirtual _) = True
502 interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
503 interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no reg pairs on this arch"
504
505
506
507 -- | Applies the supplied function to all registers in instructions.
508 -- Typically used to change virtual registers to real registers.
509 x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
510 x86_patchRegsOfInstr instr env
511 = case instr of
512 MOV fmt src dst -> patch2 (MOV fmt) src dst
513 CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst)
514 MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst
515 MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst
516 LEA fmt src dst -> patch2 (LEA fmt) src dst
517 ADD fmt src dst -> patch2 (ADD fmt) src dst
518 ADC fmt src dst -> patch2 (ADC fmt) src dst
519 SUB fmt src dst -> patch2 (SUB fmt) src dst
520 SBB fmt src dst -> patch2 (SBB fmt) src dst
521 IMUL fmt src dst -> patch2 (IMUL fmt) src dst
522 IMUL2 fmt src -> patch1 (IMUL2 fmt) src
523 MUL fmt src dst -> patch2 (MUL fmt) src dst
524 MUL2 fmt src -> patch1 (MUL2 fmt) src
525 IDIV fmt op -> patch1 (IDIV fmt) op
526 DIV fmt op -> patch1 (DIV fmt) op
527 ADD_CC fmt src dst -> patch2 (ADD_CC fmt) src dst
528 SUB_CC fmt src dst -> patch2 (SUB_CC fmt) src dst
529 AND fmt src dst -> patch2 (AND fmt) src dst
530 OR fmt src dst -> patch2 (OR fmt) src dst
531 XOR fmt src dst -> patch2 (XOR fmt) src dst
532 NOT fmt op -> patch1 (NOT fmt) op
533 BSWAP fmt reg -> BSWAP fmt (env reg)
534 NEGI fmt op -> patch1 (NEGI fmt) op
535 SHL fmt imm dst -> patch1 (SHL fmt imm) dst
536 SAR fmt imm dst -> patch1 (SAR fmt imm) dst
537 SHR fmt imm dst -> patch1 (SHR fmt imm) dst
538 BT fmt imm src -> patch1 (BT fmt imm) src
539 TEST fmt src dst -> patch2 (TEST fmt) src dst
540 CMP fmt src dst -> patch2 (CMP fmt) src dst
541 PUSH fmt op -> patch1 (PUSH fmt) op
542 POP fmt op -> patch1 (POP fmt) op
543 SETCC cond op -> patch1 (SETCC cond) op
544 JMP op regs -> JMP (patchOp op) regs
545 JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
546
547 -- literally only support storing the top x87 stack value st(0)
548 X87Store fmt dst -> X87Store fmt (lookupAddr dst)
549
550 CVTSS2SD src dst -> CVTSS2SD (env src) (env dst)
551 CVTSD2SS src dst -> CVTSD2SS (env src) (env dst)
552 CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
553 CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
554 CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
555 CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
556 FDIV fmt src dst -> FDIV fmt (patchOp src) (patchOp dst)
557 SQRT fmt src dst -> SQRT fmt (patchOp src) (env dst)
558
559 CALL (Left _) _ -> instr
560 CALL (Right reg) p -> CALL (Right (env reg)) p
561
562 FETCHGOT reg -> FETCHGOT (env reg)
563 FETCHPC reg -> FETCHPC (env reg)
564
565 NOP -> instr
566 COMMENT _ -> instr
567 LOCATION {} -> instr
568 UNWIND {} -> instr
569 DELTA _ -> instr
570
571 JXX _ _ -> instr
572 JXX_GBL _ _ -> instr
573 CLTD _ -> instr
574
575 POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
576 LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst)
577 TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst)
578 PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
579 PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
580 BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
581 BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
582
583 PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
584
585 LOCK i -> LOCK (x86_patchRegsOfInstr i env)
586 XADD fmt src dst -> patch2 (XADD fmt) src dst
587 CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst
588 MFENCE -> instr
589
590 _other -> panic "patchRegs: unrecognised instr"
591
592 where
593 patch1 :: (Operand -> a) -> Operand -> a
594 patch1 insn op = insn $! patchOp op
595 patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
596 patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
597
598 patchOp (OpReg reg) = OpReg $! env reg
599 patchOp (OpImm imm) = OpImm imm
600 patchOp (OpAddr ea) = OpAddr $! lookupAddr ea
601
602 lookupAddr (ImmAddr imm off) = ImmAddr imm off
603 lookupAddr (AddrBaseIndex base index disp)
604 = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
605 where
606 lookupBase EABaseNone = EABaseNone
607 lookupBase EABaseRip = EABaseRip
608 lookupBase (EABaseReg r) = EABaseReg $! env r
609
610 lookupIndex EAIndexNone = EAIndexNone
611 lookupIndex (EAIndex r i) = (EAIndex $! env r) i
612
613
614 --------------------------------------------------------------------------------
615 x86_isJumpishInstr
616 :: Instr -> Bool
617
618 x86_isJumpishInstr instr
619 = case instr of
620 JMP{} -> True
621 JXX{} -> True
622 JXX_GBL{} -> True
623 JMP_TBL{} -> True
624 CALL{} -> True
625 _ -> False
626
627
628 x86_jumpDestsOfInstr
629 :: Instr
630 -> [BlockId]
631
632 x86_jumpDestsOfInstr insn
633 = case insn of
634 JXX _ id -> [id]
635 JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
636 _ -> []
637
638
639 x86_patchJumpInstr
640 :: Instr -> (BlockId -> BlockId) -> Instr
641
642 x86_patchJumpInstr insn patchF
643 = case insn of
644 JXX cc id -> JXX cc (patchF id)
645 JMP_TBL op ids section lbl
646 -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
647 _ -> insn
648 where
649 patchJumpDest f (DestBlockId id) = DestBlockId (f id)
650 patchJumpDest _ dest = dest
651
652
653
654
655
656 -- -----------------------------------------------------------------------------
657 -- | Make a spill instruction.
658 x86_mkSpillInstr
659 :: DynFlags
660 -> Reg -- register to spill
661 -> Int -- current stack delta
662 -> Int -- spill slot to use
663 -> Instr
664
665 x86_mkSpillInstr dflags reg delta slot
666 = let off = spillSlotToOffset platform slot - delta
667 in
668 case targetClassOfReg platform reg of
669 RcInteger -> MOV (archWordFormat is32Bit)
670 (OpReg reg) (OpAddr (spRel dflags off))
671 RcDouble -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
672 _ -> panic "X86.mkSpillInstr: no match"
673 where platform = targetPlatform dflags
674 is32Bit = target32Bit platform
675
676 -- | Make a spill reload instruction.
677 x86_mkLoadInstr
678 :: DynFlags
679 -> Reg -- register to load
680 -> Int -- current stack delta
681 -> Int -- spill slot to use
682 -> Instr
683
684 x86_mkLoadInstr dflags reg delta slot
685 = let off = spillSlotToOffset platform slot - delta
686 in
687 case targetClassOfReg platform reg of
688 RcInteger -> MOV (archWordFormat is32Bit)
689 (OpAddr (spRel dflags off)) (OpReg reg)
690 RcDouble -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
691 _ -> panic "X86.x86_mkLoadInstr"
692 where platform = targetPlatform dflags
693 is32Bit = target32Bit platform
694
695 spillSlotSize :: Platform -> Int
696 spillSlotSize dflags = if is32Bit then 12 else 8
697 where is32Bit = target32Bit dflags
698
699 maxSpillSlots :: DynFlags -> Int
700 maxSpillSlots dflags
701 = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1
702 -- = 0 -- useful for testing allocMoreStack
703
704 -- number of bytes that the stack pointer should be aligned to
705 stackAlign :: Int
706 stackAlign = 16
707
708 -- convert a spill slot number to a *byte* offset, with no sign:
709 -- decide on a per arch basis whether you are spilling above or below
710 -- the C stack pointer.
711 spillSlotToOffset :: Platform -> Int -> Int
712 spillSlotToOffset platform slot
713 = 64 + spillSlotSize platform * slot
714
715 --------------------------------------------------------------------------------
716
717 -- | See if this instruction is telling us the current C stack delta
718 x86_takeDeltaInstr
719 :: Instr
720 -> Maybe Int
721
722 x86_takeDeltaInstr instr
723 = case instr of
724 DELTA i -> Just i
725 _ -> Nothing
726
727
728 x86_isMetaInstr
729 :: Instr
730 -> Bool
731
732 x86_isMetaInstr instr
733 = case instr of
734 COMMENT{} -> True
735 LOCATION{} -> True
736 LDATA{} -> True
737 NEWBLOCK{} -> True
738 UNWIND{} -> True
739 DELTA{} -> True
740 _ -> False
741
742
743
744 --- TODO: why is there
745 -- | Make a reg-reg move instruction.
746 -- On SPARC v8 there are no instructions to move directly between
747 -- floating point and integer regs. If we need to do that then we
748 -- have to go via memory.
749 --
750 x86_mkRegRegMoveInstr
751 :: Platform
752 -> Reg
753 -> Reg
754 -> Instr
755
756 x86_mkRegRegMoveInstr platform src dst
757 = case targetClassOfReg platform src of
758 RcInteger -> case platformArch platform of
759 ArchX86 -> MOV II32 (OpReg src) (OpReg dst)
760 ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
761 _ -> panic "x86_mkRegRegMoveInstr: Bad arch"
762 RcDouble -> MOV FF64 (OpReg src) (OpReg dst)
763 -- this code is the lie we tell ourselves because both float and double
764 -- use the same register class.on x86_64 and x86 32bit with SSE2,
765 -- more plainly, both use the XMM registers
766 _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
767
768 -- | Check whether an instruction represents a reg-reg move.
769 -- The register allocator attempts to eliminate reg->reg moves whenever it can,
770 -- by assigning the src and dest temporaries to the same real register.
771 --
772 x86_takeRegRegMoveInstr
773 :: Instr
774 -> Maybe (Reg,Reg)
775
776 x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
777 = Just (r1,r2)
778
779 x86_takeRegRegMoveInstr _ = Nothing
780
781
782 -- | Make an unconditional branch instruction.
783 x86_mkJumpInstr
784 :: BlockId
785 -> [Instr]
786
787 x86_mkJumpInstr id
788 = [JXX ALWAYS id]
789
790 -- Note [Windows stack layout]
791 -- | On most OSes the kernel will place a guard page after the current stack
792 -- page. If you allocate larger than a page worth you may jump over this
793 -- guard page. Not only is this a security issue, but on certain OSes such
794 -- as Windows a new page won't be allocated if you don't hit the guard. This
795 -- will cause a segfault or access fault.
796 --
797 -- This function defines if the current allocation amount requires a probe.
798 -- On Windows (for now) we emit a call to _chkstk for this. For other OSes
799 -- this is not yet implemented.
800 -- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
801 -- The Windows stack looks like this:
802 --
803 -- +-------------------+
804 -- | SP |
805 -- +-------------------+
806 -- | |
807 -- | GUARD PAGE |
808 -- | |
809 -- +-------------------+
810 -- | |
811 -- | |
812 -- | UNMAPPED |
813 -- | |
814 -- | |
815 -- +-------------------+
816 --
817 -- In essense each allocation larger than a page size needs to be chunked and
818 -- a probe emitted after each page allocation. You have to hit the guard
819 -- page so the kernel can map in the next page, otherwise you'll segfault.
820 --
821 needs_probe_call :: Platform -> Int -> Bool
822 needs_probe_call platform amount
823 = case platformOS platform of
824 OSMinGW32 -> case platformArch platform of
825 ArchX86 -> amount > (4 * 1024)
826 ArchX86_64 -> amount > (8 * 1024)
827 _ -> False
828 _ -> False
829
830 x86_mkStackAllocInstr
831 :: Platform
832 -> Int
833 -> [Instr]
834 x86_mkStackAllocInstr platform amount
835 = case platformOS platform of
836 OSMinGW32 ->
837 -- These will clobber AX but this should be ok because
838 --
839 -- 1. It is the first thing we do when entering the closure and AX is
840 -- a caller saved registers on Windows both on x86_64 and x86.
841 --
842 -- 2. The closures are only entered via a call or longjmp in which case
843 -- there are no expectations for volatile registers.
844 --
845 -- 3. When the target is a local branch point it is re-targeted
846 -- after the dealloc, preserving #2. See note [extra spill slots].
847 --
848 -- We emit a call because the stack probes are quite involved and
849 -- would bloat code size a lot. GHC doesn't really have an -Os.
850 -- __chkstk is guaranteed to leave all nonvolatile registers and AX
851 -- untouched. It's part of the standard prologue code for any Windows
852 -- function dropping the stack more than a page.
853 -- See Note [Windows stack layout]
854 case platformArch platform of
855 ArchX86 | needs_probe_call platform amount ->
856 [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
857 , CALL (Left $ strImmLit "___chkstk_ms") [eax]
858 , SUB II32 (OpReg eax) (OpReg esp)
859 ]
860 | otherwise ->
861 [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
862 , TEST II32 (OpReg esp) (OpReg esp)
863 ]
864 ArchX86_64 | needs_probe_call platform amount ->
865 [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
866 , CALL (Left $ strImmLit "___chkstk_ms") [rax]
867 , SUB II64 (OpReg rax) (OpReg rsp)
868 ]
869 | otherwise ->
870 [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
871 , TEST II64 (OpReg rsp) (OpReg rsp)
872 ]
873 _ -> panic "x86_mkStackAllocInstr"
874 _ ->
875 case platformArch platform of
876 ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
877 ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
878 _ -> panic "x86_mkStackAllocInstr"
879
880 x86_mkStackDeallocInstr
881 :: Platform
882 -> Int
883 -> [Instr]
884 x86_mkStackDeallocInstr platform amount
885 = case platformArch platform of
886 ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
887 ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
888 _ -> panic "x86_mkStackDeallocInstr"
889
890
891 --
892 -- Note [extra spill slots]
893 --
894 -- If the register allocator used more spill slots than we have
895 -- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
896 -- C stack space on entry and exit from this proc. Therefore we
897 -- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
898 -- before every non-local jump.
899 --
900 -- This became necessary when the new codegen started bundling entire
901 -- functions together into one proc, because the register allocator
902 -- assigns a different stack slot to each virtual reg within a proc.
903 -- To avoid using so many slots we could also:
904 --
905 -- - split up the proc into connected components before code generator
906 --
907 -- - rename the virtual regs, so that we re-use vreg names and hence
908 -- stack slots for non-overlapping vregs.
909 --
910 -- Note that when a block is both a non-local entry point (with an
911 -- info table) and a local branch target, we have to split it into
912 -- two, like so:
913 --
914 -- <info table>
915 -- L:
916 -- <code>
917 --
918 -- becomes
919 --
920 -- <info table>
921 -- L:
922 -- subl $rsp, N
923 -- jmp Lnew
924 -- Lnew:
925 -- <code>
926 --
927 -- and all branches pointing to L are retargetted to point to Lnew.
928 -- Otherwise, we would repeat the $rsp adjustment for each branch to
929 -- L.
930 --
931 -- Returns a list of (L,Lnew) pairs.
932 --
933 allocMoreStack
934 :: Platform
935 -> Int
936 -> NatCmmDecl statics X86.Instr.Instr
937 -> UniqSM (NatCmmDecl statics X86.Instr.Instr, [(BlockId,BlockId)])
938
939 allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
940 allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
941 let entries = entryBlocks proc
942
943 uniqs <- replicateM (length entries) getUniqueM
944
945 let
946 delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
947 where x = slots * spillSlotSize platform -- sp delta
948
949 alloc = mkStackAllocInstr platform delta
950 dealloc = mkStackDeallocInstr platform delta
951
952 retargetList = (zip entries (map mkBlockId uniqs))
953
954 new_blockmap :: LabelMap BlockId
955 new_blockmap = mapFromList retargetList
956
957 insert_stack_insns (BasicBlock id insns)
958 | Just new_blockid <- mapLookup id new_blockmap
959 = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
960 , BasicBlock new_blockid block' ]
961 | otherwise
962 = [ BasicBlock id block' ]
963 where
964 block' = foldr insert_dealloc [] insns
965
966 insert_dealloc insn r = case insn of
967 JMP _ _ -> dealloc ++ (insn : r)
968 JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
969 _other -> x86_patchJumpInstr insn retarget : r
970 where retarget b = fromMaybe b (mapLookup b new_blockmap)
971
972 new_code = concatMap insert_stack_insns code
973 -- in
974 return (CmmProc info lbl live (ListGraph new_code), retargetList)
975
976 data JumpDest = DestBlockId BlockId | DestImm Imm
977
978 -- Debug Instance
979 instance Outputable JumpDest where
980 ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
981 ppr (DestImm _imm) = text "jd<imm>:noShow"
982
983
984 getJumpDestBlockId :: JumpDest -> Maybe BlockId
985 getJumpDestBlockId (DestBlockId bid) = Just bid
986 getJumpDestBlockId _ = Nothing
987
988 canShortcut :: Instr -> Maybe JumpDest
989 canShortcut (JXX ALWAYS id) = Just (DestBlockId id)
990 canShortcut (JMP (OpImm imm) _) = Just (DestImm imm)
991 canShortcut _ = Nothing
992
993
994 -- This helper shortcuts a sequence of branches.
995 -- The blockset helps avoid following cycles.
996 shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
997 shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
998 where
999 shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
1000 shortcutJump' fn seen insn@(JXX cc id) =
1001 if setMember id seen then insn
1002 else case fn id of
1003 Nothing -> insn
1004 Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
1005 Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm)
1006 where seen' = setInsert id seen
1007 shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
1008 let updateBlock (Just (DestBlockId bid)) =
1009 case fn bid of
1010 Nothing -> Just (DestBlockId bid )
1011 Just dest -> Just dest
1012 updateBlock dest = dest
1013 blocks' = map updateBlock blocks
1014 in JMP_TBL addr blocks' section tblId
1015 shortcutJump' _ _ other = other
1016
1017 -- Here because it knows about JumpDest
1018 shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, CmmStatics) -> (Alignment, CmmStatics)
1019 shortcutStatics fn (align, Statics lbl statics)
1020 = (align, Statics lbl $ map (shortcutStatic fn) statics)
1021 -- we need to get the jump tables, so apply the mapping to the entries
1022 -- of a CmmData too.
1023
1024 shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
1025 shortcutLabel fn lab
1026 | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
1027 | otherwise = lab
1028
1029 shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
1030 shortcutStatic fn (CmmStaticLit (CmmLabel lab))
1031 = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
1032 shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
1033 = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
1034 -- slightly dodgy, we're ignoring the second label, but this
1035 -- works with the way we use CmmLabelDiffOff for jump tables now.
1036 shortcutStatic _ other_static
1037 = other_static
1038
1039 shortBlockId
1040 :: (BlockId -> Maybe JumpDest)
1041 -> UniqSet Unique
1042 -> BlockId
1043 -> CLabel
1044
1045 shortBlockId fn seen blockid =
1046 case (elementOfUniqSet uq seen, fn blockid) of
1047 (True, _) -> blockLbl blockid
1048 (_, Nothing) -> blockLbl blockid
1049 (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
1050 (_, Just (DestImm (ImmCLbl lbl))) -> lbl
1051 (_, _other) -> panic "shortBlockId"
1052 where uq = getUnique blockid