rts: Use .cfi_{start|end}proc directives
[ghc.git] / rts / StgCRun.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2011
4 *
5 * STG-to-C glue.
6 *
7 * To run an STG function from C land, call
8 *
9 * rv = StgRun(f,BaseReg);
10 *
11 * where "f" is the STG function to call, and BaseReg is the address of the
12 * RegTable for this run (we might have separate RegTables if we're running
13 * multiple threads on an SMP machine).
14 *
15 * In the end, "f" must JMP to StgReturn (defined below), passing the
16 * return-value "rv" in R1, to return to the caller of StgRun returning "rv" in
17 * the whatever way C returns a value.
18 *
19 * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any other registers
20 * (other than saving the C callee-saves registers). Instead, the called
21 * function "f" must do that in STG land.
22 *
23 * We also initially make sure that there are @RESERVED_C_STACK_BYTES@ on the
24 * C-stack. This is done to reserve some space for the allocation of
25 * temporaries in STG code.
26 *
27 * -------------------------------------------------------------------------- */
28
29 #include "PosixSource.h"
30 #include "ghcconfig.h"
31
32 #if defined(sparc_HOST_ARCH) || defined(USE_MINIINTERPRETER)
33 /* include Stg.h first because we want real machine regs in here: we
34 * have to get the value of R1 back from Stg land to C land intact.
35 */
36
37 /* We include windows.h very early, as on Win64 the CONTEXT type has
38 fields "R8", "R9" and "R10", which goes bad if we've already
39 #define'd those names for our own purposes (in stg/Regs.h) */
40 #if defined(HAVE_WINDOWS_H)
41 #include <windows.h>
42 #endif
43
44 #define IN_STGCRUN 1
45 #include "Stg.h"
46 #include "Rts.h"
47 #else
48 /* The other architectures do not require the actual register macro definitions
49 * here because they use hand written assembly to implement the StgRun
50 * function. Including Stg.h first will define the R1 values using GCC specific
51 * techniques, which we don't want for LLVM based C compilers. Since we don't
52 * actually need the real machine register definitions here, we include the
53 * headers in the opposite order to allow LLVM-based C compilers to work.
54 */
55 #include "Rts.h"
56 #include "Stg.h"
57 #endif
58
59 #include "StgRun.h"
60 #include "Capability.h"
61
62 #include "RtsUtils.h"
63 #if defined(DEBUG)
64 #include "Printer.h"
65 #endif
66
67 #if defined(USE_MINIINTERPRETER)
68
69 /* -----------------------------------------------------------------------------
70 any architecture (using miniinterpreter)
71 -------------------------------------------------------------------------- */
72
73 StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg STG_UNUSED)
74 {
75 while (f) {
76 IF_DEBUG(interpreter,
77 debugBelch("Jumping to ");
78 printPtr((P_)f); fflush(stdout);
79 debugBelch("\n");
80 );
81 f = (StgFunPtr) (f)();
82 }
83 return (StgRegTable *)R1.p;
84 }
85
86 StgFunPtr StgReturn(void)
87 {
88 return 0;
89 }
90
91 #else /* !USE_MINIINTERPRETER */
92
93 #if defined(mingw32_HOST_OS)
94 /*
95 * Note [Windows Stack allocations]
96 *
97 * On windows the stack has to be allocated 4k at a time, otherwise
98 * we get a segfault. The C compiler knows how to do this (it calls
99 * _alloca()), so we make sure that we can allocate as much stack as
100 * we need. However since we are doing a local stack allocation and the value
101 * isn't valid outside the frame, compilers are free to optimize this allocation
102 * and the corresponding stack check away. So to prevent that we request that
103 * this function never be optimized (See #14669). */
104 STG_NO_OPTIMIZE StgWord8 *win32AllocStack(void)
105 {
106 StgWord8 stack[RESERVED_C_STACK_BYTES + 16 + 12];
107 return stack;
108 }
109 #endif
110
111 /* -----------------------------------------------------------------------------
112 x86 architecture
113 -------------------------------------------------------------------------- */
114
115 #if defined(i386_HOST_ARCH)
116
117 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
118 #define STG_GLOBAL ".globl "
119 #define STG_HIDDEN ".private_extern "
120 #else
121 #define STG_GLOBAL ".global "
122 #define STG_HIDDEN ".hidden "
123 #endif
124
125 /*
126 * Note [Stack Alignment on X86]
127 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
128 *
129 * On X86 (both 32bit and 64bit) we keep the stack aligned on function calls at
130 * a 16-byte boundary. This is done because on a number of architectures the
131 * ABI requires this (x64, Mac OSX 32bit/64bit) as well as interfacing with
132 * other libraries through the FFI.
133 *
134 * As part of this arrangment we must maintain the stack at a 16-byte boundary
135 * - word_size-bytes (so 16n - 4 for i386 and 16n - 8 for x64) on entry to a
136 * procedure since both GCC and LLVM expect this. This is because the stack
137 * should have been 16-byte boundary aligned and then a call made which pushes
138 * a return address onto the stack (so word_size more space used). In STG code
139 * we only jump to other STG procedures, so we maintain the 16n - word_size
140 * alignment for these jumps.
141 *
142 * This gives us binary compatibility with LLVM and GCC as well as dealing
143 * with the FFI. Previously we just maintianed a 16n byte alignment for
144 * procedure entry and calls, which led to bugs (see #4211 and #5250).
145 *
146 * To change this convention you need to change the code here, and in
147 * compiler/nativeGen/X86/CodeGen.hs::GenCCall, and maybe the adjustor
148 * code for thunks in rts/AdjustorAsm.s, rts/Adjustor.c.
149 *
150 * A quick way to see if this is wrong is to compile this code:
151 *
152 * main = System.Exit.exitWith ExitSuccess
153 *
154 * And run it with +RTS -sstderr. The stats code in the RTS, in
155 * particular statsPrintf(), relies on the stack alignment because
156 * it saves the %xmm regs on the stack, so it'll fall over if the
157 * stack isn't aligned, and calling exitWith from Haskell invokes
158 * shutdownHaskellAndExit using a C call.
159 *
160 * If you edit the sequence below be sure to update the unwinding information
161 * for stg_stop_thread in StgStartup.cmm.
162 */
163
164 static void GNUC3_ATTRIBUTE(used)
165 StgRunIsImplementedInAssembler(void)
166 {
167 __asm__ volatile (
168 STG_GLOBAL STG_RUN "\n"
169 #if !defined(mingw32_HOST_OS)
170 STG_HIDDEN STG_RUN "\n"
171 #endif
172 STG_RUN ":\n\t"
173
174 /*
175 * move %esp down to reserve an area for temporary storage
176 * during the execution of STG code.
177 *
178 * The stack pointer has to be aligned to a multiple of 16
179 * bytes from here - this is a requirement of the C ABI, so
180 * that C code can assign SSE2 registers directly to/from
181 * stack locations.
182 */
183 "subl %0, %%esp\n\t"
184
185 /*
186 * save callee-saves registers on behalf of the STG code.
187 */
188 "movl %%esp, %%eax\n\t"
189 "addl %0-16, %%eax\n\t"
190 "movl %%ebx,0(%%eax)\n\t"
191 "movl %%esi,4(%%eax)\n\t"
192 "movl %%edi,8(%%eax)\n\t"
193 "movl %%ebp,12(%%eax)\n\t"
194 /*
195 * Set BaseReg
196 */
197 "movl 24(%%eax),%%ebx\n\t"
198 /*
199 * grab the function argument from the stack
200 */
201 "movl 20(%%eax),%%eax\n\t"
202 /*
203 * jump to it
204 */
205 "jmp *%%eax\n\t"
206
207 STG_GLOBAL STG_RETURN "\n"
208 STG_RETURN ":\n\t"
209
210 "movl %%esi, %%eax\n\t" /* Return value in R1 */
211
212 /*
213 * restore callee-saves registers. (Don't stomp on %%eax!)
214 */
215 "movl %%esp, %%edx\n\t"
216 "addl %0-16, %%edx\n\t"
217 "movl 0(%%edx),%%ebx\n\t" /* restore the registers saved above */
218 "movl 4(%%edx),%%esi\n\t"
219 "movl 8(%%edx),%%edi\n\t"
220 "movl 12(%%edx),%%ebp\n\t"
221
222 "addl %0, %%esp\n\t"
223 "ret"
224
225 : : "i" (RESERVED_C_STACK_BYTES + 16)
226 // + 16 to make room for the 4 registers we have to save
227 // See Note [Stack Alignment on X86]
228 );
229 }
230
231 #endif // defined(i386_HOST_ARCH)
232
233 /* ----------------------------------------------------------------------------
234 x86-64 is almost the same as plain x86.
235
236 I've done it using entirely inline assembler, because I couldn't
237 get gcc to generate the correct subtraction from %rsp by using
238 the local array variable trick. It didn't seem to reserve
239 enough space. Oh well, it's not much harder this way.
240 ------------------------------------------------------------------------- */
241
242 #if defined(x86_64_HOST_ARCH)
243
244 #define STG_GLOBAL ".globl "
245
246 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
247 #define STG_HIDDEN ".private_extern "
248 #else
249 #define STG_HIDDEN ".hidden "
250 #endif
251
252 /*
253 Note [Unwinding foreign exports on x86-64]
254 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
255 For foreign exports, that is Haskell functions exported as C functions when
256 we unwind we have to unwind from Haskell code into C code. The current story
257 is as follows:
258
259 * The Haskell stack always has stg_stop_thread_info frame at the bottom
260 * We annotate stg_stop_thread_info to unwind the instruction pointer to a
261 label inside StgRun called StgRunJmp. It's the last instruction before the
262 code jumps into Haskell.
263 * StgRun - which is implemented in assembler is annotated with some manual
264 unwinding information. It unwinds all the registers that it has saved
265 on the stack. This is important as rsp and rbp are often required for
266 getting to the next frame and the rest of the saved registers are useful
267 when inspecting locals in gdb.
268
269
270 Example x86-64 stack for an FFI call
271 from C into a Haskell function:
272
273
274 HASKELL HEAP
275 "ADDRESS SPACE"
276
277 +--------------------+ <------ rbp
278 | |
279 | |
280 | |
281 | |
282 | Haskell |
283 | evaluation stack |
284 | |
285 | |
286 |--------------------|
287 |stg_catch_frame_info|
288 |--------------------|
289 | stg_forceIO_info |
290 |--------------------|
291 |stg_stop_thread_info| -------
292 +--------------------+ |
293 ... |
294 (other heap objects) |
295 ... |
296 |
297 |
298 |
299 C STACK "ADDRESS SPACE" |
300 v
301 +-----------------------------+ <------ rsp
302 | |
303 | RESERVED_C_STACK_BYTES ~16k |
304 | |
305 |-----------------------------|
306 | rbx ||
307 |-----------------------------| \
308 | rbp | |
309 |-----------------------------| \
310 | r12 | |
311 |-----------------------------| \
312 | r13 | | STG_RUN_STACK_FRAME_SIZE
313 |-----------------------------| /
314 | r14 | |
315 |-----------------------------| /
316 | r15 | |
317 |-----------------------------|/
318 | rip saved by call StgRun |
319 | in schedule() |
320 +-----------------------------+
321 ...
322 schedule() stack frame
323
324
325 Lower addresses on the top
326
327 One little snag in this approach is that the annotations accepted by the
328 assembler are surprisingly unexpressive. I had to resort to a .cfi_escape
329 and hand-assemble a DWARF expression. What made it worse was that big numbers
330 are LEB128 encoded, which makes them variable byte length, with length depending
331 on the magnitude.
332
333 Here's an example stack generated this way:
334
335 Thread 1 "m" hit Breakpoint 1, Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9
336 9 fib a = return (a + 1)
337 #0 Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9
338 #1 stg_catch_frame_info () at rts/Exception.cmm:372
339 #2 stg_forceIO_info () at rts/StgStartup.cmm:178
340 #3 stg_stop_thread_info () at rts/StgStartup.cmm:42
341 #4 0x00000000007048ab in StgRunIsImplementedInAssembler () at rts/StgCRun.c:255
342 #5 0x00000000006fcf42 in schedule (initialCapability=initialCapability@entry=0x8adac0 <MainCapability>, task=task@entry=0x8cf2a0) at rts/Schedule.c:451
343 #6 0x00000000006fe18e in scheduleWaitThread (tso=0x4200006388, ret=<optimized out>, pcap=0x7fffffffdac0) at rts/Schedule.c:2533
344 #7 0x000000000040a21e in hs_fib ()
345 #8 0x000000000040a083 in main (argc=1, argv=0x7fffffffdc48) at m.cpp:15
346
347 (This is from patched gdb. See Note [Info Offset].)
348
349 The previous approach was to encode the unwinding information for select
350 registers in stg_stop_thread_info with Cmm annotations. The unfortunate thing
351 about that approach was that it required introduction of an artificial MachSp
352 register that wasn't meaningful outside unwinding. I discovered that to get
353 stack unwinding working under -threaded runtime I also needed to unwind rbp
354 which would require adding MachRbp. If we wanted to see saved locals in gdb,
355 we'd have to add more. The core of the problem is that Cmm is architecture
356 independent, while unwinding isn't.
357
358 Note [Unwinding foreign imports]
359 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
360 For unwinding foreign imports, that is C functions exposed as Haskell functions
361 no special handling is required. The C function unwinds according to the rip
362 saved on the stack by the call instruction. Then we perform regular Haskell
363 stack unwinding.
364 */
365
366 /*
367 * gcc automatically inserts .cfi_startproc/.cfi_endproc directives around
368 * inline assembler but clang does not. This caused the build to fail with
369 * Clang (see #15207).
370 */
371
372 #if defined(__clang__)
373 #define NEED_EXPLICIT_CFI_START_END
374 #endif
375
376 static void GNUC3_ATTRIBUTE(used)
377 StgRunIsImplementedInAssembler(void)
378 {
379 __asm__ volatile (
380 /*
381 * save callee-saves registers on behalf of the STG code.
382 */
383 STG_GLOBAL STG_RUN "\n"
384 #if !defined(mingw32_HOST_OS)
385 STG_HIDDEN STG_RUN "\n"
386 #endif
387 STG_RUN ":\n\t"
388
389 #if defined(NEED_EXPLICIT_CFI_START_END)
390 ".cfi_startproc simple\n\t"
391 #endif
392 "subq %1, %%rsp\n\t"
393 "movq %%rsp, %%rax\n\t"
394 "subq %0, %%rsp\n\t"
395 "movq %%rbx,0(%%rax)\n\t"
396 "movq %%rbp,8(%%rax)\n\t"
397 "movq %%r12,16(%%rax)\n\t"
398 "movq %%r13,24(%%rax)\n\t"
399 "movq %%r14,32(%%rax)\n\t"
400 "movq %%r15,40(%%rax)\n\t"
401 #if defined(mingw32_HOST_OS)
402 /*
403 * Additional callee saved registers on Win64. This must match
404 * callClobberedRegisters in compiler/nativeGen/X86/Regs.hs as
405 * both represent the Win64 calling convention.
406 */
407 "movq %%rdi,48(%%rax)\n\t"
408 "movq %%rsi,56(%%rax)\n\t"
409 "movq %%xmm6, 64(%%rax)\n\t"
410 "movq %%xmm7, 72(%%rax)\n\t"
411 "movq %%xmm8, 80(%%rax)\n\t"
412 "movq %%xmm9, 88(%%rax)\n\t"
413 "movq %%xmm10, 96(%%rax)\n\t"
414 "movq %%xmm11,104(%%rax)\n\t"
415 "movq %%xmm12,112(%%rax)\n\t"
416 "movq %%xmm13,120(%%rax)\n\t"
417 "movq %%xmm14,128(%%rax)\n\t"
418 "movq %%xmm15,136(%%rax)\n\t"
419 #endif
420
421 /*
422 * Let the unwinder know where we saved the registers
423 * See Note [Unwinding foreign exports on x86-64].
424 */
425 ".cfi_def_cfa rsp, 0\n\t"
426 ".cfi_offset rbx, %c2\n\t"
427 ".cfi_offset rbp, %c3\n\t"
428 ".cfi_offset r12, %c4\n\t"
429 ".cfi_offset r13, %c5\n\t"
430 ".cfi_offset r14, %c6\n\t"
431 ".cfi_offset r15, %c7\n\t"
432 ".cfi_offset rip, %c8\n\t"
433 ".cfi_escape " // DW_CFA_val_expression is not expressible otherwise
434 "0x16, " // DW_CFA_val_expression
435 "0x07, " // register num 7 - rsp
436 "0x04, " // block length
437 "0x77, " // DW_OP_breg7 - signed LEB128 offset from rsp
438 #define RSP_DELTA (RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE + 8)
439 "%c9" // signed LEB128 encoded delta - byte 1
440 #if (RSP_DELTA >> 7) > 0
441 ", %c10" // signed LEB128 encoded delta - byte 2
442 #endif
443
444 #if (RSP_DELTA >> 14) > 0
445 ", %c11" // signed LEB128 encoded delta - byte 3
446 #endif
447
448 #if (RSP_DELTA >> 21) > 0
449 ", %c12" // signed LEB128 encoded delta - byte 4
450 #endif
451
452 #if (RSP_DELTA >> 28) > 0
453 #error "RSP_DELTA too big"
454 #endif
455 "\n\t"
456
457 /*
458 * Set BaseReg
459 */
460 #if defined(mingw32_HOST_OS)
461 "movq %%rdx,%%r13\n\t"
462 #else
463 "movq %%rsi,%%r13\n\t"
464 #endif
465 /*
466 * grab the function argument from the stack, and jump to it.
467 */
468 #if defined(mingw32_HOST_OS)
469 "movq %%rcx,%%rax\n\t"
470 #else
471 "movq %%rdi,%%rax\n\t"
472 #endif
473
474 STG_GLOBAL xstr(STG_RUN_JMP) "\n"
475 #if !defined(mingw32_HOST_OS)
476 STG_HIDDEN xstr(STG_RUN_JMP) "\n"
477 #endif
478 #if defined(NEED_EXPLICIT_CFI_START_END)
479 ".cfi_endproc\n\t"
480 #endif
481
482 #if HAVE_SUBSECTIONS_VIA_SYMBOLS
483 // If we have deadstripping enabled and a label is detected as unused
484 // the code gets nop'd out.
485 ".no_dead_strip " xstr(STG_RUN_JMP) "\n"
486 #endif
487 xstr(STG_RUN_JMP) ":\n\t"
488 "jmp *%%rax\n\t"
489
490 ".globl " STG_RETURN "\n"
491 STG_RETURN ":\n\t"
492
493 "movq %%rbx, %%rax\n\t" /* Return value in R1 */
494
495 /*
496 * restore callee-saves registers. (Don't stomp on %%rax!)
497 */
498 "addq %0, %%rsp\n\t"
499 "movq 0(%%rsp),%%rbx\n\t" /* restore the registers saved above */
500 "movq 8(%%rsp),%%rbp\n\t"
501 "movq 16(%%rsp),%%r12\n\t"
502 "movq 24(%%rsp),%%r13\n\t"
503 "movq 32(%%rsp),%%r14\n\t"
504 "movq 40(%%rsp),%%r15\n\t"
505 #if defined(mingw32_HOST_OS)
506 "movq 48(%%rsp),%%rdi\n\t"
507 "movq 56(%%rsp),%%rsi\n\t"
508 "movq 64(%%rsp),%%xmm6\n\t"
509 "movq 72(%%rax),%%xmm7\n\t"
510 "movq 80(%%rax),%%xmm8\n\t"
511 "movq 88(%%rax),%%xmm9\n\t"
512 "movq 96(%%rax),%%xmm10\n\t"
513 "movq 104(%%rax),%%xmm11\n\t"
514 "movq 112(%%rax),%%xmm12\n\t"
515 "movq 120(%%rax),%%xmm13\n\t"
516 "movq 128(%%rax),%%xmm14\n\t"
517 "movq 136(%%rax),%%xmm15\n\t"
518 #endif
519 "addq %1, %%rsp\n\t"
520 "retq\n\t"
521
522 :
523 : "i"(RESERVED_C_STACK_BYTES),
524 "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */),
525 "i"(RESERVED_C_STACK_BYTES /* rbx relative to cfa (rsp) */),
526 "i"(RESERVED_C_STACK_BYTES + 8 /* rbp relative to cfa (rsp) */),
527 "i"(RESERVED_C_STACK_BYTES + 16 /* r12 relative to cfa (rsp) */),
528 "i"(RESERVED_C_STACK_BYTES + 24 /* r13 relative to cfa (rsp) */),
529 "i"(RESERVED_C_STACK_BYTES + 32 /* r14 relative to cfa (rsp) */),
530 "i"(RESERVED_C_STACK_BYTES + 40 /* r15 relative to cfa (rsp) */),
531 "i"(RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE
532 /* rip relative to cfa */),
533 "i"((RSP_DELTA & 127) | (128 * ((RSP_DELTA >> 7) > 0)))
534 /* signed LEB128-encoded delta from rsp - byte 1 */
535 #if (RSP_DELTA >> 7) > 0
536 , "i"(((RSP_DELTA >> 7) & 127) | (128 * ((RSP_DELTA >> 14) > 0)))
537 /* signed LEB128-encoded delta from rsp - byte 2 */
538 #endif
539
540 #if (RSP_DELTA >> 14) > 0
541 , "i"(((RSP_DELTA >> 14) & 127) | (128 * ((RSP_DELTA >> 21) > 0)))
542 /* signed LEB128-encoded delta from rsp - byte 3 */
543 #endif
544
545 #if (RSP_DELTA >> 21) > 0
546 , "i"(((RSP_DELTA >> 21) & 127) | (128 * ((RSP_DELTA >> 28) > 0)))
547 /* signed LEB128-encoded delta from rsp - byte 4 */
548 #endif
549 #undef RSP_DELTA
550 );
551 /*
552 * See Note [Stack Alignment on X86]
553 */
554 }
555
556 #endif /* x86-64 */
557
558 /* -----------------------------------------------------------------------------
559 Sparc architecture
560
561 --
562 OLD COMMENT from GHC-3.02:
563
564 We want tailjumps to be calls, because `call xxx' is the only Sparc
565 branch that allows an arbitrary label as a target. (Gcc's ``goto
566 *target'' construct ends up loading the label into a register and
567 then jumping, at the cost of two extra instructions for the 32-bit
568 load.)
569
570 When entering the threaded world, we stash our return address in a
571 known location so that \tr{%i7} is available as an extra
572 callee-saves register. Of course, we have to restore this when
573 coming out of the threaded world.
574
575 I hate this god-forsaken architecture. Since the top of the
576 reserved stack space is used for globals and the bottom is reserved
577 for outgoing arguments, we have to stick our return address
578 somewhere in the middle. Currently, I'm allowing 100 extra
579 outgoing arguments beyond the first 6. --JSM
580
581 Updated info (GHC 4.06): we don't appear to use %i7 any more, so
582 I'm not sure whether we still need to save it. Incedentally, what
583 does the last paragraph above mean when it says "the top of the
584 stack is used for globals"? What globals? --SDM
585
586 Updated info (GHC 4.08.2): not saving %i7 any more (see below).
587 -------------------------------------------------------------------------- */
588
589 #if defined(sparc_HOST_ARCH)
590
591 StgRegTable *
592 StgRun(StgFunPtr f, StgRegTable *basereg) {
593
594 unsigned char space[RESERVED_C_STACK_BYTES];
595 #if 0
596 register void *i7 __asm__("%i7");
597 ((void **)(space))[100] = i7;
598 #endif
599 f();
600 __asm__ volatile (
601 ".align 4\n"
602 ".global " STG_RETURN "\n"
603 STG_RETURN ":"
604 : : "p" (space) : "l0","l1","l2","l3","l4","l5","l6","l7");
605 /* we tell the C compiler that l0-l7 are clobbered on return to
606 * StgReturn, otherwise it tries to use these to save eg. the
607 * address of space[100] across the call. The correct thing
608 * to do would be to save all the callee-saves regs, but we
609 * can't be bothered to do that.
610 *
611 * We also explicitly mark space as used since gcc eliminates it
612 * otherwise.
613 *
614 * The code that gcc generates for this little fragment is now
615 * terrible. We could do much better by coding it directly in
616 * assembler.
617 */
618 #if 0
619 /* updated 4.08.2: we don't save %i7 in the middle of the reserved
620 * space any more, since gcc tries to save its address across the
621 * call to f(), this gets clobbered in STG land and we end up
622 * dereferencing a bogus pointer in StgReturn.
623 */
624 __asm__ volatile ("ld %1,%0"
625 : "=r" (i7) : "m" (((void **)(space))[100]));
626 #endif
627 return (StgRegTable *)R1.i;
628 }
629
630 #endif
631
632 /* -----------------------------------------------------------------------------
633 PowerPC architecture
634
635 Everything is in assembler, so we don't have to deal with GCC...
636 -------------------------------------------------------------------------- */
637
638 #if defined(powerpc_HOST_ARCH)
639
640 #define STG_GLOBAL ".globl "
641
642 #if defined(darwin_HOST_OS)
643 #define STG_HIDDEN ".private_extern "
644 #else
645 #define STG_HIDDEN ".hidden "
646 #endif
647
648 #if defined(aix_HOST_OS)
649
650 // implementation is in StgCRunAsm.S
651
652 #elif defined(darwin_HOST_OS)
653 void StgRunIsImplementedInAssembler(void)
654 {
655 #if HAVE_SUBSECTIONS_VIA_SYMBOLS
656 // if the toolchain supports deadstripping, we have to
657 // prevent it here (it tends to get confused here).
658 __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler\n");
659 #endif
660 __asm__ volatile (
661 STG_GLOBAL STG_RUN "\n"
662 STG_HIDDEN STG_RUN "\n"
663 STG_RUN ":\n"
664 "\tmflr r0\n"
665 "\tbl saveFP # f14\n"
666 "\tstmw r13,-220(r1)\n"
667 "\tstwu r1,-%0(r1)\n"
668 "\tmr r27,r4\n" // BaseReg == r27
669 "\tmtctr r3\n"
670 "\tmr r12,r3\n"
671 "\tbctr\n"
672 ".globl _StgReturn\n"
673 "_StgReturn:\n"
674 "\tmr r3,r14\n"
675 "\tla r1,%0(r1)\n"
676 "\tlmw r13,-220(r1)\n"
677 "\tb restFP # f14\n"
678 : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
679 }
680 #else
681
682 // This version is for PowerPC Linux.
683
684 // Differences from the Darwin/Mac OS X version:
685 // *) Different Assembler Syntax
686 // *) Doesn't use Register Saving Helper Functions (although they exist somewhere)
687 // *) We may not access positive stack offsets
688 // (no "Red Zone" as in the Darwin ABI)
689 // *) The Link Register is saved to a different offset in the caller's stack frame
690 // (Linux: 4(r1), Darwin 8(r1))
691
692 static void GNUC3_ATTRIBUTE(used)
693 StgRunIsImplementedInAssembler(void)
694 {
695 __asm__ volatile (
696 "\t.globl StgRun\n"
697 "\t.hidden StgRun\n"
698 "\t.type StgRun,@function\n"
699 "StgRun:\n"
700 "\tmflr 0\n"
701 "\tstw 0,4(1)\n"
702 "\tmr 5,1\n"
703 "\tstwu 1,-%0(1)\n"
704 "\tstmw 13,-220(5)\n"
705 "\tstfd 14,-144(5)\n"
706 "\tstfd 15,-136(5)\n"
707 "\tstfd 16,-128(5)\n"
708 "\tstfd 17,-120(5)\n"
709 "\tstfd 18,-112(5)\n"
710 "\tstfd 19,-104(5)\n"
711 "\tstfd 20,-96(5)\n"
712 "\tstfd 21,-88(5)\n"
713 "\tstfd 22,-80(5)\n"
714 "\tstfd 23,-72(5)\n"
715 "\tstfd 24,-64(5)\n"
716 "\tstfd 25,-56(5)\n"
717 "\tstfd 26,-48(5)\n"
718 "\tstfd 27,-40(5)\n"
719 "\tstfd 28,-32(5)\n"
720 "\tstfd 29,-24(5)\n"
721 "\tstfd 30,-16(5)\n"
722 "\tstfd 31,-8(5)\n"
723 "\tmr 27,4\n" // BaseReg == r27
724 "\tmtctr 3\n"
725 "\tmr 12,3\n"
726 "\tbctr\n"
727 ".globl StgReturn\n"
728 "\t.type StgReturn,@function\n"
729 "StgReturn:\n"
730 "\tmr 3,14\n"
731 "\tla 5,%0(1)\n"
732 "\tlmw 13,-220(5)\n"
733 "\tlfd 14,-144(5)\n"
734 "\tlfd 15,-136(5)\n"
735 "\tlfd 16,-128(5)\n"
736 "\tlfd 17,-120(5)\n"
737 "\tlfd 18,-112(5)\n"
738 "\tlfd 19,-104(5)\n"
739 "\tlfd 20,-96(5)\n"
740 "\tlfd 21,-88(5)\n"
741 "\tlfd 22,-80(5)\n"
742 "\tlfd 23,-72(5)\n"
743 "\tlfd 24,-64(5)\n"
744 "\tlfd 25,-56(5)\n"
745 "\tlfd 26,-48(5)\n"
746 "\tlfd 27,-40(5)\n"
747 "\tlfd 28,-32(5)\n"
748 "\tlfd 29,-24(5)\n"
749 "\tlfd 30,-16(5)\n"
750 "\tlfd 31,-8(5)\n"
751 "\tmr 1,5\n"
752 "\tlwz 0,4(1)\n"
753 "\tmtlr 0\n"
754 "\tblr\n"
755 : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
756 }
757 #endif
758
759 #endif
760
761 /* -----------------------------------------------------------------------------
762 PowerPC 64 architecture
763
764 Everything is in assembler, so we don't have to deal with GCC...
765 -------------------------------------------------------------------------- */
766
767 #if defined(powerpc64_HOST_ARCH)
768
769 #if defined(linux_HOST_OS)
770 static void GNUC3_ATTRIBUTE(used)
771 StgRunIsImplementedInAssembler(void)
772 {
773 // r0 volatile
774 // r1 stack pointer
775 // r2 toc - needs to be saved
776 // r3-r10 argument passing, volatile
777 // r11, r12 very volatile (not saved across cross-module calls)
778 // r13 thread local state (never modified, don't need to save)
779 // r14-r31 callee-save
780 __asm__ volatile (
781 ".section \".opd\",\"aw\"\n"
782 ".align 3\n"
783 ".globl StgRun\n"
784 ".hidden StgRun\n"
785 "StgRun:\n"
786 "\t.quad\t.StgRun,.TOC.@tocbase,0\n"
787 "\t.size StgRun,24\n"
788 ".globl StgReturn\n"
789 "StgReturn:\n"
790 "\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
791 "\t.size StgReturn,24\n"
792 ".previous\n"
793 ".globl .StgRun\n"
794 ".type .StgRun,@function\n"
795 ".StgRun:\n"
796 "\tmflr 0\n"
797 "\tmr 5, 1\n"
798 "\tstd 0, 16(1)\n"
799 "\tstdu 1, -%0(1)\n"
800 "\tstd 2, -296(5)\n"
801 "\tstd 14, -288(5)\n"
802 "\tstd 15, -280(5)\n"
803 "\tstd 16, -272(5)\n"
804 "\tstd 17, -264(5)\n"
805 "\tstd 18, -256(5)\n"
806 "\tstd 19, -248(5)\n"
807 "\tstd 20, -240(5)\n"
808 "\tstd 21, -232(5)\n"
809 "\tstd 22, -224(5)\n"
810 "\tstd 23, -216(5)\n"
811 "\tstd 24, -208(5)\n"
812 "\tstd 25, -200(5)\n"
813 "\tstd 26, -192(5)\n"
814 "\tstd 27, -184(5)\n"
815 "\tstd 28, -176(5)\n"
816 "\tstd 29, -168(5)\n"
817 "\tstd 30, -160(5)\n"
818 "\tstd 31, -152(5)\n"
819 "\tstfd 14, -144(5)\n"
820 "\tstfd 15, -136(5)\n"
821 "\tstfd 16, -128(5)\n"
822 "\tstfd 17, -120(5)\n"
823 "\tstfd 18, -112(5)\n"
824 "\tstfd 19, -104(5)\n"
825 "\tstfd 20, -96(5)\n"
826 "\tstfd 21, -88(5)\n"
827 "\tstfd 22, -80(5)\n"
828 "\tstfd 23, -72(5)\n"
829 "\tstfd 24, -64(5)\n"
830 "\tstfd 25, -56(5)\n"
831 "\tstfd 26, -48(5)\n"
832 "\tstfd 27, -40(5)\n"
833 "\tstfd 28, -32(5)\n"
834 "\tstfd 29, -24(5)\n"
835 "\tstfd 30, -16(5)\n"
836 "\tstfd 31, -8(5)\n"
837 "\tmr 27, 4\n" // BaseReg == r27
838 "\tld 2, 8(3)\n"
839 "\tld 3, 0(3)\n"
840 "\tmtctr 3\n"
841 "\tbctr\n"
842 ".globl .StgReturn\n"
843 ".type .StgReturn,@function\n"
844 ".StgReturn:\n"
845 "\tmr 3,14\n"
846 "\tla 5, %0(1)\n" // load address == addi r5, r1, %0
847 "\tld 2, -296(5)\n"
848 "\tld 14, -288(5)\n"
849 "\tld 15, -280(5)\n"
850 "\tld 16, -272(5)\n"
851 "\tld 17, -264(5)\n"
852 "\tld 18, -256(5)\n"
853 "\tld 19, -248(5)\n"
854 "\tld 20, -240(5)\n"
855 "\tld 21, -232(5)\n"
856 "\tld 22, -224(5)\n"
857 "\tld 23, -216(5)\n"
858 "\tld 24, -208(5)\n"
859 "\tld 25, -200(5)\n"
860 "\tld 26, -192(5)\n"
861 "\tld 27, -184(5)\n"
862 "\tld 28, -176(5)\n"
863 "\tld 29, -168(5)\n"
864 "\tld 30, -160(5)\n"
865 "\tld 31, -152(5)\n"
866 "\tlfd 14, -144(5)\n"
867 "\tlfd 15, -136(5)\n"
868 "\tlfd 16, -128(5)\n"
869 "\tlfd 17, -120(5)\n"
870 "\tlfd 18, -112(5)\n"
871 "\tlfd 19, -104(5)\n"
872 "\tlfd 20, -96(5)\n"
873 "\tlfd 21, -88(5)\n"
874 "\tlfd 22, -80(5)\n"
875 "\tlfd 23, -72(5)\n"
876 "\tlfd 24, -64(5)\n"
877 "\tlfd 25, -56(5)\n"
878 "\tlfd 26, -48(5)\n"
879 "\tlfd 27, -40(5)\n"
880 "\tlfd 28, -32(5)\n"
881 "\tlfd 29, -24(5)\n"
882 "\tlfd 30, -16(5)\n"
883 "\tlfd 31, -8(5)\n"
884 "\tmr 1, 5\n"
885 "\tld 0, 16(1)\n"
886 "\tmtlr 0\n"
887 "\tblr\n"
888 : : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
889 }
890
891 #else // linux_HOST_OS
892 #error Only Linux support for power64 right now.
893 #endif
894
895 #endif
896
897 #if defined(powerpc64le_HOST_ARCH)
898 /* -----------------------------------------------------------------------------
899 PowerPC 64 little endian architecture
900
901 Really everything is in assembler, so we don't have to deal with GCC...
902 -------------------------------------------------------------------------- */
903 #endif
904
905 /* -----------------------------------------------------------------------------
906 ARM architecture
907 -------------------------------------------------------------------------- */
908
909 #if defined(arm_HOST_ARCH)
910
911 #if defined(__thumb__)
912 #define THUMB_FUNC ".thumb\n\t.thumb_func\n\t"
913 #else
914 #define THUMB_FUNC
915 #endif
916
917 StgRegTable *
918 StgRun(StgFunPtr f, StgRegTable *basereg) {
919 StgRegTable * r;
920 __asm__ volatile (
921 /*
922 * save callee-saves registers on behalf of the STG code.
923 */
924 "stmfd sp!, {r4-r11, ip, lr}\n\t"
925 #if !defined(arm_HOST_ARCH_PRE_ARMv6)
926 "vstmdb sp!, {d8-d11}\n\t"
927 #endif
928 /*
929 * allocate some space for Stg machine's temporary storage.
930 * Note: RESERVED_C_STACK_BYTES has to be a round number here or
931 * the assembler can't assemble it.
932 */
933 "sub sp, sp, %3\n\t"
934 /*
935 * Set BaseReg
936 */
937 "mov r4, %2\n\t"
938 /*
939 * Jump to function argument.
940 */
941 "bx %1\n\t"
942
943 ".globl " STG_RETURN "\n\t"
944 THUMB_FUNC
945 #if !defined(ios_HOST_OS)
946 ".type " STG_RETURN ", %%function\n"
947 #endif
948 STG_RETURN ":\n\t"
949 /*
950 * Free the space we allocated
951 */
952 "add sp, sp, %3\n\t"
953 /*
954 * Return the new register table, taking it from Stg's R1 (ARM's R7).
955 */
956 "mov %0, r7\n\t"
957 /*
958 * restore callee-saves registers.
959 */
960 #if !defined(arm_HOST_ARCH_PRE_ARMv6)
961 "vldmia sp!, {d8-d11}\n\t"
962 #endif
963 "ldmfd sp!, {r4-r11, ip, lr}\n\t"
964 : "=r" (r)
965 : "r" (f), "r" (basereg), "i" (RESERVED_C_STACK_BYTES)
966 #if !defined(__thumb__)
967 /* In ARM mode, r11/fp is frame-pointer and so we cannot mark
968 it as clobbered. If we do so, GCC complains with error. */
969 : "%r4", "%r5", "%r6", "%r7", "%r8", "%r9", "%r10", "%ip", "%lr"
970 #else
971 /* In Thumb mode r7 is frame-pointer and so we cannot mark it
972 as clobbered. On the other hand we mark as clobbered also
973 those regs not used in Thumb mode. Hard to judge if this is
974 needed, but certainly Haskell code is using them for
975 placing GHC's virtual registers there. See
976 includes/stg/MachRegs.h Please note that Haskell code is
977 compiled by GHC/LLVM into ARM code (not Thumb!), at least
978 as of February 2012 */
979 : "%r4", "%r5", "%r6", "%r8", "%r9", "%r10", "%11", "%ip", "%lr"
980 #endif
981 );
982 return r;
983 }
984 #endif
985
986 #if defined(aarch64_HOST_ARCH)
987
988 StgRegTable *
989 StgRun(StgFunPtr f, StgRegTable *basereg) {
990 StgRegTable * r;
991 __asm__ volatile (
992 /*
993 * Save callee-saves registers on behalf of the STG code.
994 * Floating point registers only need the bottom 64 bits preserved.
995 * We need to use the names x16, x17, x29 and x30 instead of ip0
996 * ip1, fp and lp because one of either clang or gcc doesn't understand
997 * the later names.
998 */
999 "stp x29, x30, [sp, #-16]!\n\t"
1000 "mov x29, sp\n\t"
1001 "stp x16, x17, [sp, #-16]!\n\t"
1002 "stp x19, x20, [sp, #-16]!\n\t"
1003 "stp x21, x22, [sp, #-16]!\n\t"
1004 "stp x23, x24, [sp, #-16]!\n\t"
1005 "stp x25, x26, [sp, #-16]!\n\t"
1006 "stp x27, x28, [sp, #-16]!\n\t"
1007 "stp d8, d9, [sp, #-16]!\n\t"
1008 "stp d10, d11, [sp, #-16]!\n\t"
1009 "stp d12, d13, [sp, #-16]!\n\t"
1010 "stp d14, d15, [sp, #-16]!\n\t"
1011
1012 /*
1013 * allocate some space for Stg machine's temporary storage.
1014 * Note: RESERVED_C_STACK_BYTES has to be a round number here or
1015 * the assembler can't assemble it.
1016 */
1017 "sub sp, sp, %3\n\t"
1018 /*
1019 * Set BaseReg
1020 */
1021 "mov x19, %2\n\t"
1022 /*
1023 * Jump to function argument.
1024 */
1025 "br %1\n\t"
1026
1027 ".globl " STG_RETURN "\n\t"
1028 #if !defined(ios_HOST_OS)
1029 ".type " STG_RETURN ", %%function\n"
1030 #endif
1031 STG_RETURN ":\n\t"
1032 /*
1033 * Free the space we allocated
1034 */
1035 "add sp, sp, %3\n\t"
1036 /*
1037 * Return the new register table, taking it from Stg's R1 (ARM64's R22).
1038 */
1039 "mov %0, x22\n\t"
1040 /*
1041 * restore callee-saves registers.
1042 */
1043
1044 "ldp d14, d15, [sp], #16\n\t"
1045 "ldp d12, d13, [sp], #16\n\t"
1046 "ldp d10, d11, [sp], #16\n\t"
1047 "ldp d8, d9, [sp], #16\n\t"
1048 "ldp x27, x28, [sp], #16\n\t"
1049 "ldp x25, x26, [sp], #16\n\t"
1050 "ldp x23, x24, [sp], #16\n\t"
1051 "ldp x21, x22, [sp], #16\n\t"
1052 "ldp x19, x20, [sp], #16\n\t"
1053 "ldp x16, x17, [sp], #16\n\t"
1054 "ldp x29, x30, [sp], #16\n\t"
1055
1056 : "=r" (r)
1057 : "r" (f), "r" (basereg), "i" (RESERVED_C_STACK_BYTES)
1058 : "%x19", "%x20", "%x21", "%x22", "%x23", "%x24", "%x25", "%x26", "%x27", "%x28",
1059 "%x16", "%x17", "%x30"
1060 );
1061 return r;
1062 }
1063
1064 #endif
1065
1066 #endif /* !USE_MINIINTERPRETER */