CNF: Silence pointer fix-up message unless gc debugging is enabled
[ghc.git] / rts / StgCRun.c
index 98116ab..2166249 100644 (file)
@@ -1,82 +1,70 @@
 /* -----------------------------------------------------------------------------
  *
- * (c) The GHC Team, 1998-2003
+ * (c) The GHC Team, 1998-2011
  *
  * STG-to-C glue.
  *
  * To run an STG function from C land, call
  *
- *             rv = StgRun(f,BaseReg);
+ *              rv = StgRun(f,BaseReg);
  *
  * where "f" is the STG function to call, and BaseReg is the address of the
  * RegTable for this run (we might have separate RegTables if we're running
  * multiple threads on an SMP machine).
  *
- * In the end, "f" must JMP to StgReturn (defined below),
- * passing the return-value "rv" in R1,
- * to return to the caller of StgRun returning "rv" in
+ * In the end, "f" must JMP to StgReturn (defined below), passing the
+ * return-value "rv" in R1, to return to the caller of StgRun returning "rv" in
  * the whatever way C returns a value.
  *
- * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any
- * other registers (other than saving the C callee-saves
- * registers).  Instead, the called function "f" must do that
- * in STG land.
+ * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any other registers
+ * (other than saving the C callee-saves registers). Instead, the called
+ * function "f" must do that in STG land.
  *
- * GCC will have assumed that pushing/popping of C-stack frames is
- * going on when it generated its code, and used stack space
- * accordingly.  However, we actually {\em post-process away} all
- * such stack-framery (see \tr{ghc/driver/ghc-asm.lprl}). Things will
- * be OK however, if we initially make sure there are
- * @RESERVED_C_STACK_BYTES@ on the C-stack to begin with, for local
- * variables.
+ * We also initially make sure that there are @RESERVED_C_STACK_BYTES@ on the
+ * C-stack. This is done to reserve some space for the allocation of
+ * temporaries in STG code.
  *
  * -------------------------------------------------------------------------- */
 
 #include "PosixSource.h"
-
-
-/*
- * We define the following (unused) global register variables, because for
- * some reason gcc generates sub-optimal code for StgRun() on the Alpha
- * (unnecessarily saving extra registers on the stack) if we don't.
- *
- * Why do it at the top of this file, rather than near StgRun() below?  Because
- * gcc doesn't let us define global register variables after any function
- * definition has been read.  Any point after #include "Stg.h" would be too
- * late.
- *
- * We define alpha_EXTRA_CAREFUL here to save $s6, $f8 and $f9 -- registers
- * that we don't use but which are callee-save registers.  The __divq() routine
- * in libc.a clobbers $s6.
- */
 #include "ghcconfig.h"
-#ifdef alpha_HOST_ARCH
-#define alpha_EXTRA_CAREFUL
-register long   fake_ra __asm__("$26");
-register long   fake_gp __asm__("$29");
-#ifdef alpha_EXTRA_CAREFUL
-register long   fake_s6 __asm__("$15");
-register double fake_f8 __asm__("$f8");
-register double fake_f9 __asm__("$f9");
-#endif
-#endif
 
+#if defined(sparc_HOST_ARCH) || defined(USE_MINIINTERPRETER)
 /* include Stg.h first because we want real machine regs in here: we
  * have to get the value of R1 back from Stg land to C land intact.
  */
+
+/* We include windows.h very early, as on Win64 the CONTEXT type has
+   fields "R8", "R9" and "R10", which goes bad if we've already
+   #define'd those names for our own purposes (in stg/Regs.h) */
+#if defined(HAVE_WINDOWS_H)
+#include <windows.h>
+#endif
+
+#define IN_STGCRUN 1
 #include "Stg.h"
 #include "Rts.h"
+#else
+/* The other architectures do not require the actual register macro definitions
+ * here because they use hand written assembly to implement the StgRun
+ * function. Including Stg.h first will define the R1 values using GCC specific
+ * techniques, which we don't want for LLVM based C compilers. Since we don't
+ * actually need the real machine register definitions here, we include the
+ * headers in the opposite order to allow LLVM-based C compilers to work.
+ */
+#include "Rts.h"
+#include "Stg.h"
+#endif
+
 #include "StgRun.h"
-#include "RtsFlags.h"
-#include "OSThreads.h"
 #include "Capability.h"
 
-#ifdef DEBUG
+#if defined(DEBUG)
 #include "RtsUtils.h"
 #include "Printer.h"
 #endif
 
-#ifdef USE_MINIINTERPRETER
+#if defined(USE_MINIINTERPRETER)
 
 /* -----------------------------------------------------------------------------
    any architecture (using miniinterpreter)
@@ -85,14 +73,12 @@ register double fake_f9 __asm__("$f9");
 StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg STG_UNUSED)
 {
     while (f) {
-        /* XXX Disabled due to RtsFlags[]/RtsFlags mismatch
-       IF_DEBUG(interpreter,
-           debugBelch("Jumping to ");
-           printPtr((P_)f); fflush(stdout);
-           debugBelch("\n");
-           );
-        */
-       f = (StgFunPtr) (f)();
+        IF_DEBUG(interpreter,
+                 debugBelch("Jumping to ");
+                 printPtr((P_)f); fflush(stdout);
+                 debugBelch("\n");
+                 );
+        f = (StgFunPtr) (f)();
     }
     return (StgRegTable *)R1.p;
 }
@@ -104,82 +90,144 @@ StgFunPtr StgReturn(void)
 
 #else /* !USE_MINIINTERPRETER */
 
-#ifdef LEADING_UNDERSCORE
+#if defined(LEADING_UNDERSCORE)
+#define STG_RUN "_StgRun"
 #define STG_RETURN "_StgReturn"
 #else
+#define STG_RUN "StgRun"
 #define STG_RETURN "StgReturn"
 #endif
 
+#if defined(mingw32_HOST_OS)
+// On windows the stack has to be allocated 4k at a time, otherwise
+// we get a segfault.  The C compiler knows how to do this (it calls
+// _alloca()), so we make sure that we can allocate as much stack as
+// we need:
+StgWord8 *win32AllocStack(void)
+{
+    StgWord8 stack[RESERVED_C_STACK_BYTES + 16 + 12];
+    return stack;
+}
+#endif
+
 /* -----------------------------------------------------------------------------
    x86 architecture
    -------------------------------------------------------------------------- */
 
-#ifdef i386_HOST_ARCH
+#if defined(i386_HOST_ARCH)
 
-#ifdef darwin_TARGET_OS
+#if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
 #define STG_GLOBAL ".globl "
+#define STG_HIDDEN ".private_extern "
 #else
 #define STG_GLOBAL ".global "
+#define STG_HIDDEN ".hidden "
 #endif
 
-StgRegTable *
-StgRun(StgFunPtr f, StgRegTable *basereg) {
-
-    unsigned char space[ RESERVED_C_STACK_BYTES + 4*sizeof(void *) ];
-    StgRegTable * r;
+/*
+ * Note [Stack Alignment on X86]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * On X86 (both 32bit and 64bit) we keep the stack aligned on function calls at
+ * a 16-byte boundary. This is done because on a number of architectures the
+ * ABI requires this (x64, Mac OSX 32bit/64bit) as well as interfacing with
+ * other libraries through the FFI.
+ *
+ * As part of this arrangment we must maintain the stack at a 16-byte boundary
+ * - word_size-bytes (so 16n - 4 for i386 and 16n - 8 for x64) on entry to a
+ * procedure since both GCC and LLVM expect this. This is because the stack
+ * should have been 16-byte boundary aligned and then a call made which pushes
+ * a return address onto the stack (so word_size more space used). In STG code
+ * we only jump to other STG procedures, so we maintain the 16n - word_size
+ * alignment for these jumps.
+ *
+ * This gives us binary compatibility with LLVM and GCC as well as dealing
+ * with the FFI. Previously we just maintianed a 16n byte alignment for
+ * procedure entry and calls, which led to bugs (see #4211 and #5250).
+ *
+ * To change this convention you need to change the code here, and in
+ * compiler/nativeGen/X86/CodeGen.hs::GenCCall, and maybe the adjustor
+ * code for thunks in rts/AdjustorAsm.s, rts/Adjustor.c.
+ *
+ * A quick way to see if this is wrong is to compile this code:
+ *
+ *    main = System.Exit.exitWith ExitSuccess
+ *
+ * And run it with +RTS -sstderr.  The stats code in the RTS, in
+ * particular statsPrintf(), relies on the stack alignment because
+ * it saves the %xmm regs on the stack, so it'll fall over if the
+ * stack isn't aligned, and calling exitWith from Haskell invokes
+ * shutdownHaskellAndExit using a C call.
+ *
+ * If you edit the sequence below be sure to update the unwinding information
+ * for stg_stop_thread in StgStartup.cmm.
+ */
 
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
     __asm__ volatile (
-       /*
-        * save callee-saves registers on behalf of the STG code.
-        */
-       "movl %%esp, %%eax\n\t"
-       "addl %4, %%eax\n\t"
+        STG_GLOBAL STG_RUN "\n"
+#if !defined(mingw32_HOST_OS)
+        STG_HIDDEN STG_RUN "\n"
+#endif
+        STG_RUN ":\n\t"
+
+        /*
+         * move %esp down to reserve an area for temporary storage
+         * during the execution of STG code.
+         *
+         * The stack pointer has to be aligned to a multiple of 16
+         * bytes from here - this is a requirement of the C ABI, so
+         * that C code can assign SSE2 registers directly to/from
+         * stack locations.
+         */
+        "subl %0, %%esp\n\t"
+
+        /*
+         * save callee-saves registers on behalf of the STG code.
+         */
+        "movl %%esp, %%eax\n\t"
+        "addl %0-16, %%eax\n\t"
         "movl %%ebx,0(%%eax)\n\t"
         "movl %%esi,4(%%eax)\n\t"
         "movl %%edi,8(%%eax)\n\t"
         "movl %%ebp,12(%%eax)\n\t"
-       /*
-        * Set BaseReg
-        */
-       "movl %3,%%ebx\n\t"
-       /*
-        * grab the function argument from the stack
-        */
-        "movl %2,%%eax\n\t"
-        
-       /*
-        * Darwin note:
-        * The stack pointer has to be aligned to a multiple of 16 bytes at
-        * this point. This works out correctly with gcc 4.0.1, but it might
-        * break at any time in the future. TODO: Make this future-proof.
-        */
-
-       /*
-        * jump to it
-        */
+        /*
+         * Set BaseReg
+         */
+        "movl 24(%%eax),%%ebx\n\t"
+        /*
+         * grab the function argument from the stack
+         */
+        "movl 20(%%eax),%%eax\n\t"
+        /*
+         * jump to it
+         */
         "jmp *%%eax\n\t"
 
-       STG_GLOBAL STG_RETURN "\n"
-               STG_RETURN ":\n\t"
+        STG_GLOBAL STG_RETURN "\n"
+        STG_RETURN ":\n\t"
 
-       "movl %%esi, %%eax\n\t"   /* Return value in R1  */
+        "movl %%esi, %%eax\n\t"   /* Return value in R1  */
 
-       /*
-        * restore callee-saves registers.  (Don't stomp on %%eax!)
-        */
-       "movl %%esp, %%edx\n\t"
-       "addl %4, %%edx\n\t"
-        "movl 0(%%edx),%%ebx\n\t"      /* restore the registers saved above */
+        /*
+         * restore callee-saves registers.  (Don't stomp on %%eax!)
+         */
+        "movl %%esp, %%edx\n\t"
+        "addl %0-16, %%edx\n\t"
+        "movl 0(%%edx),%%ebx\n\t"       /* restore the registers saved above */
         "movl 4(%%edx),%%esi\n\t"
         "movl 8(%%edx),%%edi\n\t"
         "movl 12(%%edx),%%ebp\n\t"
 
-      : "=&a" (r), "=m" (space)
-      : "m" (f), "m" (basereg), "i" (RESERVED_C_STACK_BYTES)
-      : "edx" /* stomps on %edx */
-    );
+        "addl %0, %%esp\n\t"
+        "ret"
 
-    return r;
+      : : "i" (RESERVED_C_STACK_BYTES + 16)
+        // + 16 to make room for the 4 registers we have to save
+        // See Note [Stack Alignment on X86]
+    );
 }
 
 #endif
@@ -191,95 +239,92 @@ StgRun(StgFunPtr f, StgRegTable *basereg) {
    get gcc to generate the correct subtraction from %rsp by using
    the local array variable trick.  It didn't seem to reserve
    enough space.  Oh well, it's not much harder this way.
-
    ------------------------------------------------------------------------- */
 
-#ifdef x86_64_HOST_ARCH
+#if defined(x86_64_HOST_ARCH)
 
-extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+#define STG_GLOBAL ".globl "
+
+#if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
+#define STG_HIDDEN ".private_extern "
+#else
+#define STG_HIDDEN ".hidden "
+#endif
 
 static void GNUC3_ATTRIBUTE(used)
 StgRunIsImplementedInAssembler(void)
 {
     __asm__ volatile (
-       /*
-        * save callee-saves registers on behalf of the STG code.
-        */
-       ".globl StgRun\n"
-       "StgRun:\n\t"
-       "subq %0, %%rsp\n\t"
-       "movq %%rsp, %%rax\n\t"
-       "addq %0-48, %%rax\n\t"
+        /*
+         * save callee-saves registers on behalf of the STG code.
+         */
+        STG_GLOBAL STG_RUN "\n"
+#if !defined(mingw32_HOST_OS)
+        STG_HIDDEN STG_RUN "\n"
+#endif
+        STG_RUN ":\n\t"
+        "subq %1, %%rsp\n\t"
+        "movq %%rsp, %%rax\n\t"
+        "subq %0, %%rsp\n\t"
         "movq %%rbx,0(%%rax)\n\t"
         "movq %%rbp,8(%%rax)\n\t"
         "movq %%r12,16(%%rax)\n\t"
         "movq %%r13,24(%%rax)\n\t"
         "movq %%r14,32(%%rax)\n\t"
         "movq %%r15,40(%%rax)\n\t"
-       /*
-        * Set BaseReg
-        */
-       "movq %%rsi,%%r13\n\t"
-       /*
-        * grab the function argument from the stack, and jump to it.
-        */
+#if defined(mingw32_HOST_OS)
+        "movq %%rdi,48(%%rax)\n\t"
+        "movq %%rsi,56(%%rax)\n\t"
+        "movq %%xmm6,64(%%rax)\n\t"
+#endif
+        /*
+         * Set BaseReg
+         */
+#if defined(mingw32_HOST_OS)
+        "movq %%rdx,%%r13\n\t"
+#else
+        "movq %%rsi,%%r13\n\t"
+#endif
+        /*
+         * grab the function argument from the stack, and jump to it.
+         */
+#if defined(mingw32_HOST_OS)
+        "movq %%rcx,%%rax\n\t"
+#else
         "movq %%rdi,%%rax\n\t"
+#endif
         "jmp *%%rax\n\t"
 
-       ".global " STG_RETURN "\n"
-               STG_RETURN ":\n\t"
-
-       "movq %%rbx, %%rax\n\t"   /* Return value in R1  */
-
-       /*
-        * restore callee-saves registers.  (Don't stomp on %%rax!)
-        */
-       "movq %%rsp, %%rdx\n\t"
-       "addq %0-48, %%rdx\n\t"
-        "movq 0(%%rdx),%%rbx\n\t"      /* restore the registers saved above */
-        "movq 8(%%rdx),%%rbp\n\t"
-        "movq 16(%%rdx),%%r12\n\t"
-        "movq 24(%%rdx),%%r13\n\t"
-        "movq 32(%%rdx),%%r14\n\t"
-        "movq 40(%%rdx),%%r15\n\t"
-       "addq %0, %%rsp\n\t"
-       "retq"
-
-       : : "i"(RESERVED_C_STACK_BYTES+48+8 /*stack frame size*/));
-    /* 
-       HACK alert!
-
-       The x86_64 ABI specifies that on a procedure call, %rsp is
-       aligned on a 16-byte boundary + 8.  That is, the first
-       argument on the stack after the return address will be
-       16-byte aligned.  
-       
-       Which should be fine: RESERVED_C_STACK_BYTES+48 is a multiple
-       of 16 bytes.  
-       
-       BUT... when we do a C-call from STG land, gcc likes to put the
-       stack alignment adjustment in the prolog.  eg. if we're calling
-       a function with arguments in regs, gcc will insert 'subq $8,%rsp'
-       in the prolog, to keep %rsp aligned (the return address is 8
-       bytes, remember).  The mangler throws away the prolog, so we
-       lose the stack alignment.
-
-       The hack is to add this extra 8 bytes to our %rsp adjustment
-       here, so that throughout STG code, %rsp is 16-byte aligned,
-       ready for a C-call.  
-
-       A quick way to see if this is wrong is to compile this code:
-
-          main = System.Exit.exitWith ExitSuccess
-
-       And run it with +RTS -sstderr.  The stats code in the RTS, in
-       particular statsPrintf(), relies on the stack alignment because
-       it saves the %xmm regs on the stack, so it'll fall over if the
-       stack isn't aligned, and calling exitWith from Haskell invokes
-       shutdownHaskellAndExit using a C call.
-
-       Future gcc releases will almost certainly break this hack...
-    */
+        ".globl " STG_RETURN "\n"
+         STG_RETURN ":\n\t"
+
+        "movq %%rbx, %%rax\n\t"   /* Return value in R1  */
+
+        /*
+         * restore callee-saves registers.  (Don't stomp on %%rax!)
+         */
+        "addq %0, %%rsp\n\t"
+        "movq 0(%%rsp),%%rbx\n\t"       /* restore the registers saved above */
+        "movq 8(%%rsp),%%rbp\n\t"
+        "movq 16(%%rsp),%%r12\n\t"
+        "movq 24(%%rsp),%%r13\n\t"
+        "movq 32(%%rsp),%%r14\n\t"
+        "movq 40(%%rsp),%%r15\n\t"
+#if defined(mingw32_HOST_OS)
+        "movq 48(%%rsp),%%rdi\n\t"
+        "movq 56(%%rsp),%%rsi\n\t"
+        "movq 64(%%rsp),%%xmm6\n\t"
+#endif
+        "addq %1, %%rsp\n\t"
+        "retq"
+
+        :
+        : "i"(RESERVED_C_STACK_BYTES),
+          "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */)
+        );
+        /*
+         * See Note [Stack Alignment on X86]
+         */
 }
 
 #endif /* x86-64 */
@@ -315,7 +360,7 @@ StgRunIsImplementedInAssembler(void)
    Updated info (GHC 4.08.2): not saving %i7 any more (see below).
    -------------------------------------------------------------------------- */
 
-#ifdef sparc_HOST_ARCH
+#if defined(sparc_HOST_ARCH)
 
 StgRegTable *
 StgRun(StgFunPtr f, StgRegTable *basereg) {
@@ -327,10 +372,10 @@ StgRun(StgFunPtr f, StgRegTable *basereg) {
 #endif
     f();
     __asm__ volatile (
-           ".align 4\n"
-            ".global " STG_RETURN "\n"
-                   STG_RETURN ":"
-           : : "p" (space) : "l0","l1","l2","l3","l4","l5","l6","l7");
+                 ".align 4\n"
+                 ".global " STG_RETURN "\n"
+                 STG_RETURN ":"
+                 : : "p" (space) : "l0","l1","l2","l3","l4","l5","l6","l7");
     /* we tell the C compiler that l0-l7 are clobbered on return to
      * StgReturn, otherwise it tries to use these to save eg. the
      * address of space[100] across the call.  The correct thing
@@ -351,7 +396,7 @@ StgRun(StgFunPtr f, StgRegTable *basereg) {
      * dereferencing a bogus pointer in StgReturn.
      */
     __asm__ volatile ("ld %1,%0"
-                     : "=r" (i7) : "m" (((void **)(space))[100]));
+                                : "=r" (i7) : "m" (((void **)(space))[100]));
 #endif
     return (StgRegTable *)R1.i;
 }
@@ -359,261 +404,52 @@ StgRun(StgFunPtr f, StgRegTable *basereg) {
 #endif
 
 /* -----------------------------------------------------------------------------
-   alpha architecture
-
-   "The stack pointer (SP) must at all times denote an address that has octaword
-    alignment. (This restriction has the side effect that the in-memory portion
-    of the argument list, if any, will start on an octaword boundary.) Note that
-    the stack grows toward lower addresses. During a procedure invocation, SP
-    can never be set to a value that is higher than the value of SP at entry to
-    that procedure invocation.
-
-   "The contents of the stack, located above the portion of the argument list
-    (if any) that is passed in memory, belong to the calling procedure. Because
-    they are part of the calling procedure, they should not be read or written
-    by the called procedure, except as specified by indirect arguments or
-    language-controlled up-level references.
-
-   "The SP value might be used by the hardware when raising exceptions and
-    asynchronous interrupts. It must be assumed that the contents of the stack
-    below the current SP value and within the stack for the current thread are
-    continually and unpredictably modified, as specified in the _Alpha
-    Architecture Reference Manual_, and as a result of asynchronous software
-    actions."
-
-   -- Compaq Computer Corporation, Houston. Tru64 UNIX Calling Standard for
-      Alpha Systems, 5.1 edition, August 2000, section 3.2.1.  http://www.
-      tru64unix.compaq.com/docs/base_doc/DOCUMENTATION/V51_PDF/ARH9MBTE.PDF
-   -------------------------------------------------------------------------- */
-
-#ifdef alpha_HOST_ARCH
-
-StgRegTable *
-StgRun(StgFunPtr f, StgRegTable *basereg)
-{
-    register long   real_ra __asm__("$26"); volatile long   save_ra;
-    register long   real_gp __asm__("$29"); volatile long   save_gp;
-
-    register long   real_s0 __asm__("$9" ); volatile long   save_s0;
-    register long   real_s1 __asm__("$10"); volatile long   save_s1;
-    register long   real_s2 __asm__("$11"); volatile long   save_s2;
-    register long   real_s3 __asm__("$12"); volatile long   save_s3;
-    register long   real_s4 __asm__("$13"); volatile long   save_s4;
-    register long   real_s5 __asm__("$14"); volatile long   save_s5;
-#ifdef alpha_EXTRA_CAREFUL
-    register long   real_s6 __asm__("$15"); volatile long   save_s6;
-#endif
-
-    register double real_f2 __asm__("$f2"); volatile double save_f2;
-    register double real_f3 __asm__("$f3"); volatile double save_f3;
-    register double real_f4 __asm__("$f4"); volatile double save_f4;
-    register double real_f5 __asm__("$f5"); volatile double save_f5;
-    register double real_f6 __asm__("$f6"); volatile double save_f6;
-    register double real_f7 __asm__("$f7"); volatile double save_f7;
-#ifdef alpha_EXTRA_CAREFUL
-    register double real_f8 __asm__("$f8"); volatile double save_f8;
-    register double real_f9 __asm__("$f9"); volatile double save_f9;
-#endif
-
-    register StgFunPtr real_pv __asm__("$27");
-
-    StgRegTable * ret;
-
-    save_ra = real_ra;
-    save_gp = real_gp;
-
-    save_s0 = real_s0;
-    save_s1 = real_s1;
-    save_s2 = real_s2;
-    save_s3 = real_s3;
-    save_s4 = real_s4;
-    save_s5 = real_s5;
-#ifdef alpha_EXTRA_CAREFUL
-    save_s6 = real_s6;
-#endif
-
-    save_f2 = real_f2;
-    save_f3 = real_f3;
-    save_f4 = real_f4;
-    save_f5 = real_f5;
-    save_f6 = real_f6;
-    save_f7 = real_f7;
-#ifdef alpha_EXTRA_CAREFUL
-    save_f8 = real_f8;
-    save_f9 = real_f9;
-#endif
-
-    real_pv = f;
-
-    __asm__ volatile(  "lda $30,-%0($30)"      "\n"
-               "\t"    "jmp ($27)"             "\n"
-               "\t"    ".align 3"              "\n"
-               ".globl " STG_RETURN            "\n"
-               STG_RETURN ":"                  "\n"
-               "\t"    "lda $30,%0($30)"       "\n"
-               : : "K" (RESERVED_C_STACK_BYTES));
-
-    ret = real_s5;
-
-    real_s0 = save_s0;
-    real_s1 = save_s1;
-    real_s2 = save_s2;
-    real_s3 = save_s3;
-    real_s4 = save_s4;
-    real_s5 = save_s5;
-#ifdef alpha_EXTRA_CAREFUL
-    real_s6 = save_s6;
-#endif
-
-    real_f2 = save_f2;
-    real_f3 = save_f3;
-    real_f4 = save_f4;
-    real_f5 = save_f5;
-    real_f6 = save_f6;
-    real_f7 = save_f7;
-#ifdef alpha_EXTRA_CAREFUL
-    real_f8 = save_f8;
-    real_f9 = save_f9;
-#endif
-
-    real_ra = save_ra;
-    real_gp = save_gp;
-
-    return ret;
-}
-
-#endif /* alpha_HOST_ARCH */
+   PowerPC architecture
 
-/* -----------------------------------------------------------------------------
-   HP-PA architecture
+   Everything is in assembler, so we don't have to deal with GCC...
    -------------------------------------------------------------------------- */
 
-#ifdef hppa1_1_HOST_ARCH
-
-StgRegTable *
-StgRun(StgFunPtr f, StgRegTable *basereg)
-{
-    StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)];
-    StgRegTable * ret;
-
-    __asm__ volatile ("ldo %0(%%r30),%%r19\n"
-                     "\tstw %%r3, 0(0,%%r19)\n"
-                      "\tstw %%r4, 4(0,%%r19)\n"
-                      "\tstw %%r5, 8(0,%%r19)\n"
-                      "\tstw %%r6,12(0,%%r19)\n"
-                      "\tstw %%r7,16(0,%%r19)\n"
-                      "\tstw %%r8,20(0,%%r19)\n"
-                      "\tstw %%r9,24(0,%%r19)\n"
-                     "\tstw %%r10,28(0,%%r19)\n"
-                      "\tstw %%r11,32(0,%%r19)\n"
-                      "\tstw %%r12,36(0,%%r19)\n"
-                      "\tstw %%r13,40(0,%%r19)\n"
-                      "\tstw %%r14,44(0,%%r19)\n"
-                      "\tstw %%r15,48(0,%%r19)\n"
-                      "\tstw %%r16,52(0,%%r19)\n"
-                      "\tstw %%r17,56(0,%%r19)\n"
-                      "\tstw %%r18,60(0,%%r19)\n"
-                     "\tldo 80(%%r19),%%r19\n"
-                     "\tfstds %%fr12,-16(0,%%r19)\n"
-                     "\tfstds %%fr13, -8(0,%%r19)\n"
-                     "\tfstds %%fr14,  0(0,%%r19)\n"
-                     "\tfstds %%fr15,  8(0,%%r19)\n"
-                     "\tldo 32(%%r19),%%r19\n"
-                     "\tfstds %%fr16,-16(0,%%r19)\n"
-                     "\tfstds %%fr17, -8(0,%%r19)\n"
-                     "\tfstds %%fr18,  0(0,%%r19)\n"
-                     "\tfstds %%fr19,  8(0,%%r19)\n"
-                     "\tldo 32(%%r19),%%r19\n"
-                     "\tfstds %%fr20,-16(0,%%r19)\n"
-                     "\tfstds %%fr21, -8(0,%%r19)\n" : :
-                      "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19"
-                     );
-
-    f();
+#if defined(powerpc_HOST_ARCH)
 
-    __asm__ volatile (".align 4\n"
-                             "\t.EXPORT " STG_RETURN ",CODE\n"
-                     "\t.EXPORT " STG_RETURN ",ENTRY,PRIV_LEV=3\n"
-                      STG_RETURN "\n"
-                      /* "\tldo %0(%%r3),%%r19\n" */
-                      "\tldo %1(%%r30),%%r19\n"
-                      "\tcopy %%r11, %0\n"  /* save R1 */
-                     "\tldw  0(0,%%r19),%%r3\n"
-                      "\tldw  4(0,%%r19),%%r4\n"
-                      "\tldw  8(0,%%r19),%%r5\n"
-                      "\tldw 12(0,%%r19),%%r6\n"
-                      "\tldw 16(0,%%r19),%%r7\n"
-                      "\tldw 20(0,%%r19),%%r8\n"
-                      "\tldw 24(0,%%r19),%%r9\n"
-                     "\tldw 28(0,%%r19),%%r10\n"
-                      "\tldw 32(0,%%r19),%%r11\n"
-                      "\tldw 36(0,%%r19),%%r12\n"
-                      "\tldw 40(0,%%r19),%%r13\n"
-                      "\tldw 44(0,%%r19),%%r14\n"
-                      "\tldw 48(0,%%r19),%%r15\n"
-                      "\tldw 52(0,%%r19),%%r16\n"
-                      "\tldw 56(0,%%r19),%%r17\n"
-                      "\tldw 60(0,%%r19),%%r18\n"
-                     "\tldo 80(%%r19),%%r19\n"
-                     "\tfldds -16(0,%%r19),%%fr12\n"
-                     "\tfldds  -8(0,%%r19),%%fr13\n"
-                     "\tfldds   0(0,%%r19),%%fr14\n"
-                     "\tfldds   8(0,%%r19),%%fr15\n"
-                     "\tldo 32(%%r19),%%r19\n"
-                     "\tfldds -16(0,%%r19),%%fr16\n"
-                     "\tfldds  -8(0,%%r19),%%fr17\n"
-                     "\tfldds   0(0,%%r19),%%fr18\n"
-                     "\tfldds   8(0,%%r19),%%fr19\n"
-                     "\tldo 32(%%r19),%%r19\n"
-                     "\tfldds -16(0,%%r19),%%fr20\n"
-                     "\tfldds  -8(0,%%r19),%%fr21\n"
-                        : "=r" (ret)
-                        : "n" (-(116 * sizeof(long) + 10 * sizeof(double)))
-                        : "%r19"
-                     );
-
-    return ret;
-}
-
-#endif /* hppa1_1_HOST_ARCH */
-
-/* -----------------------------------------------------------------------------
-   PowerPC architecture
+#define STG_GLOBAL ".globl "
 
-   Everything is in assembler, so we don't have to deal with GCC...
-   
-   -------------------------------------------------------------------------- */
+#if defined(darwin_HOST_OS)
+#define STG_HIDDEN ".private_extern "
+#else
+#define STG_HIDDEN ".hidden "
+#endif
 
-#ifdef powerpc_HOST_ARCH
+#if defined(aix_HOST_OS)
 
-extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+// implementation is in StgCRunAsm.S
 
-#ifdef darwin_HOST_OS
+#elif defined(darwin_HOST_OS)
 void StgRunIsImplementedInAssembler(void)
 {
 #if HAVE_SUBSECTIONS_VIA_SYMBOLS
             // if the toolchain supports deadstripping, we have to
             // prevent it here (it tends to get confused here).
-        __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler");
-#endif
-       __asm__ volatile (
-               "\n.globl _StgRun\n"
-               "_StgRun:\n"
-               "\tmflr r0\n"
-               "\tbl saveFP # f14\n"
-               "\tstmw r13,-220(r1)\n"
-               "\tstwu r1,-%0(r1)\n"
+        __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler\n");
+#endif
+        __asm__ volatile (
+                STG_GLOBAL STG_RUN "\n"
+                STG_HIDDEN STG_RUN "\n"
+                STG_RUN ":\n"
+                "\tmflr r0\n"
+                "\tbl saveFP # f14\n"
+                "\tstmw r13,-220(r1)\n"
+                "\tstwu r1,-%0(r1)\n"
                 "\tmr r27,r4\n" // BaseReg == r27
-               "\tmtctr r3\n"
-               "\tmr r12,r3\n"
-               "\tbctr\n"
-               ".globl _StgReturn\n"
-               "_StgReturn:\n"
-               "\tmr r3,r14\n"
-               "\tla r1,%0(r1)\n"
-               "\tlmw r13,-220(r1)\n"
-               "\tb restFP # f14\n"
-       : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+                "\tmtctr r3\n"
+                "\tmr r12,r3\n"
+                "\tbctr\n"
+                ".globl _StgReturn\n"
+                "_StgReturn:\n"
+                "\tmr r3,r14\n"
+                "\tla r1,%0(r1)\n"
+                "\tlmw r13,-220(r1)\n"
+                "\tb restFP # f14\n"
+        : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
 }
 #else
 
@@ -630,66 +466,67 @@ void StgRunIsImplementedInAssembler(void)
 static void GNUC3_ATTRIBUTE(used)
 StgRunIsImplementedInAssembler(void)
 {
-       __asm__ volatile (
-               "\t.globl StgRun\n"
-               "\t.type StgRun,@function\n"
-               "StgRun:\n"
-               "\tmflr 0\n"
-               "\tstw 0,4(1)\n"
-               "\tmr 5,1\n"
-               "\tstwu 1,-%0(1)\n"
-               "\tstmw 13,-220(5)\n"
-               "\tstfd 14,-144(5)\n"
-               "\tstfd 15,-136(5)\n"
-               "\tstfd 16,-128(5)\n"
-               "\tstfd 17,-120(5)\n"
-               "\tstfd 18,-112(5)\n"
-               "\tstfd 19,-104(5)\n"
-               "\tstfd 20,-96(5)\n"
-               "\tstfd 21,-88(5)\n"
-               "\tstfd 22,-80(5)\n"
-               "\tstfd 23,-72(5)\n"
-               "\tstfd 24,-64(5)\n"
-               "\tstfd 25,-56(5)\n"
-               "\tstfd 26,-48(5)\n"
-               "\tstfd 27,-40(5)\n"
-               "\tstfd 28,-32(5)\n"
-               "\tstfd 29,-24(5)\n"
-               "\tstfd 30,-16(5)\n"
-               "\tstfd 31,-8(5)\n"
-               "\tmr 27,4\n"  // BaseReg == r27
-               "\tmtctr 3\n"
-               "\tmr 12,3\n"
-               "\tbctr\n"
-               ".globl StgReturn\n"
-               "\t.type StgReturn,@function\n"
-               "StgReturn:\n"
-               "\tmr 3,14\n"
-               "\tla 5,%0(1)\n"
-               "\tlmw 13,-220(5)\n"
-               "\tlfd 14,-144(5)\n"
-               "\tlfd 15,-136(5)\n"
-               "\tlfd 16,-128(5)\n"
-               "\tlfd 17,-120(5)\n"
-               "\tlfd 18,-112(5)\n"
-               "\tlfd 19,-104(5)\n"
-               "\tlfd 20,-96(5)\n"
-               "\tlfd 21,-88(5)\n"
-               "\tlfd 22,-80(5)\n"
-               "\tlfd 23,-72(5)\n"
-               "\tlfd 24,-64(5)\n"
-               "\tlfd 25,-56(5)\n"
-               "\tlfd 26,-48(5)\n"
-               "\tlfd 27,-40(5)\n"
-               "\tlfd 28,-32(5)\n"
-               "\tlfd 29,-24(5)\n"
-               "\tlfd 30,-16(5)\n"
-               "\tlfd 31,-8(5)\n"
-               "\tmr 1,5\n"
-               "\tlwz 0,4(1)\n"
-               "\tmtlr 0\n"
-               "\tblr\n"
-       : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+        __asm__ volatile (
+                "\t.globl StgRun\n"
+                "\t.hidden StgRun\n"
+                "\t.type StgRun,@function\n"
+                "StgRun:\n"
+                "\tmflr 0\n"
+                "\tstw 0,4(1)\n"
+                "\tmr 5,1\n"
+                "\tstwu 1,-%0(1)\n"
+                "\tstmw 13,-220(5)\n"
+                "\tstfd 14,-144(5)\n"
+                "\tstfd 15,-136(5)\n"
+                "\tstfd 16,-128(5)\n"
+                "\tstfd 17,-120(5)\n"
+                "\tstfd 18,-112(5)\n"
+                "\tstfd 19,-104(5)\n"
+                "\tstfd 20,-96(5)\n"
+                "\tstfd 21,-88(5)\n"
+                "\tstfd 22,-80(5)\n"
+                "\tstfd 23,-72(5)\n"
+                "\tstfd 24,-64(5)\n"
+                "\tstfd 25,-56(5)\n"
+                "\tstfd 26,-48(5)\n"
+                "\tstfd 27,-40(5)\n"
+                "\tstfd 28,-32(5)\n"
+                "\tstfd 29,-24(5)\n"
+                "\tstfd 30,-16(5)\n"
+                "\tstfd 31,-8(5)\n"
+                "\tmr 27,4\n"  // BaseReg == r27
+                "\tmtctr 3\n"
+                "\tmr 12,3\n"
+                "\tbctr\n"
+                ".globl StgReturn\n"
+                "\t.type StgReturn,@function\n"
+                "StgReturn:\n"
+                "\tmr 3,14\n"
+                "\tla 5,%0(1)\n"
+                "\tlmw 13,-220(5)\n"
+                "\tlfd 14,-144(5)\n"
+                "\tlfd 15,-136(5)\n"
+                "\tlfd 16,-128(5)\n"
+                "\tlfd 17,-120(5)\n"
+                "\tlfd 18,-112(5)\n"
+                "\tlfd 19,-104(5)\n"
+                "\tlfd 20,-96(5)\n"
+                "\tlfd 21,-88(5)\n"
+                "\tlfd 22,-80(5)\n"
+                "\tlfd 23,-72(5)\n"
+                "\tlfd 24,-64(5)\n"
+                "\tlfd 25,-56(5)\n"
+                "\tlfd 26,-48(5)\n"
+                "\tlfd 27,-40(5)\n"
+                "\tlfd 28,-32(5)\n"
+                "\tlfd 29,-24(5)\n"
+                "\tlfd 30,-16(5)\n"
+                "\tlfd 31,-8(5)\n"
+                "\tmr 1,5\n"
+                "\tlwz 0,4(1)\n"
+                "\tmtlr 0\n"
+                "\tblr\n"
+        : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
 }
 #endif
 
@@ -699,202 +536,303 @@ StgRunIsImplementedInAssembler(void)
    PowerPC 64 architecture
 
    Everything is in assembler, so we don't have to deal with GCC...
-   
    -------------------------------------------------------------------------- */
 
-#ifdef powerpc64_HOST_ARCH
-
-#ifdef linux_HOST_OS
-extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+#if defined(powerpc64_HOST_ARCH)
 
+#if defined(linux_HOST_OS)
 static void GNUC3_ATTRIBUTE(used)
 StgRunIsImplementedInAssembler(void)
 {
         // r0 volatile
-       // r1 stack pointer
-       // r2 toc - needs to be saved
-       // r3-r10 argument passing, volatile
-       // r11, r12 very volatile (not saved across cross-module calls)
-       // r13 thread local state (never modified, don't need to save)
-       // r14-r31 callee-save
-       __asm__ volatile (
-               ".section \".opd\",\"aw\"\n"
-               ".align 3\n"
-               ".globl StgRun\n"
-               "StgRun:\n"
-                       "\t.quad\t.StgRun,.TOC.@tocbase,0\n"
-                       "\t.size StgRun,24\n"
-               ".globl StgReturn\n"
-               "StgReturn:\n"
-                       "\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
-                       "\t.size StgReturn,24\n"
-               ".previous\n"
-               ".globl .StgRun\n"
-               ".type .StgRun,@function\n"
-               ".StgRun:\n"
-                       "\tmflr 0\n"
-                       "\tmr 5, 1\n"
-                       "\tstd 0, 16(1)\n"
-                       "\tstdu 1, -%0(1)\n"
-                       "\tstd 2, -296(5)\n"
-                       "\tstd 14, -288(5)\n"
-                       "\tstd 15, -280(5)\n"
-                       "\tstd 16, -272(5)\n"
-                       "\tstd 17, -264(5)\n"
-                       "\tstd 18, -256(5)\n"
-                       "\tstd 19, -248(5)\n"
-                       "\tstd 20, -240(5)\n"
-                       "\tstd 21, -232(5)\n"
-                       "\tstd 22, -224(5)\n"
-                       "\tstd 23, -216(5)\n"
-                       "\tstd 24, -208(5)\n"
-                       "\tstd 25, -200(5)\n"
-                       "\tstd 26, -192(5)\n"
-                       "\tstd 27, -184(5)\n"
-                       "\tstd 28, -176(5)\n"
-                       "\tstd 29, -168(5)\n"
-                       "\tstd 30, -160(5)\n"
-                       "\tstd 31, -152(5)\n"
-                       "\tstfd 14, -144(5)\n"
-                       "\tstfd 15, -136(5)\n"
-                       "\tstfd 16, -128(5)\n"
-                       "\tstfd 17, -120(5)\n"
-                       "\tstfd 18, -112(5)\n"
-                       "\tstfd 19, -104(5)\n"
-                       "\tstfd 20, -96(5)\n"
-                       "\tstfd 21, -88(5)\n"
-                       "\tstfd 22, -80(5)\n"
-                       "\tstfd 23, -72(5)\n"
-                       "\tstfd 24, -64(5)\n"
-                       "\tstfd 25, -56(5)\n"
-                       "\tstfd 26, -48(5)\n"
-                       "\tstfd 27, -40(5)\n"
-                       "\tstfd 28, -32(5)\n"
-                       "\tstfd 29, -24(5)\n"
-                       "\tstfd 30, -16(5)\n"
-                       "\tstfd 31, -8(5)\n"
-                       "\tmr 27, 4\n"  // BaseReg == r27
-                       "\tld 2, 8(3)\n"
-                       "\tld 3, 0(3)\n"
-                       "\tmtctr 3\n"
-                       "\tbctr\n"
-               ".globl .StgReturn\n"
-               ".type .StgReturn,@function\n"
-               ".StgReturn:\n"
-                       "\tmr 3,14\n"
-                       "\tla 5, %0(1)\n" // load address == addi r5, r1, %0
-                       "\tld 2, -296(5)\n"
-                       "\tld 14, -288(5)\n"
-                       "\tld 15, -280(5)\n"
-                       "\tld 16, -272(5)\n"
-                       "\tld 17, -264(5)\n"
-                       "\tld 18, -256(5)\n"
-                       "\tld 19, -248(5)\n"
-                       "\tld 20, -240(5)\n"
-                       "\tld 21, -232(5)\n"
-                       "\tld 22, -224(5)\n"
-                       "\tld 23, -216(5)\n"
-                       "\tld 24, -208(5)\n"
-                       "\tld 25, -200(5)\n"
-                       "\tld 26, -192(5)\n"
-                       "\tld 27, -184(5)\n"
-                       "\tld 28, -176(5)\n"
-                       "\tld 29, -168(5)\n"
-                       "\tld 30, -160(5)\n"
-                       "\tld 31, -152(5)\n"
-                       "\tlfd 14, -144(5)\n"
-                       "\tlfd 15, -136(5)\n"
-                       "\tlfd 16, -128(5)\n"
-                       "\tlfd 17, -120(5)\n"
-                       "\tlfd 18, -112(5)\n"
-                       "\tlfd 19, -104(5)\n"
-                       "\tlfd 20, -96(5)\n"
-                       "\tlfd 21, -88(5)\n"
-                       "\tlfd 22, -80(5)\n"
-                       "\tlfd 23, -72(5)\n"
-                       "\tlfd 24, -64(5)\n"
-                       "\tlfd 25, -56(5)\n"
-                       "\tlfd 26, -48(5)\n"
-                       "\tlfd 27, -40(5)\n"
-                       "\tlfd 28, -32(5)\n"
-                       "\tlfd 29, -24(5)\n"
-                       "\tlfd 30, -16(5)\n"
-                       "\tlfd 31, -8(5)\n"
-                       "\tmr 1, 5\n"
-                       "\tld 0, 16(1)\n"
-                       "\tmtlr 0\n"
-                       "\tblr\n"
-       : : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
+        // r1 stack pointer
+        // r2 toc - needs to be saved
+        // r3-r10 argument passing, volatile
+        // r11, r12 very volatile (not saved across cross-module calls)
+        // r13 thread local state (never modified, don't need to save)
+        // r14-r31 callee-save
+        __asm__ volatile (
+                ".section \".opd\",\"aw\"\n"
+                ".align 3\n"
+                ".globl StgRun\n"
+                ".hidden StgRun\n"
+                "StgRun:\n"
+                "\t.quad\t.StgRun,.TOC.@tocbase,0\n"
+                "\t.size StgRun,24\n"
+                ".globl StgReturn\n"
+                "StgReturn:\n"
+                "\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
+                "\t.size StgReturn,24\n"
+                ".previous\n"
+                ".globl .StgRun\n"
+                ".type .StgRun,@function\n"
+                ".StgRun:\n"
+                "\tmflr 0\n"
+                "\tmr 5, 1\n"
+                "\tstd 0, 16(1)\n"
+                "\tstdu 1, -%0(1)\n"
+                "\tstd 2, -296(5)\n"
+                "\tstd 14, -288(5)\n"
+                "\tstd 15, -280(5)\n"
+                "\tstd 16, -272(5)\n"
+                "\tstd 17, -264(5)\n"
+                "\tstd 18, -256(5)\n"
+                "\tstd 19, -248(5)\n"
+                "\tstd 20, -240(5)\n"
+                "\tstd 21, -232(5)\n"
+                "\tstd 22, -224(5)\n"
+                "\tstd 23, -216(5)\n"
+                "\tstd 24, -208(5)\n"
+                "\tstd 25, -200(5)\n"
+                "\tstd 26, -192(5)\n"
+                "\tstd 27, -184(5)\n"
+                "\tstd 28, -176(5)\n"
+                "\tstd 29, -168(5)\n"
+                "\tstd 30, -160(5)\n"
+                "\tstd 31, -152(5)\n"
+                "\tstfd 14, -144(5)\n"
+                "\tstfd 15, -136(5)\n"
+                "\tstfd 16, -128(5)\n"
+                "\tstfd 17, -120(5)\n"
+                "\tstfd 18, -112(5)\n"
+                "\tstfd 19, -104(5)\n"
+                "\tstfd 20, -96(5)\n"
+                "\tstfd 21, -88(5)\n"
+                "\tstfd 22, -80(5)\n"
+                "\tstfd 23, -72(5)\n"
+                "\tstfd 24, -64(5)\n"
+                "\tstfd 25, -56(5)\n"
+                "\tstfd 26, -48(5)\n"
+                "\tstfd 27, -40(5)\n"
+                "\tstfd 28, -32(5)\n"
+                "\tstfd 29, -24(5)\n"
+                "\tstfd 30, -16(5)\n"
+                "\tstfd 31, -8(5)\n"
+                "\tmr 27, 4\n"  // BaseReg == r27
+                "\tld 2, 8(3)\n"
+                "\tld 3, 0(3)\n"
+                "\tmtctr 3\n"
+                "\tbctr\n"
+                ".globl .StgReturn\n"
+                ".type .StgReturn,@function\n"
+                ".StgReturn:\n"
+                "\tmr 3,14\n"
+                "\tla 5, %0(1)\n" // load address == addi r5, r1, %0
+                "\tld 2, -296(5)\n"
+                "\tld 14, -288(5)\n"
+                "\tld 15, -280(5)\n"
+                "\tld 16, -272(5)\n"
+                "\tld 17, -264(5)\n"
+                "\tld 18, -256(5)\n"
+                "\tld 19, -248(5)\n"
+                "\tld 20, -240(5)\n"
+                "\tld 21, -232(5)\n"
+                "\tld 22, -224(5)\n"
+                "\tld 23, -216(5)\n"
+                "\tld 24, -208(5)\n"
+                "\tld 25, -200(5)\n"
+                "\tld 26, -192(5)\n"
+                "\tld 27, -184(5)\n"
+                "\tld 28, -176(5)\n"
+                "\tld 29, -168(5)\n"
+                "\tld 30, -160(5)\n"
+                "\tld 31, -152(5)\n"
+                "\tlfd 14, -144(5)\n"
+                "\tlfd 15, -136(5)\n"
+                "\tlfd 16, -128(5)\n"
+                "\tlfd 17, -120(5)\n"
+                "\tlfd 18, -112(5)\n"
+                "\tlfd 19, -104(5)\n"
+                "\tlfd 20, -96(5)\n"
+                "\tlfd 21, -88(5)\n"
+                "\tlfd 22, -80(5)\n"
+                "\tlfd 23, -72(5)\n"
+                "\tlfd 24, -64(5)\n"
+                "\tlfd 25, -56(5)\n"
+                "\tlfd 26, -48(5)\n"
+                "\tlfd 27, -40(5)\n"
+                "\tlfd 28, -32(5)\n"
+                "\tlfd 29, -24(5)\n"
+                "\tlfd 30, -16(5)\n"
+                "\tlfd 31, -8(5)\n"
+                "\tmr 1, 5\n"
+                "\tld 0, 16(1)\n"
+                "\tmtlr 0\n"
+                "\tblr\n"
+        : : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
 }
+
 #else // linux_HOST_OS
-#error Only linux support for power64 right now.
+#error Only Linux support for power64 right now.
 #endif
 
 #endif
 
+#if defined(powerpc64le_HOST_ARCH)
 /* -----------------------------------------------------------------------------
-   IA64 architecture
+   PowerPC 64 little endian architecture
 
-   Again, in assembler - so we can fiddle with the register stack, and because
-   gcc doesn't handle asm-clobbered callee-saves correctly.
+   Really everything is in assembler, so we don't have to deal with GCC...
+   -------------------------------------------------------------------------- */
+#endif
 
-   loc0  - loc15: preserved locals
-   loc16 - loc28: STG registers
-           loc29: saved ar.pfs
-           loc30: saved b0
-           loc31: saved gp (gcc 3.3 uses this slot)
+/* -----------------------------------------------------------------------------
+   ARM architecture
    -------------------------------------------------------------------------- */
 
-#ifdef ia64_HOST_ARCH
+#if defined(arm_HOST_ARCH)
 
-/* the memory stack is rarely used, so 16K is excessive */
-#undef RESERVED_C_STACK_BYTES
-#define RESERVED_C_STACK_BYTES 1024
+#if defined(__thumb__)
+#define THUMB_FUNC ".thumb\n\t.thumb_func\n\t"
+#else
+#define THUMB_FUNC
+#endif
 
-#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
-/* gcc 3.3+: leave an extra slot for gp saves */
-#define LOCALS 32
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+    StgRegTable * r;
+    __asm__ volatile (
+        /*
+         * save callee-saves registers on behalf of the STG code.
+         */
+        "stmfd sp!, {r4-r11, ip, lr}\n\t"
+#if !defined(arm_HOST_ARCH_PRE_ARMv6)
+        "vstmdb sp!, {d8-d11}\n\t"
+#endif
+        /*
+         * allocate some space for Stg machine's temporary storage.
+         * Note: RESERVED_C_STACK_BYTES has to be a round number here or
+         * the assembler can't assemble it.
+         */
+        "sub sp, sp, %3\n\t"
+        /*
+         * Set BaseReg
+         */
+        "mov r4, %2\n\t"
+        /*
+         * Jump to function argument.
+         */
+        "bx %1\n\t"
+
+        ".globl " STG_RETURN "\n\t"
+        THUMB_FUNC
+#if !defined(ios_HOST_OS)
+        ".type " STG_RETURN ", %%function\n"
+#endif
+        STG_RETURN ":\n\t"
+        /*
+         * Free the space we allocated
+         */
+        "add sp, sp, %3\n\t"
+        /*
+         * Return the new register table, taking it from Stg's R1 (ARM's R7).
+         */
+        "mov %0, r7\n\t"
+        /*
+         * restore callee-saves registers.
+         */
+#if !defined(arm_HOST_ARCH_PRE_ARMv6)
+        "vldmia sp!, {d8-d11}\n\t"
+#endif
+        "ldmfd sp!, {r4-r11, ip, lr}\n\t"
+      : "=r" (r)
+      : "r" (f), "r" (basereg), "i" (RESERVED_C_STACK_BYTES)
+#if !defined(__thumb__)
+        /* In ARM mode, r11/fp is frame-pointer and so we cannot mark
+           it as clobbered. If we do so, GCC complains with error. */
+      : "%r4", "%r5", "%r6", "%r7", "%r8", "%r9", "%r10", "%ip", "%lr"
 #else
-#define LOCALS 31
+        /* In Thumb mode r7 is frame-pointer and so we cannot mark it
+           as clobbered. On the other hand we mark as clobbered also
+           those regs not used in Thumb mode. Hard to judge if this is
+           needed, but certainly Haskell code is using them for
+           placing GHC's virtual registers there. See
+           includes/stg/MachRegs.h Please note that Haskell code is
+           compiled by GHC/LLVM into ARM code (not Thumb!), at least
+           as of February 2012 */
+      : "%r4", "%r5", "%r6", "%r8", "%r9", "%r10", "%11", "%ip", "%lr"
+#endif
+    );
+    return r;
+}
 #endif
 
-static void GNUC3_ATTRIBUTE(used)
-StgRunIsImplementedInAssembler(void)
-{
-    __asm__ volatile(
-               ".global StgRun\n"
-               "StgRun:\n"
-               "\talloc loc29 = ar.pfs, 0, %1, 8, 0\n" /* setup register frame */
-               "\tld8 r18 = [r32],8\n"                 /* get procedure address */
-               "\tadds sp = -%0, sp ;;\n"              /* setup stack */
-               "\tld8 gp = [r32]\n"                    /* get procedure GP */
-               "\tadds r16 = %0-(6*16), sp\n"
-               "\tadds r17 = %0-(5*16), sp ;;\n"
-               "\tstf.spill [r16] = f16,32\n"          /* spill callee-saved fp regs */
-               "\tstf.spill [r17] = f17,32\n"
-               "\tmov b6 = r18 ;;\n"                   /* set target address */
-               "\tstf.spill [r16] = f18,32\n"
-               "\tstf.spill [r17] = f19,32\n"
-               "\tmov loc30 = b0 ;;\n"                 /* save return address */
-               "\tstf.spill [r16] = f20,32\n"
-               "\tstf.spill [r17] = f21,32\n"
-               "\tbr.few b6 ;;\n"                      /* branch to function */
-               ".global StgReturn\n"
-               "StgReturn:\n"
-               "\tmov r8 = loc16\n"            /* return value in r8 */
-               "\tadds r16 = %0-(6*16), sp\n"
-               "\tadds r17 = %0-(5*16), sp ;;\n"
-               "\tldf.fill f16 = [r16],32\n"   /* start restoring fp regs */
-               "\tldf.fill f17 = [r17],32\n"
-               "\tmov ar.pfs = loc29 ;;\n"     /* restore register frame */
-               "\tldf.fill f18 = [r16],32\n"
-               "\tldf.fill f19 = [r17],32\n"
-               "\tmov b0 = loc30 ;;\n"         /* restore return address */
-               "\tldf.fill f20 = [r16],32\n"
-               "\tldf.fill f21 = [r17],32\n"
-               "\tadds sp = %0, sp\n"          /* restore stack */
-               "\tbr.ret.sptk.many b0 ;;\n"    /* return */
-       : : "i"(RESERVED_C_STACK_BYTES + 6*16), "i"(LOCALS));
+#if defined(aarch64_HOST_ARCH)
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+    StgRegTable * r;
+    __asm__ volatile (
+        /*
+         * Save callee-saves registers on behalf of the STG code.
+         * Floating point registers only need the bottom 64 bits preserved.
+         * We need to use the the names x16, x17, x29 and x30 instead of ip0
+         * ip1, fp and lp because one of either clang or gcc doesn't understand
+         * the later names.
+         */
+        "stp x29,  x30,  [sp, #-16]!\n\t"
+        "mov x29, sp\n\t"
+        "stp x16, x17, [sp, #-16]!\n\t"
+        "stp x19, x20, [sp, #-16]!\n\t"
+        "stp x21, x22, [sp, #-16]!\n\t"
+        "stp x23, x24, [sp, #-16]!\n\t"
+        "stp x25, x26, [sp, #-16]!\n\t"
+        "stp x27, x28, [sp, #-16]!\n\t"
+        "stp d8,  d9,  [sp, #-16]!\n\t"
+        "stp d10, d11, [sp, #-16]!\n\t"
+        "stp d12, d13, [sp, #-16]!\n\t"
+        "stp d14, d15, [sp, #-16]!\n\t"
+
+        /*
+         * allocate some space for Stg machine's temporary storage.
+         * Note: RESERVED_C_STACK_BYTES has to be a round number here or
+         * the assembler can't assemble it.
+         */
+        "sub sp, sp, %3\n\t"
+        /*
+         * Set BaseReg
+         */
+        "mov x19, %2\n\t"
+        /*
+         * Jump to function argument.
+         */
+        "br %1\n\t"
+
+        ".globl " STG_RETURN "\n\t"
+#if !defined(ios_HOST_OS)
+        ".type " STG_RETURN ", %%function\n"
+#endif
+        STG_RETURN ":\n\t"
+        /*
+         * Free the space we allocated
+         */
+        "add sp, sp, %3\n\t"
+        /*
+         * Return the new register table, taking it from Stg's R1 (ARM64's R22).
+         */
+        "mov %0, x22\n\t"
+        /*
+         * restore callee-saves registers.
+         */
+
+        "ldp d14, d15, [sp], #16\n\t"
+        "ldp d12, d13, [sp], #16\n\t"
+        "ldp d10, d11, [sp], #16\n\t"
+        "ldp d8,  d9,  [sp], #16\n\t"
+        "ldp x27, x28, [sp], #16\n\t"
+        "ldp x25, x26, [sp], #16\n\t"
+        "ldp x23, x24, [sp], #16\n\t"
+        "ldp x21, x22, [sp], #16\n\t"
+        "ldp x19, x20, [sp], #16\n\t"
+        "ldp x16, x17, [sp], #16\n\t"
+        "ldp x29,  x30,  [sp], #16\n\t"
+
+      : "=r" (r)
+      : "r" (f), "r" (basereg), "i" (RESERVED_C_STACK_BYTES)
+        : "%x19", "%x20", "%x21", "%x22", "%x23", "%x24", "%x25", "%x26", "%x27", "%x28",
+          "%x16", "%x17", "%x30"
+    );
+    return r;
 }
 
 #endif