UNREG: use __builtin___clear_cache where available
[ghc.git] / rts / Interpreter.c
index 8a608ec..1a883a5 100644 (file)
 #include "Prelude.h"
 #include "Stable.h"
 #include "Printer.h"
+#include "Profiling.h"
 #include "Disassembler.h"
 #include "Interpreter.h"
 #include "ThreadPaused.h"
 #include "Threads.h"
 
 #include <string.h>     /* for memcpy */
-#ifdef HAVE_ERRNO_H
+#if defined(HAVE_ERRNO_H)
 #include <errno.h>
 #endif
 
     SpLim = tso_SpLim(cap->r.rCurrentTSO);
 
 #define SAVE_STACK_POINTERS                     \
-    cap->r.rCurrentTSO->stackobj->sp = Sp
+    cap->r.rCurrentTSO->stackobj->sp = Sp;
+
+#if defined(PROFILING)
+#define LOAD_THREAD_STATE()                     \
+    LOAD_STACK_POINTERS                         \
+    cap->r.rCCCS = cap->r.rCurrentTSO->prof.cccs;
+#else
+#define LOAD_THREAD_STATE()                     \
+    LOAD_STACK_POINTERS
+#endif
+
+#if defined(PROFILING)
+#define SAVE_THREAD_STATE()                     \
+    SAVE_STACK_POINTERS                         \
+    cap->r.rCurrentTSO->prof.cccs = cap->r.rCCCS;
+#else
+#define SAVE_THREAD_STATE()                     \
+    SAVE_STACK_POINTERS
+#endif
 
 // Note [Not true: ASSERT(Sp > SpLim)]
 //
 // less than SpLim both when leaving to return to the scheduler.
 
 #define RETURN_TO_SCHEDULER(todo,retcode)       \
-   SAVE_STACK_POINTERS;                         \
+   SAVE_THREAD_STATE();                         \
    cap->r.rCurrentTSO->what_next = (todo);      \
-   threadPaused(cap,cap->r.rCurrentTSO);                \
+   threadPaused(cap,cap->r.rCurrentTSO);        \
    cap->r.rRet = (retcode);                     \
    return cap;
 
+// Note [avoiding threadPaused]
+//
+// Switching between the interpreter to compiled code can happen very
+// frequently, so we don't want to call threadPaused(), which is
+// expensive.  BUT we must be careful not to violate the invariant
+// that threadPaused() has been called on all threads before we GC
+// (see Note [upd-black-hole].  So the scheduler must ensure that when
+// we return in this way that we definitely immediately run the thread
+// again and don't GC or do something else.
+//
 #define RETURN_TO_SCHEDULER_NO_PAUSE(todo,retcode)      \
-   SAVE_STACK_POINTERS;                                 \
+   SAVE_THREAD_STATE();                                 \
    cap->r.rCurrentTSO->what_next = (todo);              \
    cap->r.rRet = (retcode);                             \
    return cap;
@@ -112,7 +141,7 @@ allocate_NONUPD (Capability *cap, int n_words)
 int rts_stop_next_breakpoint = 0;
 int rts_stop_on_exception = 0;
 
-#ifdef INTERP_STATS
+#if defined(INTERP_STATS)
 
 /* Hacky stats, for tuning the interpreter ... */
 int it_unknown_entries[N_CLOSURE_TYPES];
@@ -195,6 +224,48 @@ void interp_shutdown ( void )
 
 #endif
 
+#if defined(PROFILING)
+
+//
+// Build a zero-argument PAP with the current CCS
+// See Note [Evaluating functions with profiling] in Apply.cmm
+//
+STATIC_INLINE
+StgClosure * newEmptyPAP (Capability *cap,
+                          StgClosure *tagged_obj, // a FUN or a BCO
+                          uint32_t arity)
+{
+    StgPAP *pap = (StgPAP *)allocate(cap, sizeofW(StgPAP));
+    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
+    pap->arity = arity;
+    pap->n_args = 0;
+    pap->fun = tagged_obj;
+    return (StgClosure *)pap;
+}
+
+//
+// Make an exact copy of a PAP, except that we combine the current CCS with the
+// CCS in the PAP.  See Note [Evaluating functions with profiling] in Apply.cmm
+//
+STATIC_INLINE
+StgClosure * copyPAP  (Capability *cap, StgPAP *oldpap)
+{
+    uint32_t size = PAP_sizeW(oldpap->n_args);
+    StgPAP *pap = (StgPAP *)allocate(cap, size);
+    enterFunCCS(&cap->r, oldpap->header.prof.ccs);
+    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
+    pap->arity = oldpap->arity;
+    pap->n_args = oldpap->n_args;
+    pap->fun = oldpap->fun;
+    uint32_t i;
+    for (i = 0; i < ((StgPAP *)pap)->n_args; i++) {
+        pap->payload[i] = oldpap->payload[i];
+    }
+    return (StgClosure *)pap;
+}
+
+#endif
+
 static StgWord app_ptrs_itbl[] = {
     (W_)&stg_ap_p_info,
     (W_)&stg_ap_pp_info,
@@ -215,13 +286,26 @@ interpretBCO (Capability* cap)
     register StgPtr       Sp;    // local state -- stack pointer
     register StgPtr       SpLim; // local state -- stack lim pointer
     register StgClosure   *tagged_obj = 0, *obj;
-    nat n, m;
+    uint32_t n, m;
 
-    LOAD_STACK_POINTERS;
+    LOAD_THREAD_STATE();
 
     cap->r.rHpLim = (P_)1; // HpLim is the context-switch flag; when it
                            // goes to zero we must return to the scheduler.
 
+    IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Entering the interpreter, Sp = %p\n", Sp);
+#if defined(PROFILING)
+             fprintCCS(stderr, cap->r.rCCCS);
+             debugBelch("\n");
+#endif
+             debugBelch("\n");
+             printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
+             debugBelch("\n\n");
+            );
+
     // ------------------------------------------------------------------------
     // Case 1:
     //
@@ -231,6 +315,8 @@ interpretBCO (Capability* cap)
     //          +---------------+
     //       Sp |      -------------------> closure
     //          +---------------+
+    //          |   stg_enter   |
+    //          +---------------+
     //
     if (Sp[0] == (W_)&stg_enter_info) {
        Sp++;
@@ -280,6 +366,10 @@ eval_obj:
              "\n---------------------------------------------------------------\n");
              debugBelch("Evaluating: "); printObj(obj);
              debugBelch("Sp = %p\n", Sp);
+#if defined(PROFILING)
+             fprintCCS(stderr, cap->r.rCCCS);
+             debugBelch("\n");
+#endif
              debugBelch("\n" );
 
              printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
@@ -292,7 +382,6 @@ eval_obj:
     switch ( get_itbl(obj)->type ) {
 
     case IND:
-    case IND_PERM:
     case IND_STATIC:
     {
         tagged_obj = ((StgInd*)obj)->indirectee;
@@ -305,8 +394,9 @@ eval_obj:
     case CONSTR_2_0:
     case CONSTR_1_1:
     case CONSTR_0_2:
-    case CONSTR_STATIC:
-    case CONSTR_NOCAF_STATIC:
+    case CONSTR_NOCAF:
+        break;
+
     case FUN:
     case FUN_1_0:
     case FUN_0_1:
@@ -314,33 +404,54 @@ eval_obj:
     case FUN_1_1:
     case FUN_0_2:
     case FUN_STATIC:
+#if defined(PROFILING)
+        if (cap->r.rCCCS != obj->header.prof.ccs) {
+            tagged_obj =
+                newEmptyPAP(cap, tagged_obj, get_fun_itbl(obj)->f.arity);
+        }
+#endif
+        break;
+
     case PAP:
-        // already in WHNF
+#if defined(PROFILING)
+        if (cap->r.rCCCS != obj->header.prof.ccs) {
+            tagged_obj = copyPAP(cap, (StgPAP *)obj);
+        }
+#endif
         break;
 
     case BCO:
-    {
         ASSERT(((StgBCO *)obj)->arity > 0);
+#if defined(PROFILING)
+        if (cap->r.rCCCS != obj->header.prof.ccs) {
+            tagged_obj = newEmptyPAP(cap, tagged_obj, ((StgBCO *)obj)->arity);
+        }
+#endif
         break;
-    }
 
     case AP:    /* Copied from stg_AP_entry. */
     {
-        nat i, words;
+        uint32_t i, words;
         StgAP *ap;
 
         ap = (StgAP*)obj;
         words = ap->n_args;
 
         // Stack check
-        if (Sp - (words+sizeofW(StgUpdateFrame)) < SpLim) {
+        if (Sp - (words+sizeofW(StgUpdateFrame)+2) < SpLim) {
             Sp -= 2;
             Sp[1] = (W_)tagged_obj;
             Sp[0] = (W_)&stg_enter_info;
             RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
         }
 
-        /* Ok; we're safe.  Party on.  Push an update frame. */
+#if defined(PROFILING)
+        // restore the CCCS after evaluating the AP
+        Sp -= 2;
+        Sp[1] = (W_)cap->r.rCCCS;
+        Sp[0] = (W_)&stg_restore_cccs_eval_info;
+#endif
+
         Sp -= sizeofW(StgUpdateFrame);
         {
             StgUpdateFrame *__frame;
@@ -349,6 +460,8 @@ eval_obj:
             __frame->updatee = (StgClosure *)(ap);
         }
 
+        ENTER_CCS_THUNK(cap,ap);
+
         /* Reload the stack */
         Sp -= words;
         for (i=0; i < words; i++) {
@@ -361,7 +474,7 @@ eval_obj:
     }
 
     default:
-#ifdef INTERP_STATS
+#if defined(INTERP_STATS)
     {
         int j;
 
@@ -377,6 +490,12 @@ eval_obj:
                  debugBelch("evaluating unknown closure -- yielding to sched\n");
                  printObj(obj);
             );
+#if defined(PROFILING)
+        // restore the CCCS after evaluating the closure
+        Sp -= 2;
+        Sp[1] = (W_)cap->r.rCCCS;
+        Sp[0] = (W_)&stg_restore_cccs_eval_info;
+#endif
         Sp -= 2;
         Sp[1] = (W_)tagged_obj;
         Sp[0] = (W_)&stg_enter_info;
@@ -396,7 +515,11 @@ do_return:
              "\n---------------------------------------------------------------\n");
              debugBelch("Returning: "); printObj(obj);
              debugBelch("Sp = %p\n", Sp);
-             debugBelch("\n" );
+#if defined(PROFILING)
+             fprintCCS(stderr, cap->r.rCCCS);
+             debugBelch("\n");
+#endif
+             debugBelch("\n");
              printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
              debugBelch("\n\n");
             );
@@ -410,6 +533,14 @@ do_return:
 
         // NOTE: not using get_itbl().
         info = ((StgClosure *)Sp)->header.info;
+
+        if (info == (StgInfoTable *)&stg_restore_cccs_info ||
+            info == (StgInfoTable *)&stg_restore_cccs_eval_info) {
+            cap->r.rCCCS = (CostCentreStack*)Sp[1];
+            Sp += 2;
+            goto do_return;
+        }
+
         if (info == (StgInfoTable *)&stg_ap_v_info) {
             n = 1; m = 0; goto do_apply;
         }
@@ -526,10 +657,24 @@ do_return_unboxed:
                 || Sp[0] == (W_)&stg_ret_l_info
             );
 
+        IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Returning: "); printObj(obj);
+             debugBelch("Sp = %p\n", Sp);
+#if defined(PROFILING)
+             fprintCCS(stderr, cap->r.rCCCS);
+             debugBelch("\n");
+#endif
+             debugBelch("\n");
+             printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
+             debugBelch("\n\n");
+            );
+
         // get the offset of the stg_ctoi_ret_XXX itbl
         offset = stack_frame_sizeW((StgClosure *)Sp);
 
-        switch (get_itbl((StgClosure *)Sp+offset)->type) {
+        switch (get_itbl((StgClosure*)((StgPtr)Sp+offset))->type) {
 
         case RET_BCO:
             // Returning to an interpreted continuation: put the object on
@@ -567,7 +712,7 @@ do_apply:
 
         case PAP: {
             StgPAP *pap;
-            nat i, arity;
+            uint32_t i, arity;
 
             pap = (StgPAP *)obj;
 
@@ -598,7 +743,7 @@ do_apply:
                 // the appropriate info table in the gap.
                 for (i = 0; i < arity; i++) {
                     Sp[(int)i-1] = Sp[i];
-                    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+                    // ^^^^^ careful, i-1 might be negative, but i is unsigned
                 }
                 Sp[arity-1] = app_ptrs_itbl[n-arity-1];
                 Sp--;
@@ -608,6 +753,10 @@ do_apply:
                     Sp[i] = (W_)pap->payload[i];
                 }
                 obj = UNTAG_CLOSURE(pap->fun);
+
+#if defined(PROFILING)
+                enterFunCCS(&cap->r, pap->header.prof.ccs);
+#endif
                 goto run_BCO_fun;
             }
             else if (arity == n) {
@@ -616,6 +765,9 @@ do_apply:
                     Sp[i] = (W_)pap->payload[i];
                 }
                 obj = UNTAG_CLOSURE(pap->fun);
+#if defined(PROFILING)
+                enterFunCCS(&cap->r, pap->header.prof.ccs);
+#endif
                 goto run_BCO_fun;
             }
             else /* arity > n */ {
@@ -639,7 +791,7 @@ do_apply:
         }
 
         case BCO: {
-            nat arity, i;
+            uint32_t arity, i;
 
             Sp++;
             arity = ((StgBCO *)obj)->arity;
@@ -653,7 +805,7 @@ do_apply:
                 // the appropriate info table in the gap.
                 for (i = 0; i < arity; i++) {
                     Sp[(int)i-1] = Sp[i];
-                    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+                    // ^^^^^ careful, i-1 might be negative, but i is unsigned
                 }
                 Sp[arity-1] = app_ptrs_itbl[n-arity-1];
                 Sp--;
@@ -665,7 +817,7 @@ do_apply:
             else /* arity > n */ {
                 // build a PAP and return it.
                 StgPAP *pap;
-                nat i;
+                uint32_t i;
                 pap = (StgPAP *)allocate(cap, PAP_sizeW(m));
                 SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS);
                 pap->arity = arity - n;
@@ -683,6 +835,8 @@ do_apply:
         // No point in us applying machine-code functions
         default:
         defer_apply_to_sched:
+            IF_DEBUG(interpreter,
+                     debugBelch("Cannot apply compiled function; yielding to scheduler\n"));
             Sp -= 2;
             Sp[1] = (W_)tagged_obj;
             Sp[0] = (W_)&stg_enter_info;
@@ -792,13 +946,13 @@ run_BCO:
         register StgWord16* instrs    = (StgWord16*)(bco->instrs->payload);
         register StgWord*  literals   = (StgWord*)(&bco->literals->payload[0]);
         register StgPtr*   ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
-#ifdef DEBUG
+#if defined(DEBUG)
         int bcoSize;
         bcoSize = bco->instrs->bytes / sizeof(StgWord16);
 #endif
         IF_DEBUG(interpreter,debugBelch("bcoSize = %d\n", bcoSize));
 
-#ifdef INTERP_STATS
+#if defined(INTERP_STATS)
         it_lastopc = 0; /* no opcode */
 #endif
 
@@ -825,7 +979,7 @@ run_BCO:
 
         INTERP_TICK(it_insns);
 
-#ifdef INTERP_STATS
+#if defined(INTERP_STATS)
         ASSERT( (int)instrs[bciPtr] >= 0 && (int)instrs[bciPtr] < 27 );
         it_ofreq[ (int)instrs[bciPtr] ] ++;
         it_oofreq[ it_lastopc ][ (int)instrs[bciPtr] ] ++;
@@ -842,40 +996,59 @@ run_BCO:
         /* check for a breakpoint on the beginning of a let binding */
         case bci_BRK_FUN:
         {
-            int arg1_brk_array, arg2_array_index, arg3_freeVars;
-            StgArrWords *breakPoints;
-            int returning_from_break;     // are we resuming execution from a breakpoint?
-                                          //  if yes, then don't break this time around
-            StgClosure *ioAction;         // the io action to run at a breakpoint
+            int arg1_brk_array, arg2_array_index, arg3_module_uniq;
+#if defined(PROFILING)
+            int arg4_cc;
+#endif
+            StgArrBytes *breakPoints;
+            int returning_from_break;
+
+            // the io action to run at a breakpoint
+            StgClosure *ioAction;
+
+            // a closure to save the top stack frame on the heap
+            StgAP_STACK *new_aps;
 
-            StgAP_STACK *new_aps;         // a closure to save the top stack frame on the heap
             int i;
             int size_words;
 
-            arg1_brk_array      = BCO_GET_LARGE_ARG;  // 1st arg of break instruction
-            arg2_array_index    = BCO_NEXT;           // 2nd arg of break instruction
-            arg3_freeVars       = BCO_GET_LARGE_ARG;  // 3rd arg of break instruction
+            arg1_brk_array      = BCO_GET_LARGE_ARG;
+            arg2_array_index    = BCO_NEXT;
+            arg3_module_uniq    = BCO_GET_LARGE_ARG;
+#if defined(PROFILING)
+            arg4_cc             = BCO_GET_LARGE_ARG;
+#else
+            BCO_GET_LARGE_ARG;
+#endif
 
             // check if we are returning from a breakpoint - this info
-            // is stored in the flags field of the current TSO
-            returning_from_break = cap->r.rCurrentTSO->flags & TSO_STOPPED_ON_BREAKPOINT;
+            // is stored in the flags field of the current TSO. If true,
+            // then don't break this time around.
+            returning_from_break =
+                cap->r.rCurrentTSO->flags & TSO_STOPPED_ON_BREAKPOINT;
+
+#if defined(PROFILING)
+            cap->r.rCCCS = pushCostCentre(cap->r.rCCCS,
+                                          (CostCentre*)BCO_LIT(arg4_cc));
+#endif
 
             // if we are returning from a break then skip this section
             // and continue executing
             if (!returning_from_break)
             {
-               breakPoints = (StgArrWords *) BCO_PTR(arg1_brk_array);
+               breakPoints = (StgArrBytes *) BCO_PTR(arg1_brk_array);
 
                // stop the current thread if either the
                // "rts_stop_next_breakpoint" flag is true OR if the
                // breakpoint flag for this particular expression is
                // true
-               if (rts_stop_next_breakpoint == rtsTrue ||
-                   breakPoints->payload[arg2_array_index] == rtsTrue)
+               if (rts_stop_next_breakpoint == true ||
+                   ((StgWord8*)breakPoints->payload)[arg2_array_index]
+                     == true)
                {
                   // make sure we don't automatically stop at the
                   // next breakpoint
-                  rts_stop_next_breakpoint = rtsFalse;
+                  rts_stop_next_breakpoint = false;
 
                   // allocate memory for a new AP_STACK, enough to
                   // store the top stack frame plus an
@@ -883,7 +1056,7 @@ run_BCO:
                   // the BCO
                   size_words = BCO_BITMAP_SIZE(obj) + 2;
                   new_aps = (StgAP_STACK *) allocate(cap, AP_STACK_sizeW(size_words));
-                  SET_HDR(new_aps,&stg_AP_STACK_info,CCS_SYSTEM);
+                  SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS);
                   new_aps->size = size_words;
                   new_aps->fun = &stg_dummy_ret_closure;
 
@@ -897,20 +1070,31 @@ run_BCO:
                      new_aps->payload[i] = (StgClosure *)Sp[i-2];
                   }
 
-                  // prepare the stack so that we can call the
-                  // rts_breakpoint_io_action and ensure that the stack is
-                  // in a reasonable state for the GC and so that
-                  // execution of this BCO can continue when we resume
-                  ioAction = (StgClosure *) deRefStablePtr (rts_breakpoint_io_action);
-                  Sp -= 8;
-                  Sp[7] = (W_)obj;
-                  Sp[6] = (W_)&stg_apply_interp_info;
-                  Sp[5] = (W_)new_aps;                 // the AP_STACK
-                  Sp[4] = (W_)BCO_PTR(arg3_freeVars);  // the info about local vars of the breakpoint
-                  Sp[3] = (W_)False_closure;            // True <=> a breakpoint
-                  Sp[2] = (W_)&stg_ap_pppv_info;
-                  Sp[1] = (W_)ioAction;                // apply the IO action to its two arguments above
-                  Sp[0] = (W_)&stg_enter_info;         // get ready to run the IO action
+                  // Arrange the stack to call the breakpoint IO action, and
+                  // continue execution of this BCO when the IO action returns.
+                  //
+                  // ioAction :: Bool        -- exception?
+                  //          -> HValue      -- the AP_STACK, or exception
+                  //          -> Int         -- the breakpoint index (arg2)
+                  //          -> Int         -- the module uniq (arg3)
+                  //          -> IO ()
+                  //
+                  ioAction = (StgClosure *) deRefStablePtr (
+                      rts_breakpoint_io_action);
+
+                  Sp -= 11;
+                  Sp[10] = (W_)obj;
+                  Sp[9]  = (W_)&stg_apply_interp_info;
+                  Sp[8]  = (W_)new_aps;
+                  Sp[7]  = (W_)False_closure;         // True <=> a breakpoint
+                  Sp[6]  = (W_)&stg_ap_ppv_info;
+                  Sp[5]  = (W_)BCO_LIT(arg3_module_uniq);
+                  Sp[4]  = (W_)&stg_ap_n_info;
+                  Sp[3]  = (W_)arg2_array_index;
+                  Sp[2]  = (W_)&stg_ap_n_info;
+                  Sp[1]  = (W_)ioAction;
+                  Sp[0]  = (W_)&stg_enter_info;
+
                   // set the flag in the TSO to say that we are now
                   // stopping at a breakpoint so that when we resume
                   // we don't stop on the same breakpoint that we
@@ -981,9 +1165,14 @@ run_BCO:
 
         case bci_PUSH_ALTS: {
             int o_bco  = BCO_GET_LARGE_ARG;
-            Sp[-2] = (W_)&stg_ctoi_R1p_info;
-            Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+            Sp[1] = BCO_PTR(o_bco);
+            Sp[0] = (W_)&stg_ctoi_R1p_info;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -992,6 +1181,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_R1unpt_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1000,6 +1194,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_R1n_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1008,6 +1207,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_F1_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1016,6 +1220,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_D1_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1024,6 +1233,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_L1_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1032,6 +1246,11 @@ run_BCO:
             Sp[-2] = (W_)&stg_ctoi_V_info;
             Sp[-1] = BCO_PTR(o_bco);
             Sp -= 2;
+#if defined(PROFILING)
+            Sp -= 2;
+            Sp[1] = (W_)cap->r.rCCCS;
+            Sp[0] = (W_)&stg_restore_cccs_info;
+#endif
             goto nextInsn;
         }
 
@@ -1098,7 +1317,7 @@ run_BCO:
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
             Sp[-1] = (W_)ap;
             ap->n_args = n_payload;
-            SET_HDR(ap, &stg_AP_info, CCS_SYSTEM/*ToDo*/)
+            SET_HDR(ap, &stg_AP_info, cap->r.rCCCS)
             Sp --;
             goto nextInsn;
         }
@@ -1109,7 +1328,7 @@ run_BCO:
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
             Sp[-1] = (W_)ap;
             ap->n_args = n_payload;
-            SET_HDR(ap, &stg_AP_NOUPD_info, CCS_SYSTEM/*ToDo*/)
+            SET_HDR(ap, &stg_AP_NOUPD_info, cap->r.rCCCS)
             Sp --;
             goto nextInsn;
         }
@@ -1122,7 +1341,7 @@ run_BCO:
             Sp[-1] = (W_)pap;
             pap->n_args = n_payload;
             pap->arity = arity;
-            SET_HDR(pap, &stg_PAP_info, CCS_SYSTEM/*ToDo*/)
+            SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS)
             Sp --;
             goto nextInsn;
         }
@@ -1159,7 +1378,12 @@ run_BCO:
             pap->fun = (StgClosure*)Sp[0];
 
             // The function should be a BCO
-            ASSERT(get_itbl(pap->fun)->type == BCO);
+            if (get_itbl(pap->fun)->type != BCO) {
+#if defined(DEBUG)
+                printClosure(pap->fun);
+#endif
+                barf("bci_MKPAP");
+            }
 
             for (i = 0; i < n_payload; i++)
                 pap->payload[i] = (StgClosure*)Sp[i+1];
@@ -1192,7 +1416,7 @@ run_BCO:
                                                itbl->layout.payload.nptrs );
             StgClosure* con = (StgClosure*)allocate_NONUPD(cap,request);
             ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0);
-            SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), CCS_SYSTEM/*ToDo*/);
+            SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS);
             for (i = 0; i < n_words; i++) {
                 con->payload[i] = (StgClosure*)Sp[i];
             }
@@ -1403,9 +1627,9 @@ run_BCO:
 #define ROUND_UP_WDS(p)  ((((StgWord)(p)) + sizeof(W_)-1)/sizeof(W_))
 
             ffi_cif *cif = (ffi_cif *)marshall_fn;
-            nat nargs = cif->nargs;
-            nat ret_size;
-            nat i;
+            uint32_t nargs = cif->nargs;
+            uint32_t ret_size;
+            uint32_t i;
             int j;
             StgPtr p;
             W_ ret[2];                  // max needed
@@ -1462,15 +1686,15 @@ run_BCO:
             Sp[1] = (W_)obj;
             Sp[0] = (W_)&stg_ret_p_info;
 
-            SAVE_STACK_POINTERS;
-            tok = suspendThread(&cap->r, interruptible ? rtsTrue : rtsFalse);
+            SAVE_THREAD_STATE();
+            tok = suspendThread(&cap->r, interruptible);
 
             // We already made a copy of the arguments above.
             ffi_call(cif, fn, ret, argptrs);
 
             // And restart the thread again, popping the stg_ret_p frame.
             cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - STG_FIELD_OFFSET(Capability,r)));
-            LOAD_STACK_POINTERS;
+            LOAD_THREAD_STATE();
 
             if (Sp[0] != (W_)&stg_ret_p_info) {
                 // the stack is not how we left it.  This probably