add numSparks# primop (#4167)
[ghc.git] / rts / PrimOps.cmm
index ad761ab..3c7dbdd 100644 (file)
 
 #include "Cmm.h"
 
+#ifdef __PIC__
+import pthread_mutex_lock;
+import pthread_mutex_unlock;
+#endif
+import base_ControlziExceptionziBase_nestedAtomically_closure;
+import EnterCriticalSection;
+import LeaveCriticalSection;
+import ghczmprim_GHCziBool_False_closure;
+#if !defined(mingw32_HOST_OS)
+import sm_mutex;
+#endif
+
 /*-----------------------------------------------------------------------------
   Array Primitives
 
  * round up to the nearest word for the size of the array.
  */
 
-newByteArrayzh_fast
+stg_newByteArrayzh
 {
     W_ words, payload_words, n, p;
-    MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
+    MAYBE_GC(NO_PTRS,stg_newByteArrayzh);
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = payload_words;
+    StgArrWords_bytes(p) = n;
     RET_P(p);
 }
 
-newPinnedByteArrayzh_fast
+#define BA_ALIGN 16
+#define BA_MASK  (BA_ALIGN-1)
+
+stg_newPinnedByteArrayzh
 {
-    W_ words, payload_words, n, p;
+    W_ words, n, bytes, payload_words, p;
 
-    MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
+    MAYBE_GC(NO_PTRS,stg_newPinnedByteArrayzh);
     n = R1;
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    bytes = n;
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to BA_ALIGN bytes, so we need to allow space
+       to shift up to BA_ALIGN - 1 bytes: */
+    bytes = bytes + BA_ALIGN - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // We want an 8-byte aligned array.  allocatePinned() gives us
-    // 8-byte aligned memory by default, but we want to align the
-    // *goods* inside the ArrWords object, so we have to check the
-    // size of the ArrWords header and adjust our size accordingly.
-    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       words = words + 1;
-    }
+    /* Now we need to move p forward so that the payload is aligned
+       to BA_ALIGN bytes: */
+    p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_bytes(p) = n;
+    RET_P(p);
+}
+
+stg_newAlignedPinnedByteArrayzh
+{
+    W_ words, n, bytes, payload_words, p, alignment;
+
+    MAYBE_GC(NO_PTRS,stg_newAlignedPinnedByteArrayzh);
+    n = R1;
+    alignment = R2;
+
+    bytes = n;
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to <alignment> bytes, so we need to allow space
+       to shift up to <alignment - 1> bytes: */
+    bytes = bytes + alignment - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
-    // have to push the object forward one word so that the goods
-    // fall on an 8-byte boundary.
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       p = p + WDS(1);
-    }
+    /* Now we need to move p forward so that the payload is aligned
+       to <alignment> bytes. Note that we are assuming that
+       <alignment> is a power of 2, which is technically not guaranteed */
+    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = payload_words;
+    StgArrWords_bytes(p) = n;
     RET_P(p);
 }
 
-newArrayzh_fast
+stg_newArrayzh
 {
-    W_ words, n, init, arr, p;
+    W_ words, n, init, arr, p, size;
     /* Args: R1 = words, R2 = initialisation value */
 
     n = R1;
-    MAYBE_GC(R2_PTR,newArrayzh_fast);
-
-    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    MAYBE_GC(R2_PTR,stg_newArrayzh);
+
+    // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
+    // in the array, making sure we round up, and then rounding up to a whole
+    // number of words.
+    size = n + mutArrPtrsCardWords(n);
+    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
+    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
+    StgMutArrPtrs_size(arr) = size;
 
     // Initialise all elements of the the array with the value in R2
     init = R2;
@@ -112,18 +165,25 @@ newArrayzh_fast
        p = p + WDS(1);
        goto for;
     }
+    // Initialise the mark bits with 0
+  for2:
+    if (p < arr + WDS(size)) {
+       W_[p] = 0;
+       p = p + WDS(1);
+       goto for2;
+    }
 
     RET_P(arr);
 }
 
-unsafeThawArrayzh_fast
+stg_unsafeThawArrayzh
 {
   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
   //
   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
   // it on the mutable list for the GC to remove (removing something from
-  // the mutable list is not easy, because the mut_list is only singly-linked).
+  // the mutable list is not easy).
   // 
   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
@@ -151,12 +211,12 @@ unsafeThawArrayzh_fast
    MutVar primitives
    -------------------------------------------------------------------------- */
 
-newMutVarzh_fast
+stg_newMutVarzh
 {
     W_ mv;
     /* Args: R1 = initialisation value */
 
-    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
+    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
@@ -165,9 +225,9 @@ newMutVarzh_fast
     RET_P(mv);
 }
 
-atomicModifyMutVarzh_fast
+stg_atomicModifyMutVarzh
 {
-    W_ mv, z, x, y, r;
+    W_ mv, f, z, x, y, r, h;
     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
 
     /* If x is the current contents of the MutVar#, then 
@@ -204,21 +264,17 @@ atomicModifyMutVarzh_fast
 
 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
 
-   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
-
-#if defined(THREADED_RTS)
-    foreign "C" ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
-#endif
+   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, stg_atomicModifyMutVarzh);
 
-   x = StgMutVar_var(R1);
+   mv = R1;
+   f = R2;
 
    TICK_ALLOC_THUNK_2();
    CCCS_ALLOC(THUNK_2_SIZE);
    z = Hp - THUNK_2_SIZE + WDS(1);
    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
    LDV_RECORD_CREATE(z);
-   StgThunk_payload(z,0) = R2;
-   StgThunk_payload(z,1) = x;
+   StgThunk_payload(z,0) = f;
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
@@ -227,9 +283,6 @@ atomicModifyMutVarzh_fast
    LDV_RECORD_CREATE(y);
    StgThunk_payload(y,0) = z;
 
-   StgMutVar_var(R1) = y;
-   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
-
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    r = y - THUNK_1_SIZE;
@@ -237,10 +290,20 @@ atomicModifyMutVarzh_fast
    LDV_RECORD_CREATE(r);
    StgThunk_payload(r,0) = z;
 
-#if defined(THREADED_RTS)
-    foreign "C" RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+ retry:
+   x = StgMutVar_var(mv);
+   StgThunk_payload(z,1) = x;
+#ifdef THREADED_RTS
+   (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
+   if (h != x) { goto retry; }
+#else
+   StgMutVar_var(mv) = y;
 #endif
 
+   if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
+     foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
+   }
+
    RET_P(r);
 }
 
@@ -250,7 +313,7 @@ atomicModifyMutVarzh_fast
 
 STRING(stg_weak_msg,"New weak pointer at %p\n")
 
-mkWeakzh_fast
+stg_mkWeakzh
 {
   /* R1 = key
      R2 = value
@@ -262,29 +325,91 @@ mkWeakzh_fast
     R3 = stg_NO_FINALIZER_closure;
   }
 
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
 
-  StgWeak_key(w)       = R1;
-  StgWeak_value(w)     = R2;
-  StgWeak_finalizer(w) = R3;
+  // We don't care about cfinalizer here.
+  // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
+  // something else?
+
+  StgWeak_key(w)        = R1;
+  StgWeak_value(w)      = R2;
+  StgWeak_finalizer(w)  = R3;
+  StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
 
+  ACQUIRE_LOCK(sm_mutex);
   StgWeak_link(w)      = W_[weak_ptr_list];
   W_[weak_ptr_list]    = w;
+  RELEASE_LOCK(sm_mutex);
 
   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
   RET_P(w);
 }
 
+stg_mkWeakForeignEnvzh
+{
+  /* R1 = key
+     R2 = value
+     R3 = finalizer
+     R4 = pointer
+     R5 = has environment (0 or 1)
+     R6 = environment
+  */
+  W_ w, payload_words, words, p;
+
+  W_ key, val, fptr, ptr, flag, eptr;
+
+  key  = R1;
+  val  = R2;
+  fptr = R3;
+  ptr  = R4;
+  flag = R5;
+  eptr = R6;
+
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh );
+
+  w = Hp - SIZEOF_StgWeak + WDS(1);
+  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+
+  payload_words = 4;
+  words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
+
+  TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+  SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+
+  StgArrWords_bytes(p)     = WDS(payload_words);
+  StgArrWords_payload(p,0) = fptr;
+  StgArrWords_payload(p,1) = ptr;
+  StgArrWords_payload(p,2) = eptr;
+  StgArrWords_payload(p,3) = flag;
+
+  // We don't care about the value here.
+  // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?
+
+  StgWeak_key(w)        = key;
+  StgWeak_value(w)      = val;
+  StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
+  StgWeak_cfinalizer(w) = p;
+
+  ACQUIRE_LOCK(sm_mutex);
+  StgWeak_link(w)   = W_[weak_ptr_list];
+  W_[weak_ptr_list] = w;
+  RELEASE_LOCK(sm_mutex);
+
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
-finalizzeWeakzh_fast
+  RET_P(w);
+}
+
+stg_finalizzeWeakzh
 {
   /* R1 = weak ptr
    */
-  W_ w, f;
+  W_ w, f, arr;
 
   w = R1;
 
@@ -312,9 +437,18 @@ finalizzeWeakzh_fast
   SET_INFO(w,stg_DEAD_WEAK_info);
   LDV_RECORD_CREATE(w);
 
-  f = StgWeak_finalizer(w);
+  f   = StgWeak_finalizer(w);
+  arr = StgWeak_cfinalizer(w);
+
   StgDeadWeak_link(w) = StgWeak_link(w);
 
+  if (arr != stg_NO_FINALIZER_closure) {
+    foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
+                              StgArrWords_payload(arr,1),
+                              StgArrWords_payload(arr,2),
+                              StgArrWords_payload(arr,3)) [];
+  }
+
   /* return the finalizer */
   if (f == stg_NO_FINALIZER_closure) {
       RET_NP(0,stg_NO_FINALIZER_closure);
@@ -323,7 +457,7 @@ finalizzeWeakzh_fast
   }
 }
 
-deRefWeakzh_fast
+stg_deRefWeakzh
 {
   /* R1 = weak ptr */
   W_ w, code, val;
@@ -340,554 +474,69 @@ deRefWeakzh_fast
 }
 
 /* -----------------------------------------------------------------------------
-   Arbitrary-precision Integer operations.
-
-   There are some assumptions in this code that mp_limb_t == W_.  This is
-   the case for all the platforms that GHC supports, currently.
+   Floating point operations.
    -------------------------------------------------------------------------- */
 
-int2Integerzh_fast
-{
-   /* arguments: R1 = Int# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   /* mpz_set_si is inlined here, makes things simpler */
-   if (%lt(val,0)) { 
-       s  = -1;
-       Hp(0) = -val;
-   } else { 
-     if (%gt(val,0)) {
-       s = 1;
-       Hp(0) = val;
-     } else {
-       s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# 
-              #)
-   */
-   RET_NP(s,p);
-}
-
-word2Integerzh_fast
-{
-   /* arguments: R1 = Word# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   if (val != 0) {
-       s = 1;
-       W_[Hp] = val;
-   } else {
-       s = 0;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-/*
- * 'long long' primops for converting to/from Integers.
- */
-
-#ifdef SUPPORT_LONG_LONGS
-
-int64ToIntegerzh_fast
-{
-   /* arguments: L1 = Int64# */
-
-   L_ val;
-   W_ hi, lo, s, neg, words_needed, p;
-
-   val = L1;
-   neg = 0;
-
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi != 0 && hi != 0xFFFFFFFF )  { 
-       words_needed = 2;
-   } else { 
-       // minimum is one word
-       words_needed = 1;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, int64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( %lt(hi,0) ) {
-     neg = 1;
-     lo = -lo;
-     if(lo == 0) {
-       hi = -hi;
-     } else {
-       hi = -hi - 1;
-     }
-   }
-
-   if ( words_needed == 2 )  { 
-      s = 2;
-      Hp(-1) = lo;
-      Hp(0) = hi;
-   } else { 
-       if ( lo != 0 ) {
-          s = 1;
-          Hp(0) = lo;
-       } else /* val==0 */  {
-          s = 0;
-       }
-   }
-   if ( neg != 0 ) {
-       s = -s;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-word64ToIntegerzh_fast
-{
-   /* arguments: L1 = Word64# */
-
-   L_ val;
-   W_ hi, lo, s, words_needed, p;
-
-   val = L1;
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi != 0 ) {
-      words_needed = 2;
-   } else {
-      words_needed = 1;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, word64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( hi != 0 ) { 
-     s = 2;
-     Hp(-1) = lo;
-     Hp(0)  = hi;
-   } else {
-      if ( lo != 0 ) {
-        s = 1;
-        Hp(0) = lo;
-     } else /* val==0 */  {
-      s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-
-#endif /* SUPPORT_LONG_LONGS */
-
-/* ToDo: this is shockingly inefficient */
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_tmp2:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result2:
-    bits8 [SIZEOF_MP_INT];
-}
-#endif
-
-#ifdef THREADED_RTS
-#define FETCH_MP_TEMP(X) \
-W_ X; \
-X = BaseReg + (OFFSET_StgRegTable_r ## X);
-#else
-#define FETCH_MP_TEMP(X) /* Nothing */
-#endif
-
-#define GMP_TAKE2_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1, s2;                                                          \
-  W_ d1, d2;                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-  FETCH_MP_TEMP(mp_result2);                                            \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
-                                                                        \
-  s1 = W_TO_INT(R1);                                                    \
-  d1 = R2;                                                              \
-  s2 = W_TO_INT(R3);                                                    \
-  d2 = R4;                                                              \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
-  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
-  MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);                         \
-  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
-  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
-  MP_INT__mp_d(mp_tmp2)            = BYTE_ARR_CTS(d2);                         \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE1_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1;                                                              \
-  W_ d1;                                                                \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR, name);                                               \
-                                                                        \
-  d1 = R2;                                                              \
-  s1 = W_TO_INT(R1);                                                    \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                     \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
-name                                                                                    \
-{                                                                                       \
-  CInt s1, s2;                                                                          \
-  W_ d1, d2;                                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                                               \
-  FETCH_MP_TEMP(mp_result1)                                                             \
-  FETCH_MP_TEMP(mp_result2)                                                             \
-                                                                                        \
-  /* call doYouWantToGC() */                                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
-                                                                                        \
-  s1 = W_TO_INT(R1);                                                                    \
-  d1 = R2;                                                                              \
-  s2 = W_TO_INT(R3);                                                                    \
-  d2 = R4;                                                                              \
-                                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));                      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                                     \
-  MP_INT__mp_alloc(mp_tmp2)    = W_TO_INT(StgArrWords_words(d2));                      \
-  MP_INT__mp_size(mp_tmp2)     = (s2);                                                 \
-  MP_INT__mp_d(mp_tmp2)                = BYTE_ARR_CTS(d2);                                     \
-                                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
-  foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
-                                                                                        \
-  /* Perform the operation */                                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
-                                                                                        \
-  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
-           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
-          TO_W_(MP_INT__mp_size(mp_result2)),                                          \
-           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
-}
-
-GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
-GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
-GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
-GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
-GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
-GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
-GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
-GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
-GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
-GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
-GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
-
-GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
-GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
-}
-#endif
-
-gcdIntzh_fast
-{
-    /* R1 = the first Int#; R2 = the second Int# */
-    W_ r; 
-    FETCH_MP_TEMP(mp_tmp_w);
-
-    W_[mp_tmp_w] = R1;
-    (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
-
-    R1 = r;
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-gcdIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ s1;
-    (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
-    R1 = s1;
-    
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-cmpIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ usize, vsize, v_digit, u_digit;
-
-    usize = R1;
-    vsize = 0;
-    v_digit = R3;
-
-    // paraphrased from __gmpz_cmp_si() in the GMP sources
-    if (%gt(v_digit,0)) {
-       vsize = 1;
-    } else { 
-       if (%lt(v_digit,0)) {
-           vsize = -1;
-           v_digit = -v_digit;
-       }
-    }
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    u_digit = W_[BYTE_ARR_CTS(R2)];
-
-    if (u_digit == v_digit) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
-       R1 = usize; 
-    } else {
-       R1 = -usize; 
-    }
-
-    jump %ENTRY_CODE(Sp(0));
-}
-
-cmpIntegerzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
-    W_ usize, vsize, size, up, vp;
-    CInt cmp;
-
-    // paraphrased from __gmpz_cmp() in the GMP sources
-    usize = R1;
-    vsize = R3;
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(usize,0)) { // NB. not <, which is unsigned
-       size = -usize;
-    } else {
-       size = usize;
-    }
-
-    up = BYTE_ARR_CTS(R2);
-    vp = BYTE_ARR_CTS(R4);
-
-    (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
-
-    if (cmp == 0 :: CInt) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
-       R1 = 1;
-    } else {
-       R1 = (-1); 
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Intzh_fast
-{
-    /* R1 = s; R2 = d */
-    W_ r, s;
-
-    s = R1;
-    if (s == 0) {
-       r = 0;
-    } else {
-       r = W_[R2 + SIZEOF_StgArrWords];
-       if (%lt(s,0)) {
-           r = -r;
-       }
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    R1 = r;
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Wordzh_fast
-{
-  /* R1 = s; R2 = d */
-  W_ r, s;
-
-  s = R1;
-  if (s == 0) {
-    r = 0;
-  } else {
-    r = W_[R2 + SIZEOF_StgArrWords];
-    if (%lt(s,0)) {
-       r = -r;
-    }
-  }
-  /* Result parked in R1, return via info-pointer at TOS */
-  R1 = r;
-  jump %ENTRY_CODE(Sp(0));
-}
-
-decodeFloatzh_fast
+stg_decodeFloatzuIntzh
 { 
     W_ p;
     F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp1;
+    W_ mp_tmp_w;
+
+    STK_CHK_GEN( WDS(2), NO_PTRS, stg_decodeFloatzuIntzh );
+
+    mp_tmp1  = Sp - WDS(1);
+    mp_tmp_w = Sp - WDS(2);
     
     /* arguments: F1 = Float# */
     arg = F1;
     
-    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeFloat
-       where mantissa._mp_d can be put (it does not care about the rest) */
-    p = Hp - SIZEOF_StgArrWords;
-    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
-    StgArrWords_words(p) = 1;
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
-    
     /* Perform the operation */
-    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
+    foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
     
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+    /* returns: (Int# (mantissa), Int# (exponent)) */
+    RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
 }
 
-#define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
-#define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
-
-decodeDoublezh_fast
+stg_decodeDoublezu2Intzh
 { 
     D_ arg;
     W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp1;
+    W_ mp_tmp2;
+    W_ mp_result1;
+    W_ mp_result2;
+
+    STK_CHK_GEN( WDS(4), NO_PTRS, stg_decodeDoublezu2Intzh );
+
+    mp_tmp1    = Sp - WDS(1);
+    mp_tmp2    = Sp - WDS(2);
+    mp_result1 = Sp - WDS(3);
+    mp_result2 = Sp - WDS(4);
 
     /* arguments: D1 = Double# */
     arg = D1;
 
-    ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeDouble
-       where mantissa.d can be put (it does not care about the rest) */
-    p = Hp - ARR_SIZE + WDS(1);
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
-
     /* Perform the operation */
-    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
-    
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+    foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
+                                    mp_result1 "ptr", mp_result2 "ptr",
+                                    arg) [];
+
+    /* returns:
+       (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
+    RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
 }
 
 /* -----------------------------------------------------------------------------
  * Concurrency primitives
  * -------------------------------------------------------------------------- */
 
-forkzh_fast
+stg_forkzh
 {
   /* args: R1 = closure to spark */
 
-  MAYBE_GC(R1_PTR, forkzh_fast);
+  MAYBE_GC(R1_PTR, stg_forkzh);
 
   W_ closure;
   W_ threadid;
@@ -896,19 +545,26 @@ forkzh_fast
   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
                                RtsFlags_GcFlags_initialStkSize(RtsFlags), 
                                closure "ptr") [];
+
+  /* start blocked if the current thread is blocked */
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
+
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
 
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
 
-forkOnzh_fast
+stg_forkOnzh
 {
   /* args: R1 = cpu, R2 = closure to spark */
 
-  MAYBE_GC(R2_PTR, forkOnzh_fast);
+  MAYBE_GC(R2_PTR, stg_forkOnzh);
 
   W_ cpu;
   W_ closure;
@@ -919,26 +575,33 @@ forkOnzh_fast
   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
                                RtsFlags_GcFlags_initialStkSize(RtsFlags), 
                                closure "ptr") [];
+
+  /* start blocked if the current thread is blocked */
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
+
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
 
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
 
-yieldzh_fast
+stg_yieldzh
 {
   jump stg_yield_noregs;
 }
 
-myThreadIdzh_fast
+stg_myThreadIdzh
 {
   /* no args. */
   RET_P(CurrentTSO);
 }
 
-labelThreadzh_fast
+stg_labelThreadzh
 {
   /* args: 
        R1 = ThreadId#
@@ -949,7 +612,7 @@ labelThreadzh_fast
   jump %ENTRY_CODE(Sp(0));
 }
 
-isCurrentThreadBoundzh_fast
+stg_isCurrentThreadBoundzh
 {
   /* no args */
   W_ r;
@@ -957,18 +620,45 @@ isCurrentThreadBoundzh_fast
   RET_N(r);
 }
 
+stg_threadStatuszh
+{
+    /* args: R1 :: ThreadId# */
+    W_ tso;
+    W_ why_blocked;
+    W_ what_next;
+    W_ ret;
+
+    tso = R1;
+    loop:
+      if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+          tso = StgTSO__link(tso);
+          goto loop;
+      }
+
+    what_next   = TO_W_(StgTSO_what_next(tso));
+    why_blocked = TO_W_(StgTSO_why_blocked(tso));
+    // Note: these two reads are not atomic, so they might end up
+    // being inconsistent.  It doesn't matter, since we
+    // only return one or the other.  If we wanted to return the
+    // contents of block_info too, then we'd have to do some synchronisation.
+
+    if (what_next == ThreadComplete) {
+        ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
+    } else {
+        if (what_next == ThreadKilled) {
+            ret = 17;
+        } else {
+            ret = why_blocked;
+        }
+    }
+    RET_N(ret);
+}
 
 /* -----------------------------------------------------------------------------
  * TVar primitives
  * -------------------------------------------------------------------------- */
 
-#ifdef REG_R1
 #define SP_OFF 0
-#define IF_NOT_REG_R1(x) 
-#else
-#define SP_OFF 1
-#define IF_NOT_REG_R1(x) x
-#endif
 
 // Catch retry frame ------------------------------------------------------------
 
@@ -976,20 +666,18 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  W_ unused3, "ptr" W_ unused4, "ptr" W_ unused5)
+  W_ unused3, P_ unused4, P_ unused5)
 {
    W_ r, frame, trec, outer;
-   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
    frame = Sp;
    trec = StgTSO_trec(CurrentTSO);
-   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   outer  = StgTRecHeader_enclosing_trec(trec);
    (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
    if (r != 0) {
      /* Succeeded (either first branch or second branch) */
      StgTSO_trec(CurrentTSO) = outer;
      Sp = Sp + SIZEOF_StgCatchRetryFrame;
-     IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
      jump %ENTRY_CODE(Sp(SP_OFF));
    } else {
      /* Did not commit: re-execute */
@@ -1012,19 +700,20 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid, next_invariant, q, outer;
-  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
-  frame = Sp;
-  trec = StgTSO_trec(CurrentTSO);
-  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  frame  = Sp;
+  trec   = StgTSO_trec(CurrentTSO);
+  result = R1;
+  outer  = StgTRecHeader_enclosing_trec(trec);
 
   if (outer == NO_TREC) {
     /* First time back at the atomically frame -- pick up invariants */
     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
+    StgAtomicallyFrame_result(frame) = result;
 
   } else {
     /* Second/subsequent time back at the atomically frame -- abort the
@@ -1058,8 +747,8 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
     if (valid != 0) {
       /* Transaction was valid: commit succeeded */
       StgTSO_trec(CurrentTSO) = NO_TREC;
+      R1 = StgAtomicallyFrame_result(frame);
       Sp = Sp + SIZEOF_StgAtomicallyFrame;
-      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
       jump %ENTRY_CODE(Sp(SP_OFF));
     } else {
       /* Transaction was not valid: try again */
@@ -1076,10 +765,9 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid;
-  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
   frame = Sp;
 
@@ -1087,9 +775,6 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
   (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
   if (valid != 0) {
     /* Previous attempt is still valid: no point trying again yet */
-         IF_NOT_REG_R1(Sp_adj(-2);
-                       Sp(1) = stg_NO_FINALIZER_closure;
-                       Sp(0) = stg_ut_1_0_unreg_info;)
     jump stg_block_noregs;
   } else {
     /* Previous attempt is no longer valid: try again */
@@ -1103,11 +788,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 
 // STM catch frame --------------------------------------------------------------
 
-#ifdef REG_R1
 #define SP_OFF 0
-#else
-#define SP_OFF 1
-#endif
 
 /* Catch frames are very similar to update frames, but when entering
  * one we just pop the frame off the stack and perform the correct
@@ -1118,19 +799,17 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
    {
-      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
       W_ r, frame, trec, outer;
       frame = Sp;
       trec = StgTSO_trec(CurrentTSO);
-      ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+      outer  = StgTRecHeader_enclosing_trec(trec);
       (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
       if (r != 0) {
         /* Commit succeeded */
         StgTSO_trec(CurrentTSO) = outer;
         Sp = Sp + SIZEOF_StgCatchSTMFrame;
-        IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
         jump Sp(SP_OFF);
       } else {
         /* Commit failed */
@@ -1145,24 +824,24 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
 
 // Primop definition ------------------------------------------------------------
 
-atomicallyzh_fast
+stg_atomicallyzh
 {
   W_ frame;
   W_ old_trec;
   W_ new_trec;
   
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR, atomicallyzh_fast); 
+  MAYBE_GC (R1_PTR, stg_atomicallyzh); 
 
   /* Args: R1 = m :: STM a */
-  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
+  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, stg_atomicallyzh);
 
   old_trec = StgTSO_trec(CurrentTSO);
 
   /* Nested transactions are not allowed; raise an exception */
   if (old_trec != NO_TREC) {
-     R1 = base_GHCziIOBase_NestedAtomically_closure;
-     jump raisezh_fast;
+     R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
+     jump stg_raisezh;
   }
 
   /* Set up the atomically frame */
@@ -1171,6 +850,7 @@ atomicallyzh_fast
 
   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
   StgAtomicallyFrame_code(frame) = R1;
+  StgAtomicallyFrame_result(frame) = NO_TREC;
   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
 
   /* Start the memory transcation */
@@ -1182,13 +862,13 @@ atomicallyzh_fast
 }
 
 
-catchSTMzh_fast
+stg_catchSTMzh
 {
   W_ frame;
   
   /* Args: R1 :: STM a */
   /* Args: R2 :: Exception -> STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
+  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, stg_catchSTMzh);
 
   /* Set up the catch frame */
   Sp = Sp - SIZEOF_StgCatchSTMFrame;
@@ -1210,18 +890,18 @@ catchSTMzh_fast
 }
 
 
-catchRetryzh_fast
+stg_catchRetryzh
 {
   W_ frame;
   W_ new_trec;
   W_ trec;
 
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
+  MAYBE_GC (R1_PTR & R2_PTR, stg_catchRetryzh); 
 
   /* Args: R1 :: STM a */
   /* Args: R2 :: STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
+  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, stg_catchRetryzh);
 
   /* Start a nested transaction within which to run the first code */
   trec = StgTSO_trec(CurrentTSO);
@@ -1242,7 +922,7 @@ catchRetryzh_fast
 }
 
 
-retryzh_fast
+stg_retryzh
 {
   W_ frame_type;
   W_ frame;
@@ -1250,7 +930,7 @@ retryzh_fast
   W_ outer;
   W_ r;
 
-  MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
+  MAYBE_GC (NO_PTRS, stg_retryzh); // STM operations may allocate
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
@@ -1259,7 +939,7 @@ retry_pop_stack:
   Sp = StgTSO_sp(CurrentTSO);
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
-  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  outer  = StgTRecHeader_enclosing_trec(trec);
 
   if (frame_type == CATCH_RETRY_FRAME) {
     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
@@ -1292,7 +972,7 @@ retry_pop_stack:
     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
     trec = outer;
     StgTSO_trec(CurrentTSO) = trec;
-    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+    outer  = StgTRecHeader_enclosing_trec(trec);
   }
   ASSERT(outer == NO_TREC);
 
@@ -1302,9 +982,6 @@ retry_pop_stack:
     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
     Sp = frame;
     // Fix up the stack in the unregisterised case: the return convention is different.
-    IF_NOT_REG_R1(Sp_adj(-2); 
-                 Sp(1) = stg_NO_FINALIZER_closure;
-                 Sp(0) = stg_ut_1_0_unreg_info;)
     R3 = trec; // passing to stmWaitUnblock()
     jump stg_block_stmwait;
   } else {
@@ -1318,12 +995,12 @@ retry_pop_stack:
 }
 
 
-checkzh_fast
+stg_checkzh
 {
   W_ trec, closure;
 
   /* Args: R1 = invariant closure */
-  MAYBE_GC (R1_PTR, checkzh_fast); 
+  MAYBE_GC (R1_PTR, stg_checkzh); 
 
   trec = StgTSO_trec(CurrentTSO);
   closure = R1;
@@ -1335,21 +1012,21 @@ checkzh_fast
 }
 
 
-newTVarzh_fast
+stg_newTVarzh
 {
   W_ tv;
   W_ new_value;
 
   /* Args: R1 = initialisation value */
 
-  MAYBE_GC (R1_PTR, newTVarzh_fast); 
+  MAYBE_GC (R1_PTR, stg_newTVarzh); 
   new_value = R1;
   ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
   RET_P(tv);
 }
 
 
-readTVarzh_fast
+stg_readTVarzh
 {
   W_ trec;
   W_ tvar;
@@ -1357,7 +1034,7 @@ readTVarzh_fast
 
   /* Args: R1 = TVar closure */
 
-  MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
+  MAYBE_GC (R1_PTR, stg_readTVarzh); // Call to stmReadTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
   tvar = R1;
   ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
@@ -1365,8 +1042,19 @@ readTVarzh_fast
   RET_P(result);
 }
 
+stg_readTVarIOzh
+{
+    W_ result;
 
-writeTVarzh_fast
+again:
+    result = StgTVar_current_value(R1);
+    if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
+        goto again;
+    }
+    RET_P(result);
+}
+
+stg_writeTVarzh
 {
   W_ trec;
   W_ tvar;
@@ -1375,7 +1063,7 @@ writeTVarzh_fast
   /* Args: R1 = TVar closure */
   /*       R2 = New value    */
 
-  MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
+  MAYBE_GC (R1_PTR & R2_PTR, stg_writeTVarzh); // Call to stmWriteTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
   tvar = R1;
   new_value = R2;
@@ -1417,26 +1105,27 @@ writeTVarzh_fast
  *
  * -------------------------------------------------------------------------- */
 
-isEmptyMVarzh_fast
+stg_isEmptyMVarzh
 {
     /* args: R1 = MVar closure */
 
-    if (GET_INFO(R1) == stg_EMPTY_MVAR_info) {
+    if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
        RET_N(1);
     } else {
        RET_N(0);
     }
 }
 
-newMVarzh_fast
+stg_newMVarzh
 {
     /* args: none */
     W_ mvar;
 
-    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
+    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh );
   
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
-    SET_HDR(mvar,stg_EMPTY_MVAR_info,W_[CCCS]);
+    SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
+        // MVARs start dirty: generation 0 has no mutable list
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1444,24 +1133,17 @@ newMVarzh_fast
 }
 
 
-/* If R1 isn't available, pass it on the stack */
-#ifdef REG_R1
 #define PerformTake(tso, value)                                \
     W_[StgTSO_sp(tso) + WDS(1)] = value;               \
     W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
-#else
-#define PerformTake(tso, value)                                        \
-    W_[StgTSO_sp(tso) + WDS(1)] = value;                       \
-    W_[StgTSO_sp(tso) + WDS(0)] = stg_ut_1_0_unreg_info;
-#endif
 
 #define PerformPut(tso,lval)                   \
     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);  \
     lval = W_[StgTSO_sp(tso) - WDS(1)];
 
-takeMVarzh_fast
+stg_takeMVarzh
 {
-    W_ mvar, val, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar closure */
     mvar = R1;
@@ -1471,80 +1153,101 @@ takeMVarzh_fast
 #else
     info = GET_INFO(mvar);
 #endif
+        
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
+    }
 
     /* If the MVar is empty, put ourselves on its blocking queue,
      * and wait until we're woken up.
      */
-    if (info == stg_EMPTY_MVAR_info) {
+    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
+        
+        // Note [mvar-heap-check] We want to do the heap check in the
+        // branch here, to avoid the conditional in the common case.
+        // However, we've already locked the MVar above, so we better
+        // be careful to unlock it again if the the heap check fails.
+        // Unfortunately we don't have an easy way to inject any code
+        // into the heap check generated by the code generator, so we
+        // have to do it in stg_gc_gen (see HeapStackCheck.cmm).
+        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR, stg_takeMVarzh);
+
+        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
+
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
+        StgMVarTSOQueue_tso(q)  = CurrentTSO;
+
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-           StgMVar_head(mvar) = CurrentTSO;
+           StgMVar_head(mvar) = q;
        } else {
-           StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
+            foreign "C" recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar)) [];
        }
-       StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgTSO__link(CurrentTSO)       = q;
        StgTSO_block_info(CurrentTSO)  = mvar;
-       StgMVar_tail(mvar) = CurrentTSO;
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgMVar_tail(mvar)             = q;
        
+        R1 = mvar;
        jump stg_block_takemvar;
-  }
+    }
+    
+    /* we got the value... */
+    val = StgMVar_value(mvar);
+    
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+        /* No further putMVars, MVar is now empty */
+        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+        unlockClosure(mvar, stg_MVAR_DIRTY_info);
+        RET_P(val);
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
+    
+    // There are putMVar(s) waiting... wake up the first thread on the queue
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-  /* we got the value... */
-  val = StgMVar_value(mvar);
-
-  if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
-  {
-      /* There are putMVar(s) waiting... 
-       * wake up the first thread on the queue
-       */
-      ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
-
-      /* actually perform the putMVar for the thread that we just woke up */
-      tso = StgMVar_head(mvar);
-      PerformPut(tso,StgMVar_value(mvar));
-      dirtyTSO(tso);
-
-#if defined(GRAN) || defined(PAR)
-      /* ToDo: check 2nd arg (mvar) is right */
-      ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
-      StgMVar_head(mvar) = tso;
-#else
-      ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", 
-                                        StgMVar_head(mvar) "ptr") [];
-      StgMVar_head(mvar) = tso;
-#endif
+loop2:
+    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+        tso = StgTSO__link(tso);
+        goto loop2;
+    }
 
-      if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-         StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-      }
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-#if defined(THREADED_RTS)
-      unlockClosure(mvar, stg_FULL_MVAR_info);
-#endif
-      RET_P(val);
-  } 
-  else
-  {
-      /* No further putMVars, MVar is now empty */
-      StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-#if defined(THREADED_RTS)
-      unlockClosure(mvar, stg_EMPTY_MVAR_info);
-#else
-      SET_INFO(mvar,stg_EMPTY_MVAR_info);
-#endif
+    // actually perform the putMVar for the thread that we just woke up
+    PerformPut(tso,StgMVar_value(mvar));
 
-      RET_P(val);
-  }
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
+
+    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+    
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    RET_P(val);
 }
 
 
-tryTakeMVarzh_fast
+stg_tryTakeMVarzh
 {
-    W_ mvar, val, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar closure */
-
     mvar = R1;
 
 #if defined(THREADED_RTS)
@@ -1552,199 +1255,237 @@ tryTakeMVarzh_fast
 #else
     info = GET_INFO(mvar);
 #endif
-
-    if (info == stg_EMPTY_MVAR_info) {
+        
+    /* If the MVar is empty, put ourselves on its blocking queue,
+     * and wait until we're woken up.
+     */
+    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
-        unlockClosure(mvar, stg_EMPTY_MVAR_info);
+        unlockClosure(mvar, info);
 #endif
        /* HACK: we need a pointer to pass back, 
         * so we abuse NO_FINALIZER_closure
         */
        RET_NP(0, stg_NO_FINALIZER_closure);
     }
+    
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
+    }
 
     /* we got the value... */
     val = StgMVar_value(mvar);
+    
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+        /* No further putMVars, MVar is now empty */
+        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+        unlockClosure(mvar, stg_MVAR_DIRTY_info);
+        RET_NP(1, val);
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
+    
+    // There are putMVar(s) waiting... wake up the first thread on the queue
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+loop2:
+    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+        tso = StgTSO__link(tso);
+        goto loop2;
+    }
 
-       /* There are putMVar(s) waiting... 
-        * wake up the first thread on the queue
-        */
-       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-       /* actually perform the putMVar for the thread that we just woke up */
-       tso = StgMVar_head(mvar);
-       PerformPut(tso,StgMVar_value(mvar));
-        dirtyTSO(tso);
+    // actually perform the putMVar for the thread that we just woke up
+    PerformPut(tso,StgMVar_value(mvar));
 
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
-       StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr",
-                                          StgMVar_head(mvar) "ptr") [];
-       StgMVar_head(mvar) = tso;
-#endif
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-       }
-#if defined(THREADED_RTS)
-        unlockClosure(mvar, stg_FULL_MVAR_info);
-#endif
-    }
-    else 
-    {
-       /* No further putMVars, MVar is now empty */
-       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-#if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_EMPTY_MVAR_info);
-#else
-       SET_INFO(mvar,stg_EMPTY_MVAR_info);
-#endif
-    }
+    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
     
-    RET_NP(1, val);
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    RET_P(val);
 }
 
 
-putMVarzh_fast
+stg_putMVarzh
 {
-    W_ mvar, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
+    val  = R2;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
 
-    if (info == stg_FULL_MVAR_info) {
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+    }
+
+    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
+
+        // see Note [mvar-heap-check] above
+        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR & R2_PTR, stg_putMVarzh);
+
+        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
+
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
+        StgMVarTSOQueue_tso(q)  = CurrentTSO;
+
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-           StgMVar_head(mvar) = CurrentTSO;
+           StgMVar_head(mvar) = q;
        } else {
-           StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
+            foreign "C" recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar)) [];
        }
-       StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgTSO__link(CurrentTSO)       = q;
        StgTSO_block_info(CurrentTSO)  = mvar;
-       StgMVar_tail(mvar) = CurrentTSO;
-       
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgMVar_tail(mvar)             = q;
+
+        R1 = mvar;
+        R2 = val;
        jump stg_block_putmvar;
     }
   
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
-
-       /* There are takeMVar(s) waiting: wake up the first one
-        */
-       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
-
-       /* actually perform the takeMVar */
-       tso = StgMVar_head(mvar);
-       PerformTake(tso, R2);
-        dirtyTSO(tso);
-      
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
-       StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
-       StgMVar_head(mvar) = tso;
-#endif
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+       /* No further takes, the MVar is now full. */
+       StgMVar_value(mvar) = val;
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
+       jump %ENTRY_CODE(Sp(0));
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
 
-       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-       }
+    // There are takeMVar(s) waiting: wake up the first one
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-#if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_EMPTY_MVAR_info);
-#endif
-       jump %ENTRY_CODE(Sp(0));
+loop2:
+    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+        tso = StgTSO__link(tso);
+        goto loop2;
     }
-    else
-    {
-       /* No further takes, the MVar is now full. */
-       StgMVar_value(mvar) = R2;
 
-#if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_FULL_MVAR_info);
-#else
-       SET_INFO(mvar,stg_FULL_MVAR_info);
-#endif
-       jump %ENTRY_CODE(Sp(0));
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
+
+    // actually perform the takeMVar
+    PerformTake(tso, val);
+
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    if (TO_W_(StgTSO_dirty(tso)) == 0) {
+        foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
     }
     
-    /* ToDo: yield afterward for better communication performance? */
+    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    jump %ENTRY_CODE(Sp(0));
 }
 
 
-tryPutMVarzh_fast
+stg_tryPutMVarzh
 {
-    W_ mvar, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
+    val  = R2;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
 
-    if (info == stg_FULL_MVAR_info) {
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+    }
+
+    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_FULL_MVAR_info);
+       unlockClosure(mvar, info);
 #endif
        RET_N(0);
     }
   
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
-
-       /* There are takeMVar(s) waiting: wake up the first one
-        */
-       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
-       
-       /* actually perform the takeMVar */
-       tso = StgMVar_head(mvar);
-       PerformTake(tso, R2);
-        dirtyTSO(tso);
-      
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
-       StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
-       StgMVar_head(mvar) = tso;
-#endif
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+       /* No further takes, the MVar is now full. */
+       StgMVar_value(mvar) = val;
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
+       jump %ENTRY_CODE(Sp(0));
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
 
-       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-       }
+    // There are takeMVar(s) waiting: wake up the first one
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-#if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_EMPTY_MVAR_info);
-#endif
+loop2:
+    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+        tso = StgTSO__link(tso);
+        goto loop2;
     }
-    else
-    {
-       /* No further takes, the MVar is now full. */
-       StgMVar_value(mvar) = R2;
 
-#if defined(THREADED_RTS)
-       unlockClosure(mvar, stg_FULL_MVAR_info);
-#else
-       SET_INFO(mvar,stg_FULL_MVAR_info);
-#endif
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
+
+    // actually perform the takeMVar
+    PerformTake(tso, val);
+
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    if (TO_W_(StgTSO_dirty(tso)) == 0) {
+        foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
     }
     
-    RET_N(1);
-    /* ToDo: yield afterward for better communication performance? */
+    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    jump %ENTRY_CODE(Sp(0));
 }
 
 
@@ -1752,11 +1493,11 @@ tryPutMVarzh_fast
    Stable pointer primitives
    -------------------------------------------------------------------------  */
 
-makeStableNamezh_fast
+stg_makeStableNamezh
 {
     W_ index, sn_obj;
 
-    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
+    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, stg_makeStableNamezh );
   
     (index) = foreign "C" lookupStableName(R1 "ptr") [];
 
@@ -1776,16 +1517,16 @@ makeStableNamezh_fast
 }
 
 
-makeStablePtrzh_fast
+stg_makeStablePtrzh
 {
     /* Args: R1 = a */
     W_ sp;
-    MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
+    MAYBE_GC(R1_PTR, stg_makeStablePtrzh);
     ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
     RET_N(sp);
 }
 
-deRefStablePtrzh_fast
+stg_deRefStablePtrzh
 {
     /* Args: R1 = the stable ptr */
     W_ r, sp;
@@ -1798,7 +1539,7 @@ deRefStablePtrzh_fast
    Bytecode object primitives
    -------------------------------------------------------------------------  */
 
-newBCOzh_fast
+stg_newBCOzh
 {
     /* R1 = instrs
        R2 = literals
@@ -1810,10 +1551,10 @@ newBCOzh_fast
     
     bitmap_arr = R5;
 
-    words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
+    words = BYTES_TO_WDS(SIZEOF_StgBCO) + BYTE_ARR_WDS(bitmap_arr);
     bytes = WDS(words);
 
-    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
+    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );
 
     bco = Hp - bytes + WDS(1);
     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
@@ -1828,7 +1569,7 @@ newBCOzh_fast
     W_ i;
     i = 0;
 for:
-    if (i < StgArrWords_words(bitmap_arr)) {
+    if (i < BYTE_ARR_WDS(bitmap_arr)) {
        StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
        i = i + 1;
        goto for;
@@ -1838,7 +1579,7 @@ for:
 }
 
 
-mkApUpd0zh_fast
+stg_mkApUpd0zh
 {
     // R1 = the BCO# for the AP
     // 
@@ -1850,7 +1591,7 @@ mkApUpd0zh_fast
     ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
           StgBCO_arity(R1) == HALF_W_(0));
 
-    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
+    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, stg_mkApUpd0zh);
     TICK_ALLOC_UP_THK(0, 0);
     CCCS_ALLOC(SIZEOF_StgAP);
 
@@ -1863,13 +1604,13 @@ mkApUpd0zh_fast
     RET_P(ap);
 }
 
-unpackClosurezh_fast
+stg_unpackClosurezh
 {
 /* args: R1 = closure to analyze */
 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
 
     W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
-    info  = %GET_STD_INFO(R1);
+    info  = %GET_STD_INFO(UNTAG(R1));
 
     // Some closures have non-standard layout, so we omit those here.
     W_ type;
@@ -1893,31 +1634,40 @@ unpackClosurezh_fast
     }}
 out:
 
-    W_ ptrs_arr_sz, nptrs_arr_sz;
+    W_ ptrs_arr_sz, ptrs_arr_cards, nptrs_arr_sz;
     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
-    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
+    ptrs_arr_cards = mutArrPtrsCardWords(ptrs);
+    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards);
+
+    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh);
 
-    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
+    W_ clos;
+    clos = UNTAG(R1);
 
     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
 
     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
+    StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards;
+
     p = 0;
 for:
     if(p < ptrs) {
-        W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(R1,p);
+        W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
         p = p + 1;
         goto for;
     }
+    /* We can leave the card table uninitialised, since the array is
+       allocated in the nursery.  The GC will fill it in if/when the array
+       is promoted. */
     
     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(nptrs_arr) = nptrs;
+    StgArrWords_bytes(nptrs_arr) = WDS(nptrs);
     p = 0;
 for2:
     if(p < nptrs) {
-        W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(R1, p+ptrs);
+        W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
         p = p + 1;
         goto for2;
     }
@@ -1932,19 +1682,19 @@ for2:
  * macro in Schedule.h).
  */
 #define APPEND_TO_BLOCKED_QUEUE(tso)                   \
-    ASSERT(StgTSO_link(tso) == END_TSO_QUEUE);         \
+    ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);                \
     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {       \
       W_[blocked_queue_hd] = tso;                      \
     } else {                                           \
-      StgTSO_link(W_[blocked_queue_tl]) = tso;         \
+      foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
     }                                                  \
     W_[blocked_queue_tl] = tso;
 
-waitReadzh_fast
+stg_waitReadzh
 {
     /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitRead# on threaded RTS");
+    foreign "C" barf("waitRead# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
@@ -1957,11 +1707,11 @@ waitReadzh_fast
 #endif
 }
 
-waitWritezh_fast
+stg_waitWritezh
 {
     /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitWrite# on threaded RTS");
+    foreign "C" barf("waitWrite# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
@@ -1975,8 +1725,8 @@ waitWritezh_fast
 }
 
 
-STRING(stg_delayzh_malloc_str, "delayzh_fast")
-delayzh_fast
+STRING(stg_delayzh_malloc_str, "stg_delayzh")
+stg_delayzh
 {
 #ifdef mingw32_HOST_OS
     W_ ares;
@@ -1986,7 +1736,7 @@ delayzh_fast
 #endif
 
 #ifdef THREADED_RTS
-    foreign "C" barf("delay# on threaded RTS");
+    foreign "C" barf("delay# on threaded RTS") never returns;
 #else
 
     /* args: R1 (microsecond delay amount) */
@@ -1998,7 +1748,7 @@ delayzh_fast
     /* could probably allocate this on the heap instead */
     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_delayzh_malloc_str);
-    reqID = foreign "C" addDelayRequest(R1);
+    (reqID) = foreign "C" addDelayRequest(R1);
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -2017,7 +1767,11 @@ delayzh_fast
     W_ time;
     W_ divisor;
     (time) = foreign "C" getourtimeofday() [R1];
-    divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags))*1000;
+    divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags));
+    if (divisor == 0) {
+        divisor = 50;
+    }
+    divisor = divisor * 1000;
     target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
            + time + 1; /* Add 1 as getourtimeofday rounds down */
     StgTSO_block_info(CurrentTSO) = target;
@@ -2028,15 +1782,15 @@ delayzh_fast
 while:
     if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
        prev = t;
-       t = StgTSO_link(t);
+       t = StgTSO__link(t);
        goto while;
     }
 
-    StgTSO_link(CurrentTSO) = t;
+    StgTSO__link(CurrentTSO) = t;
     if (prev == NULL) {
        W_[sleeping_queue] = CurrentTSO;
     } else {
-       StgTSO_link(prev) = CurrentTSO;
+        foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
     }
     jump stg_block_noregs;
 #endif
@@ -2045,14 +1799,14 @@ while:
 
 
 #ifdef mingw32_HOST_OS
-STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
-asyncReadzh_fast
+STRING(stg_asyncReadzh_malloc_str, "stg_asyncReadzh")
+stg_asyncReadzh
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncRead# on threaded RTS");
+    foreign "C" barf("asyncRead# on threaded RTS") never returns;
 #else
 
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
@@ -2060,10 +1814,10 @@ asyncReadzh_fast
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
 
     /* could probably allocate this on the heap instead */
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncReadzh_malloc_str)
                        [R1,R2,R3,R4];
-    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
+    (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -2073,24 +1827,24 @@ asyncReadzh_fast
 #endif
 }
 
-STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
-asyncWritezh_fast
+STRING(stg_asyncWritezh_malloc_str, "stg_asyncWritezh")
+stg_asyncWritezh
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncWrite# on threaded RTS");
+    foreign "C" barf("asyncWrite# on threaded RTS") never returns;
 #else
 
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
 
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncWritezh_malloc_str)
                        [R1,R2,R3,R4];
-    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
+    (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
 
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
@@ -2101,14 +1855,14 @@ asyncWritezh_fast
 #endif
 }
 
-STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
-asyncDoProczh_fast
+STRING(stg_asyncDoProczh_malloc_str, "stg_asyncDoProczh")
+stg_asyncDoProczh
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncDoProc# on threaded RTS");
+    foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
 #else
 
     /* args: R1 = proc, R2 = param */
@@ -2116,10 +1870,10 @@ asyncDoProczh_fast
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
 
     /* could probably allocate this on the heap instead */
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncDoProczh_malloc_str) 
                                [R1,R2];
-    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
+    (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -2130,12 +1884,71 @@ asyncDoProczh_fast
 }
 #endif
 
-// noDuplicate# tries to ensure that none of the thunks under
-// evaluation by the current thread are also under evaluation by
-// another thread.  It relies on *both* threads doing noDuplicate#;
-// the second one will get blocked if they are duplicating some work.
-noDuplicatezh_fast
+/* -----------------------------------------------------------------------------
+ * noDuplicate#
+ *
+ * noDuplicate# tries to ensure that none of the thunks under
+ * evaluation by the current thread are also under evaluation by
+ * another thread.  It relies on *both* threads doing noDuplicate#;
+ * the second one will get blocked if they are duplicating some work.
+ *
+ * The idea is that noDuplicate# is used within unsafePerformIO to
+ * ensure that the IO operation is performed at most once.
+ * noDuplicate# calls threadPaused which acquires an exclusive lock on
+ * all the thunks currently under evaluation by the current thread.
+ *
+ * Consider the following scenario.  There is a thunk A, whose
+ * evaluation requires evaluating thunk B, where thunk B is an
+ * unsafePerformIO.  Two threads, 1 and 2, bother enter A.  Thread 2
+ * is pre-empted before it enters B, and claims A by blackholing it
+ * (in threadPaused).  Thread 1 now enters B, and calls noDuplicate#.
+ *
+ *      thread 1                      thread 2
+ *   +-----------+                 +---------------+
+ *   |    -------+-----> A <-------+-------        |
+ *   |  update   |   BLACKHOLE     | marked_update |
+ *   +-----------+                 +---------------+
+ *   |           |                 |               | 
+ *        ...                             ...
+ *   |           |                 +---------------+
+ *   +-----------+
+ *   |     ------+-----> B
+ *   |  update   |   BLACKHOLE
+ *   +-----------+
+ *
+ * At this point: A is a blackhole, owned by thread 2.  noDuplicate#
+ * calls threadPaused, which walks up the stack and
+ *  - claims B on behalf of thread 1
+ *  - then it reaches the update frame for A, which it sees is already
+ *    a BLACKHOLE and is therefore owned by another thread.  Since
+ *    thread 1 is duplicating work, the computation up to the update
+ *    frame for A is suspended, including thunk B.
+ *  - thunk B, which is an unsafePerformIO, has now been reverted to
+ *    an AP_STACK which could be duplicated - BAD!
+ *  - The solution is as follows: before calling threadPaused, we
+ *    leave a frame on the stack (stg_noDuplicate_info) that will call
+ *    noDuplicate# again if the current computation is suspended and
+ *    restarted.
+ *
+ * See the test program in concurrent/prog003 for a way to demonstrate
+ * this.  It needs to be run with +RTS -N3 or greater, and the bug
+ * only manifests occasionally (once very 10 runs or so).
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
 {
+    Sp_adj(1);
+    jump stg_noDuplicatezh;
+}
+
+stg_noDuplicatezh
+{
+    STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
+    // leave noDuplicate frame in case the current
+    // computation is suspended and restarted (see above).
+    Sp_adj(-1);
+    Sp(0) = stg_noDuplicate_info;
+
     SAVE_THREAD_STATE();
     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
@@ -2145,11 +1958,19 @@ noDuplicatezh_fast
     } else {
         LOAD_THREAD_STATE();
         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+        // remove the stg_noDuplicate frame if it is still there.
+        if (Sp(0) == stg_noDuplicate_info) {
+            Sp_adj(1);
+        }
         jump %ENTRY_CODE(Sp(0));
     }
 }
 
-getApStackValzh_fast
+/* -----------------------------------------------------------------------------
+   Misc. primitives
+   -------------------------------------------------------------------------- */
+
+stg_getApStackValzh
 {
    W_ ap_stack, offset, val, ok;
 
@@ -2166,3 +1987,71 @@ getApStackValzh_fast
    }
    RET_NP(ok,val);
 }
+
+// Write the cost center stack of the first argument on stderr; return
+// the second.  Possibly only makes sense for already evaluated
+// things?
+stg_traceCcszh
+{
+    W_ ccs;
+
+#ifdef PROFILING
+    ccs = StgHeader_ccs(UNTAG(R1));
+    foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
+#endif
+
+    R1 = R2;
+    ENTER();
+}
+
+stg_getSparkzh
+{
+   W_ spark;
+
+#ifndef THREADED_RTS
+   RET_NP(0,ghczmprim_GHCziBool_False_closure);
+#else
+   (spark) = foreign "C" findSpark(MyCapability());
+   if (spark != 0) {
+      RET_NP(1,spark);
+   } else {
+      RET_NP(0,ghczmprim_GHCziBool_False_closure);
+   }
+#endif
+}
+
+stg_numSparkszh
+{
+  W_ n;
+#ifdef THREADED_RTS
+  (n) = foreign "C" dequeElements(Capability_sparks(MyCapability()));
+#else
+  n = 0;
+#endif
+  RET_N(n);
+}
+
+stg_traceEventzh
+{
+   W_ msg;
+   msg = R1;
+
+#if defined(TRACING) || defined(DEBUG)
+
+   foreign "C" traceUserMsg(MyCapability() "ptr", msg "ptr") [];
+
+#elif defined(DTRACE)
+
+   W_ enabled;
+
+   // We should go through the macro HASKELLEVENT_USER_MSG_ENABLED from
+   // RtsProbes.h, but that header file includes unistd.h, which doesn't
+   // work in Cmm
+   (enabled) = foreign "C" __dtrace_isenabled$HaskellEvent$user__msg$v1() [];
+   if (enabled != 0) {
+     foreign "C" dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr") [];
+   }
+
+#endif
+   jump %ENTRY_CODE(Sp(0));
+}