Fix memory leak from #12664
[ghc.git] / rts / RtsAPI.c
index 0748871..e724307 100644 (file)
@@ -8,29 +8,28 @@
 
 #include "PosixSource.h"
 #include "Rts.h"
-#include "OSThreads.h"
 #include "RtsAPI.h"
-#include "SchedAPI.h"
-#include "RtsFlags.h"
+#include "HsFFI.h"
+
 #include "RtsUtils.h"
 #include "Prelude.h"
 #include "Schedule.h"
 #include "Capability.h"
 #include "Stable.h"
-
-#include <stdlib.h>
+#include "Threads.h"
+#include "Weak.h"
 
 /* ----------------------------------------------------------------------------
    Building Haskell objects from C datatypes.
 
    TODO: Currently this code does not tag created pointers,
-         however it is not unsafe (the contructor code will do it)
+         however it is not unsafe (the constructor code will do it)
          just inefficient.
    ------------------------------------------------------------------------- */
 HaskellObj
 rts_mkChar (Capability *cap, HsChar c)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap, CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap, CONSTR_sizeW(0,1));
   SET_HDR(p, Czh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
   return p;
@@ -39,7 +38,7 @@ rts_mkChar (Capability *cap, HsChar c)
 HaskellObj
 rts_mkInt (Capability *cap, HsInt i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Izh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
@@ -48,7 +47,7 @@ rts_mkInt (Capability *cap, HsInt i)
 HaskellObj
 rts_mkInt8 (Capability *cap, HsInt8 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the bits above the lowest 8 */
   p->payload[0]  = (StgClosure *)(StgInt)i;
@@ -58,7 +57,7 @@ rts_mkInt8 (Capability *cap, HsInt8 i)
 HaskellObj
 rts_mkInt16 (Capability *cap, HsInt16 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the relevant bits */
   p->payload[0]  = (StgClosure *)(StgInt)i;
@@ -68,55 +67,25 @@ rts_mkInt16 (Capability *cap, HsInt16 i)
 HaskellObj
 rts_mkInt32 (Capability *cap, HsInt32 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
 }
 
-
-#ifdef sparc_HOST_ARCH
-/* The closures returned by allocateLocal are only guaranteed to be 32 bit
-   aligned, because that's the size of pointers. SPARC v9 can't do
-   misaligned loads/stores, so we have to write the 64bit word in chunks         */
-
-HaskellObj
-rts_mkInt64 (Capability *cap, HsInt64 i_)
-{
-  StgInt64 i   = (StgInt64)i_;
-  StgInt32 *tmp;
-
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
-  SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
-
-  tmp    = (StgInt32*)&(p->payload[0]);
-
-  tmp[0] = (StgInt32)((StgInt64)i >> 32);
-  tmp[1] = (StgInt32)i;                /* truncate high 32 bits */
-
-  return p;
-}
-
-#else
-
 HaskellObj
 rts_mkInt64 (Capability *cap, HsInt64 i)
 {
-  llong *tmp;
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
-  tmp  = (llong*)&(p->payload[0]);
-  *tmp = (StgInt64)i;
+  ASSIGN_Int64((P_)&(p->payload[0]), i);
   return p;
 }
 
-#endif /* sparc_HOST_ARCH */
-
-
 HaskellObj
 rts_mkWord (Capability *cap, HsWord i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)i;
   return p;
@@ -126,7 +95,7 @@ HaskellObj
 rts_mkWord8 (Capability *cap, HsWord8 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
   return p;
@@ -136,7 +105,7 @@ HaskellObj
 rts_mkWord16 (Capability *cap, HsWord16 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
   return p;
@@ -146,57 +115,27 @@ HaskellObj
 rts_mkWord32 (Capability *cap, HsWord32 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
   return p;
 }
 
-
-#ifdef sparc_HOST_ARCH
-/* The closures returned by allocateLocal are only guaranteed to be 32 bit
-   aligned, because that's the size of pointers. SPARC v9 can't do
-   misaligned loads/stores, so we have to write the 64bit word in chunks         */
-
-HaskellObj
-rts_mkWord64 (Capability *cap, HsWord64 w_)
-{
-  StgWord64 w = (StgWord64)w_;
-  StgWord32 *tmp;
-
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
-  /* see mk_Int8 comment */
-  SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
-  
-  tmp    = (StgWord32*)&(p->payload[0]);
-
-  tmp[0] = (StgWord32)((StgWord64)w >> 32);
-  tmp[1] = (StgWord32)w;       /* truncate high 32 bits */
-  return p;
-}
-
-#else
-
 HaskellObj
 rts_mkWord64 (Capability *cap, HsWord64 w)
 {
-  ullong *tmp;
-
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   /* see mk_Int8 comment */
   SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
-  tmp  = (ullong*)&(p->payload[0]);
-  *tmp = (StgWord64)w;
+  ASSIGN_Word64((P_)&(p->payload[0]), w);
   return p;
 }
 
-#endif
-
 
 HaskellObj
 rts_mkFloat (Capability *cap, HsFloat f)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
   ASSIGN_FLT((P_)p->payload, (StgFloat)f);
   return p;
@@ -205,7 +144,7 @@ rts_mkFloat (Capability *cap, HsFloat f)
 HaskellObj
 rts_mkDouble (Capability *cap, HsDouble d)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
   SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
   ASSIGN_DBL((P_)p->payload, (StgDouble)d);
   return p;
@@ -214,7 +153,7 @@ rts_mkDouble (Capability *cap, HsDouble d)
 HaskellObj
 rts_mkStablePtr (Capability *cap, HsStablePtr s)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)s;
   return p;
@@ -223,7 +162,7 @@ rts_mkStablePtr (Capability *cap, HsStablePtr s)
 HaskellObj
 rts_mkPtr (Capability *cap, HsPtr a)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
   return p;
@@ -232,7 +171,7 @@ rts_mkPtr (Capability *cap, HsPtr a)
 HaskellObj
 rts_mkFunPtr (Capability *cap, HsFunPtr a)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
   return p;
@@ -259,8 +198,11 @@ rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
 {
     StgThunk *ap;
 
-    ap = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk) + 2);
-    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_SYSTEM);
+    ap = (StgThunk *)allocate(cap,sizeofW(StgThunk) + 2);
+    // Here we don't want to use CCS_SYSTEM, because it's a hidden cost centre,
+    // and evaluating Haskell code under a hidden cost centre leads to
+    // confusing profiling output. (#7753)
+    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_MAIN);
     ap->payload[0] = f;
     ap->payload[1] = arg;
     return (StgClosure *)ap;
@@ -320,41 +262,15 @@ rts_getInt32 (HaskellObj p)
   return (HsInt32)(HsInt)(UNTAG_CLOSURE(p)->payload[0]);
 }
 
-
-#ifdef sparc_HOST_ARCH
-/* The closures returned by allocateLocal are only guaranteed to be 32 bit
-   aligned, because that's the size of pointers. SPARC v9 can't do
-   misaligned loads/stores, so we have to read the 64bit word in chunks         */
-
 HsInt64
 rts_getInt64 (HaskellObj p)
 {
-    HsInt32* tmp;
     // See comment above:
     // ASSERT(p->header.info == I64zh_con_info ||
     //        p->header.info == I64zh_static_info);
-    tmp = (HsInt32*)&(UNTAG_CLOSURE(p)->payload[0]);
-
-    HsInt64 i  = (HsInt64)((HsInt64)(tmp[0]) << 32) | (HsInt64)tmp[1];
-    return i;
+    return PK_Int64((P_)&(UNTAG_CLOSURE(p)->payload[0]));
 }
 
-#else
-
-HsInt64
-rts_getInt64 (HaskellObj p)
-{
-    HsInt64* tmp;
-    // See comment above:
-    // ASSERT(p->header.info == I64zh_con_info ||
-    //        p->header.info == I64zh_static_info);
-    tmp = (HsInt64*)&(UNTAG_CLOSURE(p)->payload[0]);
-    return *tmp;
-}
-
-#endif /* sparc_HOST_ARCH */
-
-
 HsWord
 rts_getWord (HaskellObj p)
 {
@@ -391,41 +307,15 @@ rts_getWord32 (HaskellObj p)
     return (HsWord32)(HsWord)(UNTAG_CLOSURE(p)->payload[0]);
 }
 
-
-#ifdef sparc_HOST_ARCH
-/* The closures returned by allocateLocal are only guaranteed to be 32 bit
-   aligned, because that's the size of pointers. SPARC v9 can't do
-   misaligned loads/stores, so we have to write the 64bit word in chunks         */
-
-HsWord64
-rts_getWord64 (HaskellObj p)
-{
-    HsInt32* tmp;
-    // See comment above:
-    // ASSERT(p->header.info == I64zh_con_info ||
-    //        p->header.info == I64zh_static_info);
-    tmp = (HsInt32*)&(UNTAG_CLOSURE(p)->payload[0]);
-
-    HsInt64 i  = (HsWord64)((HsWord64)(tmp[0]) << 32) | (HsWord64)tmp[1];
-    return i;
-}
-
-#else
-
 HsWord64
 rts_getWord64 (HaskellObj p)
 {
-    HsWord64* tmp;
     // See comment above:
     // ASSERT(p->header.info == W64zh_con_info ||
     //        p->header.info == W64zh_static_info);
-    tmp = (HsWord64*)&(UNTAG_CLOSURE(p)->payload[0]);
-    return *tmp;
+    return PK_Word64((P_)&(UNTAG_CLOSURE(p)->payload[0]));
 }
 
-#endif
-
-
 HsFloat
 rts_getFloat (HaskellObj p)
 {
@@ -474,13 +364,13 @@ rts_getFunPtr (HaskellObj p)
 HsBool
 rts_getBool (HaskellObj p)
 {
-    StgInfoTable *info;
+    const StgInfoTable *info;
 
-    info = get_itbl((StgClosure *)UNTAG_CLOSURE(p));
+    info = get_itbl((const StgClosure *)UNTAG_CONST_CLOSURE(p));
     if (info->srt_bitmap == 0) { // srt_bitmap is the constructor tag
-       return 0;
+        return 0;
     } else {
-       return 1;
+        return 1;
     }
 }
 
@@ -489,12 +379,12 @@ rts_getBool (HaskellObj p)
    -------------------------------------------------------------------------- */
 
 INLINE_HEADER void pushClosure   (StgTSO *tso, StgWord c) {
-  tso->sp--;
-  tso->sp[0] = (W_) c;
+  tso->stackobj->sp--;
+  tso->stackobj->sp[0] = (W_) c;
 }
 
 StgTSO *
-createGenThread (Capability *cap, nat stack_size,  StgClosure *closure)
+createGenThread (Capability *cap, W_ stack_size,  StgClosure *closure)
 {
   StgTSO *t;
   t = createThread (cap, stack_size);
@@ -504,11 +394,10 @@ createGenThread (Capability *cap, nat stack_size,  StgClosure *closure)
 }
 
 StgTSO *
-createIOThread (Capability *cap, nat stack_size,  StgClosure *closure)
+createIOThread (Capability *cap, W_ stack_size,  StgClosure *closure)
 {
   StgTSO *t;
   t = createThread (cap, stack_size);
-  pushClosure(t, (W_)&stg_noforceIO_info);
   pushClosure(t, (W_)&stg_ap_v_info);
   pushClosure(t, (W_)closure);
   pushClosure(t, (W_)&stg_enter_info);
@@ -521,7 +410,7 @@ createIOThread (Capability *cap, nat stack_size,  StgClosure *closure)
  */
 
 StgTSO *
-createStrictIOThread(Capability *cap, nat stack_size,  StgClosure *closure)
+createStrictIOThread(Capability *cap, W_ stack_size,  StgClosure *closure)
 {
   StgTSO *t;
   t = createThread(cap, stack_size);
@@ -536,36 +425,39 @@ createStrictIOThread(Capability *cap, nat stack_size,  StgClosure *closure)
    Evaluating Haskell expressions
    ------------------------------------------------------------------------- */
 
-Capability *
-rts_eval (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+void rts_eval (/* inout */ Capability **cap,
+               /* in    */ HaskellObj p,
+               /* out */   HaskellObj *ret)
 {
     StgTSO *tso;
-    
-    tso = createGenThread(cap, RtsFlags.GcFlags.initialStkSize, p);
-    return scheduleWaitThread(tso,ret,cap);
+
+    tso = createGenThread(*cap, RtsFlags.GcFlags.initialStkSize, p);
+    scheduleWaitThread(tso,ret,cap);
 }
 
-Capability *
-rts_eval_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
-          /*out*/HaskellObj *ret)
+void rts_eval_ (/* inout */ Capability **cap,
+                /* in    */ HaskellObj p,
+                /* in    */ unsigned int stack_size,
+                /* out   */ HaskellObj *ret)
 {
     StgTSO *tso;
 
-    tso = createGenThread(cap, stack_size, p);
-    return scheduleWaitThread(tso,ret,cap);
+    tso = createGenThread(*cap, stack_size, p);
+    scheduleWaitThread(tso,ret,cap);
 }
 
 /*
  * rts_evalIO() evaluates a value of the form (IO a), forcing the action's
  * result to WHNF before returning.
  */
-Capability *
-rts_evalIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+void rts_evalIO (/* inout */ Capability **cap,
+                 /* in    */ HaskellObj p,
+                 /* out */   HaskellObj *ret)
 {
-    StgTSO* tso; 
-    
-    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
-    return scheduleWaitThread(tso,ret,cap);
+    StgTSO* tso;
+
+    tso = createStrictIOThread(*cap, RtsFlags.GcFlags.initialStkSize, p);
+    scheduleWaitThread(tso,ret,cap);
 }
 
 /*
@@ -574,49 +466,50 @@ rts_evalIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
  * action's result to WHNF before returning.  The result is returned
  * in a StablePtr.
  */
-Capability *
-rts_evalStableIO (Capability *cap, HsStablePtr s, /*out*/HsStablePtr *ret)
+void rts_evalStableIO (/* inout */ Capability **cap,
+                       /* in    */ HsStablePtr s,
+                       /* out */   HsStablePtr *ret)
 {
     StgTSO* tso;
     StgClosure *p, *r;
     SchedulerStatus stat;
-    
+
     p = (StgClosure *)deRefStablePtr(s);
-    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    tso = createStrictIOThread(*cap, RtsFlags.GcFlags.initialStkSize, p);
     // async exceptions are always blocked by default in the created
     // thread.  See #1048.
     tso->flags |= TSO_BLOCKEX | TSO_INTERRUPTIBLE;
-    cap = scheduleWaitThread(tso,&r,cap);
-    stat = rts_getSchedStatus(cap);
+    scheduleWaitThread(tso,&r,cap);
+    stat = rts_getSchedStatus(*cap);
 
     if (stat == Success && ret != NULL) {
-       ASSERT(r != NULL);
-       *ret = getStablePtr((StgPtr)r);
+        ASSERT(r != NULL);
+        *ret = getStablePtr((StgPtr)r);
     }
-
-    return cap;
 }
 
 /*
  * Like rts_evalIO(), but doesn't force the action's result.
  */
-Capability *
-rts_evalLazyIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+void rts_evalLazyIO (/* inout */ Capability **cap,
+                     /* in    */ HaskellObj p,
+                     /* out */   HaskellObj *ret)
 {
     StgTSO *tso;
 
-    tso = createIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
-    return scheduleWaitThread(tso,ret,cap);
+    tso = createIOThread(*cap, RtsFlags.GcFlags.initialStkSize, p);
+    scheduleWaitThread(tso,ret,cap);
 }
 
-Capability *
-rts_evalLazyIO_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
-                /*out*/HaskellObj *ret)
+void rts_evalLazyIO_ (/* inout */ Capability **cap,
+                      /* in    */ HaskellObj p,
+                      /* in    */ unsigned int stack_size,
+                      /* out   */ HaskellObj *ret)
 {
     StgTSO *tso;
 
-    tso = createIOThread(cap, stack_size, p);
-    return scheduleWaitThread(tso,ret,cap);
+    tso = createIOThread(*cap, stack_size, p);
+    scheduleWaitThread(tso,ret,cap);
 }
 
 /* Convenience function for decoding the returned status. */
@@ -624,26 +517,35 @@ rts_evalLazyIO_ (Capability *cap, HaskellObj p, unsigned int stack_size,
 void
 rts_checkSchedStatus (char* site, Capability *cap)
 {
-    SchedulerStatus rc = cap->running_task->stat;
+    SchedulerStatus rc = cap->running_task->incall->rstat;
     switch (rc) {
     case Success:
-       return;
+        return;
     case Killed:
-       errorBelch("%s: uncaught exception",site);
-       stg_exit(EXIT_FAILURE);
+        errorBelch("%s: uncaught exception",site);
+        stg_exit(EXIT_FAILURE);
     case Interrupted:
-       errorBelch("%s: interrupted", site);
-       stg_exit(EXIT_FAILURE);
+        errorBelch("%s: interrupted", site);
+#ifdef THREADED_RTS
+        // The RTS is shutting down, and the process will probably
+        // soon exit.  We don't want to preempt the shutdown
+        // by exiting the whole process here, so we just terminate the
+        // current thread.  Don't forget to release the cap first though.
+        rts_unlock(cap);
+        shutdownThread();
+#else
+        stg_exit(EXIT_FAILURE);
+#endif
     default:
-       errorBelch("%s: Return code (%d) not ok",(site),(rc));  
-       stg_exit(EXIT_FAILURE);
+        errorBelch("%s: Return code (%d) not ok",(site),(rc));
+        stg_exit(EXIT_FAILURE);
     }
 }
 
 SchedulerStatus
 rts_getSchedStatus (Capability *cap)
 {
-    return cap->running_task->stat;
+    return cap->running_task->incall->rstat;
 }
 
 Capability *
@@ -654,8 +556,24 @@ rts_lock (void)
 
     task = newBoundTask();
 
+    if (task->running_finalizers) {
+        errorBelch("error: a C finalizer called back into Haskell.\n"
+                   "   This was previously allowed, but is disallowed in GHC 6.10.2 and later.\n"
+                   "   To create finalizers that may call back into Haskell, use\n"
+                   "   Foreign.Concurrent.newForeignPtr instead of Foreign.newForeignPtr.");
+        stg_exit(EXIT_FAILURE);
+    }
+
     cap = NULL;
-    waitForReturnCapability(&cap, task);
+    waitForCapability(&cap, task);
+
+    if (task->incall->prev_stack == NULL) {
+      // This is a new outermost call from C into Haskell land.
+      // Until the corresponding call to rts_unlock, this task
+      // is doing work on behalf of the RTS.
+      traceTaskCreate(task, cap);
+    }
+
     return (Capability *)cap;
 }
 
@@ -689,4 +607,91 @@ rts_unlock (Capability *cap)
     // Finally, we can release the Task to the free list.
     boundTaskExiting(task);
     RELEASE_LOCK(&cap->lock);
+
+    if (task->incall == NULL) {
+      // This is the end of an outermost call from C into Haskell land.
+      // From here on, the task goes back to C land and we should not count
+      // it as doing work on behalf of the RTS.
+      traceTaskDelete(task);
+    }
+}
+
+void rts_done (void)
+{
+    freeMyTask();
+}
+
+/* -----------------------------------------------------------------------------
+   tryPutMVar from outside Haskell
+
+   The C call
+
+      hs_try_putmvar(cap, mvar)
+
+   is equivalent to the Haskell call
+
+      tryPutMVar mvar ()
+
+   but it is
+
+     * non-blocking: takes a bounded, short, amount of time
+     * asynchronous: the actual putMVar may be performed after the
+       call returns.  That's why hs_try_putmvar() doesn't return a
+       result to say whether the put succeeded.
+
+   NOTE: this call transfers ownership of the StablePtr to the RTS, which will
+   free it after the tryPutMVar has taken place.  The reason is that otherwise,
+   it would be very difficult for the caller to arrange to free the StablePtr
+   in all circumstances.
+
+   For more details, see the section "Waking up Haskell threads from C" in the
+   User's Guide.
+   -------------------------------------------------------------------------- */
+
+void hs_try_putmvar (/* in */ int capability,
+                     /* in */ HsStablePtr mvar)
+{
+    Task *task = getTask();
+    Capability *cap;
+
+    if (capability < 0) {
+        capability = task->preferred_capability;
+        if (capability < 0) {
+            capability = 0;
+        }
+    }
+    cap = capabilities[capability % enabled_capabilities];
+
+#if !defined(THREADED_RTS)
+
+    performTryPutMVar(cap, (StgMVar*)deRefStablePtr(mvar), Unit_closure);
+    freeStablePtr(mvar);
+
+#else
+
+    ACQUIRE_LOCK(&cap->lock);
+    // If the capability is free, we can perform the tryPutMVar immediately
+    if (cap->running_task == NULL) {
+        cap->running_task = task;
+        task->cap = cap;
+        RELEASE_LOCK(&cap->lock);
+
+        performTryPutMVar(cap, (StgMVar*)deRefStablePtr(mvar), Unit_closure);
+
+        freeStablePtr(mvar);
+
+        // Wake up the capability, which will start running the thread that we
+        // just awoke (if there was one).
+        releaseCapability(cap);
+    } else {
+        PutMVar *p = stgMallocBytes(sizeof(PutMVar),"hs_try_putmvar");
+        // We cannot deref the StablePtr if we don't have a capability,
+        // so we have to store it and deref it later.
+        p->mvar = mvar;
+        p->link = cap->putMVars;
+        cap->putMVars = p;
+        RELEASE_LOCK(&cap->lock);
+    }
+
+#endif
 }