Make start address of `osReserveHeapMemory` tunable via command line -xb
[ghc.git] / rts / Schedule.c
index 447b70e..544b9c2 100644 (file)
@@ -91,12 +91,6 @@ volatile StgWord recent_activity = ACTIVITY_YES;
  */
 volatile StgWord sched_state = SCHED_RUNNING;
 
-/*  This is used in `TSO.h' and gcc 2.96 insists that this variable actually
- *  exists - earlier gccs apparently didn't.
- *  -= chak
- */
-StgTSO dummy_tso;
-
 /*
  * This mutex protects most of the global scheduler data in
  * the THREADED_RTS runtime.
@@ -109,11 +103,6 @@ Mutex sched_mutex;
 #define FORKPROCESS_PRIMOP_SUPPORTED
 #endif
 
-// Local stats
-#ifdef THREADED_RTS
-static nat n_failed_trygrab_idles = 0, n_idle_caps = 0;
-#endif
-
 /* -----------------------------------------------------------------------------
  * static function prototypes
  * -------------------------------------------------------------------------- */
@@ -131,10 +120,12 @@ static void scheduleFindWork (Capability **pcap);
 static void scheduleYield (Capability **pcap, Task *task);
 #endif
 #if defined(THREADED_RTS)
-static nat requestSync (Capability **pcap, Task *task, nat sync_type);
+static rtsBool requestSync (Capability **pcap, Task *task,
+                            PendingSync *sync_type, SyncType *prev_sync_type);
 static void acquireAllCapabilities(Capability *cap, Task *task);
-static void releaseAllCapabilities(nat n, Capability *cap, Task *task);
-static void startWorkerTasks (nat from USED_IF_THREADS, nat to USED_IF_THREADS);
+static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task);
+static void startWorkerTasks (uint32_t from USED_IF_THREADS,
+                                uint32_t to USED_IF_THREADS);
 #endif
 static void scheduleStartSignalHandlers (Capability *cap);
 static void scheduleCheckBlockedThreads (Capability *cap);
@@ -147,7 +138,7 @@ static void scheduleActivateSpark(Capability *cap);
 static void schedulePostRunThread(Capability *cap, StgTSO *t);
 static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
 static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t,
-                                    nat prev_what_next );
+                                    uint32_t prev_what_next );
 static void scheduleHandleThreadBlocked( StgTSO *t );
 static rtsBool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                              StgTSO *t );
@@ -181,7 +172,7 @@ schedule (Capability *initialCapability, Task *task)
   StgTSO *t;
   Capability *cap;
   StgThreadReturnCode ret;
-  nat prev_what_next;
+  uint32_t prev_what_next;
   rtsBool ready_to_gc;
 #if defined(THREADED_RTS)
   rtsBool first = rtsTrue;
@@ -211,7 +202,7 @@ schedule (Capability *initialCapability, Task *task)
           stg_exit(EXIT_FAILURE);
     }
 
-    // The interruption / shutdown sequence.
+    // Note [shutdown]: The interruption / shutdown sequence.
     //
     // In order to cleanly shut down the runtime, we want to:
     //   * make sure that all main threads return to their callers
@@ -219,21 +210,25 @@ schedule (Capability *initialCapability, Task *task)
     //   * clean up all OS threads assocated with the runtime
     //   * free all memory etc.
     //
-    // So the sequence for ^C goes like this:
+    // So the sequence goes like this:
+    //
+    //   * The shutdown sequence is initiated by calling hs_exit(),
+    //     interruptStgRts(), or running out of memory in the GC.
     //
-    //   * ^C handler sets sched_state := SCHED_INTERRUPTING and
-    //     arranges for some Capability to wake up
+    //   * Set sched_state = SCHED_INTERRUPTING
     //
-    //   * all threads in the system are halted, and the zombies are
-    //     placed on the run queue for cleaning up.  We acquire all
-    //     the capabilities in order to delete the threads, this is
-    //     done by scheduleDoGC() for convenience (because GC already
-    //     needs to acquire all the capabilities).  We can't kill
-    //     threads involved in foreign calls.
+    //   * The scheduler notices sched_state = SCHED_INTERRUPTING and calls
+    //     scheduleDoGC(), which halts the whole runtime by acquiring all the
+    //     capabilities, does a GC and then calls deleteAllThreads() to kill all
+    //     the remaining threads.  The zombies are left on the run queue for
+    //     cleaning up.  We can't kill threads involved in foreign calls.
     //
-    //   * somebody calls shutdownHaskell(), which calls exitScheduler()
+    //   * scheduleDoGC() sets sched_state = SCHED_SHUTTING_DOWN
     //
-    //   * sched_state := SCHED_SHUTTING_DOWN
+    //   * After this point, there can be NO MORE HASKELL EXECUTION.  This is
+    //     enforced by the scheduler, which won't run any Haskell code when
+    //     sched_state >= SCHED_INTERRUPTING, and we already sync'd with the
+    //     other capabilities by doing the GC earlier.
     //
     //   * all workers exit when the run queue on their capability
     //     drains.  All main threads will also exit when their TSO
@@ -243,7 +238,7 @@ schedule (Capability *initialCapability, Task *task)
     //     exit.
     //
     //   * We might be left with threads blocked in foreign calls,
-    //     we should really attempt to kill these somehow (TODO);
+    //     we should really attempt to kill these somehow (TODO).
 
     switch (sched_state) {
     case SCHED_RUNNING:
@@ -251,7 +246,7 @@ schedule (Capability *initialCapability, Task *task)
     case SCHED_INTERRUPTING:
         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
         /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,rtsFalse);
+        scheduleDoGC(&cap,task,rtsTrue);
 
         // after scheduleDoGC(), we must be shutting down.  Either some
         // other Capability did the final GC, or we did it above,
@@ -428,7 +423,7 @@ run_thread:
     case ACTIVITY_DONE_GC: {
         // ACTIVITY_DONE_GC means we turned off the timer signal to
         // conserve power (see #1623).  Re-enable it here.
-        nat prev;
+        uint32_t prev;
         prev = xchg((P_)&recent_activity, ACTIVITY_YES);
         if (prev == ACTIVITY_DONE_GC) {
 #ifndef PROFILING
@@ -563,7 +558,7 @@ run_thread:
  * Run queue operations
  * -------------------------------------------------------------------------- */
 
-void
+static void
 removeFromRunQueue (Capability *cap, StgTSO *tso)
 {
     if (tso->block_info.prev == END_TSO_QUEUE) {
@@ -579,6 +574,7 @@ removeFromRunQueue (Capability *cap, StgTSO *tso)
         setTSOPrev(cap, tso->_link, tso->block_info.prev);
     }
     tso->_link = tso->block_info.prev = END_TSO_QUEUE;
+    cap->n_run_queue--;
 
     IF_DEBUG(sanity, checkRunQueue(cap));
 }
@@ -644,7 +640,7 @@ shouldYieldCapability (Capability *cap, Task *task, rtsBool didGcLast)
     // progress at all.
 
     return ((pending_sync && !didGcLast) ||
-            cap->returning_tasks_hd != NULL ||
+            cap->n_returning_tasks != 0 ||
             (!emptyRunQueue(cap) && (task->incall->tso == NULL
                                      ? peekRunQueue(cap)->bound != NULL
                                      : peekRunQueue(cap)->bound != task->incall)));
@@ -704,26 +700,29 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
 #if defined(THREADED_RTS)
 
     Capability *free_caps[n_capabilities], *cap0;
-    nat i, n_free_caps;
+    uint32_t i, n_wanted_caps, n_free_caps;
 
-    // migration can be turned off with +RTS -qm
-    if (!RtsFlags.ParFlags.migrate) return;
+    uint32_t spare_threads = cap->n_run_queue > 0 ? cap->n_run_queue - 1 : 0;
 
-    // Check whether we have more threads on our run queue, or sparks
-    // in our pool, that we could hand to another Capability.
-    if (emptyRunQueue(cap)) {
-        if (sparkPoolSizeCap(cap) < 2) return;
-    } else {
-        if (singletonRunQueue(cap) &&
-            sparkPoolSizeCap(cap) < 1) return;
+    // migration can be turned off with +RTS -qm
+    if (!RtsFlags.ParFlags.migrate) {
+        spare_threads = 0;
     }
 
-    // First grab as many free Capabilities as we can.
-    for (i=0, n_free_caps=0; i < n_capabilities; i++) {
+    // Figure out how many capabilities we want to wake up.  We need at least
+    // sparkPoolSize(cap) plus the number of spare threads we have.
+    n_wanted_caps = sparkPoolSizeCap(cap) + spare_threads;
+    if (n_wanted_caps == 0) return;
+
+    // First grab as many free Capabilities as we can.  ToDo: we should use
+    // capabilities on the same NUMA node preferably, but not exclusively.
+    for (i = (cap->no + 1) % n_capabilities, n_free_caps=0;
+         n_free_caps < n_wanted_caps && i != cap->no;
+         i = (i + 1) % n_capabilities) {
         cap0 = capabilities[i];
         if (cap != cap0 && !cap0->disabled && tryGrabCapability(cap0,task)) {
             if (!emptyRunQueue(cap0)
-                || cap0->returning_tasks_hd != NULL
+                || cap0->n_returning_tasks != 0
                 || cap0->inbox != (Message*)END_TSO_QUEUE) {
                 // it already has some work, we just grabbed it at
                 // the wrong moment.  Or maybe it's deadlocked!
@@ -734,10 +733,16 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
         }
     }
 
-    // we now have n_free_caps free capabilities stashed in
-    // free_caps[].  Share our run queue equally with them.  This is
-    // probably the simplest thing we could do; improvements we might
-    // want to do include:
+    // We now have n_free_caps free capabilities stashed in
+    // free_caps[].  Attempt to share our run queue equally with them.
+    // This is complicated slightly by the fact that we can't move
+    // some threads:
+    //
+    //  - threads that have TSO_LOCKED cannot migrate
+    //  - a thread that is bound to the current Task cannot be migrated
+    //
+    // This is about the simplest thing we could do; improvements we
+    // might want to do include:
     //
     //   - giving high priority to moving relatively new threads, on
     //     the gournds that they haven't had time to build up a
@@ -747,89 +752,92 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
 
     if (n_free_caps > 0) {
         StgTSO *prev, *t, *next;
-#ifdef SPARK_PUSHING
-        rtsBool pushed_to_all;
-#endif
 
         debugTrace(DEBUG_sched,
-                   "cap %d: %s and %d free capabilities, sharing...",
-                   cap->no,
-                   (!emptyRunQueue(cap) && !singletonRunQueue(cap))?
-                   "excess threads on run queue":"sparks to share (>=2)",
+                   "cap %d: %d threads, %d sparks, and %d free capabilities, sharing...",
+                   cap->no, cap->n_run_queue, sparkPoolSizeCap(cap),
                    n_free_caps);
 
-        i = 0;
-#ifdef SPARK_PUSHING
-        pushed_to_all = rtsFalse;
-#endif
-
-        if (cap->run_queue_hd != END_TSO_QUEUE) {
-            prev = cap->run_queue_hd;
-            t = prev->_link;
-            prev->_link = END_TSO_QUEUE;
-            for (; t != END_TSO_QUEUE; t = next) {
-                next = t->_link;
-                t->_link = END_TSO_QUEUE;
-                if (t->bound == task->incall // don't move my bound thread
-                    || tsoLocked(t)) {  // don't move a locked thread
-                    setTSOLink(cap, prev, t);
-                    setTSOPrev(cap, t, prev);
-                    prev = t;
-                } else if (i == n_free_caps) {
-#ifdef SPARK_PUSHING
-                    pushed_to_all = rtsTrue;
-#endif
-                    i = 0;
-                    // keep one for us
-                    setTSOLink(cap, prev, t);
-                    setTSOPrev(cap, t, prev);
-                    prev = t;
+        // There are n_free_caps+1 caps in total.  We will share the threads
+        // evently between them, *except* that if the run queue does not divide
+        // evenly by n_free_caps+1 then we bias towards the current capability.
+        // e.g. with n_run_queue=4, n_free_caps=2, we will keep 2.
+        uint32_t keep_threads =
+            (cap->n_run_queue + n_free_caps) / (n_free_caps + 1);
+
+        // This also ensures that we don't give away all our threads, since
+        // (x + y) / (y + 1) >= 1 when x >= 1.
+
+        // The number of threads we have left.
+        uint32_t n = cap->n_run_queue;
+
+        // prev = the previous thread on this cap's run queue
+        prev = END_TSO_QUEUE;
+
+        // We're going to walk through the run queue, migrating threads to other
+        // capabilities until we have only keep_threads left.  We might
+        // encounter a thread that cannot be migrated, in which case we add it
+        // to the current run queue and decrement keep_threads.
+        for (t = cap->run_queue_hd, i = 0;
+             t != END_TSO_QUEUE && n > keep_threads;
+             t = next)
+        {
+            next = t->_link;
+            t->_link = END_TSO_QUEUE;
+
+            // Should we keep this thread?
+            if (t->bound == task->incall // don't move my bound thread
+                || tsoLocked(t) // don't move a locked thread
+                ) {
+                if (prev == END_TSO_QUEUE) {
+                    cap->run_queue_hd = t;
                 } else {
-                    appendToRunQueue(free_caps[i],t);
-
-                    traceEventMigrateThread (cap, t, free_caps[i]->no);
-
-                    if (t->bound) { t->bound->task->cap = free_caps[i]; }
-                    t->cap = free_caps[i];
-                    i++;
+                    setTSOLink(cap, prev, t);
                 }
+                setTSOPrev(cap, t, prev);
+                prev = t;
+                if (keep_threads > 0) keep_threads--;
             }
-            cap->run_queue_tl = prev;
 
-            IF_DEBUG(sanity, checkRunQueue(cap));
-        }
+            // Or migrate it?
+            else {
+                appendToRunQueue(free_caps[i],t);
+                traceEventMigrateThread (cap, t, free_caps[i]->no);
 
-#ifdef SPARK_PUSHING
-        /* JB I left this code in place, it would work but is not necessary */
-
-        // If there are some free capabilities that we didn't push any
-        // threads to, then try to push a spark to each one.
-        if (!pushed_to_all) {
-            StgClosure *spark;
-            // i is the next free capability to push to
-            for (; i < n_free_caps; i++) {
-                if (emptySparkPoolCap(free_caps[i])) {
-                    spark = tryStealSpark(cap->sparks);
-                    if (spark != NULL) {
-                        /* TODO: if anyone wants to re-enable this code then
-                         * they must consider the fizzledSpark(spark) case
-                         * and update the per-cap spark statistics.
-                         */
-                        debugTrace(DEBUG_sched, "pushing spark %p to capability %d", spark, free_caps[i]->no);
-
-            traceEventStealSpark(free_caps[i], t, cap->no);
-
-                        newSpark(&(free_caps[i]->r), spark);
-                    }
-                }
+                if (t->bound) { t->bound->task->cap = free_caps[i]; }
+                t->cap = free_caps[i];
+                n--; // we have one fewer threads now
+                i++; // move on to the next free_cap
+                if (i == n_free_caps) i = 0;
             }
         }
-#endif /* SPARK_PUSHING */
+
+        // Join up the beginning of the queue (prev)
+        // with the rest of the queue (t)
+        if (t == END_TSO_QUEUE) {
+            cap->run_queue_tl = prev;
+        } else {
+            setTSOPrev(cap, t, prev);
+        }
+        if (prev == END_TSO_QUEUE) {
+            cap->run_queue_hd = t;
+        } else {
+            setTSOLink(cap, prev, t);
+        }
+        cap->n_run_queue = n;
+
+        IF_DEBUG(sanity, checkRunQueue(cap));
 
         // release the capabilities
         for (i = 0; i < n_free_caps; i++) {
             task->cap = free_caps[i];
-            releaseAndWakeupCapability(free_caps[i]);
+            if (sparkPoolSizeCap(cap) > 0) {
+                // If we have sparks to steal, wake up a worker on the
+                // capability, even if it has no threads to run.
+                releaseAndWakeupCapability(free_caps[i]);
+            } else {
+                releaseCapability(free_caps[i]);
+            }
         }
     }
     task->cap = cap; // reset to point to our Capability.
@@ -966,29 +974,6 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
 
 
 /* ----------------------------------------------------------------------------
- * Send pending messages (PARALLEL_HASKELL only)
- * ------------------------------------------------------------------------- */
-
-#if defined(PARALLEL_HASKELL)
-static void
-scheduleSendPendingMessages(void)
-{
-
-# if defined(PAR) // global Mem.Mgmt., omit for now
-    if (PendingFetches != END_BF_QUEUE) {
-        processFetches();
-    }
-# endif
-
-    if (RtsFlags.ParFlags.BufferTime) {
-        // if we use message buffering, we must send away all message
-        // packets which have become too old...
-        sendOldBuffers();
-    }
-}
-#endif
-
-/* ----------------------------------------------------------------------------
  * Process message in the current Capability's inbox
  * ------------------------------------------------------------------------- */
 
@@ -1035,7 +1020,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
 }
 
 /* ----------------------------------------------------------------------------
- * Activate spark threads (PARALLEL_HASKELL and THREADED_RTS)
+ * Activate spark threads (THREADED_RTS)
  * ------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
@@ -1048,7 +1033,7 @@ scheduleActivateSpark(Capability *cap)
         debugTrace(DEBUG_sched, "creating a spark thread");
     }
 }
-#endif // PARALLEL_HASKELL || THREADED_RTS
+#endif // THREADED_RTS
 
 /* ----------------------------------------------------------------------------
  * After running a thread...
@@ -1086,15 +1071,15 @@ schedulePostRunThread (Capability *cap, StgTSO *t)
     // If the current thread's allocation limit has run out, send it
     // the AllocationLimitExceeded exception.
 
-    if (t->alloc_limit < 0 && (t->flags & TSO_ALLOC_LIMIT)) {
+    if (PK_Int64((W_*)&(t->alloc_limit)) < 0 && (t->flags & TSO_ALLOC_LIMIT)) {
         // Use a throwToSelf rather than a throwToSingleThreaded, because
         // it correctly handles the case where the thread is currently
         // inside mask.  Also the thread might be blocked (e.g. on an
         // MVar), and throwToSingleThreaded doesn't unblock it
         // correctly in that case.
         throwToSelf(cap, t, allocationLimitExceeded_closure);
-        t->alloc_limit = (StgInt64)RtsFlags.GcFlags.allocLimitGrace
-            * BLOCK_SIZE;
+        ASSIGN_Int64((W_*)&(t->alloc_limit),
+                     (StgInt64)RtsFlags.GcFlags.allocLimitGrace * BLOCK_SIZE);
     }
 
   /* some statistics gathering in the parallel case */
@@ -1107,6 +1092,17 @@ schedulePostRunThread (Capability *cap, StgTSO *t)
 static rtsBool
 scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
 {
+    if (cap->r.rHpLim == NULL || cap->context_switch) {
+        // Sometimes we miss a context switch, e.g. when calling
+        // primitives in a tight loop, MAYBE_GC() doesn't check the
+        // context switch flag, and we end up waiting for a GC.
+        // See #1984, and concurrent/should_run/1984
+        cap->context_switch = 0;
+        appendToRunQueue(cap,t);
+    } else {
+        pushOnRunQueue(cap,t);
+    }
+
     // did the task ask for a large block?
     if (cap->r.rHpAlloc > BLOCK_SIZE) {
         // if so, get one and push it on the front of the nursery.
@@ -1130,7 +1126,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
                                                // nursery has only one
                                                // block.
 
-            bd = allocGroup_lock(blocks);
+            bd = allocGroupOnNode_lock(cap->node,blocks);
             cap->r.rNursery->n_blocks += blocks;
 
             // link the new group after CurrentNursery
@@ -1164,21 +1160,23 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
             // run queue before us and steal the large block, but in that
             // case the thread will just end up requesting another large
             // block.
-            pushOnRunQueue(cap,t);
             return rtsFalse;  /* not actually GC'ing */
         }
     }
 
-    if (cap->r.rHpLim == NULL || cap->context_switch) {
-        // Sometimes we miss a context switch, e.g. when calling
-        // primitives in a tight loop, MAYBE_GC() doesn't check the
-        // context switch flag, and we end up waiting for a GC.
-        // See #1984, and concurrent/should_run/1984
-        cap->context_switch = 0;
-        appendToRunQueue(cap,t);
-    } else {
-        pushOnRunQueue(cap,t);
+    // if we got here because we exceeded large_alloc_lim, then
+    // proceed straight to GC.
+    if (g0->n_new_large_words >= large_alloc_lim) {
+        return rtsTrue;
     }
+
+    // Otherwise, we just ran out of space in the current nursery.
+    // Grab another nursery if we can.
+    if (getNewNursery(cap)) {
+        debugTrace(DEBUG_sched, "thread %ld got a new nursery", t->id);
+        return rtsFalse;
+    }
+
     return rtsTrue;
     /* actual GC is done at the end of the while loop in schedule() */
 }
@@ -1188,7 +1186,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
  * -------------------------------------------------------------------------- */
 
 static rtsBool
-scheduleHandleYield( Capability *cap, StgTSO *t, nat prev_what_next )
+scheduleHandleYield( Capability *cap, StgTSO *t, uint32_t prev_what_next )
 {
     /* put the thread back on the run queue.  Then, if we're ready to
      * GC, check whether this is the last task to stop.  If so, wake
@@ -1309,19 +1307,19 @@ scheduleHandleThreadFinished (Capability *cap STG_UNUSED, Task *task, StgTSO *t)
                   // NOTE: return val is stack->sp[1] (see StgStartup.hc)
                   *(task->incall->ret) = (StgClosure *)task->incall->tso->stackobj->sp[1];
               }
-              task->incall->stat = Success;
+              task->incall->rstat = Success;
           } else {
               if (task->incall->ret) {
                   *(task->incall->ret) = NULL;
               }
               if (sched_state >= SCHED_INTERRUPTING) {
                   if (heap_overflow) {
-                      task->incall->stat = HeapExhausted;
+                      task->incall->rstat = HeapExhausted;
                   } else {
-                      task->incall->stat = Interrupted;
+                      task->incall->rstat = Interrupted;
                   }
               } else {
-                  task->incall->stat = Killed;
+                  task->incall->rstat = Killed;
               }
           }
 #ifdef DEBUG
@@ -1363,53 +1361,107 @@ scheduleNeedHeapProfile( rtsBool ready_to_gc STG_UNUSED )
 }
 
 /* -----------------------------------------------------------------------------
- * Start a synchronisation of all capabilities
+ * stopAllCapabilities()
+ *
+ * Stop all Haskell execution.  This is used when we need to make some global
+ * change to the system, such as altering the number of capabilities, or
+ * forking.
+ *
+ * To resume after stopAllCapabilities(), use releaseAllCapabilities().
+ * -------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+static void stopAllCapabilities (Capability **pCap, Task *task)
+{
+    rtsBool was_syncing;
+    SyncType prev_sync_type;
+
+    PendingSync sync = {
+        .type = SYNC_OTHER,
+        .idle = NULL,
+        .task = task
+    };
+
+    do {
+        was_syncing = requestSync(pCap, task, &sync, &prev_sync_type);
+    } while (was_syncing);
+
+    acquireAllCapabilities(*pCap,task);
+
+    pending_sync = 0;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * requestSync()
+ *
+ * Commence a synchronisation between all capabilities.  Normally not called
+ * directly, instead use stopAllCapabilities().  This is used by the GC, which
+ * has some special synchronisation requirements.
+ *
+ * Returns:
+ *    rtsFalse if we successfully got a sync
+ *    rtsTrue  if there was another sync request in progress,
+ *             and we yielded to it.  The value returned is the
+ *             type of the other sync request.
  * -------------------------------------------------------------------------- */
 
-// Returns:
-//    0      if we successfully got a sync
-//    non-0  if there was another sync request in progress,
-//           and we yielded to it.  The value returned is the
-//           type of the other sync request.
-//
 #if defined(THREADED_RTS)
-static nat requestSync (Capability **pcap, Task *task, nat sync_type)
+static rtsBool requestSync (
+    Capability **pcap, Task *task, PendingSync *new_sync,
+    SyncType *prev_sync_type)
 {
-    nat prev_pending_sync;
+    PendingSync *sync;
 
-    prev_pending_sync = cas(&pending_sync, 0, sync_type);
+    sync = (PendingSync*)cas((StgVolatilePtr)&pending_sync,
+                             (StgWord)NULL,
+                             (StgWord)new_sync);
 
-    if (prev_pending_sync)
+    if (sync != NULL)
     {
+        // sync is valid until we have called yieldCapability().
+        // After the sync is completed, we cannot read that struct any
+        // more because it has been freed.
+        *prev_sync_type = sync->type;
         do {
             debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...",
-                       prev_pending_sync);
+                       sync->type);
             ASSERT(*pcap);
             yieldCapability(pcap,task,rtsTrue);
-        } while (pending_sync);
-        return prev_pending_sync; // NOTE: task->cap might have changed now
+            sync = pending_sync;
+        } while (sync != NULL);
+
+        // NOTE: task->cap might have changed now
+        return rtsTrue;
     }
     else
     {
-        return 0;
+        return rtsFalse;
     }
 }
+#endif
 
-//
-// Grab all the capabilities except the one we already hold.  Used
-// when synchronising before a single-threaded GC (SYNC_SEQ_GC), and
-// before a fork (SYNC_OTHER).
-//
-// Only call this after requestSync(), otherwise a deadlock might
-// ensue if another thread is trying to synchronise.
-//
+/* -----------------------------------------------------------------------------
+ * acquireAllCapabilities()
+ *
+ * Grab all the capabilities except the one we already hold.  Used
+ * when synchronising before a single-threaded GC (SYNC_SEQ_GC), and
+ * before a fork (SYNC_OTHER).
+ *
+ * Only call this after requestSync(), otherwise a deadlock might
+ * ensue if another thread is trying to synchronise.
+ * -------------------------------------------------------------------------- */
+
+#ifdef THREADED_RTS
 static void acquireAllCapabilities(Capability *cap, Task *task)
 {
     Capability *tmpcap;
-    nat i;
+    uint32_t i;
 
+    ASSERT(pending_sync != NULL);
     for (i=0; i < n_capabilities; i++) {
-        debugTrace(DEBUG_sched, "grabbing all the capabilies (%d/%d)", i, n_capabilities);
+        debugTrace(DEBUG_sched, "grabbing all the capabilies (%d/%d)",
+                   i, n_capabilities);
         tmpcap = capabilities[i];
         if (tmpcap != cap) {
             // we better hope this task doesn't get migrated to
@@ -1418,7 +1470,7 @@ static void acquireAllCapabilities(Capability *cap, Task *task)
             // all the Capabilities, but even so it's a slightly
             // unsavoury invariant.
             task->cap = tmpcap;
-            waitForReturnCapability(&tmpcap, task);
+            waitForCapability(&tmpcap, task);
             if (tmpcap->no != i) {
                 barf("acquireAllCapabilities: got the wrong capability");
             }
@@ -1426,10 +1478,19 @@ static void acquireAllCapabilities(Capability *cap, Task *task)
     }
     task->cap = cap;
 }
+#endif
+
+/* -----------------------------------------------------------------------------
+ * releaseAllcapabilities()
+ *
+ * Assuming this thread holds all the capabilities, release them all except for
+ * the one passed in as cap.
+ * -------------------------------------------------------------------------- */
 
-static void releaseAllCapabilities(nat n, Capability *cap, Task *task)
+#ifdef THREADED_RTS
+static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task)
 {
-    nat i;
+    uint32_t i;
 
     for (i = 0; i < n; i++) {
         if (cap->no != i) {
@@ -1451,11 +1512,15 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
 {
     Capability *cap = *pcap;
     rtsBool heap_census;
-    nat collect_gen;
+    uint32_t collect_gen;
+    rtsBool major_gc;
 #ifdef THREADED_RTS
-    nat gc_type;
-    nat i, sync;
+    uint32_t gc_type;
+    uint32_t i;
+    uint32_t need_idle;
+    uint32_t n_idle_caps = 0, n_failed_trygrab_idles = 0;
     StgTSO *tso;
+    rtsBool *idle_cap;
 #endif
 
     if (sched_state == SCHED_SHUTTING_DOWN) {
@@ -1470,6 +1535,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
     // Figure out which generation we are collecting, so that we can
     // decide whether this is a parallel GC or not.
     collect_gen = calcNeeded(force_major || heap_census, NULL);
+    major_gc = (collect_gen == RtsFlags.GcFlags.generations-1);
 
 #ifdef THREADED_RTS
     if (sched_state < SCHED_INTERRUPTING
@@ -1482,6 +1548,13 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
         gc_type = SYNC_GC_SEQ;
     }
 
+    if (gc_type == SYNC_GC_PAR && RtsFlags.ParFlags.parGcThreads > 0) {
+        need_idle = stg_max(0, enabled_capabilities -
+                            RtsFlags.ParFlags.parGcThreads);
+    } else {
+        need_idle = 0;
+    }
+
     // In order to GC, there must be no threads running Haskell code.
     // Therefore, the GC thread needs to hold *all* the capabilities,
     // and release them after the GC has completed.
@@ -1497,26 +1570,75 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
         threads if pending_sync is set. Tested inside
         yieldCapability() and releaseCapability() in Capability.c */
 
-    do {
-        sync = requestSync(pcap, task, gc_type);
-        cap = *pcap;
-        if (sync == SYNC_GC_SEQ || sync == SYNC_GC_PAR) {
-            // someone else had a pending sync request for a GC, so
-            // let's assume GC has been done and we don't need to GC
-            // again.
-            return;
-        }
-        if (sched_state == SCHED_SHUTTING_DOWN) {
-            // The scheduler might now be shutting down.  We tested
-            // this above, but it might have become true since then as
-            // we yielded the capability in requestSync().
-            return;
-        }
-    } while (sync);
+    PendingSync sync = {
+        .type = gc_type,
+        .idle = NULL,
+        .task = task
+    };
+
+    {
+        SyncType prev_sync = 0;
+        rtsBool was_syncing;
+        do {
+            // We need an array of size n_capabilities, but since this may
+            // change each time around the loop we must allocate it afresh.
+            idle_cap = (rtsBool *)stgMallocBytes(n_capabilities *
+                                                  sizeof(rtsBool),
+                                                  "scheduleDoGC");
+            sync.idle = idle_cap;
+
+            // When using +RTS -qn, we need some capabilities to be idle during
+            // GC.  The best bet is to choose some inactive ones, so we look for
+            // those first:
+            uint32_t n_idle = need_idle;
+            for (i=0; i < n_capabilities; i++) {
+                if (capabilities[i]->disabled) {
+                    idle_cap[i] = rtsTrue;
+                } else if (n_idle > 0 &&
+                           capabilities[i]->running_task == NULL) {
+                    debugTrace(DEBUG_sched, "asking for cap %d to be idle", i);
+                    n_idle--;
+                    idle_cap[i] = rtsTrue;
+                } else {
+                    idle_cap[i] = rtsFalse;
+                }
+            }
+            // If we didn't find enough inactive capabilities, just pick some
+            // more to be idle.
+            for (i=0; n_idle > 0 && i < n_capabilities; i++) {
+                if (!idle_cap[i] && i != cap->no) {
+                    idle_cap[i] = rtsTrue;
+                    n_idle--;
+                }
+            }
+            ASSERT(n_idle == 0);
+
+            was_syncing = requestSync(pcap, task, &sync, &prev_sync);
+            cap = *pcap;
+            if (was_syncing) {
+                stgFree(idle_cap);
+            }
+            if (was_syncing &&
+                (prev_sync == SYNC_GC_SEQ || prev_sync == SYNC_GC_PAR) &&
+                !(sched_state == SCHED_INTERRUPTING && force_major)) {
+                // someone else had a pending sync request for a GC, so
+                // let's assume GC has been done and we don't need to GC
+                // again.
+                // Exception to this: if SCHED_INTERRUPTING, then we still
+                // need to do the final GC.
+                return;
+            }
+            if (sched_state == SCHED_SHUTTING_DOWN) {
+                // The scheduler might now be shutting down.  We tested
+                // this above, but it might have become true since then as
+                // we yielded the capability in requestSync().
+                return;
+            }
+        } while (was_syncing);
+    }
+
+    stat_startGCSync(gc_threads[cap->no]);
 
-    // don't declare this until after we have sync'd, because
-    // n_capabilities may change.
-    rtsBool idle_cap[n_capabilities];
 #ifdef DEBUG
     unsigned int old_n_capabilities = n_capabilities;
 #endif
@@ -1526,18 +1648,13 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
     // The final shutdown GC is always single-threaded, because it's
     // possible that some of the Capabilities have no worker threads.
 
-    if (gc_type == SYNC_GC_SEQ)
-    {
+    if (gc_type == SYNC_GC_SEQ) {
         traceEventRequestSeqGc(cap);
-    }
-    else
-    {
+    } else {
         traceEventRequestParGc(cap);
-        debugTrace(DEBUG_sched, "ready_to_gc, grabbing GC threads");
     }
 
-    if (gc_type == SYNC_GC_SEQ)
-    {
+    if (gc_type == SYNC_GC_SEQ) {
         // single-threaded GC: grab all the capabilities
         acquireAllCapabilities(cap,task);
     }
@@ -1553,31 +1670,45 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
 
         if (RtsFlags.ParFlags.parGcNoSyncWithIdle == 0
             || (RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
-                collect_gen >= RtsFlags.ParFlags.parGcLoadBalancingGen)) {
+                collect_gen >= RtsFlags.ParFlags.parGcLoadBalancingGen))
+        {
             for (i=0; i < n_capabilities; i++) {
                 if (capabilities[i]->disabled) {
                     idle_cap[i] = tryGrabCapability(capabilities[i], task);
+                    if (idle_cap[i]) {
+                        n_idle_caps++;
+                    }
                 } else {
-                    idle_cap[i] = rtsFalse;
+                    if (i != cap->no && idle_cap[i]) {
+                        Capability *tmpcap = capabilities[i];
+                        task->cap = tmpcap;
+                        waitForCapability(&tmpcap, task);
+                        n_idle_caps++;
+                    }
                 }
             }
-        } else {
+        }
+        else
+        {
             for (i=0; i < n_capabilities; i++) {
                 if (capabilities[i]->disabled) {
                     idle_cap[i] = tryGrabCapability(capabilities[i], task);
-                } else if (i == cap->no ||
-                           capabilities[i]->idle < RtsFlags.ParFlags.parGcNoSyncWithIdle) {
-                    idle_cap[i] = rtsFalse;
-                } else {
+                    if (idle_cap[i]) {
+                        n_idle_caps++;
+                    }
+                } else if (i != cap->no &&
+                           capabilities[i]->idle >=
+                           RtsFlags.ParFlags.parGcNoSyncWithIdle) {
                     idle_cap[i] = tryGrabCapability(capabilities[i], task);
-                    if (!idle_cap[i]) {
-                        n_failed_trygrab_idles++;
-                    } else {
+                    if (idle_cap[i]) {
                         n_idle_caps++;
+                    } else {
+                        n_failed_trygrab_idles++;
                     }
                 }
             }
         }
+        debugTrace(DEBUG_sched, "%d idle caps", n_idle_caps);
 
         // We set the gc_thread[i]->idle flag if that
         // capability/thread is not participating in this collection.
@@ -1612,8 +1743,9 @@ delete_threads_and_gc:
      * We now have all the capabilities; if we're in an interrupting
      * state, then we should take the opportunity to delete all the
      * threads in the system.
+     * Checking for major_gc ensures that the last GC is major.
      */
-    if (sched_state == SCHED_INTERRUPTING) {
+    if (sched_state == SCHED_INTERRUPTING && major_gc) {
         deleteAllThreads(cap);
 #if defined(THREADED_RTS)
         // Discard all the sparks from every Capability.  Why?
@@ -1756,6 +1888,8 @@ delete_threads_and_gc:
 #endif
 
 #if defined(THREADED_RTS)
+    stgFree(idle_cap);
+
     if (gc_type == SYNC_GC_SEQ) {
         // release our stash of capabilities.
         releaseAllCapabilities(n_capabilities, cap, task);
@@ -1780,28 +1914,19 @@ forkProcess(HsStablePtr *entry
     pid_t pid;
     StgTSO* t,*next;
     Capability *cap;
-    nat g;
+    uint32_t g;
     Task *task = NULL;
-    nat i;
-#ifdef THREADED_RTS
-    nat sync;
-#endif
+    uint32_t i;
 
     debugTrace(DEBUG_sched, "forking!");
 
     task = newBoundTask();
 
     cap = NULL;
-    waitForReturnCapability(&cap, task);
+    waitForCapability(&cap, task);
 
 #ifdef THREADED_RTS
-    do {
-        sync = requestSync(&cap, task, SYNC_OTHER);
-    } while (sync);
-
-    acquireAllCapabilities(cap,task);
-
-    pending_sync = 0;
+    stopAllCapabilities(&cap, task);
 #endif
 
     // no funny business: hold locks while we fork, otherwise if some
@@ -1905,10 +2030,12 @@ forkProcess(HsStablePtr *entry
             // bound threads for which the corresponding Task does not
             // exist.
             truncateRunQueue(cap);
+            cap->n_run_queue = 0;
 
             // Any suspended C-calling Tasks are no more, their OS threads
             // don't exist now:
             cap->suspended_ccalls = NULL;
+            cap->n_suspended_ccalls = 0;
 
 #if defined(THREADED_RTS)
             // Wipe our spare workers list, they no longer exist.  New
@@ -1917,6 +2044,7 @@ forkProcess(HsStablePtr *entry
             cap->n_spare_workers = 0;
             cap->returning_tasks_hd = NULL;
             cap->returning_tasks_tl = NULL;
+            cap->n_returning_tasks = 0;
 #endif
 
             // Release all caps except 0, we'll use that for starting
@@ -1974,7 +2102,7 @@ forkProcess(HsStablePtr *entry
  * ------------------------------------------------------------------------- */
 
 void
-setNumCapabilities (nat new_n_capabilities USED_IF_THREADS)
+setNumCapabilities (uint32_t new_n_capabilities USED_IF_THREADS)
 {
 #if !defined(THREADED_RTS)
     if (new_n_capabilities != 1) {
@@ -1989,10 +2117,9 @@ setNumCapabilities (nat new_n_capabilities USED_IF_THREADS)
 #else
     Task *task;
     Capability *cap;
-    nat sync;
-    nat n;
+    uint32_t n;
     Capability *old_capabilities = NULL;
-    nat old_n_capabilities = n_capabilities;
+    uint32_t old_n_capabilities = n_capabilities;
 
     if (new_n_capabilities == enabled_capabilities) return;
 
@@ -2002,13 +2129,7 @@ setNumCapabilities (nat new_n_capabilities USED_IF_THREADS)
     cap = rts_lock();
     task = cap->running_task;
 
-    do {
-        sync = requestSync(&cap, task, SYNC_OTHER);
-    } while (sync);
-
-    acquireAllCapabilities(cap,task);
-
-    pending_sync = 0;
+    stopAllCapabilities(&cap, task);
 
     if (new_n_capabilities < enabled_capabilities)
     {
@@ -2078,9 +2199,6 @@ setNumCapabilities (nat new_n_capabilities USED_IF_THREADS)
         n_capabilities = enabled_capabilities = new_n_capabilities;
     }
 
-    // Start worker tasks on the new Capabilities
-    startWorkerTasks(old_n_capabilities, new_n_capabilities);
-
     // We're done: release the original Capabilities
     releaseAllCapabilities(old_n_capabilities, cap,task);
 
@@ -2110,7 +2228,7 @@ deleteAllThreads ( Capability *cap )
     // NOTE: only safe to call if we own all capabilities.
 
     StgTSO* t, *next;
-    nat g;
+    uint32_t g;
 
     debugTrace(DEBUG_sched,"deleting all threads");
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
@@ -2150,6 +2268,7 @@ suspendTask (Capability *cap, Task *task)
         cap->suspended_ccalls->prev = incall;
     }
     cap->suspended_ccalls = incall;
+    cap->n_suspended_ccalls++;
 }
 
 STATIC_INLINE void
@@ -2168,6 +2287,7 @@ recoverSuspendedTask (Capability *cap, Task *task)
         incall->next->prev = incall->prev;
     }
     incall->next = incall->prev = NULL;
+    cap->n_suspended_ccalls--;
 }
 
 /* ---------------------------------------------------------------------------
@@ -2269,7 +2389,7 @@ resumeThread (void *task_)
     task->cap = cap;
 
     // Wait for permission to re-enter the RTS with the result.
-    waitForReturnCapability(&cap,task);
+    waitForCapability(&cap,task);
     // we might be on a different capability now... but if so, our
     // entry on the suspended_ccalls list will also have been
     // migrated.
@@ -2364,7 +2484,7 @@ scheduleWaitThread (StgTSO* tso, /*[out]*/HaskellObj* ret, Capability **pcap)
 
     task->incall->tso = tso;
     task->incall->ret = ret;
-    task->incall->stat = NoStatus;
+    task->incall->rstat = NoStatus;
 
     appendToRunQueue(cap,tso);
 
@@ -2373,7 +2493,7 @@ scheduleWaitThread (StgTSO* tso, /*[out]*/HaskellObj* ret, Capability **pcap)
 
     cap = schedule(cap,task);
 
-    ASSERT(task->incall->stat != NoStatus);
+    ASSERT(task->incall->rstat != NoStatus);
     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 
     debugTrace(DEBUG_sched, "bound thread (%lu) finished", (unsigned long)id);
@@ -2399,7 +2519,7 @@ void scheduleWorker (Capability *cap, Task *task)
     // cap->lock until we've finished workerTaskStop() below.
     //
     // There may be workers still involved in foreign calls; those
-    // will just block in waitForReturnCapability() because the
+    // will just block in waitForCapability() because the
     // Capability has been shut down.
     //
     ACQUIRE_LOCK(&cap->lock);
@@ -2414,10 +2534,10 @@ void scheduleWorker (Capability *cap, Task *task)
  * -------------------------------------------------------------------------- */
 
 static void
-startWorkerTasks (nat from USED_IF_THREADS, nat to USED_IF_THREADS)
+startWorkerTasks (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
 {
 #if defined(THREADED_RTS)
-    nat i;
+    uint32_t i;
     Capability *cap;
 
     for (i = from; i < to; i++) {
@@ -2490,7 +2610,7 @@ exitScheduler (rtsBool wait_foreign USED_IF_THREADS)
     if (sched_state < SCHED_SHUTTING_DOWN) {
         sched_state = SCHED_INTERRUPTING;
         Capability *cap = task->cap;
-        waitForReturnCapability(&cap,task);
+        waitForCapability(&cap,task);
         scheduleDoGC(&cap,task,rtsTrue);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
@@ -2508,13 +2628,13 @@ exitScheduler (rtsBool wait_foreign USED_IF_THREADS)
 void
 freeScheduler( void )
 {
-    nat still_running;
+    uint32_t still_running;
 
     ACQUIRE_LOCK(&sched_mutex);
     still_running = freeTaskManager();
     // We can only free the Capabilities if there are no Tasks still
     // running.  We might have a Task about to return from a foreign
-    // call into waitForReturnCapability(), for example (actually,
+    // call into waitForCapability(), for example (actually,
     // this should be the *only* thing that a still-running Task can
     // do at this point, and it will block waiting for the
     // Capability).
@@ -2558,7 +2678,7 @@ performGC_(rtsBool force_major)
 
     // TODO: do we need to traceTask*() here?
 
-    waitForReturnCapability(&cap,task);
+    waitForCapability(&cap,task);
     scheduleDoGC(&cap,task,force_major);
     releaseCapability(cap);
     boundTaskExiting(task);
@@ -2577,8 +2697,8 @@ performMajorGC(void)
 }
 
 /* ---------------------------------------------------------------------------
-   Interrupt execution
-   - usually called inside a signal handler so it mustn't do anything fancy.
+   Interrupt execution.
+   Might be called inside a signal handler so it mustn't do anything fancy.
    ------------------------------------------------------------------------ */
 
 void
@@ -2666,7 +2786,7 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
     Capability *cap = regTableToCapability(reg);
     StgThunk *raise_closure = NULL;
     StgPtr p, next;
-    StgRetInfoTable *info;
+    const StgRetInfoTable *info;
     //
     // This closure represents the expression 'raise# E' where E
     // is the exception raise.  It is used to overwrite all the
@@ -2775,12 +2895,12 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
 StgWord
 findRetryFrameHelper (Capability *cap, StgTSO *tso)
 {
-  StgPtr           p, next;
-  StgRetInfoTable *info;
+  const StgRetInfoTable *info;
+  StgPtr    p, next;
 
   p = tso->stackobj->sp;
   while (1) {
-    info = get_ret_itbl((StgClosure *)p);
+    info = get_ret_itbl((const StgClosure *)p);
     next = p + stack_frame_sizeW((StgClosure *)p);
     switch (info->i.type) {