Make clearNursery free
authorSimon Marlow <marlowsd@gmail.com>
Tue, 7 Oct 2014 09:30:36 +0000 (10:30 +0100)
committerSimon Marlow <marlowsd@gmail.com>
Tue, 25 Nov 2014 14:37:26 +0000 (14:37 +0000)
Summary:
clearNursery resets all the bd->free pointers of nursery blocks to
make the blocks empty.  In profiles we've seen clearNursery taking
significant amounts of time particularly with large -N and -A values.

This patch moves the work of clearNursery to the point at which we
actually need the new block, thereby introducing an invariant that
blocks to the right of the CurrentNursery pointer still need their
bd->free pointer reset.  This should make things faster overall,
because we don't need to clear blocks that we don't use.

Test Plan: validate

Reviewers: AndreasVoellmy, ezyang, austin

Subscribers: thomie, carter, ezyang, simonmar

Differential Revision: https://phabricator.haskell.org/D318

includes/rts/storage/GC.h
rts/Capability.h
rts/HeapStackCheck.cmm
rts/Schedule.c
rts/Stats.c
rts/sm/Storage.c
rts/sm/Storage.h
testsuite/tests/codeGen/should_run/all.T
testsuite/tests/codeGen/should_run/cgrun074.hs [new file with mode: 0644]
testsuite/tests/codeGen/should_run/cgrun074.stdout [new file with mode: 0644]
utils/deriveConstants/DeriveConstants.hs

index c171b67..db71e95 100644 (file)
@@ -66,6 +66,27 @@ typedef struct nursery_ {
     memcount       n_blocks;
 } nursery;
 
+// Nursery invariants:
+//
+//  - cap->r.rNursery points to the nursery for this capability
+//
+//  - cap->r.rCurrentNursery points to the block in the nursery that we are
+//    currently allocating into.  While in Haskell the current heap pointer is
+//    in Hp, outside Haskell it is stored in cap->r.rCurrentNursery->free.
+//
+//  - the blocks *after* cap->rCurrentNursery in the chain are empty
+//    (although their bd->free pointers have not been updated to
+//    reflect that)
+//
+//  - the blocks *before* cap->rCurrentNursery have been used.  Except
+//    for rCurrentAlloc.
+//
+//  - cap->r.rCurrentAlloc is either NULL, or it points to a block in
+//    the nursery *before* cap->r.rCurrentNursery.
+//
+// See also Note [allocation accounting] to understand how total
+// memory allocation is tracked.
+
 typedef struct generation_ {
     nat            no;                  // generation number
 
index 910c92c..420bfd5 100644 (file)
@@ -98,6 +98,10 @@ struct Capability_ {
     // reset after we have executed the context switch.
     int interrupt;
 
+    // Total words allocated by this cap since rts start
+    // See [Note allocation accounting] in Storage.c
+    W_ total_allocated;
+
 #if defined(THREADED_RTS)
     // Worker Tasks waiting in the wings.  Singly-linked.
     Task *spare_workers;
@@ -131,8 +135,6 @@ struct Capability_ {
     int io_manager_control_wr_fd;
 #endif
 #endif
-    // Total words allocated by this cap since rts start
-    W_ total_allocated;
 
     // Per-capability STM-related data
     StgTVarWatchQueue *free_tvar_watch_queues;
index a1fb5d4..a1e18ca 100644 (file)
@@ -97,7 +97,12 @@ stg_gc_noregs
             && bdescr_link(CurrentNursery) != NULL) {
             HpAlloc = 0;
             CLOSE_NURSERY();
+            Capability_total_allocated(MyCapability()) =
+              Capability_total_allocated(MyCapability()) +
+              BYTES_TO_WDS(bdescr_free(CurrentNursery) -
+                           bdescr_start(CurrentNursery));
             CurrentNursery = bdescr_link(CurrentNursery);
+            bdescr_free(CurrentNursery) = bdescr_start(CurrentNursery);
             OPEN_NURSERY();
             if (Capability_context_switch(MyCapability()) != 0 :: CInt ||
                 Capability_interrupt(MyCapability())      != 0 :: CInt ||
index 6a06f79..447b70e 100644 (file)
@@ -1125,21 +1125,16 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
 
         // don't do this if the nursery is (nearly) full, we'll GC first.
         if (cap->r.rCurrentNursery->link != NULL ||
-            cap->r.rNursery->n_blocks == 1) {  // paranoia to prevent infinite loop
-                                               // if the nursery has only one block.
+            cap->r.rNursery->n_blocks == 1) {  // paranoia to prevent
+                                               // infinite loop if the
+                                               // nursery has only one
+                                               // block.
 
             bd = allocGroup_lock(blocks);
             cap->r.rNursery->n_blocks += blocks;
 
-            // link the new group into the list
-            bd->link = cap->r.rCurrentNursery;
-            bd->u.back = cap->r.rCurrentNursery->u.back;
-            if (cap->r.rCurrentNursery->u.back != NULL) {
-                cap->r.rCurrentNursery->u.back->link = bd;
-            } else {
-                cap->r.rNursery->blocks = bd;
-            }
-            cap->r.rCurrentNursery->u.back = bd;
+            // link the new group after CurrentNursery
+            dbl_link_insert_after(bd, cap->r.rCurrentNursery);
 
             // initialise it as a nursery block.  We initialise the
             // step, gen_no, and flags field of *every* sub-block in
@@ -1162,6 +1157,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
             IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
 
             // now update the nursery to point to the new block
+            finishedNurseryBlock(cap, cap->r.rCurrentNursery);
             cap->r.rCurrentNursery = bd;
 
             // we might be unlucky and have another thread get on the
index ed345c2..d5efaa2 100644 (file)
@@ -291,27 +291,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
     {
         gct->gc_start_faults = getPageFaults();
     }
-}
-
-/* -----------------------------------------------------------------------------
- * Calculate the total allocated memory since the start of the
- * program.  Also emits events reporting the per-cap allocation
- * totals.
- * -------------------------------------------------------------------------- */
-
-static StgWord
-calcTotalAllocated(void)
-{
-    W_ tot_alloc = 0;
-    W_ n;
-    for (n = 0; n < n_capabilities; n++) {
-        tot_alloc += capabilities[n]->total_allocated;
-        traceEventHeapAllocated(capabilities[n],
-                                CAPSET_HEAP_DEFAULT,
-                                capabilities[n]->total_allocated * sizeof(W_));
-    }
 
-    return tot_alloc;
+    updateNurseriesStats();
 }
 
 /* -----------------------------------------------------------------------------
index 7174425..e4a6984 100644 (file)
@@ -532,6 +532,7 @@ assignNurseriesToCapabilities (nat from, nat to)
 
     for (i = from; i < to; i++) {
         capabilities[i]->r.rCurrentNursery = nurseries[i].blocks;
+        newNurseryBlock(nurseries[i].blocks);
         capabilities[i]->r.rCurrentAlloc   = NULL;
     }
 }
@@ -551,17 +552,16 @@ allocNurseries (nat from, nat to)
 }
       
 void
-clearNursery (Capability *cap)
+clearNursery (Capability *cap USED_IF_DEBUG)
 {
+#ifdef DEBUG
     bdescr *bd;
-
     for (bd = nurseries[cap->no].blocks; bd; bd = bd->link) {
-        cap->total_allocated += (W_)(bd->free - bd->start);
-        bd->free = bd->start;
         ASSERT(bd->gen_no == 0);
         ASSERT(bd->gen == g0);
-        IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
+        IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
     }
+#endif
 }
 
 void
@@ -734,14 +734,16 @@ StgPtr allocate (Capability *cap, W_ n)
     bd = cap->r.rCurrentAlloc;
     if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
         
+        if (bd) finishedNurseryBlock(cap,bd);
+
         // The CurrentAlloc block is full, we need to find another
         // one.  First, we try taking the next block from the
         // nursery:
         bd = cap->r.rCurrentNursery->link;
         
-        if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
-            // The nursery is empty, or the next block is already
-            // full: allocate a fresh block (we can't fail here).
+        if (bd == NULL) {
+            // The nursery is empty: allocate a fresh block (we can't
+            // fail here).
             ACQUIRE_SM_LOCK;
             bd = allocBlock();
             cap->r.rNursery->n_blocks++;
@@ -752,6 +754,7 @@ StgPtr allocate (Capability *cap, W_ n)
             // pretty quickly now, because MAYBE_GC() will
             // notice that CurrentNursery->link is NULL.
         } else {
+            newNurseryBlock(bd);
             // we have a block in the nursery: take it and put
             // it at the *front* of the nursery list, and use it
             // to allocate() from.
@@ -846,9 +849,9 @@ allocatePinned (Capability *cap, W_ n)
         // next GC cycle these objects will be moved to
         // g0->large_objects.
         if (bd != NULL) {
-            dbl_link_onto(bd, &cap->pinned_object_blocks);
             // add it to the allocation stats when the block is full
-            cap->total_allocated += bd->free - bd->start;
+            finishedNurseryBlock(cap, bd);
+            dbl_link_onto(bd, &cap->pinned_object_blocks);
         }
 
         // We need to find another block.  We could just allocate one,
@@ -861,7 +864,7 @@ allocatePinned (Capability *cap, W_ n)
         // an *empty* block, because we're about to mark it as
         // BF_PINNED | BF_LARGE.
         bd = cap->r.rCurrentNursery->link;
-        if (bd == NULL || bd->free != bd->start) { // must be empty!
+        if (bd == NULL) { // must be empty!
             // The nursery is empty, or the next block is non-empty:
             // allocate a fresh block (we can't fail here).
 
@@ -878,6 +881,7 @@ allocatePinned (Capability *cap, W_ n)
             RELEASE_SM_LOCK;
             initBdescr(bd, g0, g0);
         } else {
+            newNurseryBlock(bd);
             // we have a block in the nursery: steal it
             cap->r.rCurrentNursery->link = bd->link;
             if (bd->link != NULL) {
@@ -1001,21 +1005,57 @@ dirty_MVAR(StgRegTable *reg, StgClosure *p)
  * -------------------------------------------------------------------------- */
 
 /* -----------------------------------------------------------------------------
- * updateNurseriesStats()
+ * [Note allocation accounting]
  *
- * Update the per-cap total_allocated numbers with an approximation of
- * the amount of memory used in each cap's nursery.
+ *   - When cap->r.rCurrentNusery moves to a new block in the nursery,
+ *     we add the size of the used portion of the previous block to
+ *     cap->total_allocated. (see finishedNurseryBlock())
+ *
+ *   - When we start a GC, the allocated portion of CurrentNursery and
+ *     CurrentAlloc are added to cap->total_allocated. (see
+ *     updateNurseriesStats())
  *
- * Since this update is also performed by clearNurseries() then we only
- * need this function for the final stats when the RTS is shutting down.
  * -------------------------------------------------------------------------- */
 
-void updateNurseriesStats (void)
+//
+// Calculate the total allocated memory since the start of the
+// program.  Also emits events reporting the per-cap allocation
+// totals.
+//
+StgWord
+calcTotalAllocated (void)
+{
+    W_ tot_alloc = 0;
+    W_ n;
+
+    for (n = 0; n < n_capabilities; n++) {
+        tot_alloc += capabilities[n]->total_allocated;
+
+        traceEventHeapAllocated(capabilities[n],
+                                CAPSET_HEAP_DEFAULT,
+                                capabilities[n]->total_allocated * sizeof(W_));
+    }
+
+    return tot_alloc;
+}
+
+//
+// Update the per-cap total_allocated numbers with an approximation of
+// the amount of memory used in each cap's nursery.
+//
+void
+updateNurseriesStats (void)
 {
     nat i;
+    bdescr *bd;
 
     for (i = 0; i < n_capabilities; i++) {
-        capabilities[i]->total_allocated += countOccupied(nurseries[i].blocks);
+        // The current nursery block and the current allocate block have not
+        // yet been accounted for in cap->total_allocated, so we add them here.
+        bd = capabilities[i]->r.rCurrentNursery;
+        if (bd) finishedNurseryBlock(capabilities[i], bd);
+        bd = capabilities[i]->r.rCurrentAlloc;
+        if (bd) finishedNurseryBlock(capabilities[i], bd);
     }
 }
 
index 0016876..943c3e3 100644 (file)
@@ -88,10 +88,30 @@ void     resizeNurseriesFixed ( W_ blocks );
 W_       countNurseryBlocks   ( void );
 
 /* -----------------------------------------------------------------------------
+   Allocation accounting
+
+   See [Note allocation accounting] in Storage.c
+   -------------------------------------------------------------------------- */
+
+//
+// Called when we are finished allocating into a block; account for the amount
+// allocated in cap->total_allocated.
+//
+INLINE_HEADER void finishedNurseryBlock (Capability *cap, bdescr *bd) {
+    cap->total_allocated += bd->free - bd->start;
+}
+
+INLINE_HEADER void newNurseryBlock (bdescr *bd) {
+    bd->free = bd->start;
+}
+
+void    updateNurseriesStats (void);
+StgWord calcTotalAllocated   (void);
+
+/* -----------------------------------------------------------------------------
    Stats 'n' DEBUG stuff
    -------------------------------------------------------------------------- */
 
-void  updateNurseriesStats (void);
 W_    countLargeAllocated  (void);
 W_    countOccupied        (bdescr *bd);
 W_    calcNeeded           (rtsBool force_major, W_ *blocks_needed);
index 03106d4..f157287 100644 (file)
@@ -123,3 +123,4 @@ test('T9001', normal, compile_and_run, [''])
 test('T9013', omit_ways(['ghci']),  # ghci doesn't support unboxed tuples
      compile_and_run, [''])
 test('T9340', normal, compile_and_run, [''])
+test('cgrun074', normal, compile_and_run, [''])
diff --git a/testsuite/tests/codeGen/should_run/cgrun074.hs b/testsuite/tests/codeGen/should_run/cgrun074.hs
new file mode 100644 (file)
index 0000000..31328a0
--- /dev/null
@@ -0,0 +1,24 @@
+-- This test exercises the "large block" allocation code in the
+-- scheduler, where the mutator requests more than a block's worth of
+-- memory.
+
+longlistof x = [
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
+  x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x
+  ]
+
+main = print (sum (concat (map longlistof [1..100])))
diff --git a/testsuite/tests/codeGen/should_run/cgrun074.stdout b/testsuite/tests/codeGen/should_run/cgrun074.stdout
new file mode 100644 (file)
index 0000000..72e9f31
--- /dev/null
@@ -0,0 +1 @@
+2828000
index 486f497..c793e84 100644 (file)
@@ -349,6 +349,7 @@ wanteds = concat
           ,structField C    "Capability" "context_switch"
           ,structField C    "Capability" "interrupt"
           ,structField C    "Capability" "sparks"
+          ,structField C    "Capability" "total_allocated"
           ,structField C    "Capability" "weak_ptr_list_hd"
           ,structField C    "Capability" "weak_ptr_list_tl"