Turn on -n4m with -A16m or greater
authorSimon Marlow <marlowsd@gmail.com>
Sun, 9 Oct 2016 22:21:35 +0000 (18:21 -0400)
committerBen Gamari <ben@smart-cactus.org>
Sun, 9 Oct 2016 22:21:36 +0000 (18:21 -0400)
Nursery chunks help reduce the cost of GC when capabilities are unevenly
loaded, by ensuring that we use more of the available nursery.

The rationale for enabling this at -A16m is that any negative effects
due to loss of cache locality are less likely to be an issue at -A16m
and above.  It's a conservative guess.  If we had a lot of benchmark
data we could probably do better.

Results for nofib/parallel at -N4 -A32m with and without -n4m:

```
------------------------------------------------------------------------
        Program           Size    Allocs   Runtime   Elapsed  TotalMem
------------------------------------------------------------------------
   blackscholes           0.0%     -9.5%     -9.0%    -15.0%     -2.2%
          coins           0.0%     -4.7%     -3.6%     -0.6%    -13.6%
         mandel           0.0%     -0.3%     +7.7%    +13.1%     +0.1%
        matmult           0.0%     +1.5%    +10.0%     +7.7%     +0.1%
          nbody           0.0%     -4.1%     -2.9%     0.085      0.0%
         parfib           0.0%     -1.4%     +1.0%     +1.5%     +0.2%
        partree           0.0%     -0.3%     +0.8%     +2.9%     -0.8%
           prsa           0.0%     -0.5%     -2.1%     -7.6%      0.0%
         queens           0.0%     -3.2%     -1.4%     +2.2%     +1.3%
            ray           0.0%     -5.6%    -14.5%     -7.6%     +0.8%
       sumeuler           0.0%     -0.4%     +2.4%     +1.1%      0.0%
------------------------------------------------------------------------
            Min           0.0%     -9.5%    -14.5%    -15.0%    -13.6%
            Max           0.0%     +1.5%    +10.0%    +13.1%     +1.3%
 Geometric Mean          +0.0%     -2.6%     -1.3%     -0.5%     -1.4%
```

Not conclusive, but slightly better.  This matters a lot more when you
have more cores.

Test Plan: validate, nofib/paralel

Reviewers: niteria, ezyang, nh2, trofi, austin, erikd, bgamari

Reviewed By: bgamari

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D2581

GHC Trac Issues: #9221

docs/users_guide/runtime_control.rst
rts/RtsFlags.c
rts/sm/Storage.c

index 0ffb1d8..54c7508 100644 (file)
@@ -327,7 +327,7 @@ performance.
 
 .. rts-flag:: -n ⟨size⟩
 
-    :default: 0
+    :default: 4m with ``-A16m`` or larger, otherwise 0.
 
     .. index::
        single: allocation area, chunk size
index 4bd544e..d86b154 100644 (file)
@@ -1454,6 +1454,24 @@ static void normaliseRtsOpts (void)
         errorUsage();
     }
 
+    if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+        RtsFlags.GcFlags.heapSizeSuggestion >
+        RtsFlags.GcFlags.maxHeapSize) {
+        RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
+    }
+
+    if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+        RtsFlags.GcFlags.minAllocAreaSize >
+        RtsFlags.GcFlags.maxHeapSize) {
+        errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
+        RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
+    }
+
+    // If we have -A16m or larger, use -n4m.
+    if (RtsFlags.GcFlags.minAllocAreaSize >= (16*1024*1024) / BLOCK_SIZE) {
+        RtsFlags.GcFlags.nurseryChunkSize = (4*1024*1024) / BLOCK_SIZE;
+    }
+
     if (RtsFlags.ParFlags.parGcLoadBalancingGen == ~0u) {
         StgWord alloc_area_bytes
             = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE;
index 4d0c8d5..357e018 100644 (file)
@@ -140,19 +140,6 @@ initStorage (void)
   ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
   ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
 
-  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
-      RtsFlags.GcFlags.heapSizeSuggestion >
-      RtsFlags.GcFlags.maxHeapSize) {
-      RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
-  }
-
-  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
-      RtsFlags.GcFlags.minAllocAreaSize >
-      RtsFlags.GcFlags.maxHeapSize) {
-      errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
-      RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
-  }
-
   initBlockAllocator();
 
 #if defined(THREADED_RTS)