rts, base: Refactor stats.c to improve --machine-readable report
authorDouglas Wilson <douglas.wilson@gmail.com>
Mon, 19 Mar 2018 17:26:41 +0000 (13:26 -0400)
committerBen Gamari <ben@smart-cactus.org>
Mon, 19 Mar 2018 17:26:42 +0000 (13:26 -0400)
There should be no change in the output of the '+RTS -s' (summary)
report, or the 'RTS -t' (one-line) report.

All data shown in the summary report is now shown in the machine
readable report.

All data in RTSStats is now shown in the machine readable report.

init times are added to RTSStats and added to GHC.Stats.

Example of the new output:
```
 [("bytes allocated", "375016384")
 ,("num_GCs", "113")
 ,("average_bytes_used", "148348")
 ,("max_bytes_used", "206552")
 ,("num_byte_usage_samples", "2")
 ,("peak_megabytes_allocated", "6")
 ,("init_cpu_seconds", "0.001642")
 ,("init_wall_seconds", "0.001027")
 ,("mut_cpu_seconds", "3.020166")
 ,("mut_wall_seconds", "0.757244")
 ,("GC_cpu_seconds", "0.037750")
 ,("GC_wall_seconds", "0.009569")
 ,("exit_cpu_seconds", "0.000890")
 ,("exit_wall_seconds", "0.002551")
 ,("total_cpu_seconds", "3.060452")
 ,("total_wall_seconds", "0.770395")
 ,("major_gcs", "2")
 ,("allocated_bytes", "375016384")
 ,("max_live_bytes", "206552")
 ,("max_large_objects_bytes", "159344")
 ,("max_compact_bytes", "0")
 ,("max_slop_bytes", "59688")
 ,("max_mem_in_use_bytes", "6291456")
 ,("cumulative_live_bytes", "296696")
 ,("copied_bytes", "541024")
 ,("par_copied_bytes", "493976")
 ,("cumulative_par_max_copied_bytes", "104104")
 ,("cumulative_par_balanced_copied_bytes", "274456")
 ,("fragmentation_bytes", "2112")
 ,("alloc_rate", "124170795")
 ,("productivity_cpu_percent", "0.986838")
 ,("productivity_wall_percent", "0.982935")
 ,("bound_task_count", "1")
 ,("sparks_count", "5836258")
 ,("sparks_converted", "237")
 ,("sparks_overflowed", "1990408")
 ,("sparks_dud ", "0")
 ,("sparks_gcd", "3455553")
 ,("sparks_fizzled", "390060")
 ,("work_balance", "0.555606")
 ,("n_capabilities", "4")
 ,("task_count", "10")
 ,("peak_worker_count", "9")
 ,("worker_count", "9")
 ,("gc_alloc_block_sync_spin", "162")
 ,("gc_alloc_block_sync_yield", "0")
 ,("gc_alloc_block_sync_spin", "162")
 ,("gc_spin_spin", "18840855")
 ,("gc_spin_yield", "10355")
 ,("mut_spin_spin", "70331392")
 ,("mut_spin_yield", "61700")
 ,("waitForGcThreads_spin", "241")
 ,("waitForGcThreads_yield", "2797")
 ,("whitehole_gc_spin", "0")
 ,("whitehole_lockClosure_spin", "0")
 ,("whitehole_lockClosure_yield", "0")
 ,("whitehole_executeMessage_spin", "0")
 ,("whitehole_threadPaused_spin", "0")
 ,("any_work", "1667")
 ,("no_work", "1662")
 ,("scav_find_work", "1026")
 ,("gen_0_collections", "111")
 ,("gen_0_par_collections", "111")
 ,("gen_0_cpu_seconds", "0.036126")
 ,("gen_0_wall_seconds", "0.036126")
 ,("gen_0_max_pause_seconds", "0.036126")
 ,("gen_0_avg_pause_seconds", "0.000081")
 ,("gen_0_sync_spin", "21")
 ,("gen_0_sync_yield", "0")
 ,("gen_1_collections", "2")
 ,("gen_1_par_collections", "1")
 ,("gen_1_cpu_seconds", "0.001624")
 ,("gen_1_wall_seconds", "0.001624")
 ,("gen_1_max_pause_seconds", "0.001624")
 ,("gen_1_avg_pause_seconds", "0.000272")
 ,("gen_1_sync_spin", "3")
 ,("gen_1_sync_yield", "0")
 ]
```

Test Plan: Ensure that one-line and summary reports are unchanged.

Reviewers: bgamari, erikd, simonmar, hvr

Reviewed By: simonmar

Subscribers: rwbarton, thomie, carter

GHC Trac Issues: #14660

Differential Revision: https://phabricator.haskell.org/D4303

includes/RtsAPI.h
libraries/base/GHC/Stats.hsc
rts/Stats.c
rts/Stats.h

index 6f011cb..69ec607 100644 (file)
@@ -184,7 +184,7 @@ typedef struct _RTSStats {
   uint64_t par_copied_bytes;
     // Sum of par_max_copied_bytes across all parallel GCs
   uint64_t cumulative_par_max_copied_bytes;
-  // Sum of par_balanced_copied_byes across all parallel GCs.
+    // Sum of par_balanced_copied_byes across all parallel GCs.
   uint64_t cumulative_par_balanced_copied_bytes;
 
   // -----------------------------------
@@ -192,6 +192,10 @@ typedef struct _RTSStats {
   // (we use signed values here because due to inaccuracies in timers
   // the values can occasionally go slightly negative)
 
+    // Total CPU time used by the init phase
+  Time init_cpu_ns;
+    // Total elapsed time used by the init phase
+  Time init_elapsed_ns;
     // Total CPU time used by the mutator
   Time mutator_cpu_ns;
     // Total elapsed time used by the mutator
@@ -213,26 +217,26 @@ typedef struct _RTSStats {
   // -----------------------------------
   // Internal Counters
 
-  // The number of times a GC thread spun on its 'gc_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread spun on its 'gc_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t gc_spin_spin;
-  // The number of times a GC thread yielded on its 'gc_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread yielded on its 'gc_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t gc_spin_yield;
-  // The number of times a GC thread spun on its 'mut_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread spun on its 'mut_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t mut_spin_spin;
-  // The number of times a GC thread yielded on its 'mut_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread yielded on its 'mut_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t mut_spin_yield;
-  // The number of times a GC thread has checked for work across all parallel
-  // GCs
+    // The number of times a GC thread has checked for work across all parallel
+    // GCs
   uint64_t any_work;
-  // The number of times a GC thread has checked for work and found none across
-  // all parallel GCs
+    // The number of times a GC thread has checked for work and found none
+    // across all parallel GCs
   uint64_t no_work;
-  // The number of times a GC thread has iterated it's outer loop across all
-  // parallel GCs
+    // The number of times a GC thread has iterated it's outer loop across all
+    // parallel GCs
   uint64_t scav_find_work;
 } RTSStats;
 
index 3497ff5..d81173f 100644 (file)
@@ -83,6 +83,12 @@ data RTSStats = RTSStats {
   -- (we use signed values here because due to inaccuracies in timers
   -- the values can occasionally go slightly negative)
 
+    -- | Total CPU time used by the init phase
+    -- @since 4.12.0.0
+  , init_cpu_ns :: RtsTime
+    -- | Total elapsed time used by the init phase
+    -- @since 4.12.0.0
+  , init_elapsed_ns :: RtsTime
     -- | Total CPU time used by the mutator
   , mutator_cpu_ns :: RtsTime
     -- | Total elapsed time used by the mutator
@@ -175,6 +181,8 @@ getRTSStats = do
       (# peek RTSStats, cumulative_par_max_copied_bytes) p
     cumulative_par_balanced_copied_bytes <-
       (# peek RTSStats, cumulative_par_balanced_copied_bytes) p
+    init_cpu_ns <- (# peek RTSStats, init_cpu_ns) p
+    init_elapsed_ns <- (# peek RTSStats, init_elapsed_ns) p
     mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p
     mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p
     gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p
index 7eb93be..8769551 100644 (file)
@@ -24,6 +24,7 @@
 #include "ThreadPaused.h"
 #include "Messages.h"
 
+#include <string.h> // for memset
 
 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
@@ -92,6 +93,9 @@ mut_user_time_until( Time t )
     return TimeToSecondsDbl(t - stats.gc_cpu_ns);
     // heapCensus() time is included in GC_tot_cpu, so we don't need
     // to subtract it here.
+
+    // TODO: This seems wrong to me. Surely we should be subtracting
+    // (at least) start_init_cpu?
 }
 
 double
@@ -169,6 +173,8 @@ initStats0(void)
         .any_work = 0,
         .no_work = 0,
         .scav_find_work = 0,
+        .init_cpu_ns = 0,
+        .init_elapsed_ns = 0,
         .mutator_cpu_ns = 0,
         .mutator_elapsed_ns = 0,
         .gc_cpu_ns = 0,
@@ -240,6 +246,8 @@ void
 stat_endInit(void)
 {
     getProcessTimes(&end_init_cpu, &end_init_elapsed);
+    stats.init_cpu_ns = end_init_cpu - start_init_cpu;
+    stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
 }
 
 /* -----------------------------------------------------------------------------
@@ -419,8 +427,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
                           stats.gc.copied_bytes,
                           stats.gc.slop_bytes,
                           /* current loss due to fragmentation */
-                          (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
-                                 * BLOCK_SIZE,
+                          (mblocks_allocated * BLOCKS_PER_MBLOCK
+                           - n_alloc_blocks) * BLOCK_SIZE,
                           par_n_threads,
                           stats.gc.par_max_copied_bytes,
                           stats.gc.copied_bytes,
@@ -515,7 +523,8 @@ stat_endRP(
   fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
   fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
 #endif
-  fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
+  fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+          averageNumVisit);
 }
 #endif /* PROFILING */
 
@@ -573,7 +582,7 @@ StgInt TOTAL_CALLS=1;
 #define REPORT(counter) \
   { \
     showStgWord64(counter,temp,true/*commas*/); \
-    statsPrintf("  (" #counter ")  : %s\n",temp);                               \
+    statsPrintf("  (" #counter ")  : %s\n",temp); \
   }
 
 /* Report the value of a counter as a percentage of another counter */
@@ -592,340 +601,647 @@ StgInt TOTAL_CALLS=1;
   statsPrintf("  (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
               SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
 
-STATIC_INLINE Time get_init_cpu(void) {
-    return end_init_cpu - start_init_cpu;
-}
+/*
+Note [RTS Stats Reporting]
+==========================
+
+There are currently three reporting functions:
+  * report_summary:
+      Responsible for producing '+RTS -s' output.
+      Will report internal counters if the RTS flag --internal-counters is
+      passed. See [Internal Counters Stats]
+  * report_machine_readable:
+      Responsible for producing '+RTS -t --machine-readable' output.
+  * report_one_line:
+      Responsible for productin '+RTS -t' output
+
+Stats are accumulated into the global variable 'stats' as the program runs, then
+in 'stat_exit' we do the following:
+  * Finalise 'stats'. This involves setting final running times and allocations
+    that have not yet been accounted for.
+  * Create a RTSSummaryStats. This contains all data for reports that is not
+    included in stats (because they do not make sense before the program has
+    completed) or in a global variable.
+  * call the appropriate report_* function, passing the newly constructed
+    RTSSummaryStats.
+
+To ensure that the data in the different reports is kept consistent, the
+report_* functions should not do any calculation, excepting unit changes and
+formatting. If you need to add a new calculated field, add it to
+RTSSummaryStats.
+
+*/
+
 
-STATIC_INLINE Time get_init_elapsed(void) {
-    return end_init_elapsed - start_init_elapsed;
+static void init_RTSSummaryStats(RTSSummaryStats* sum)
+{
+    const size_t sizeof_gc_summary_stats =
+      RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
+
+    memset(sum, 0, sizeof(RTSSummaryStats));
+    sum->gc_summary_stats =
+      stgMallocBytes(sizeof_gc_summary_stats,
+                     "alloc_RTSSummaryStats.gc_summary_stats");
+    memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
 }
 
+static void free_RTSSummaryStats(RTSSummaryStats * sum)
+{
+    if (!sum) { return; }
+    if (!sum->gc_summary_stats) {
+        stgFree(sum->gc_summary_stats);
+        sum->gc_summary_stats = NULL;
+    }
+}
 
-void
-stat_exit (void)
+static void report_summary(const RTSSummaryStats* sum)
 {
-    generation *gen;
-    Time gc_cpu = 0;
-    Time gc_elapsed = 0;
-    Time init_cpu = 0;
-    Time init_elapsed = 0;
-    Time mut_cpu = 0;
-    Time mut_elapsed = 0;
-    Time exit_cpu = 0;
-    Time exit_elapsed = 0;
-    Time exit_gc_cpu = 0;
-    Time exit_gc_elapsed = 0;
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+
+    uint32_t g;
+    char temp[512];
+    showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes allocated in the heap\n", temp);
+
+    showStgWord64(stats.copied_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes copied during GC\n", temp);
+
+    if ( stats.major_gcs > 0 ) {
+        showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
+        statsPrintf("%16s bytes maximum residency (%" FMT_Word32
+                    " sample(s))\n",
+                    temp, stats.major_gcs);
+    }
 
-    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+    showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes maximum slop\n", temp);
+
+    statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
+                FMT_SizeT " MB lost due to fragmentation)\n\n",
+                stats.max_live_bytes  / (1024 * 1024),
+                sum->fragmentation_bytes / (1024 * 1024));
+
+    /* Print garbage collections in each gen */
+    statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats * gen_stats =
+            &sum->gc_summary_stats[g];
+        statsPrintf("  Gen %2d     %5d colls"
+                    ", %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
+                    g, // REVIEWERS: this used to be gen->no
+                    //, this can't ever be different right?
+                    gen_stats->collections,
+                    gen_stats->par_collections,
+                    TimeToSecondsDbl(gen_stats->cpu_ns),
+                    TimeToSecondsDbl(gen_stats->elapsed_ns),
+                    TimeToSecondsDbl(gen_stats->avg_pause_ns),
+                    TimeToSecondsDbl(gen_stats->max_pause_ns));
+    }
 
-        char temp[512];
-        Time tot_cpu;
-        Time tot_elapsed;
-        uint32_t g;
+    statsPrintf("\n");
 
-        getProcessTimes( &tot_cpu, &tot_elapsed );
-        tot_cpu -= start_init_cpu;
-        tot_elapsed -= start_init_elapsed;
+#if defined(THREADED_RTS)
+    if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
+        // See Note [Work Balance]
+        statsPrintf("  Parallel GC work balance: "
+                    "%.2f%% (serial 0%%, perfect 100%%)\n\n",
+                    sum->work_balance * 100);
+    }
 
-        uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+    statsPrintf("  TASKS: %d "
+                "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
+                taskCount, sum->bound_task_count,
+                peakWorkerCount, workerCount,
+                n_capabilities);
+
+    statsPrintf("  SPARKS: %" FMT_Word
+                "(%" FMT_Word " converted, %" FMT_Word " overflowed, %"
+                FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
+                sum->sparks_count,
+                sum->sparks.converted, sum->sparks.overflowed,
+                sum->sparks.dud, sum->sparks.gcd,
+                sum->sparks.fizzled);
+#endif
 
-        // allocated since the last GC
-        stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
-        stats.allocated_bytes = tot_alloc_bytes;
+    statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns));
 
-        /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
-        if (tot_cpu <= 0)  tot_cpu = 1;
-        if (tot_elapsed <= 0) tot_elapsed = 1;
+    statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
 
-        if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
-            statsPrintf("%9" FMT_Word " %9.9s %9.9s",
-                        (W_)stats.gc.allocated_bytes, "", "");
-            statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
-        }
+#if defined(PROFILING)
+    statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->rc_cpu_ns),
+                TimeToSecondsDbl(sum->rc_elapsed_ns));
+    statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->hc_cpu_ns),
+                TimeToSecondsDbl(sum->hc_elapsed_ns));
+#endif
+    statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->exit_cpu_ns),
+                TimeToSecondsDbl(sum->exit_elapsed_ns));
+    statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
+                TimeToSecondsDbl(stats.cpu_ns),
+                TimeToSecondsDbl(stats.elapsed_ns));
+#if !defined(THREADED_RTS)
+    statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
+                sum->gc_cpu_percent * 100,
+                sum->gc_elapsed_percent * 100);
+#endif
 
-        // heapCensus() is called by the GC, so RP and HC time are
-        // included in the GC stats.  We therefore subtract them to
-        // obtain the actual GC cpu time.
-        gc_cpu     = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
-        gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
+    showStgWord64(sum->alloc_rate, temp, true/*commas*/);
 
-        init_cpu     = get_init_cpu();
-        init_elapsed = get_init_elapsed();
+    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-        // We do a GC during the EXIT phase.  We'll attribute the cost of that
-        // to GC instead of EXIT, so carefully subtract it from the EXIT time.
-        exit_gc_cpu  = stats.gc_cpu_ns - start_exit_gc_cpu;
-        exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
-        exit_cpu     = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
-        exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
+    statsPrintf("  Productivity %5.1f%% of total user, "
+                "%.1f%% of total elapsed\n\n",
+                sum->productivity_cpu_percent * 100,
+                sum->productivity_elapsed_percent * 100);
 
-        mut_elapsed = start_exit_elapsed - end_init_elapsed -
-            (gc_elapsed - exit_gc_elapsed) -
-            PROF_VAL(RPe_tot_time + HCe_tot_time);
+    // See Note [Internal Counter Stats] for a description of the
+    // following counters. If you add a counter here, please remember
+    // to update the Note.
+    if (RtsFlags.MiscFlags.internalCounters) {
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        const int32_t col_width[] = {4, -30, 14, 14};
+        statsPrintf("Internal Counters:\n");
+        statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+                    , col_width[0], ""
+                    , col_width[1], "SpinLock"
+                    , col_width[2], "Spins"
+                    , col_width[3], "Yields");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_alloc_block_sync"
+                    , col_width[2], gc_alloc_block_sync.spin
+                    , col_width[3], gc_alloc_block_sync.yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_spin"
+                    , col_width[2], stats.gc_spin_spin
+                    , col_width[3], stats.gc_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "mut_spin"
+                    , col_width[2], stats.mut_spin_spin
+                    , col_width[3], stats.mut_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_gc"
+                    , col_width[2], whitehole_gc_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_threadPaused"
+                    , col_width[2], whitehole_threadPaused_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_executeMessage"
+                    , col_width[2], whitehole_executeMessage_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_lockClosure"
+                    , col_width[2], whitehole_lockClosure_spin
+                    , col_width[3], whitehole_lockClosure_yield);
+        // waitForGcThreads isn't really spin-locking(see the function)
+        // but these numbers still seem useful.
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "waitForGcThreads"
+                    , col_width[2], waitForGcThreads_spin
+                    , col_width[3], waitForGcThreads_yield);
+
+        for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+            int prefix_length = 0;
+            statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+                        col_width[0], "", g, &prefix_length);
+            prefix_length -= col_width[0];
+            int suffix_length = col_width[1] + prefix_length;
+            suffix_length =
+                  suffix_length > 0 ? col_width[1] : suffix_length;
+
+            statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                        , suffix_length, "].sync"
+                        , col_width[2], generations[g].sync.spin
+                        , col_width[3], generations[g].sync.yield);
+        }
+        statsPrintf("\n");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "any_work"
+                    , col_width[2], stats.any_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "no_work"
+                    , col_width[2], stats.no_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "scav_find_work"
+                    , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+        statsPrintf("Internal Counters require the RTS to be built "
+                "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+        statsPrintf("Internal Counters require the threaded RTS");
+#endif
+    }
+}
 
-        mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
-            - PROF_VAL(RP_tot_time + HC_tot_time);
-        if (mut_cpu < 0) { mut_cpu = 0; }
+static void report_machine_readable (const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+    uint32_t g;
+
+#define MR_STAT(field_name,format,value) \
+    statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
+#define MR_STAT_GEN(gen,field_name,format,value) \
+    statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
+      format "\")\n", g, value)
+
+    // These first values are for backwards compatibility.
+    // Some of these first fields are duplicated with more machine-readable
+    // names, or to match the name in RtsStats.
+
+    // we don't use for the first field helper macro here because the prefix is
+    // different
+    statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
+                stats.allocated_bytes);
+    MR_STAT("num_GCs", FMT_Word32, stats.gcs);
+    MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
+    MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
+    MR_STAT("peak_megabytes_allocated", FMT_SizeT,
+      stats.max_mem_in_use_bytes / (1024 * 1024));
+
+    MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
+    MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
+    MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
+    MR_STAT("mut_wall_seconds", "f",
+            TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
+    MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
+
+    // end backward compatibility
+
+    // First, the rest of the times
+
+    MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
+    MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
+#if defined(PROFILING)
+    MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
+    MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
+    MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hp_cpu_ns));
+    MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hp_elapsed_ns));
+#endif
+    MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
+    MR_STAT("total_wall_seconds", "f",
+            TimeToSecondsDbl(stats.elapsed_ns));
+
+    // next, the remainder of the fields of RTSStats, except internal counters
+
+    // The first two are duplicates of those above, but have more machine
+    // readable names that match the field names in RTSStats.
+
+
+    // gcs has been done as num_GCs above
+    MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
+    MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
+    MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("max_large_objects_bytes", FMT_Word64,
+            stats.max_large_objects_bytes);
+    MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
+    MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
+    // This duplicates, except for unit, peak_megabytes_allocated above
+    MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
+    MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
+    MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
+    MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
+    MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
+            stats.cumulative_par_max_copied_bytes);
+    MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
+            stats.cumulative_par_balanced_copied_bytes);
+
+    // next, the computed fields in RTSSummaryStats
+#if !defined(THREADED_RTS) // THREADED_RTS
+    MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
+    MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
+#endif
+    MR_STAT("fragmentation_bytes", FMT_SizeT, sum->fragmentation_bytes);
+    // average_bytes_used is done above
+    MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
+    MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
+    MR_STAT("productivity_wall_percent", "f",
+            sum->productivity_elapsed_percent);
 
-        // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
-        // subtracting, so the parts should add up to the total exactly.  Note
-        // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
-        // so we don't use it here.
-        ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
-               == end_exit_elapsed - start_init_elapsed);
+    // next, the THREADED_RTS fields in RTSSummaryStats
 
+#if defined(THREADED_RTS)
+    MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
+    MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
+    MR_STAT("sparks_converted", FMT_Word64, sum->sparks.converted);
+    MR_STAT("sparks_overflowed", FMT_Word64, sum->sparks.overflowed);
+    MR_STAT("sparks_dud ", FMT_Word64, sum->sparks.dud);
+    MR_STAT("sparks_gcd", FMT_Word64, sum->sparks.gcd);
+    MR_STAT("sparks_fizzled", FMT_Word64, sum->sparks.fizzled);
+    MR_STAT("work_balance", "f", sum->work_balance);
+
+    // next, globals (other than internal counters)
+    MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
+    MR_STAT("task_count", FMT_Word32, taskCount);
+    MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
+    MR_STAT("worker_count", FMT_Word32, workerCount);
+
+    // next, internal counters
+#if defined(PROF_SPIN)
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
+            gc_alloc_block_sync.yield);
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
+    MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
+    MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
+    MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
+    MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
+    MR_STAT("waitForGcThreads_yield", FMT_Word64,
+            waitForGcThreads_yield);
+    MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
+    MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
+            whitehole_lockClosure_spin);
+    MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
+            whitehole_lockClosure_yield);
+    MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
+            whitehole_executeMessage_spin);
+    MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
+            whitehole_threadPaused_spin);
+    MR_STAT("any_work", FMT_Word64,
+            stats.any_work);
+    MR_STAT("no_work", FMT_Word64,
+            stats.no_work);
+    MR_STAT("scav_find_work", FMT_Word64,
+            stats.scav_find_work);
+#endif // PROF_SPIN
+#endif // THREADED_RTS
+
+    // finally, per-generation stats. Named as, for example for generation 0,
+    // gen_0_collections
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
+        MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
+        MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
+        MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
+        MR_STAT_GEN(g, "wall_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->elapsed_ns));
+        MR_STAT_GEN(g, "max_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->max_pause_ns));
+        MR_STAT_GEN(g, "avg_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->avg_pause_ns));
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
+        MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
+#endif
+    }
 
-        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
-            showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes allocated in the heap\n", temp);
+    statsPrintf(" ]\n");
+}
 
-            showStgWord64(stats.copied_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes copied during GC\n", temp);
+static void report_one_line(const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
 
-            if ( stats.major_gcs > 0 ) {
-                showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
-                statsPrintf("%16s bytes maximum residency (%" FMT_Word32
-                            " sample(s))\n",
-                            temp, stats.major_gcs);
-            }
+    /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
+    mingw-0.5) */
+    statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
+                "%" FMT_Word32 " GCs, "
+                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
+                "(%" FMT_Word32 " samples), "
+                "%" FMT_Word64 "M in use, "
+                "%.3f INIT (%.3f elapsed), "
+                "%.3f MUT (%.3f elapsed), "
+                "%.3f GC (%.3f elapsed) :ghc>>\n",
+                stats.allocated_bytes,
+                stats.gcs,
+                sum->average_bytes_used,
+                stats.max_live_bytes,
+                stats.major_gcs,
+                stats.max_mem_in_use_bytes / (1024 * 1024),
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns),
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns),
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
+}
 
-            showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes maximum slop\n", temp);
-
-            statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
-                        FMT_SizeT " MB lost due to fragmentation)\n\n",
-                        (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
-                        (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
-
-            /* Print garbage collections in each gen */
-            statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
-            for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                gen = &generations[g];
-                statsPrintf("  Gen %2d     %5d colls, %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
-                            gen->no,
-                            gen->collections,
-                            gen->par_collections,
-                            TimeToSecondsDbl(GC_coll_cpu[g]),
-                            TimeToSecondsDbl(GC_coll_elapsed[g]),
-                            gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
-                            TimeToSecondsDbl(GC_coll_max_pause[g]));
-            }
+void
+stat_exit (void)
+{
+    RTSSummaryStats sum;
+    uint32_t g;
 
-#if defined(THREADED_RTS)
-            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
-                // See Note [Work Balance]
-                statsPrintf("\n  Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
-                    100 * (double)stats.cumulative_par_balanced_copied_bytes /
-                          (double)stats.par_copied_bytes);
-            }
-#endif
-            statsPrintf("\n");
+    init_RTSSummaryStats(&sum);
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+        // First we tidy the times in stats, and populate the times in sum.
+        // In particular, we adjust the gc_* time in stats to remove
+        // profiling times.
+        {
+            Time now_cpu_ns, now_elapsed_ns;
+            Time exit_gc_cpu = 0;
+            Time exit_gc_elapsed = 0;
+            Time prof_cpu = 0;
+            Time prof_elapsed = 0;
+
+            getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+
+            stats.cpu_ns = now_cpu_ns - start_init_cpu;
+            stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
+            /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
+               seconds -- SDM */
+            if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
+            if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
+
+            prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
+            prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
+
+            // heapCensus() is called by the GC, so RP and HC time are
+            // included in the GC stats.  We therefore subtract them to
+            // obtain the actual GC cpu time.
+            stats.gc_cpu_ns      -=  prof_cpu;
+            stats.gc_elapsed_ns  -=  prof_elapsed;
 
-#if defined(THREADED_RTS)
-            statsPrintf("  TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
-                        taskCount, taskCount - workerCount,
-                        peakWorkerCount, workerCount,
-                        n_capabilities);
-
-            statsPrintf("\n");
-
-            {
-                uint32_t i;
-                SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
-                for (i = 0; i < n_capabilities; i++) {
-                    sparks.created   += capabilities[i]->spark_stats.created;
-                    sparks.dud       += capabilities[i]->spark_stats.dud;
-                    sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
-                    sparks.converted += capabilities[i]->spark_stats.converted;
-                    sparks.gcd       += capabilities[i]->spark_stats.gcd;
-                    sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
-                }
-
-                statsPrintf("  SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
-                            sparks.created + sparks.dud + sparks.overflowed,
-                            sparks.converted, sparks.overflowed, sparks.dud,
-                            sparks.gcd, sparks.fizzled);
-            }
-#endif
+#if defined(PROFILING)
+            sum.rp_cpu_ns = RP_tot_time;
+            sum.rp_elapsed_ns = RPe_tot_time;
+            sum.hc_cpu_ns = HC_tot_time;
+            sum.hc_elapsed_ns = HCe_tot_time;
+#endif // PROFILING
+
+            // We do a GC during the EXIT phase. We'll attribute the cost of
+            // that to GC instead of EXIT, so carefully subtract it from the
+            // EXIT time.
+            exit_gc_cpu          = stats.gc_cpu_ns - start_exit_gc_cpu;
+            exit_gc_elapsed      = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+            sum.exit_cpu_ns     = end_exit_cpu
+                                      - start_exit_cpu
+                                      - exit_gc_cpu;
+            sum.exit_elapsed_ns = end_exit_elapsed
+                                       - start_exit_elapsed
+                                       - exit_gc_elapsed;
+
+            stats.mutator_cpu_ns     = start_exit_cpu
+                                 - end_init_cpu
+                                 - (stats.gc_cpu_ns - exit_gc_cpu)
+                                 - prof_cpu;
+            stats.mutator_elapsed_ns = start_exit_elapsed
+                                 - end_init_elapsed
+                                 - (stats.gc_elapsed_ns - exit_gc_elapsed)
+                                 - prof_elapsed;
+
+            if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
+
+            // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
+            // and subtracting, so the parts should add up to the total exactly.
+            // Note that stats->total_ns is captured a tiny bit later than
+            // end_exit_elapsed, so we don't use it here.
+            ASSERT(stats.init_elapsed_ns \
+                   + stats.mutator_elapsed_ns \
+                   + stats.gc_elapsed_ns \
+                   + sum.exit_elapsed_ns \
+                   == end_exit_elapsed - start_init_elapsed);
 
-            statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
+        }
 
-            statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
-            statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+        // REVIEWERS: it's not clear to me why the following isn't done in
+        // stat_endGC of the last garbage collection?
+
+        // We account for the last garbage collection
+        {
+            uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+            stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
+            stats.allocated_bytes = tot_alloc_bytes;
+            if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+                statsPrintf("%9" FMT_Word " %9.9s %9.9s",
+                            (W_)stats.gc.allocated_bytes, "", "");
+                statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
+            }
+        }
 
-#if defined(PROFILING)
-            statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
-            statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
-#endif
-            statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
-            statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
-                    TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
-#if !defined(THREADED_RTS)
-            statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
-                    TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
-#endif
+        // We populate the remainder (non-time elements) of sum
+        {
+    #if defined(THREADED_RTS)
+            uint32_t i;
+            sum.bound_task_count = taskCount - workerCount;
+
+            for (i = 0; i < n_capabilities; i++) {
+                sum.sparks.created   += capabilities[i]->spark_stats.created;
+                sum.sparks.dud       += capabilities[i]->spark_stats.dud;
+                sum.sparks.overflowed+=
+                  capabilities[i]->spark_stats.overflowed;
+                sum.sparks.converted +=
+                  capabilities[i]->spark_stats.converted;
+                sum.sparks.gcd       += capabilities[i]->spark_stats.gcd;
+                sum.sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
+            }
 
-            if (mut_cpu == 0) {
-                showStgWord64(0, temp, true/*commas*/);
+            sum.sparks_count = sum.sparks.created
+                + sum.sparks.dud
+                + sum.sparks.overflowed;
+
+            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
+                // See Note [Work Balance]
+                sum.work_balance =
+                    (double)stats.cumulative_par_balanced_copied_bytes
+                    / (double)stats.par_copied_bytes;
             } else {
-                showStgWord64(
-                    (StgWord64)((double)stats.allocated_bytes /
-                                TimeToSecondsDbl(mut_cpu)),
-                    temp, true/*commas*/);
+                sum.work_balance = 0;
             }
 
-            statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-            statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
-                    TimeToSecondsDbl(tot_cpu - gc_cpu -
-                                PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
-                    / TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(tot_elapsed - gc_elapsed -
-                                PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
-                    / TimeToSecondsDbl(tot_elapsed));
-
-            // See Note [Internal Counter Stats] for a description of the
-            // following counters. If you add a counter here, please remember
-            // to update the Note.
-            if (RtsFlags.MiscFlags.internalCounters) {
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
-                uint32_t g;
-                const int32_t col_width[] = {4, -30, 14, 14};
-                statsPrintf("Internal Counters:\n");
-                statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
-                            , col_width[0], ""
-                            , col_width[1], "SpinLock"
-                            , col_width[2], "Spins"
-                            , col_width[3], "Yields");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_alloc_block_sync"
-                            , col_width[2], gc_alloc_block_sync.spin
-                            , col_width[3], gc_alloc_block_sync.yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_spin"
-                            , col_width[2], stats.gc_spin_spin
-                            , col_width[3], stats.gc_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "mut_spin"
-                            , col_width[2], stats.mut_spin_spin
-                            , col_width[3], stats.mut_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_gc"
-                            , col_width[2], whitehole_gc_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_threadPaused"
-                            , col_width[2], whitehole_threadPaused_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_executeMessage"
-                            , col_width[2], whitehole_executeMessage_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_lockClosure"
-                            , col_width[2], whitehole_lockClosure_spin
-                            , col_width[3], whitehole_lockClosure_yield);
-                // waitForGcThreads isn't really spin-locking(see the function)
-                // but these numbers still seem useful.
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "waitForGcThreads"
-                            , col_width[2], waitForGcThreads_spin
-                            , col_width[3], waitForGcThreads_yield);
-
-                for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    int prefix_length = 0;
-                    statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
-                                col_width[0], "", g, &prefix_length);
-                    prefix_length -= col_width[0];
-                    int suffix_length = col_width[1] + prefix_length;
-                    suffix_length =
-                      suffix_length > 0 ? col_width[1] : suffix_length;
-
-                    statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                                , suffix_length, "].sync"
-                                , col_width[2], generations[g].sync.spin
-                                , col_width[3], generations[g].sync.yield);
-                }
-                statsPrintf("\n");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "any_work"
-                            , col_width[2], stats.any_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "no_work"
-                            , col_width[2], stats.no_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "scav_find_work"
-                            , col_width[2], stats.scav_find_work);
-#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
-                statsPrintf("Internal Counters require the RTS to be built "
-                        "with PROF_SPIN"); // PROF_SPIN is not #defined here
-#else // THREADED_RTS
-                statsPrintf("Internal Counters require the threaded RTS");
-#endif
+    #else // THREADED_RTS
+            sum.gc_cpu_percent     = stats.gc_cpu_ns
+                                  / stats.cpu_ns;
+            sum.gc_elapsed_percent = stats.gc_elapsed_ns
+                                  / stats.elapsed_ns;
+    #endif // THREADED_RTS
+
+            sum.fragmentation_bytes =
+                (uint64_t)(peak_mblocks_allocated
+                         * BLOCKS_PER_MBLOCK
+                         * BLOCK_SIZE_W
+                         - hw_alloc_blocks * BLOCK_SIZE_W)
+                / (uint64_t)sizeof(W_);
+
+            sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
+                 stats.cumulative_live_bytes/stats.major_gcs,
+
+            sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
+                (uint64_t)((double)stats.allocated_bytes
+                / TimeToSecondsDbl(stats.mutator_cpu_ns));
+
+            // REVIEWERS: These two values didn't used to include the exit times
+            sum.productivity_cpu_percent =
+                TimeToSecondsDbl(stats.cpu_ns
+                                - stats.gc_cpu_ns
+                                - sum.rp_cpu_ns
+                                - sum.hc_cpu_ns
+                                - stats.init_cpu_ns
+                                - sum.exit_cpu_ns)
+                / TimeToSecondsDbl(stats.cpu_ns);
+
+            sum.productivity_elapsed_percent =
+                TimeToSecondsDbl(stats.elapsed_ns
+                                - stats.gc_elapsed_ns
+                                - sum.rp_elapsed_ns
+                                - sum.hc_elapsed_ns
+                                - stats.init_elapsed_ns
+                                - sum.exit_elapsed_ns)
+                / TimeToSecondsDbl(stats.elapsed_ns);
+
+            for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+                const generation* gen = &generations[g];
+                GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
+                gen_stats->collections = gen->collections;
+                gen_stats->par_collections = gen->par_collections;
+                gen_stats->cpu_ns = GC_coll_cpu[g];
+                gen_stats->elapsed_ns = GC_coll_elapsed[g];
+                gen_stats->max_pause_ns = GC_coll_max_pause[g];
+                gen_stats->avg_pause_ns = gen->collections == 0 ?
+                    0 : (GC_coll_elapsed[g] / gen->collections);
+    #if defined(THREADED_RTS) && defined(PROF_SPIN)
+                gen_stats->sync_spin = gen->sync.spin;
+                gen_stats->sync_yield = gen->sync.yield;
+    #endif // PROF_SPIN
             }
         }
 
+        // Now we generate the report
+        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+            report_summary(&sum);
+        }
+
         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
-          char *fmt;
-          if (RtsFlags.MiscFlags.machineReadable) {
-            fmt =
-                " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
-                " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
-                " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"init_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"init_wall_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
-                " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"GC_wall_seconds\", \"%.3f\")\n"
-                " ]\n";
-          }
-          else {
-            fmt =
-                "<<ghc: %" FMT_Word64 " bytes, "
-                "%" FMT_Word32 " GCs, "
-                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
-                "%" FMT_Word64 "M in use, "
-                "%.3f INIT (%.3f elapsed), "
-                "%.3f MUT (%.3f elapsed), "
-                "%.3f GC (%.3f elapsed) :ghc>>\n";
-          }
-          /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
-          statsPrintf(fmt,
-                    stats.allocated_bytes,
-                    stats.gcs,
-                     (uint64_t)
-                      (stats.major_gcs == 0 ? 0 :
-                       stats.cumulative_live_bytes/stats.major_gcs),
-                    stats.max_live_bytes,
-                    stats.major_gcs,
-                    (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
-                    TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
-                    TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
-                    TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+            if (RtsFlags.MiscFlags.machineReadable) {
+                report_machine_readable(&sum);
+            }
+            else {
+                report_one_line(&sum);
+            }
         }
 
+        free_RTSSummaryStats(&sum);
         statsFlush();
         statsClose();
     }
@@ -1055,7 +1371,7 @@ waitForGcThreads:
   for a parallel garbage collection. We yield more than we spin in this case.
 
 In several places in the runtime we must take a lock on a closure. To do this,
-we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+we replace its info table with stg_WHITEHOLE_info, spinning if it is already
 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
 don't. We count these white-hole spins and include them in the SpinLocks table.
 If a particular loop does not yield, we put "n/a" in the table. They are named
@@ -1141,8 +1457,9 @@ statDescribeGens(void)
 
       gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
 
-      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
-                 gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
+      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n",
+                 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
+                 gen_slop*(W_)sizeof(W_));
       tot_live += gen_live;
       tot_slop += gen_slop;
   }
index 1c56344..7cd49fc 100644 (file)
@@ -66,4 +66,49 @@ void      statDescribeGens( void );
 Time      stat_getElapsedGCTime(void);
 Time      stat_getElapsedTime(void);
 
+typedef struct GenerationSummaryStats_ {
+    uint32_t collections;
+    uint32_t par_collections;
+    Time cpu_ns;
+    Time elapsed_ns;
+    Time max_pause_ns;
+    Time avg_pause_ns;
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+    uint64_t sync_spin;
+    uint64_t sync_yield;
+#endif
+} GenerationSummaryStats;
+
+typedef struct RTSSummaryStats_ {
+    // These profiling times could potentially be in RTSStats. However, I'm not
+    // confident enough to do this now, since there is some logic depending on
+    // global state that I do not understand. (Or if I do understand it, it's
+    // wrong)
+    Time rp_cpu_ns;
+    Time rp_elapsed_ns;
+    Time hc_cpu_ns;
+    Time hc_elapsed_ns;
+
+    Time exit_cpu_ns;
+    Time exit_elapsed_ns;
+
+#if defined(THREADED_RTS)
+    uint32_t bound_task_count;
+    uint64_t sparks_count;
+    SparkCounters sparks;
+    double work_balance;
+#else // THREADED_RTS
+    double gc_cpu_percent;
+    double gc_elapsed_percent;
+#endif
+    uint64_t fragmentation_bytes;
+    uint64_t average_bytes_used; // This is not shown in the '+RTS -s' report
+    uint64_t alloc_rate;
+    double productivity_cpu_percent;
+    double productivity_elapsed_percent;
+
+    // one for each generation, 0 first
+    GenerationSummaryStats* gc_summary_stats;
+} RTSSummaryStats;
+
 #include "EndPrivate.h"