rts: Properly free the RTSSummaryStats structure
[ghc.git] / rts / Stats.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2005
4 *
5 * Statistics and timing-related functions.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "PosixSource.h"
10 #include "Rts.h"
11
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GCThread.h"
20 #include "sm/BlockAlloc.h"
21
22 // for spin/yield counters
23 #include "sm/GC.h"
24 #include "ThreadPaused.h"
25 #include "Messages.h"
26
27 #include <string.h> // for memset
28
29 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
30
31 static Time
32 start_init_cpu, start_init_elapsed,
33 end_init_cpu, end_init_elapsed,
34 start_exit_cpu, start_exit_elapsed,
35 start_exit_gc_elapsed, start_exit_gc_cpu,
36 end_exit_cpu, end_exit_elapsed;
37
38 #if defined(PROFILING)
39 static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
40 static Time RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time
41
42 static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
43 static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
44 #endif
45
46 #if defined(PROF_SPIN)
47 volatile StgWord64 whitehole_lockClosure_spin = 0;
48 volatile StgWord64 whitehole_lockClosure_yield = 0;
49 volatile StgWord64 whitehole_threadPaused_spin = 0;
50 volatile StgWord64 whitehole_executeMessage_spin = 0;
51 #endif
52
53 //
54 // All the stats!
55 //
56 // This is where we accumulate all the stats during execution, and it's also
57 // in a convenient form that we can copy over to a caller of getRTSStats().
58 //
59 static RTSStats stats;
60
61 static W_ GC_end_faults = 0;
62
63 static Time *GC_coll_cpu = NULL;
64 static Time *GC_coll_elapsed = NULL;
65 static Time *GC_coll_max_pause = NULL;
66
67 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
68 static void statsFlush( void );
69 static void statsClose( void );
70
71 /* -----------------------------------------------------------------------------
72 Current elapsed time
73 ------------------------------------------------------------------------- */
74
75 Time stat_getElapsedTime(void)
76 {
77 return getProcessElapsedTime() - start_init_elapsed;
78 }
79
80 /* ---------------------------------------------------------------------------
81 Measure the current MUT time, for profiling
82 ------------------------------------------------------------------------ */
83
84 double
85 mut_user_time_until( Time t )
86 {
87 return TimeToSecondsDbl(t - stats.gc_cpu_ns);
88 // heapCensus() time is included in GC_tot_cpu, so we don't need
89 // to subtract it here.
90
91 // TODO: This seems wrong to me. Surely we should be subtracting
92 // (at least) start_init_cpu?
93 }
94
95 double
96 mut_user_time( void )
97 {
98 Time cpu;
99 cpu = getProcessCPUTime();
100 return mut_user_time_until(cpu);
101 }
102
103 #if defined(PROFILING)
104 /*
105 mut_user_time_during_RP() returns the MUT time during retainer profiling.
106 The same is for mut_user_time_during_HC();
107 */
108 static double
109 mut_user_time_during_RP( void )
110 {
111 return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
112 }
113
114 #endif /* PROFILING */
115
116 /* ---------------------------------------------------------------------------
117 initStats0() has no dependencies, it can be called right at the beginning
118 ------------------------------------------------------------------------ */
119
120 void
121 initStats0(void)
122 {
123 start_init_cpu = 0;
124 start_init_elapsed = 0;
125 end_init_cpu = 0;
126 end_init_elapsed = 0;
127
128 start_exit_cpu = 0;
129 start_exit_elapsed = 0;
130 start_exit_gc_cpu = 0;
131 start_exit_gc_elapsed = 0;
132 end_exit_cpu = 0;
133 end_exit_elapsed = 0;
134
135 #if defined(PROFILING)
136 RP_start_time = 0;
137 RP_tot_time = 0;
138 RPe_start_time = 0;
139 RPe_tot_time = 0;
140
141 HC_start_time = 0;
142 HC_tot_time = 0;
143 HCe_start_time = 0;
144 HCe_tot_time = 0;
145 #endif
146
147 GC_end_faults = 0;
148
149 stats = (RTSStats) {
150 .gcs = 0,
151 .major_gcs = 0,
152 .allocated_bytes = 0,
153 .max_live_bytes = 0,
154 .max_large_objects_bytes = 0,
155 .max_compact_bytes = 0,
156 .max_slop_bytes = 0,
157 .max_mem_in_use_bytes = 0,
158 .cumulative_live_bytes = 0,
159 .copied_bytes = 0,
160 .par_copied_bytes = 0,
161 .cumulative_par_max_copied_bytes = 0,
162 .cumulative_par_balanced_copied_bytes = 0,
163 .gc_spin_spin = 0,
164 .gc_spin_yield = 0,
165 .mut_spin_spin = 0,
166 .mut_spin_yield = 0,
167 .any_work = 0,
168 .no_work = 0,
169 .scav_find_work = 0,
170 .init_cpu_ns = 0,
171 .init_elapsed_ns = 0,
172 .mutator_cpu_ns = 0,
173 .mutator_elapsed_ns = 0,
174 .gc_cpu_ns = 0,
175 .gc_elapsed_ns = 0,
176 .cpu_ns = 0,
177 .elapsed_ns = 0,
178 .gc = {
179 .gen = 0,
180 .threads = 0,
181 .allocated_bytes = 0,
182 .live_bytes = 0,
183 .large_objects_bytes = 0,
184 .compact_bytes = 0,
185 .slop_bytes = 0,
186 .mem_in_use_bytes = 0,
187 .copied_bytes = 0,
188 .par_max_copied_bytes = 0,
189 .par_balanced_copied_bytes = 0,
190 .sync_elapsed_ns = 0,
191 .cpu_ns = 0,
192 .elapsed_ns = 0
193 }
194 };
195 }
196
197 /* ---------------------------------------------------------------------------
198 initStats1() can be called after setupRtsFlags()
199 ------------------------------------------------------------------------ */
200
201 void initGenerationStats(void);
202
203 void
204 initStats1 (void)
205 {
206 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
207 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
208 statsPrintf(" bytes bytes bytes user elap user elap\n");
209 }
210 GC_coll_cpu =
211 (Time *)stgMallocBytes(
212 sizeof(Time)*RtsFlags.GcFlags.generations,
213 "initStats");
214 GC_coll_elapsed =
215 (Time *)stgMallocBytes(
216 sizeof(Time)*RtsFlags.GcFlags.generations,
217 "initStats");
218 GC_coll_max_pause =
219 (Time *)stgMallocBytes(
220 sizeof(Time)*RtsFlags.GcFlags.generations,
221 "initStats");
222 initGenerationStats();
223 }
224
225 void
226 initGenerationStats()
227 {
228 for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
229 GC_coll_cpu[i] = 0;
230 GC_coll_elapsed[i] = 0;
231 GC_coll_max_pause[i] = 0;
232 }
233 }
234
235 /* ---------------------------------------------------------------------------
236 Reset stats of child process after fork()
237 ------------------------------------------------------------------------ */
238
239 void resetChildProcessStats()
240 {
241 initStats0();
242 initGenerationStats();
243 }
244
245 /* -----------------------------------------------------------------------------
246 Initialisation time...
247 -------------------------------------------------------------------------- */
248
249 void
250 stat_startInit(void)
251 {
252 getProcessTimes(&start_init_cpu, &start_init_elapsed);
253 }
254
255 void
256 stat_endInit(void)
257 {
258 getProcessTimes(&end_init_cpu, &end_init_elapsed);
259 stats.init_cpu_ns = end_init_cpu - start_init_cpu;
260 stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
261 }
262
263 /* -----------------------------------------------------------------------------
264 stat_startExit and stat_endExit
265
266 These two measure the time taken in shutdownHaskell().
267 -------------------------------------------------------------------------- */
268
269 void
270 stat_startExit(void)
271 {
272 getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
273 start_exit_gc_elapsed = stats.gc_elapsed_ns;
274 start_exit_gc_cpu = stats.gc_cpu_ns;
275 }
276
277 void
278 stat_endExit(void)
279 {
280 getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
281 }
282
283 void
284 stat_startGCSync (gc_thread *gct)
285 {
286 gct->gc_sync_start_elapsed = getProcessElapsedTime();
287 }
288
289 /* -----------------------------------------------------------------------------
290 Called at the beginning of each GC
291 -------------------------------------------------------------------------- */
292
293 void
294 stat_startGC (Capability *cap, gc_thread *gct)
295 {
296 if (RtsFlags.GcFlags.ringBell) {
297 debugBelch("\007");
298 }
299
300 getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
301
302 // Post EVENT_GC_START with the same timestamp as used for stats
303 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
304 // Here, as opposed to other places, the event is emitted on the cap
305 // that initiates the GC and external tools expect it to have the same
306 // timestamp as used in +RTS -s calculcations.
307 traceEventGcStartAtT(cap,
308 TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
309
310 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
311 {
312 gct->gc_start_faults = getPageFaults();
313 }
314
315 updateNurseriesStats();
316 }
317
318 /* -----------------------------------------------------------------------------
319 Called at the end of each GC
320 -------------------------------------------------------------------------- */
321
322 void
323 stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
324 uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
325 W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
326 W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
327 W_ scav_find_work)
328 {
329 // -------------------------------------------------
330 // Collect all the stats about this GC in stats.gc. We always do this since
331 // it's relatively cheap and we need allocated_bytes to catch heap
332 // overflows.
333
334 stats.gc.gen = gen;
335 stats.gc.threads = par_n_threads;
336
337 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
338
339 // allocated since the last GC
340 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
341
342 stats.gc.live_bytes = live * sizeof(W_);
343 stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
344 stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
345 stats.gc.slop_bytes = slop * sizeof(W_);
346 stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
347 stats.gc.copied_bytes = copied * sizeof(W_);
348 stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
349 stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
350
351 bool stats_enabled =
352 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
353 rtsConfig.gcDoneHook != NULL;
354
355 if (stats_enabled
356 || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
357 {
358 // We only update the times when stats are explicitly enabled since
359 // getProcessTimes (e.g. requiring a system call) can be expensive on
360 // some platforms.
361 Time current_cpu, current_elapsed;
362 getProcessTimes(&current_cpu, &current_elapsed);
363 stats.cpu_ns = current_cpu - start_init_cpu;
364 stats.elapsed_ns = current_elapsed - start_init_elapsed;
365
366 stats.gc.sync_elapsed_ns =
367 gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
368 stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
369 stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
370 }
371 // -------------------------------------------------
372 // Update the cumulative stats
373
374 stats.gcs++;
375 stats.allocated_bytes = tot_alloc_bytes;
376 stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
377
378 GC_coll_cpu[gen] += stats.gc.cpu_ns;
379 GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
380 if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
381 GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
382 }
383
384 stats.copied_bytes += stats.gc.copied_bytes;
385 if (par_n_threads > 1) {
386 stats.par_copied_bytes += stats.gc.copied_bytes;
387 stats.cumulative_par_max_copied_bytes +=
388 stats.gc.par_max_copied_bytes;
389 stats.cumulative_par_balanced_copied_bytes +=
390 stats.gc.par_balanced_copied_bytes;
391 stats.any_work += any_work;
392 stats.no_work += no_work;
393 stats.scav_find_work += scav_find_work;
394 stats.gc_spin_spin += gc_spin_spin;
395 stats.gc_spin_yield += gc_spin_yield;
396 stats.mut_spin_spin += mut_spin_spin;
397 stats.mut_spin_yield += mut_spin_yield;
398 }
399 stats.gc_cpu_ns += stats.gc.cpu_ns;
400 stats.gc_elapsed_ns += stats.gc.elapsed_ns;
401
402 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
403 stats.major_gcs++;
404 if (stats.gc.live_bytes > stats.max_live_bytes) {
405 stats.max_live_bytes = stats.gc.live_bytes;
406 }
407 if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
408 stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
409 }
410 if (stats.gc.compact_bytes > stats.max_compact_bytes) {
411 stats.max_compact_bytes = stats.gc.compact_bytes;
412 }
413 if (stats.gc.slop_bytes > stats.max_slop_bytes) {
414 stats.max_slop_bytes = stats.gc.slop_bytes;
415 }
416 stats.cumulative_live_bytes += stats.gc.live_bytes;
417 }
418
419 // -------------------------------------------------
420 // Do the more expensive bits only when stats are enabled.
421
422 if (stats_enabled)
423 {
424 // -------------------------------------------------
425 // Emit events to the event log
426
427 // Has to be emitted while all caps stopped for GC, but before GC_END.
428 // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
429 // for a detailed design rationale of the current setup
430 // of GC eventlog events.
431 traceEventGcGlobalSync(cap);
432
433 // Emitted before GC_END on all caps, which simplifies tools code.
434 traceEventGcStats(cap,
435 CAPSET_HEAP_DEFAULT,
436 stats.gc.gen,
437 stats.gc.copied_bytes,
438 stats.gc.slop_bytes,
439 /* current loss due to fragmentation */
440 (mblocks_allocated * BLOCKS_PER_MBLOCK
441 - n_alloc_blocks) * BLOCK_SIZE,
442 par_n_threads,
443 stats.gc.par_max_copied_bytes,
444 stats.gc.copied_bytes,
445 stats.gc.par_balanced_copied_bytes);
446
447 // Post EVENT_GC_END with the same timestamp as used for stats
448 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
449 // Here, as opposed to other places, the event is emitted on the cap
450 // that initiates the GC and external tools expect it to have the same
451 // timestamp as used in +RTS -s calculcations.
452 traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
453
454 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
455 traceEventHeapLive(cap,
456 CAPSET_HEAP_DEFAULT,
457 stats.gc.live_bytes);
458 }
459
460 // -------------------------------------------------
461 // Print GC stats to stdout or a file (+RTS -S/-s)
462
463 if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
464 W_ faults = getPageFaults();
465
466 statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
467 stats.gc.allocated_bytes, stats.gc.copied_bytes,
468 stats.gc.live_bytes);
469
470 statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
471 FMT_Word " %4" FMT_Word " (Gen: %2d)\n",
472 TimeToSecondsDbl(stats.gc.cpu_ns),
473 TimeToSecondsDbl(stats.gc.elapsed_ns),
474 TimeToSecondsDbl(stats.cpu_ns),
475 TimeToSecondsDbl(stats.elapsed_ns),
476 faults - gct->gc_start_faults,
477 gct->gc_start_faults - GC_end_faults,
478 gen);
479
480 GC_end_faults = faults;
481 statsFlush();
482 }
483
484
485 if (rtsConfig.gcDoneHook != NULL) {
486 rtsConfig.gcDoneHook(&stats.gc);
487 }
488
489 traceEventHeapSize(cap,
490 CAPSET_HEAP_DEFAULT,
491 mblocks_allocated * MBLOCK_SIZE);
492 }
493 }
494
495 /* -----------------------------------------------------------------------------
496 Called at the beginning of each Retainer Profiliing
497 -------------------------------------------------------------------------- */
498 #if defined(PROFILING)
499 void
500 stat_startRP(void)
501 {
502 Time user, elapsed;
503 getProcessTimes( &user, &elapsed );
504
505 RP_start_time = user;
506 RPe_start_time = elapsed;
507 }
508 #endif /* PROFILING */
509
510 /* -----------------------------------------------------------------------------
511 Called at the end of each Retainer Profiliing
512 -------------------------------------------------------------------------- */
513
514 #if defined(PROFILING)
515 void
516 stat_endRP(
517 uint32_t retainerGeneration,
518 #if defined(DEBUG_RETAINER)
519 uint32_t maxCStackSize,
520 int maxStackSize,
521 #endif
522 double averageNumVisit)
523 {
524 Time user, elapsed;
525 getProcessTimes( &user, &elapsed );
526
527 RP_tot_time += user - RP_start_time;
528 RPe_tot_time += elapsed - RPe_start_time;
529
530 fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
531 retainerGeneration, mut_user_time_during_RP());
532 #if defined(DEBUG_RETAINER)
533 fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
534 fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
535 #endif
536 fprintf(prof_file, "\tAverage number of visits per object = %f\n",
537 averageNumVisit);
538 }
539 #endif /* PROFILING */
540
541 /* -----------------------------------------------------------------------------
542 Called at the beginning of each heap census
543 -------------------------------------------------------------------------- */
544 #if defined(PROFILING)
545 void
546 stat_startHeapCensus(void)
547 {
548 Time user, elapsed;
549 getProcessTimes( &user, &elapsed );
550
551 HC_start_time = user;
552 HCe_start_time = elapsed;
553 }
554 #endif /* PROFILING */
555
556 /* -----------------------------------------------------------------------------
557 Called at the end of each heap census
558 -------------------------------------------------------------------------- */
559 #if defined(PROFILING)
560 void
561 stat_endHeapCensus(void)
562 {
563 Time user, elapsed;
564 getProcessTimes( &user, &elapsed );
565
566 HC_tot_time += user - HC_start_time;
567 HCe_tot_time += elapsed - HCe_start_time;
568 }
569 #endif /* PROFILING */
570
571 /* -----------------------------------------------------------------------------
572 Called at the end of execution
573
574 NOTE: number of allocations is not entirely accurate: it doesn't
575 take into account the few bytes at the end of the heap that
576 were left unused when the heap-check failed.
577 -------------------------------------------------------------------------- */
578
579 #if defined(DEBUG)
580 #define TICK_VAR_INI(arity) \
581 StgInt SLOW_CALLS_##arity = 1; \
582 StgInt RIGHT_ARITY_##arity = 1; \
583 StgInt TAGGED_PTR_##arity = 0;
584
585 TICK_VAR_INI(1)
586 TICK_VAR_INI(2)
587
588 StgInt TOTAL_CALLS=1;
589 #endif
590
591 /* Report the value of a counter */
592 #define REPORT(counter) \
593 { \
594 showStgWord64(counter,temp,true/*commas*/); \
595 statsPrintf(" (" #counter ") : %s\n",temp); \
596 }
597
598 /* Report the value of a counter as a percentage of another counter */
599 #define REPORT_PCT(counter,countertot) \
600 statsPrintf(" (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
601 counter*100.0/countertot)
602
603 #define TICK_PRINT(arity) \
604 REPORT(SLOW_CALLS_##arity); \
605 REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
606 REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
607 REPORT(RIGHT_ARITY_##arity); \
608 REPORT(TAGGED_PTR_##arity)
609
610 #define TICK_PRINT_TOT(arity) \
611 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
612 SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
613
614 /*
615 Note [RTS Stats Reporting]
616 ==========================
617
618 There are currently three reporting functions:
619 * report_summary:
620 Responsible for producing '+RTS -s' output.
621 Will report internal counters if the RTS flag --internal-counters is
622 passed. See [Internal Counters Stats]
623 * report_machine_readable:
624 Responsible for producing '+RTS -t --machine-readable' output.
625 * report_one_line:
626 Responsible for productin '+RTS -t' output
627
628 Stats are accumulated into the global variable 'stats' as the program runs, then
629 in 'stat_exit' we do the following:
630 * Finalise 'stats'. This involves setting final running times and allocations
631 that have not yet been accounted for.
632 * Create a RTSSummaryStats. This contains all data for reports that is not
633 included in stats (because they do not make sense before the program has
634 completed) or in a global variable.
635 * call the appropriate report_* function, passing the newly constructed
636 RTSSummaryStats.
637
638 To ensure that the data in the different reports is kept consistent, the
639 report_* functions should not do any calculation, excepting unit changes and
640 formatting. If you need to add a new calculated field, add it to
641 RTSSummaryStats.
642
643 */
644
645
646 static void init_RTSSummaryStats(RTSSummaryStats* sum)
647 {
648 const size_t sizeof_gc_summary_stats =
649 RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
650
651 memset(sum, 0, sizeof(RTSSummaryStats));
652 sum->gc_summary_stats =
653 stgMallocBytes(sizeof_gc_summary_stats,
654 "alloc_RTSSummaryStats.gc_summary_stats");
655 memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
656 }
657
658 static void free_RTSSummaryStats(RTSSummaryStats * sum)
659 {
660 stgFree(sum->gc_summary_stats);
661 sum->gc_summary_stats = NULL;
662 }
663
664 static void report_summary(const RTSSummaryStats* sum)
665 {
666 // We should do no calculation, other than unit changes and formatting, and
667 // we should not not use any data from outside of globals, sum and stats
668 // here. See Note [RTS Stats Reporting]
669
670 uint32_t g;
671 char temp[512];
672 showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
673 statsPrintf("%16s bytes allocated in the heap\n", temp);
674
675 showStgWord64(stats.copied_bytes, temp, true/*commas*/);
676 statsPrintf("%16s bytes copied during GC\n", temp);
677
678 if ( stats.major_gcs > 0 ) {
679 showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
680 statsPrintf("%16s bytes maximum residency (%" FMT_Word32
681 " sample(s))\n",
682 temp, stats.major_gcs);
683 }
684
685 showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
686 statsPrintf("%16s bytes maximum slop\n", temp);
687
688 statsPrintf("%16" FMT_Word64 " MB total memory in use (%"
689 FMT_Word64 " MB lost due to fragmentation)\n\n",
690 stats.max_live_bytes / (1024 * 1024),
691 sum->fragmentation_bytes / (1024 * 1024));
692
693 /* Print garbage collections in each gen */
694 statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
695 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
696 const GenerationSummaryStats * gen_stats =
697 &sum->gc_summary_stats[g];
698 statsPrintf(" Gen %2d %5d colls"
699 ", %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
700 g, // REVIEWERS: this used to be gen->no
701 //, this can't ever be different right?
702 gen_stats->collections,
703 gen_stats->par_collections,
704 TimeToSecondsDbl(gen_stats->cpu_ns),
705 TimeToSecondsDbl(gen_stats->elapsed_ns),
706 TimeToSecondsDbl(gen_stats->avg_pause_ns),
707 TimeToSecondsDbl(gen_stats->max_pause_ns));
708 }
709
710 statsPrintf("\n");
711
712 #if defined(THREADED_RTS)
713 if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
714 // See Note [Work Balance]
715 statsPrintf(" Parallel GC work balance: "
716 "%.2f%% (serial 0%%, perfect 100%%)\n\n",
717 sum->work_balance * 100);
718 }
719
720 statsPrintf(" TASKS: %d "
721 "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
722 taskCount, sum->bound_task_count,
723 peakWorkerCount, workerCount,
724 n_capabilities);
725
726 statsPrintf(" SPARKS: %" FMT_Word64
727 " (%" FMT_Word " converted, %" FMT_Word " overflowed, %"
728 FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
729 sum->sparks_count,
730 sum->sparks.converted, sum->sparks.overflowed,
731 sum->sparks.dud, sum->sparks.gcd,
732 sum->sparks.fizzled);
733 #endif
734
735 statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
736 TimeToSecondsDbl(stats.init_cpu_ns),
737 TimeToSecondsDbl(stats.init_elapsed_ns));
738
739 statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
740 TimeToSecondsDbl(stats.mutator_cpu_ns),
741 TimeToSecondsDbl(stats.mutator_elapsed_ns));
742 statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
743 TimeToSecondsDbl(stats.gc_cpu_ns),
744 TimeToSecondsDbl(stats.gc_elapsed_ns));
745
746 #if defined(PROFILING)
747 statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
748 TimeToSecondsDbl(sum->rp_cpu_ns),
749 TimeToSecondsDbl(sum->rp_elapsed_ns));
750 statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
751 TimeToSecondsDbl(sum->hc_cpu_ns),
752 TimeToSecondsDbl(sum->hc_elapsed_ns));
753 #endif
754 statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
755 TimeToSecondsDbl(sum->exit_cpu_ns),
756 TimeToSecondsDbl(sum->exit_elapsed_ns));
757 statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
758 TimeToSecondsDbl(stats.cpu_ns),
759 TimeToSecondsDbl(stats.elapsed_ns));
760 #if !defined(THREADED_RTS)
761 statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
762 sum->gc_cpu_percent * 100,
763 sum->gc_elapsed_percent * 100);
764 #endif
765
766 showStgWord64(sum->alloc_rate, temp, true/*commas*/);
767
768 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
769
770 statsPrintf(" Productivity %5.1f%% of total user, "
771 "%.1f%% of total elapsed\n\n",
772 sum->productivity_cpu_percent * 100,
773 sum->productivity_elapsed_percent * 100);
774
775 // See Note [Internal Counter Stats] for a description of the
776 // following counters. If you add a counter here, please remember
777 // to update the Note.
778 if (RtsFlags.MiscFlags.internalCounters) {
779 #if defined(THREADED_RTS) && defined(PROF_SPIN)
780 const int32_t col_width[] = {4, -30, 14, 14};
781 statsPrintf("Internal Counters:\n");
782 statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
783 , col_width[0], ""
784 , col_width[1], "SpinLock"
785 , col_width[2], "Spins"
786 , col_width[3], "Yields");
787 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
788 , col_width[0], ""
789 , col_width[1], "gc_alloc_block_sync"
790 , col_width[2], gc_alloc_block_sync.spin
791 , col_width[3], gc_alloc_block_sync.yield);
792 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
793 , col_width[0], ""
794 , col_width[1], "gc_spin"
795 , col_width[2], stats.gc_spin_spin
796 , col_width[3], stats.gc_spin_yield);
797 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
798 , col_width[0], ""
799 , col_width[1], "mut_spin"
800 , col_width[2], stats.mut_spin_spin
801 , col_width[3], stats.mut_spin_yield);
802 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
803 , col_width[0], ""
804 , col_width[1], "whitehole_gc"
805 , col_width[2], whitehole_gc_spin
806 , col_width[3], "n/a");
807 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
808 , col_width[0], ""
809 , col_width[1], "whitehole_threadPaused"
810 , col_width[2], whitehole_threadPaused_spin
811 , col_width[3], "n/a");
812 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
813 , col_width[0], ""
814 , col_width[1], "whitehole_executeMessage"
815 , col_width[2], whitehole_executeMessage_spin
816 , col_width[3], "n/a");
817 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
818 , col_width[0], ""
819 , col_width[1], "whitehole_lockClosure"
820 , col_width[2], whitehole_lockClosure_spin
821 , col_width[3], whitehole_lockClosure_yield);
822 // waitForGcThreads isn't really spin-locking(see the function)
823 // but these numbers still seem useful.
824 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
825 , col_width[0], ""
826 , col_width[1], "waitForGcThreads"
827 , col_width[2], waitForGcThreads_spin
828 , col_width[3], waitForGcThreads_yield);
829
830 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
831 int prefix_length = 0;
832 statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
833 col_width[0], "", g, &prefix_length);
834 prefix_length -= col_width[0];
835 int suffix_length = col_width[1] + prefix_length;
836 suffix_length =
837 suffix_length > 0 ? col_width[1] : suffix_length;
838
839 statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
840 , suffix_length, "].sync"
841 , col_width[2], generations[g].sync.spin
842 , col_width[3], generations[g].sync.yield);
843 }
844 statsPrintf("\n");
845 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
846 , col_width[0], ""
847 , col_width[1], "any_work"
848 , col_width[2], stats.any_work);
849 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
850 , col_width[0], ""
851 , col_width[1], "no_work"
852 , col_width[2], stats.no_work);
853 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
854 , col_width[0], ""
855 , col_width[1], "scav_find_work"
856 , col_width[2], stats.scav_find_work);
857 #elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
858 statsPrintf("Internal Counters require the RTS to be built "
859 "with PROF_SPIN"); // PROF_SPIN is not #defined here
860 #else // THREADED_RTS
861 statsPrintf("Internal Counters require the threaded RTS");
862 #endif
863 }
864 }
865
866 static void report_machine_readable (const RTSSummaryStats * sum)
867 {
868 // We should do no calculation, other than unit changes and formatting, and
869 // we should not not use any data from outside of globals, sum and stats
870 // here. See Note [RTS Stats Reporting]
871 uint32_t g;
872
873 #define MR_STAT(field_name,format,value) \
874 statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
875 #define MR_STAT_GEN(gen,field_name,format,value) \
876 statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
877 format "\")\n", g, value)
878
879 // These first values are for backwards compatibility.
880 // Some of these first fields are duplicated with more machine-readable
881 // names, or to match the name in RtsStats.
882
883 // we don't use for the first field helper macro here because the prefix is
884 // different
885 statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
886 stats.allocated_bytes);
887 MR_STAT("num_GCs", FMT_Word32, stats.gcs);
888 MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
889 MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
890 MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
891 MR_STAT("peak_megabytes_allocated", FMT_Word64,
892 stats.max_mem_in_use_bytes / (1024 * 1024));
893
894 MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
895 MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
896 MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
897 MR_STAT("mut_wall_seconds", "f",
898 TimeToSecondsDbl(stats.mutator_elapsed_ns));
899 MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
900 MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
901
902 // end backward compatibility
903
904 // First, the rest of the times
905
906 MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
907 MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
908 #if defined(PROFILING)
909 MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
910 MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
911 MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hc_cpu_ns));
912 MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hc_elapsed_ns));
913 #endif
914 MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
915 MR_STAT("total_wall_seconds", "f",
916 TimeToSecondsDbl(stats.elapsed_ns));
917
918 // next, the remainder of the fields of RTSStats, except internal counters
919
920 // The first two are duplicates of those above, but have more machine
921 // readable names that match the field names in RTSStats.
922
923
924 // gcs has been done as num_GCs above
925 MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
926 MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
927 MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
928 MR_STAT("max_large_objects_bytes", FMT_Word64,
929 stats.max_large_objects_bytes);
930 MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
931 MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
932 // This duplicates, except for unit, peak_megabytes_allocated above
933 MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
934 MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
935 MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
936 MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
937 MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
938 stats.cumulative_par_max_copied_bytes);
939 MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
940 stats.cumulative_par_balanced_copied_bytes);
941
942 // next, the computed fields in RTSSummaryStats
943 #if !defined(THREADED_RTS) // THREADED_RTS
944 MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
945 MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
946 #endif
947 MR_STAT("fragmentation_bytes", FMT_Word64, sum->fragmentation_bytes);
948 // average_bytes_used is done above
949 MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
950 MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
951 MR_STAT("productivity_wall_percent", "f",
952 sum->productivity_elapsed_percent);
953
954 // next, the THREADED_RTS fields in RTSSummaryStats
955
956 #if defined(THREADED_RTS)
957 MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
958 MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
959 MR_STAT("sparks_converted", FMT_Word, sum->sparks.converted);
960 MR_STAT("sparks_overflowed", FMT_Word, sum->sparks.overflowed);
961 MR_STAT("sparks_dud ", FMT_Word, sum->sparks.dud);
962 MR_STAT("sparks_gcd", FMT_Word, sum->sparks.gcd);
963 MR_STAT("sparks_fizzled", FMT_Word, sum->sparks.fizzled);
964 MR_STAT("work_balance", "f", sum->work_balance);
965
966 // next, globals (other than internal counters)
967 MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
968 MR_STAT("task_count", FMT_Word32, taskCount);
969 MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
970 MR_STAT("worker_count", FMT_Word32, workerCount);
971
972 // next, internal counters
973 #if defined(PROF_SPIN)
974 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
975 MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
976 gc_alloc_block_sync.yield);
977 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
978 MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
979 MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
980 MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
981 MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
982 MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
983 MR_STAT("waitForGcThreads_yield", FMT_Word64,
984 waitForGcThreads_yield);
985 MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
986 MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
987 whitehole_lockClosure_spin);
988 MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
989 whitehole_lockClosure_yield);
990 MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
991 whitehole_executeMessage_spin);
992 MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
993 whitehole_threadPaused_spin);
994 MR_STAT("any_work", FMT_Word64,
995 stats.any_work);
996 MR_STAT("no_work", FMT_Word64,
997 stats.no_work);
998 MR_STAT("scav_find_work", FMT_Word64,
999 stats.scav_find_work);
1000 #endif // PROF_SPIN
1001 #endif // THREADED_RTS
1002
1003 // finally, per-generation stats. Named as, for example for generation 0,
1004 // gen_0_collections
1005 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1006 const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
1007 MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
1008 MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
1009 MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
1010 MR_STAT_GEN(g, "wall_seconds", "f",
1011 TimeToSecondsDbl(gc_sum->elapsed_ns));
1012 MR_STAT_GEN(g, "max_pause_seconds", "f",
1013 TimeToSecondsDbl(gc_sum->max_pause_ns));
1014 MR_STAT_GEN(g, "avg_pause_seconds", "f",
1015 TimeToSecondsDbl(gc_sum->avg_pause_ns));
1016 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1017 MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
1018 MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
1019 #endif
1020 }
1021
1022 statsPrintf(" ]\n");
1023 }
1024
1025 static void report_one_line(const RTSSummaryStats * sum)
1026 {
1027 // We should do no calculation, other than unit changes and formatting, and
1028 // we should not not use any data from outside of globals, sum and stats
1029 // here. See Note [RTS Stats Reporting]
1030
1031 /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
1032 mingw-0.5) */
1033 statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
1034 "%" FMT_Word32 " GCs, "
1035 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
1036 "(%" FMT_Word32 " samples), "
1037 "%" FMT_Word64 "M in use, "
1038 "%.3f INIT (%.3f elapsed), "
1039 "%.3f MUT (%.3f elapsed), "
1040 "%.3f GC (%.3f elapsed) :ghc>>\n",
1041 stats.allocated_bytes,
1042 stats.gcs,
1043 sum->average_bytes_used,
1044 stats.max_live_bytes,
1045 stats.major_gcs,
1046 stats.max_mem_in_use_bytes / (1024 * 1024),
1047 TimeToSecondsDbl(stats.init_cpu_ns),
1048 TimeToSecondsDbl(stats.init_elapsed_ns),
1049 TimeToSecondsDbl(stats.mutator_cpu_ns),
1050 TimeToSecondsDbl(stats.mutator_elapsed_ns),
1051 TimeToSecondsDbl(stats.gc_cpu_ns),
1052 TimeToSecondsDbl(stats.gc_elapsed_ns));
1053 }
1054
1055 void
1056 stat_exit (void)
1057 {
1058 RTSSummaryStats sum;
1059 init_RTSSummaryStats(&sum);
1060
1061 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
1062 // First we tidy the times in stats, and populate the times in sum.
1063 // In particular, we adjust the gc_* time in stats to remove
1064 // profiling times.
1065 {
1066 Time now_cpu_ns, now_elapsed_ns;
1067 getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
1068
1069 stats.cpu_ns = now_cpu_ns - start_init_cpu;
1070 stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
1071 /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
1072 seconds -- SDM */
1073 if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
1074 if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
1075
1076 #if defined(PROFILING)
1077 sum.rp_cpu_ns = RP_tot_time;
1078 sum.rp_elapsed_ns = RPe_tot_time;
1079 sum.hc_cpu_ns = HC_tot_time;
1080 sum.hc_elapsed_ns = HCe_tot_time;
1081 #endif // PROFILING
1082
1083 // We do a GC during the EXIT phase. We'll attribute the cost of
1084 // that to GC instead of EXIT, so carefully subtract it from the
1085 // EXIT time.
1086 // Note that exit_gc includes RP and HC for the exit GC too.
1087 Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
1088 Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
1089
1090 ASSERT(exit_gc_elapsed > 0);
1091
1092 sum.exit_cpu_ns = end_exit_cpu
1093 - start_exit_cpu
1094 - exit_gc_cpu;
1095 sum.exit_elapsed_ns = end_exit_elapsed
1096 - start_exit_elapsed
1097 - exit_gc_elapsed;
1098
1099 ASSERT(sum.exit_elapsed_ns >= 0);
1100
1101 stats.mutator_cpu_ns = start_exit_cpu
1102 - end_init_cpu
1103 - (stats.gc_cpu_ns - exit_gc_cpu);
1104 stats.mutator_elapsed_ns = start_exit_elapsed
1105 - end_init_elapsed
1106 - (stats.gc_elapsed_ns - exit_gc_elapsed);
1107
1108 ASSERT(stats.mutator_elapsed_ns >= 0);
1109
1110 if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
1111
1112 // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
1113 // and subtracting, so the parts should add up to the total exactly.
1114 // Note that stats->total_ns is captured a tiny bit later than
1115 // end_exit_elapsed, so we don't use it here.
1116 ASSERT(stats.init_elapsed_ns // INIT
1117 + stats.mutator_elapsed_ns // MUT
1118 + stats.gc_elapsed_ns // GC
1119 + sum.exit_elapsed_ns // EXIT
1120 == end_exit_elapsed - start_init_elapsed);
1121
1122 // heapCensus() is called by the GC, so RP and HC time are
1123 // included in the GC stats. We therefore subtract them to
1124 // obtain the actual GC cpu time.
1125 Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns;
1126 Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
1127
1128 stats.gc_cpu_ns -= prof_cpu;
1129 stats.gc_elapsed_ns -= prof_elapsed;
1130
1131 // This assertion is probably not necessary; make sure the
1132 // subdivision with PROF also makes sense
1133 ASSERT(stats.init_elapsed_ns // INIT
1134 + stats.mutator_elapsed_ns // MUT
1135 + stats.gc_elapsed_ns // GC
1136 + sum.exit_elapsed_ns // EXIT
1137 + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
1138 == end_exit_elapsed - start_init_elapsed);
1139 }
1140
1141 // REVIEWERS: it's not clear to me why the following isn't done in
1142 // stat_endGC of the last garbage collection?
1143
1144 // We account for the last garbage collection
1145 {
1146 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
1147 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
1148 stats.allocated_bytes = tot_alloc_bytes;
1149 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
1150 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
1151 (W_)stats.gc.allocated_bytes, "", "");
1152 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
1153 }
1154 }
1155
1156 // We populate the remainder (non-time elements) of sum
1157 {
1158 #if defined(THREADED_RTS)
1159 sum.bound_task_count = taskCount - workerCount;
1160
1161 for (uint32_t i = 0; i < n_capabilities; i++) {
1162 sum.sparks.created += capabilities[i]->spark_stats.created;
1163 sum.sparks.dud += capabilities[i]->spark_stats.dud;
1164 sum.sparks.overflowed+=
1165 capabilities[i]->spark_stats.overflowed;
1166 sum.sparks.converted +=
1167 capabilities[i]->spark_stats.converted;
1168 sum.sparks.gcd += capabilities[i]->spark_stats.gcd;
1169 sum.sparks.fizzled += capabilities[i]->spark_stats.fizzled;
1170 }
1171
1172 sum.sparks_count = sum.sparks.created
1173 + sum.sparks.dud
1174 + sum.sparks.overflowed;
1175
1176 if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
1177 // See Note [Work Balance]
1178 sum.work_balance =
1179 (double)stats.cumulative_par_balanced_copied_bytes
1180 / (double)stats.par_copied_bytes;
1181 } else {
1182 sum.work_balance = 0;
1183 }
1184
1185
1186 #else // THREADED_RTS
1187 sum.gc_cpu_percent = stats.gc_cpu_ns
1188 / stats.cpu_ns;
1189 sum.gc_elapsed_percent = stats.gc_elapsed_ns
1190 / stats.elapsed_ns;
1191 #endif // THREADED_RTS
1192
1193 sum.fragmentation_bytes =
1194 (uint64_t)(peak_mblocks_allocated
1195 * BLOCKS_PER_MBLOCK
1196 * BLOCK_SIZE_W
1197 - hw_alloc_blocks * BLOCK_SIZE_W)
1198 / (uint64_t)sizeof(W_);
1199
1200 sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
1201 stats.cumulative_live_bytes/stats.major_gcs,
1202
1203 sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
1204 (uint64_t)((double)stats.allocated_bytes
1205 / TimeToSecondsDbl(stats.mutator_cpu_ns));
1206
1207 // REVIEWERS: These two values didn't used to include the exit times
1208 sum.productivity_cpu_percent =
1209 TimeToSecondsDbl(stats.cpu_ns
1210 - stats.gc_cpu_ns
1211 - stats.init_cpu_ns
1212 - sum.exit_cpu_ns)
1213 / TimeToSecondsDbl(stats.cpu_ns);
1214
1215 ASSERT(sum.productivity_cpu_percent >= 0);
1216
1217 sum.productivity_elapsed_percent =
1218 TimeToSecondsDbl(stats.elapsed_ns
1219 - stats.gc_elapsed_ns
1220 - stats.init_elapsed_ns
1221 - sum.exit_elapsed_ns)
1222 / TimeToSecondsDbl(stats.elapsed_ns);
1223
1224 ASSERT(sum.productivity_elapsed_percent >= 0);
1225
1226 for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
1227 const generation* gen = &generations[g];
1228 GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
1229 gen_stats->collections = gen->collections;
1230 gen_stats->par_collections = gen->par_collections;
1231 gen_stats->cpu_ns = GC_coll_cpu[g];
1232 gen_stats->elapsed_ns = GC_coll_elapsed[g];
1233 gen_stats->max_pause_ns = GC_coll_max_pause[g];
1234 gen_stats->avg_pause_ns = gen->collections == 0 ?
1235 0 : (GC_coll_elapsed[g] / gen->collections);
1236 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1237 gen_stats->sync_spin = gen->sync.spin;
1238 gen_stats->sync_yield = gen->sync.yield;
1239 #endif // PROF_SPIN
1240 }
1241 }
1242
1243 // Now we generate the report
1244 if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
1245 report_summary(&sum);
1246 }
1247
1248 if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
1249 if (RtsFlags.MiscFlags.machineReadable) {
1250 report_machine_readable(&sum);
1251 }
1252 else {
1253 report_one_line(&sum);
1254 }
1255 }
1256
1257 statsFlush();
1258 statsClose();
1259 }
1260
1261 free_RTSSummaryStats(&sum);
1262
1263 if (GC_coll_cpu) {
1264 stgFree(GC_coll_cpu);
1265 GC_coll_cpu = NULL;
1266 }
1267 if (GC_coll_elapsed) {
1268 stgFree(GC_coll_elapsed);
1269 GC_coll_elapsed = NULL;
1270 }
1271 if (GC_coll_max_pause) {
1272 stgFree(GC_coll_max_pause);
1273 GC_coll_max_pause = NULL;
1274 }
1275 }
1276
1277 /* Note [Work Balance]
1278 ----------------------
1279 Work balance is a measure of how evenly the work done during parallel garbage
1280 collection is spread across threads. To compute work balance we must take care
1281 to account for the number of GC threads changing between GCs. The statistics we
1282 track must have the number of GC threads "integrated out".
1283
1284 We accumulate two values from each garbage collection:
1285 * par_copied: is a measure of the total work done during parallel garbage
1286 collection
1287 * par_balanced_copied: is a measure of the balanced work done
1288 during parallel garbage collection.
1289
1290 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
1291 complicated:
1292
1293 For a given garbage collection:
1294 Let gc_copied := total copies during the gc
1295 gc_copied_i := copies by the ith thread during the gc
1296 num_gc_threads := the number of threads participating in the gc
1297 balance_limit := (gc_copied / num_gc_threads)
1298
1299 If we were to graph gc_copied_i, sorted from largest to smallest we would see
1300 something like:
1301
1302 |X
1303 ^ |X X
1304 | |X X X X: unbalanced copies
1305 copies |----------- Y: balanced copies by the busiest GC thread
1306 |Y Z Z Z: other balanced copies
1307 |Y Z Z Z
1308 |Y Z Z Z Z
1309 |Y Z Z Z Z Z
1310 |===========
1311 |1 2 3 4 5 6
1312 i ->
1313
1314 where the --- line is at balance_limit. Balanced copies are those under the ---
1315 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
1316 always equal balance_limit. Completely balanced gc has every thread copying
1317 balance_limit and a completely unbalanced gc has a single thread copying
1318 gc_copied.
1319
1320 One could define par_balance_copied as the areas of the Ys and Zs in the graph
1321 above, however we would like the ratio of (par_balance_copied / gc_copied) to
1322 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
1323 from 0 to 1. We therefore define par_balanced_copied as:
1324
1325 ( num_gc_threads )
1326 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
1327 i (num_gc_threads - 1)
1328 vvv vvv
1329 S T
1330
1331 Where the S and T terms serve to remove the area of the Ys, and
1332 to normalize the result to lie between 0 and gc_copied.
1333
1334 Note that the implementation orders these operations differently to minimize
1335 error due to integer rounding.
1336
1337 Then cumulative work balance is computed as
1338 (cumulative_par_balanced_copied_bytes / par_copied_byes)
1339
1340 Previously, cumulative work balance was computed as:
1341
1342 (cumulative_par_max_copied_bytes)
1343 (-------------------------------) - 1
1344 ( par_copied_bytes )
1345 -------------------------------------
1346 (n_capabilities - 1)
1347
1348 This was less accurate than the current method, and invalid whenever a garbage
1349 collection had occurred with num_gc_threads /= n_capabilities; which can happen
1350 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
1351 the number of gc threads is limited to the number of cores.
1352 See #13830
1353 */
1354
1355 /*
1356 Note [Internal Counter Stats]
1357 -----------------------------
1358 What do the counts at the end of a '+RTS -s --internal-counters' report mean?
1359 They are detailed below. Most of these counters are used by multiple threads
1360 with no attempt at synchronisation. This means that reported values may be
1361 lower than the true value and this becomes more likely and more severe as
1362 contention increases.
1363
1364 The first counters are for various SpinLock-like constructs in the RTS. See
1365 Spinlock.h for the definition of a SpinLock. We maintain up two counters per
1366 SpinLock:
1367 * spin: The number of busy-spins over the length of the program.
1368 * yield: The number of times the SpinLock spun SPIN_COUNT times without success
1369 and called yieldThread().
1370 Not all of these are actual SpinLocks, see the details below.
1371
1372 Actual SpinLocks:
1373 * gc_alloc_block:
1374 This SpinLock protects the block allocator and free list manager. See
1375 BlockAlloc.c.
1376 * gc_spin and mut_spin:
1377 These SpinLocks are used to herd gc worker threads during parallel garbage
1378 collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
1379 * gen[g].sync:
1380 These SpinLocks, one per generation, protect the generations[g] data
1381 structure during garbage collection.
1382
1383 waitForGcThreads:
1384 These counters are incremented while we wait for all threads to be ready
1385 for a parallel garbage collection. We yield more than we spin in this case.
1386
1387 In several places in the runtime we must take a lock on a closure. To do this,
1388 we replace its info table with stg_WHITEHOLE_info, spinning if it is already
1389 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
1390 don't. We count these white-hole spins and include them in the SpinLocks table.
1391 If a particular loop does not yield, we put "n/a" in the table. They are named
1392 for the function that has the spinning loop except that several loops in the
1393 garbage collector accumulate into whitehole_gc.
1394 TODO: Should these counters be more or less granular?
1395
1396 white-hole spin counters:
1397 * whitehole_gc
1398 * whitehole_lockClosure
1399 * whitehole_executeMessage
1400 * whitehole_threadPaused
1401
1402
1403 We count the number of calls of several functions in the parallel garbage
1404 collector.
1405
1406 Parallel garbage collector counters:
1407 * any_work:
1408 A cheap function called whenever a gc_thread is ready for work. Does
1409 not do any work.
1410 * no_work:
1411 Incremented whenever any_work finds no work.
1412 * scav_find_work:
1413 Called to do work when any_work return true.
1414
1415 */
1416
1417 /* -----------------------------------------------------------------------------
1418 stat_describe_gens
1419
1420 Produce some detailed info on the state of the generational GC.
1421 -------------------------------------------------------------------------- */
1422 void
1423 statDescribeGens(void)
1424 {
1425 uint32_t g, mut, lge, compacts, i;
1426 W_ gen_slop;
1427 W_ tot_live, tot_slop;
1428 W_ gen_live, gen_blocks;
1429 bdescr *bd;
1430 generation *gen;
1431
1432 debugBelch(
1433 "----------------------------------------------------------------------\n"
1434 " Gen Max Mut-list Blocks Large Compacts Live Slop\n"
1435 " Blocks Bytes Objects \n"
1436 "----------------------------------------------------------------------\n");
1437
1438 tot_live = 0;
1439 tot_slop = 0;
1440
1441 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1442 gen = &generations[g];
1443
1444 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
1445 lge++;
1446 }
1447
1448 for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
1449 compacts++;
1450 }
1451
1452 gen_live = genLiveWords(gen);
1453 gen_blocks = genLiveBlocks(gen);
1454
1455 mut = 0;
1456 for (i = 0; i < n_capabilities; i++) {
1457 mut += countOccupied(capabilities[i]->mut_lists[g]);
1458
1459 // Add the pinned object block.
1460 bd = capabilities[i]->pinned_object_block;
1461 if (bd != NULL) {
1462 gen_live += bd->free - bd->start;
1463 gen_blocks += bd->blocks;
1464 }
1465
1466 gen_live += gcThreadLiveWords(i,g);
1467 gen_blocks += gcThreadLiveBlocks(i,g);
1468 }
1469
1470 debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
1471
1472 gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
1473
1474 debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n",
1475 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
1476 gen_slop*(W_)sizeof(W_));
1477 tot_live += gen_live;
1478 tot_slop += gen_slop;
1479 }
1480 debugBelch("----------------------------------------------------------------------\n");
1481 debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
1482 "",tot_live*(W_)sizeof(W_),tot_slop*(W_)sizeof(W_));
1483 debugBelch("----------------------------------------------------------------------\n");
1484 debugBelch("\n");
1485 }
1486
1487 /* -----------------------------------------------------------------------------
1488 Stats available via a programmatic interface, so eg. GHCi can time
1489 each compilation and expression evaluation.
1490 -------------------------------------------------------------------------- */
1491
1492 uint64_t getAllocations( void )
1493 {
1494 return stats.allocated_bytes;
1495 }
1496
1497 int getRTSStatsEnabled( void )
1498 {
1499 return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
1500 }
1501
1502 void getRTSStats( RTSStats *s )
1503 {
1504 Time current_elapsed = 0;
1505 Time current_cpu = 0;
1506
1507 *s = stats;
1508
1509 getProcessTimes(&current_cpu, &current_elapsed);
1510 s->cpu_ns = current_cpu - end_init_cpu;
1511 s->elapsed_ns = current_elapsed - end_init_elapsed;
1512
1513 s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
1514 s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1515 stats.gc_elapsed_ns;
1516 }
1517
1518 /* -----------------------------------------------------------------------------
1519 Dumping stuff in the stats file, or via the debug message interface
1520 -------------------------------------------------------------------------- */
1521
1522 void
1523 statsPrintf( char *s, ... )
1524 {
1525 FILE *sf = RtsFlags.GcFlags.statsFile;
1526 va_list ap;
1527
1528 va_start(ap,s);
1529 if (sf == NULL) {
1530 vdebugBelch(s,ap);
1531 } else {
1532 vfprintf(sf, s, ap);
1533 }
1534 va_end(ap);
1535 }
1536
1537 static void
1538 statsFlush( void )
1539 {
1540 FILE *sf = RtsFlags.GcFlags.statsFile;
1541 if (sf != NULL) {
1542 fflush(sf);
1543 }
1544 }
1545
1546 static void
1547 statsClose( void )
1548 {
1549 FILE *sf = RtsFlags.GcFlags.statsFile;
1550 if (sf != NULL) {
1551 fclose(sf);
1552 }
1553 }