Use https links in user-facing startup and error messages
[ghc.git] / rts / Stats.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2005
4 *
5 * Statistics and timing-related functions.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "PosixSource.h"
10 #include "Rts.h"
11
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GCThread.h"
20 #include "sm/BlockAlloc.h"
21
22 // for spin/yield counters
23 #include "sm/GC.h"
24 #include "ThreadPaused.h"
25 #include "Messages.h"
26
27 #include <string.h> // for memset
28
29 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
30
31 static Time
32 start_init_cpu, start_init_elapsed,
33 end_init_cpu, end_init_elapsed,
34 start_exit_cpu, start_exit_elapsed,
35 start_exit_gc_elapsed, start_exit_gc_cpu,
36 end_exit_cpu, end_exit_elapsed;
37
38 #if defined(PROFILING)
39 static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
40 static Time RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time
41
42 static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
43 static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
44 #endif
45
46 #if defined(PROFILING)
47 #define PROF_VAL(x) (x)
48 #else
49 #define PROF_VAL(x) 0
50 #endif
51
52 #if defined(PROF_SPIN)
53 volatile StgWord64 whitehole_lockClosure_spin = 0;
54 volatile StgWord64 whitehole_lockClosure_yield = 0;
55 volatile StgWord64 whitehole_threadPaused_spin = 0;
56 volatile StgWord64 whitehole_executeMessage_spin = 0;
57 #endif
58
59 //
60 // All the stats!
61 //
62 // This is where we accumulate all the stats during execution, and it's also
63 // in a convenient form that we can copy over to a caller of getRTSStats().
64 //
65 static RTSStats stats;
66
67 static W_ GC_end_faults = 0;
68
69 static Time *GC_coll_cpu = NULL;
70 static Time *GC_coll_elapsed = NULL;
71 static Time *GC_coll_max_pause = NULL;
72
73 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
74 static void statsFlush( void );
75 static void statsClose( void );
76
77 /* -----------------------------------------------------------------------------
78 Current elapsed time
79 ------------------------------------------------------------------------- */
80
81 Time stat_getElapsedTime(void)
82 {
83 return getProcessElapsedTime() - start_init_elapsed;
84 }
85
86 /* ---------------------------------------------------------------------------
87 Measure the current MUT time, for profiling
88 ------------------------------------------------------------------------ */
89
90 double
91 mut_user_time_until( Time t )
92 {
93 return TimeToSecondsDbl(t - stats.gc_cpu_ns);
94 // heapCensus() time is included in GC_tot_cpu, so we don't need
95 // to subtract it here.
96
97 // TODO: This seems wrong to me. Surely we should be subtracting
98 // (at least) start_init_cpu?
99 }
100
101 double
102 mut_user_time( void )
103 {
104 Time cpu;
105 cpu = getProcessCPUTime();
106 return mut_user_time_until(cpu);
107 }
108
109 #if defined(PROFILING)
110 /*
111 mut_user_time_during_RP() returns the MUT time during retainer profiling.
112 The same is for mut_user_time_during_HC();
113 */
114 static double
115 mut_user_time_during_RP( void )
116 {
117 return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
118 }
119
120 #endif /* PROFILING */
121
122 /* ---------------------------------------------------------------------------
123 initStats0() has no dependencies, it can be called right at the beginning
124 ------------------------------------------------------------------------ */
125
126 void
127 initStats0(void)
128 {
129 start_init_cpu = 0;
130 start_init_elapsed = 0;
131 end_init_cpu = 0;
132 end_init_elapsed = 0;
133
134 start_exit_cpu = 0;
135 start_exit_elapsed = 0;
136 start_exit_gc_cpu = 0;
137 start_exit_gc_elapsed = 0;
138 end_exit_cpu = 0;
139 end_exit_elapsed = 0;
140
141 #if defined(PROFILING)
142 RP_start_time = 0;
143 RP_tot_time = 0;
144 RPe_start_time = 0;
145 RPe_tot_time = 0;
146
147 HC_start_time = 0;
148 HC_tot_time = 0;
149 HCe_start_time = 0;
150 HCe_tot_time = 0;
151 #endif
152
153 GC_end_faults = 0;
154
155 stats = (RTSStats) {
156 .gcs = 0,
157 .major_gcs = 0,
158 .allocated_bytes = 0,
159 .max_live_bytes = 0,
160 .max_large_objects_bytes = 0,
161 .max_compact_bytes = 0,
162 .max_slop_bytes = 0,
163 .max_mem_in_use_bytes = 0,
164 .cumulative_live_bytes = 0,
165 .copied_bytes = 0,
166 .par_copied_bytes = 0,
167 .cumulative_par_max_copied_bytes = 0,
168 .cumulative_par_balanced_copied_bytes = 0,
169 .gc_spin_spin = 0,
170 .gc_spin_yield = 0,
171 .mut_spin_spin = 0,
172 .mut_spin_yield = 0,
173 .any_work = 0,
174 .no_work = 0,
175 .scav_find_work = 0,
176 .init_cpu_ns = 0,
177 .init_elapsed_ns = 0,
178 .mutator_cpu_ns = 0,
179 .mutator_elapsed_ns = 0,
180 .gc_cpu_ns = 0,
181 .gc_elapsed_ns = 0,
182 .cpu_ns = 0,
183 .elapsed_ns = 0,
184 .gc = {
185 .gen = 0,
186 .threads = 0,
187 .allocated_bytes = 0,
188 .live_bytes = 0,
189 .large_objects_bytes = 0,
190 .compact_bytes = 0,
191 .slop_bytes = 0,
192 .mem_in_use_bytes = 0,
193 .copied_bytes = 0,
194 .par_max_copied_bytes = 0,
195 .par_balanced_copied_bytes = 0,
196 .sync_elapsed_ns = 0,
197 .cpu_ns = 0,
198 .elapsed_ns = 0
199 }
200 };
201 }
202
203 /* ---------------------------------------------------------------------------
204 initStats1() can be called after setupRtsFlags()
205 ------------------------------------------------------------------------ */
206
207 void
208 initStats1 (void)
209 {
210 uint32_t i;
211
212 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
213 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
214 statsPrintf(" bytes bytes bytes user elap user elap\n");
215 }
216 GC_coll_cpu =
217 (Time *)stgMallocBytes(
218 sizeof(Time)*RtsFlags.GcFlags.generations,
219 "initStats");
220 GC_coll_elapsed =
221 (Time *)stgMallocBytes(
222 sizeof(Time)*RtsFlags.GcFlags.generations,
223 "initStats");
224 GC_coll_max_pause =
225 (Time *)stgMallocBytes(
226 sizeof(Time)*RtsFlags.GcFlags.generations,
227 "initStats");
228 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
229 GC_coll_cpu[i] = 0;
230 GC_coll_elapsed[i] = 0;
231 GC_coll_max_pause[i] = 0;
232 }
233 }
234
235 /* -----------------------------------------------------------------------------
236 Initialisation time...
237 -------------------------------------------------------------------------- */
238
239 void
240 stat_startInit(void)
241 {
242 getProcessTimes(&start_init_cpu, &start_init_elapsed);
243 }
244
245 void
246 stat_endInit(void)
247 {
248 getProcessTimes(&end_init_cpu, &end_init_elapsed);
249 stats.init_cpu_ns = end_init_cpu - start_init_cpu;
250 stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
251 }
252
253 /* -----------------------------------------------------------------------------
254 stat_startExit and stat_endExit
255
256 These two measure the time taken in shutdownHaskell().
257 -------------------------------------------------------------------------- */
258
259 void
260 stat_startExit(void)
261 {
262 getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
263 start_exit_gc_elapsed = stats.gc_elapsed_ns;
264 start_exit_gc_cpu = stats.gc_cpu_ns;
265 }
266
267 void
268 stat_endExit(void)
269 {
270 getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
271 }
272
273 void
274 stat_startGCSync (gc_thread *gct)
275 {
276 gct->gc_sync_start_elapsed = getProcessElapsedTime();
277 }
278
279 /* -----------------------------------------------------------------------------
280 Called at the beginning of each GC
281 -------------------------------------------------------------------------- */
282
283 void
284 stat_startGC (Capability *cap, gc_thread *gct)
285 {
286 if (RtsFlags.GcFlags.ringBell) {
287 debugBelch("\007");
288 }
289
290 getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
291
292 // Post EVENT_GC_START with the same timestamp as used for stats
293 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
294 // Here, as opposed to other places, the event is emitted on the cap
295 // that initiates the GC and external tools expect it to have the same
296 // timestamp as used in +RTS -s calculcations.
297 traceEventGcStartAtT(cap,
298 TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
299
300 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
301 {
302 gct->gc_start_faults = getPageFaults();
303 }
304
305 updateNurseriesStats();
306 }
307
308 /* -----------------------------------------------------------------------------
309 Called at the end of each GC
310 -------------------------------------------------------------------------- */
311
312 void
313 stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
314 uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
315 W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
316 W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
317 W_ scav_find_work)
318 {
319 // -------------------------------------------------
320 // Collect all the stats about this GC in stats.gc. We always do this since
321 // it's relatively cheap and we need allocated_bytes to catch heap
322 // overflows.
323
324 stats.gc.gen = gen;
325 stats.gc.threads = par_n_threads;
326
327 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
328
329 // allocated since the last GC
330 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
331
332 stats.gc.live_bytes = live * sizeof(W_);
333 stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
334 stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
335 stats.gc.slop_bytes = slop * sizeof(W_);
336 stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
337 stats.gc.copied_bytes = copied * sizeof(W_);
338 stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
339 stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
340
341 bool stats_enabled =
342 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
343 rtsConfig.gcDoneHook != NULL;
344
345 if (stats_enabled
346 || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
347 {
348 // We only update the times when stats are explicitly enabled since
349 // getProcessTimes (e.g. requiring a system call) can be expensive on
350 // some platforms.
351 Time current_cpu, current_elapsed;
352 getProcessTimes(&current_cpu, &current_elapsed);
353 stats.cpu_ns = current_cpu - start_init_cpu;
354 stats.elapsed_ns = current_elapsed - start_init_elapsed;
355
356 stats.gc.sync_elapsed_ns =
357 gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
358 stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
359 stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
360 }
361 // -------------------------------------------------
362 // Update the cumulative stats
363
364 stats.gcs++;
365 stats.allocated_bytes = tot_alloc_bytes;
366 stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
367
368 GC_coll_cpu[gen] += stats.gc.cpu_ns;
369 GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
370 if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
371 GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
372 }
373
374 stats.copied_bytes += stats.gc.copied_bytes;
375 if (par_n_threads > 1) {
376 stats.par_copied_bytes += stats.gc.copied_bytes;
377 stats.cumulative_par_max_copied_bytes +=
378 stats.gc.par_max_copied_bytes;
379 stats.cumulative_par_balanced_copied_bytes +=
380 stats.gc.par_balanced_copied_bytes;
381 stats.any_work += any_work;
382 stats.no_work += no_work;
383 stats.scav_find_work += scav_find_work;
384 stats.gc_spin_spin += gc_spin_spin;
385 stats.gc_spin_yield += gc_spin_yield;
386 stats.mut_spin_spin += mut_spin_spin;
387 stats.mut_spin_yield += mut_spin_yield;
388 }
389 stats.gc_cpu_ns += stats.gc.cpu_ns;
390 stats.gc_elapsed_ns += stats.gc.elapsed_ns;
391
392 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
393 stats.major_gcs++;
394 if (stats.gc.live_bytes > stats.max_live_bytes) {
395 stats.max_live_bytes = stats.gc.live_bytes;
396 }
397 if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
398 stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
399 }
400 if (stats.gc.compact_bytes > stats.max_compact_bytes) {
401 stats.max_compact_bytes = stats.gc.compact_bytes;
402 }
403 if (stats.gc.slop_bytes > stats.max_slop_bytes) {
404 stats.max_slop_bytes = stats.gc.slop_bytes;
405 }
406 stats.cumulative_live_bytes += stats.gc.live_bytes;
407 }
408
409 // -------------------------------------------------
410 // Do the more expensive bits only when stats are enabled.
411
412 if (stats_enabled)
413 {
414 // -------------------------------------------------
415 // Emit events to the event log
416
417 // Has to be emitted while all caps stopped for GC, but before GC_END.
418 // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
419 // for a detailed design rationale of the current setup
420 // of GC eventlog events.
421 traceEventGcGlobalSync(cap);
422
423 // Emitted before GC_END on all caps, which simplifies tools code.
424 traceEventGcStats(cap,
425 CAPSET_HEAP_DEFAULT,
426 stats.gc.gen,
427 stats.gc.copied_bytes,
428 stats.gc.slop_bytes,
429 /* current loss due to fragmentation */
430 (mblocks_allocated * BLOCKS_PER_MBLOCK
431 - n_alloc_blocks) * BLOCK_SIZE,
432 par_n_threads,
433 stats.gc.par_max_copied_bytes,
434 stats.gc.copied_bytes,
435 stats.gc.par_balanced_copied_bytes);
436
437 // Post EVENT_GC_END with the same timestamp as used for stats
438 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
439 // Here, as opposed to other places, the event is emitted on the cap
440 // that initiates the GC and external tools expect it to have the same
441 // timestamp as used in +RTS -s calculcations.
442 traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
443
444 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
445 traceEventHeapLive(cap,
446 CAPSET_HEAP_DEFAULT,
447 stats.gc.live_bytes);
448 }
449
450 // -------------------------------------------------
451 // Print GC stats to stdout or a file (+RTS -S/-s)
452
453 if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
454 W_ faults = getPageFaults();
455
456 statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
457 stats.gc.allocated_bytes, stats.gc.copied_bytes,
458 stats.gc.live_bytes);
459
460 statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
461 FMT_Word " %4" FMT_Word " (Gen: %2d)\n",
462 TimeToSecondsDbl(stats.gc.cpu_ns),
463 TimeToSecondsDbl(stats.gc.elapsed_ns),
464 TimeToSecondsDbl(stats.cpu_ns),
465 TimeToSecondsDbl(stats.elapsed_ns),
466 faults - gct->gc_start_faults,
467 gct->gc_start_faults - GC_end_faults,
468 gen);
469
470 GC_end_faults = faults;
471 statsFlush();
472 }
473
474
475 if (rtsConfig.gcDoneHook != NULL) {
476 rtsConfig.gcDoneHook(&stats.gc);
477 }
478
479 traceEventHeapSize(cap,
480 CAPSET_HEAP_DEFAULT,
481 mblocks_allocated * MBLOCK_SIZE);
482 }
483 }
484
485 /* -----------------------------------------------------------------------------
486 Called at the beginning of each Retainer Profiliing
487 -------------------------------------------------------------------------- */
488 #if defined(PROFILING)
489 void
490 stat_startRP(void)
491 {
492 Time user, elapsed;
493 getProcessTimes( &user, &elapsed );
494
495 RP_start_time = user;
496 RPe_start_time = elapsed;
497 }
498 #endif /* PROFILING */
499
500 /* -----------------------------------------------------------------------------
501 Called at the end of each Retainer Profiliing
502 -------------------------------------------------------------------------- */
503
504 #if defined(PROFILING)
505 void
506 stat_endRP(
507 uint32_t retainerGeneration,
508 #if defined(DEBUG_RETAINER)
509 uint32_t maxCStackSize,
510 int maxStackSize,
511 #endif
512 double averageNumVisit)
513 {
514 Time user, elapsed;
515 getProcessTimes( &user, &elapsed );
516
517 RP_tot_time += user - RP_start_time;
518 RPe_tot_time += elapsed - RPe_start_time;
519
520 fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
521 retainerGeneration, mut_user_time_during_RP());
522 #if defined(DEBUG_RETAINER)
523 fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
524 fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
525 #endif
526 fprintf(prof_file, "\tAverage number of visits per object = %f\n",
527 averageNumVisit);
528 }
529 #endif /* PROFILING */
530
531 /* -----------------------------------------------------------------------------
532 Called at the beginning of each heap census
533 -------------------------------------------------------------------------- */
534 #if defined(PROFILING)
535 void
536 stat_startHeapCensus(void)
537 {
538 Time user, elapsed;
539 getProcessTimes( &user, &elapsed );
540
541 HC_start_time = user;
542 HCe_start_time = elapsed;
543 }
544 #endif /* PROFILING */
545
546 /* -----------------------------------------------------------------------------
547 Called at the end of each heap census
548 -------------------------------------------------------------------------- */
549 #if defined(PROFILING)
550 void
551 stat_endHeapCensus(void)
552 {
553 Time user, elapsed;
554 getProcessTimes( &user, &elapsed );
555
556 HC_tot_time += user - HC_start_time;
557 HCe_tot_time += elapsed - HCe_start_time;
558 }
559 #endif /* PROFILING */
560
561 /* -----------------------------------------------------------------------------
562 Called at the end of execution
563
564 NOTE: number of allocations is not entirely accurate: it doesn't
565 take into account the few bytes at the end of the heap that
566 were left unused when the heap-check failed.
567 -------------------------------------------------------------------------- */
568
569 #if defined(DEBUG)
570 #define TICK_VAR_INI(arity) \
571 StgInt SLOW_CALLS_##arity = 1; \
572 StgInt RIGHT_ARITY_##arity = 1; \
573 StgInt TAGGED_PTR_##arity = 0;
574
575 TICK_VAR_INI(1)
576 TICK_VAR_INI(2)
577
578 StgInt TOTAL_CALLS=1;
579 #endif
580
581 /* Report the value of a counter */
582 #define REPORT(counter) \
583 { \
584 showStgWord64(counter,temp,true/*commas*/); \
585 statsPrintf(" (" #counter ") : %s\n",temp); \
586 }
587
588 /* Report the value of a counter as a percentage of another counter */
589 #define REPORT_PCT(counter,countertot) \
590 statsPrintf(" (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
591 counter*100.0/countertot)
592
593 #define TICK_PRINT(arity) \
594 REPORT(SLOW_CALLS_##arity); \
595 REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
596 REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
597 REPORT(RIGHT_ARITY_##arity); \
598 REPORT(TAGGED_PTR_##arity)
599
600 #define TICK_PRINT_TOT(arity) \
601 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
602 SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
603
604 /*
605 Note [RTS Stats Reporting]
606 ==========================
607
608 There are currently three reporting functions:
609 * report_summary:
610 Responsible for producing '+RTS -s' output.
611 Will report internal counters if the RTS flag --internal-counters is
612 passed. See [Internal Counters Stats]
613 * report_machine_readable:
614 Responsible for producing '+RTS -t --machine-readable' output.
615 * report_one_line:
616 Responsible for productin '+RTS -t' output
617
618 Stats are accumulated into the global variable 'stats' as the program runs, then
619 in 'stat_exit' we do the following:
620 * Finalise 'stats'. This involves setting final running times and allocations
621 that have not yet been accounted for.
622 * Create a RTSSummaryStats. This contains all data for reports that is not
623 included in stats (because they do not make sense before the program has
624 completed) or in a global variable.
625 * call the appropriate report_* function, passing the newly constructed
626 RTSSummaryStats.
627
628 To ensure that the data in the different reports is kept consistent, the
629 report_* functions should not do any calculation, excepting unit changes and
630 formatting. If you need to add a new calculated field, add it to
631 RTSSummaryStats.
632
633 */
634
635
636 static void init_RTSSummaryStats(RTSSummaryStats* sum)
637 {
638 const size_t sizeof_gc_summary_stats =
639 RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
640
641 memset(sum, 0, sizeof(RTSSummaryStats));
642 sum->gc_summary_stats =
643 stgMallocBytes(sizeof_gc_summary_stats,
644 "alloc_RTSSummaryStats.gc_summary_stats");
645 memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
646 }
647
648 static void free_RTSSummaryStats(RTSSummaryStats * sum)
649 {
650 if (!sum) { return; }
651 if (!sum->gc_summary_stats) {
652 stgFree(sum->gc_summary_stats);
653 sum->gc_summary_stats = NULL;
654 }
655 }
656
657 static void report_summary(const RTSSummaryStats* sum)
658 {
659 // We should do no calculation, other than unit changes and formatting, and
660 // we should not not use any data from outside of globals, sum and stats
661 // here. See Note [RTS Stats Reporting]
662
663 uint32_t g;
664 char temp[512];
665 showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
666 statsPrintf("%16s bytes allocated in the heap\n", temp);
667
668 showStgWord64(stats.copied_bytes, temp, true/*commas*/);
669 statsPrintf("%16s bytes copied during GC\n", temp);
670
671 if ( stats.major_gcs > 0 ) {
672 showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
673 statsPrintf("%16s bytes maximum residency (%" FMT_Word32
674 " sample(s))\n",
675 temp, stats.major_gcs);
676 }
677
678 showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
679 statsPrintf("%16s bytes maximum slop\n", temp);
680
681 statsPrintf("%16" FMT_Word64 " MB total memory in use (%"
682 FMT_Word64 " MB lost due to fragmentation)\n\n",
683 stats.max_live_bytes / (1024 * 1024),
684 sum->fragmentation_bytes / (1024 * 1024));
685
686 /* Print garbage collections in each gen */
687 statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
688 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
689 const GenerationSummaryStats * gen_stats =
690 &sum->gc_summary_stats[g];
691 statsPrintf(" Gen %2d %5d colls"
692 ", %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
693 g, // REVIEWERS: this used to be gen->no
694 //, this can't ever be different right?
695 gen_stats->collections,
696 gen_stats->par_collections,
697 TimeToSecondsDbl(gen_stats->cpu_ns),
698 TimeToSecondsDbl(gen_stats->elapsed_ns),
699 TimeToSecondsDbl(gen_stats->avg_pause_ns),
700 TimeToSecondsDbl(gen_stats->max_pause_ns));
701 }
702
703 statsPrintf("\n");
704
705 #if defined(THREADED_RTS)
706 if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
707 // See Note [Work Balance]
708 statsPrintf(" Parallel GC work balance: "
709 "%.2f%% (serial 0%%, perfect 100%%)\n\n",
710 sum->work_balance * 100);
711 }
712
713 statsPrintf(" TASKS: %d "
714 "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
715 taskCount, sum->bound_task_count,
716 peakWorkerCount, workerCount,
717 n_capabilities);
718
719 statsPrintf(" SPARKS: %" FMT_Word64
720 " (%" FMT_Word " converted, %" FMT_Word " overflowed, %"
721 FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
722 sum->sparks_count,
723 sum->sparks.converted, sum->sparks.overflowed,
724 sum->sparks.dud, sum->sparks.gcd,
725 sum->sparks.fizzled);
726 #endif
727
728 statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
729 TimeToSecondsDbl(stats.init_cpu_ns),
730 TimeToSecondsDbl(stats.init_elapsed_ns));
731
732 statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
733 TimeToSecondsDbl(stats.mutator_cpu_ns),
734 TimeToSecondsDbl(stats.mutator_elapsed_ns));
735 statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
736 TimeToSecondsDbl(stats.gc_cpu_ns),
737 TimeToSecondsDbl(stats.gc_elapsed_ns));
738
739 #if defined(PROFILING)
740 statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
741 TimeToSecondsDbl(sum->rp_cpu_ns),
742 TimeToSecondsDbl(sum->rp_elapsed_ns));
743 statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
744 TimeToSecondsDbl(sum->hc_cpu_ns),
745 TimeToSecondsDbl(sum->hc_elapsed_ns));
746 #endif
747 statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
748 TimeToSecondsDbl(sum->exit_cpu_ns),
749 TimeToSecondsDbl(sum->exit_elapsed_ns));
750 statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
751 TimeToSecondsDbl(stats.cpu_ns),
752 TimeToSecondsDbl(stats.elapsed_ns));
753 #if !defined(THREADED_RTS)
754 statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
755 sum->gc_cpu_percent * 100,
756 sum->gc_elapsed_percent * 100);
757 #endif
758
759 showStgWord64(sum->alloc_rate, temp, true/*commas*/);
760
761 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
762
763 statsPrintf(" Productivity %5.1f%% of total user, "
764 "%.1f%% of total elapsed\n\n",
765 sum->productivity_cpu_percent * 100,
766 sum->productivity_elapsed_percent * 100);
767
768 // See Note [Internal Counter Stats] for a description of the
769 // following counters. If you add a counter here, please remember
770 // to update the Note.
771 if (RtsFlags.MiscFlags.internalCounters) {
772 #if defined(THREADED_RTS) && defined(PROF_SPIN)
773 const int32_t col_width[] = {4, -30, 14, 14};
774 statsPrintf("Internal Counters:\n");
775 statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
776 , col_width[0], ""
777 , col_width[1], "SpinLock"
778 , col_width[2], "Spins"
779 , col_width[3], "Yields");
780 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
781 , col_width[0], ""
782 , col_width[1], "gc_alloc_block_sync"
783 , col_width[2], gc_alloc_block_sync.spin
784 , col_width[3], gc_alloc_block_sync.yield);
785 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
786 , col_width[0], ""
787 , col_width[1], "gc_spin"
788 , col_width[2], stats.gc_spin_spin
789 , col_width[3], stats.gc_spin_yield);
790 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
791 , col_width[0], ""
792 , col_width[1], "mut_spin"
793 , col_width[2], stats.mut_spin_spin
794 , col_width[3], stats.mut_spin_yield);
795 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
796 , col_width[0], ""
797 , col_width[1], "whitehole_gc"
798 , col_width[2], whitehole_gc_spin
799 , col_width[3], "n/a");
800 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
801 , col_width[0], ""
802 , col_width[1], "whitehole_threadPaused"
803 , col_width[2], whitehole_threadPaused_spin
804 , col_width[3], "n/a");
805 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
806 , col_width[0], ""
807 , col_width[1], "whitehole_executeMessage"
808 , col_width[2], whitehole_executeMessage_spin
809 , col_width[3], "n/a");
810 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
811 , col_width[0], ""
812 , col_width[1], "whitehole_lockClosure"
813 , col_width[2], whitehole_lockClosure_spin
814 , col_width[3], whitehole_lockClosure_yield);
815 // waitForGcThreads isn't really spin-locking(see the function)
816 // but these numbers still seem useful.
817 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
818 , col_width[0], ""
819 , col_width[1], "waitForGcThreads"
820 , col_width[2], waitForGcThreads_spin
821 , col_width[3], waitForGcThreads_yield);
822
823 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
824 int prefix_length = 0;
825 statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
826 col_width[0], "", g, &prefix_length);
827 prefix_length -= col_width[0];
828 int suffix_length = col_width[1] + prefix_length;
829 suffix_length =
830 suffix_length > 0 ? col_width[1] : suffix_length;
831
832 statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
833 , suffix_length, "].sync"
834 , col_width[2], generations[g].sync.spin
835 , col_width[3], generations[g].sync.yield);
836 }
837 statsPrintf("\n");
838 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
839 , col_width[0], ""
840 , col_width[1], "any_work"
841 , col_width[2], stats.any_work);
842 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
843 , col_width[0], ""
844 , col_width[1], "no_work"
845 , col_width[2], stats.no_work);
846 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
847 , col_width[0], ""
848 , col_width[1], "scav_find_work"
849 , col_width[2], stats.scav_find_work);
850 #elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
851 statsPrintf("Internal Counters require the RTS to be built "
852 "with PROF_SPIN"); // PROF_SPIN is not #defined here
853 #else // THREADED_RTS
854 statsPrintf("Internal Counters require the threaded RTS");
855 #endif
856 }
857 }
858
859 static void report_machine_readable (const RTSSummaryStats * sum)
860 {
861 // We should do no calculation, other than unit changes and formatting, and
862 // we should not not use any data from outside of globals, sum and stats
863 // here. See Note [RTS Stats Reporting]
864 uint32_t g;
865
866 #define MR_STAT(field_name,format,value) \
867 statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
868 #define MR_STAT_GEN(gen,field_name,format,value) \
869 statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
870 format "\")\n", g, value)
871
872 // These first values are for backwards compatibility.
873 // Some of these first fields are duplicated with more machine-readable
874 // names, or to match the name in RtsStats.
875
876 // we don't use for the first field helper macro here because the prefix is
877 // different
878 statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
879 stats.allocated_bytes);
880 MR_STAT("num_GCs", FMT_Word32, stats.gcs);
881 MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
882 MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
883 MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
884 MR_STAT("peak_megabytes_allocated", FMT_Word64,
885 stats.max_mem_in_use_bytes / (1024 * 1024));
886
887 MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
888 MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
889 MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
890 MR_STAT("mut_wall_seconds", "f",
891 TimeToSecondsDbl(stats.mutator_elapsed_ns));
892 MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
893 MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
894
895 // end backward compatibility
896
897 // First, the rest of the times
898
899 MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
900 MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
901 #if defined(PROFILING)
902 MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
903 MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
904 MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hc_cpu_ns));
905 MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hc_elapsed_ns));
906 #endif
907 MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
908 MR_STAT("total_wall_seconds", "f",
909 TimeToSecondsDbl(stats.elapsed_ns));
910
911 // next, the remainder of the fields of RTSStats, except internal counters
912
913 // The first two are duplicates of those above, but have more machine
914 // readable names that match the field names in RTSStats.
915
916
917 // gcs has been done as num_GCs above
918 MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
919 MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
920 MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
921 MR_STAT("max_large_objects_bytes", FMT_Word64,
922 stats.max_large_objects_bytes);
923 MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
924 MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
925 // This duplicates, except for unit, peak_megabytes_allocated above
926 MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
927 MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
928 MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
929 MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
930 MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
931 stats.cumulative_par_max_copied_bytes);
932 MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
933 stats.cumulative_par_balanced_copied_bytes);
934
935 // next, the computed fields in RTSSummaryStats
936 #if !defined(THREADED_RTS) // THREADED_RTS
937 MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
938 MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
939 #endif
940 MR_STAT("fragmentation_bytes", FMT_Word64, sum->fragmentation_bytes);
941 // average_bytes_used is done above
942 MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
943 MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
944 MR_STAT("productivity_wall_percent", "f",
945 sum->productivity_elapsed_percent);
946
947 // next, the THREADED_RTS fields in RTSSummaryStats
948
949 #if defined(THREADED_RTS)
950 MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
951 MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
952 MR_STAT("sparks_converted", FMT_Word, sum->sparks.converted);
953 MR_STAT("sparks_overflowed", FMT_Word, sum->sparks.overflowed);
954 MR_STAT("sparks_dud ", FMT_Word, sum->sparks.dud);
955 MR_STAT("sparks_gcd", FMT_Word, sum->sparks.gcd);
956 MR_STAT("sparks_fizzled", FMT_Word, sum->sparks.fizzled);
957 MR_STAT("work_balance", "f", sum->work_balance);
958
959 // next, globals (other than internal counters)
960 MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
961 MR_STAT("task_count", FMT_Word32, taskCount);
962 MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
963 MR_STAT("worker_count", FMT_Word32, workerCount);
964
965 // next, internal counters
966 #if defined(PROF_SPIN)
967 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
968 MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
969 gc_alloc_block_sync.yield);
970 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
971 MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
972 MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
973 MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
974 MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
975 MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
976 MR_STAT("waitForGcThreads_yield", FMT_Word64,
977 waitForGcThreads_yield);
978 MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
979 MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
980 whitehole_lockClosure_spin);
981 MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
982 whitehole_lockClosure_yield);
983 MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
984 whitehole_executeMessage_spin);
985 MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
986 whitehole_threadPaused_spin);
987 MR_STAT("any_work", FMT_Word64,
988 stats.any_work);
989 MR_STAT("no_work", FMT_Word64,
990 stats.no_work);
991 MR_STAT("scav_find_work", FMT_Word64,
992 stats.scav_find_work);
993 #endif // PROF_SPIN
994 #endif // THREADED_RTS
995
996 // finally, per-generation stats. Named as, for example for generation 0,
997 // gen_0_collections
998 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
999 const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
1000 MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
1001 MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
1002 MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
1003 MR_STAT_GEN(g, "wall_seconds", "f",
1004 TimeToSecondsDbl(gc_sum->elapsed_ns));
1005 MR_STAT_GEN(g, "max_pause_seconds", "f",
1006 TimeToSecondsDbl(gc_sum->max_pause_ns));
1007 MR_STAT_GEN(g, "avg_pause_seconds", "f",
1008 TimeToSecondsDbl(gc_sum->avg_pause_ns));
1009 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1010 MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
1011 MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
1012 #endif
1013 }
1014
1015 statsPrintf(" ]\n");
1016 }
1017
1018 static void report_one_line(const RTSSummaryStats * sum)
1019 {
1020 // We should do no calculation, other than unit changes and formatting, and
1021 // we should not not use any data from outside of globals, sum and stats
1022 // here. See Note [RTS Stats Reporting]
1023
1024 /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
1025 mingw-0.5) */
1026 statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
1027 "%" FMT_Word32 " GCs, "
1028 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
1029 "(%" FMT_Word32 " samples), "
1030 "%" FMT_Word64 "M in use, "
1031 "%.3f INIT (%.3f elapsed), "
1032 "%.3f MUT (%.3f elapsed), "
1033 "%.3f GC (%.3f elapsed) :ghc>>\n",
1034 stats.allocated_bytes,
1035 stats.gcs,
1036 sum->average_bytes_used,
1037 stats.max_live_bytes,
1038 stats.major_gcs,
1039 stats.max_mem_in_use_bytes / (1024 * 1024),
1040 TimeToSecondsDbl(stats.init_cpu_ns),
1041 TimeToSecondsDbl(stats.init_elapsed_ns),
1042 TimeToSecondsDbl(stats.mutator_cpu_ns),
1043 TimeToSecondsDbl(stats.mutator_elapsed_ns),
1044 TimeToSecondsDbl(stats.gc_cpu_ns),
1045 TimeToSecondsDbl(stats.gc_elapsed_ns));
1046 }
1047
1048 void
1049 stat_exit (void)
1050 {
1051 RTSSummaryStats sum;
1052 uint32_t g;
1053
1054 init_RTSSummaryStats(&sum);
1055 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
1056 // First we tidy the times in stats, and populate the times in sum.
1057 // In particular, we adjust the gc_* time in stats to remove
1058 // profiling times.
1059 {
1060 Time now_cpu_ns, now_elapsed_ns;
1061 Time exit_gc_cpu = 0;
1062 Time exit_gc_elapsed = 0;
1063 Time prof_cpu = 0;
1064 Time prof_elapsed = 0;
1065
1066 getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
1067
1068 stats.cpu_ns = now_cpu_ns - start_init_cpu;
1069 stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
1070 /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
1071 seconds -- SDM */
1072 if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
1073 if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
1074
1075 prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
1076 prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
1077
1078 // heapCensus() is called by the GC, so RP and HC time are
1079 // included in the GC stats. We therefore subtract them to
1080 // obtain the actual GC cpu time.
1081 stats.gc_cpu_ns -= prof_cpu;
1082 stats.gc_elapsed_ns -= prof_elapsed;
1083
1084 #if defined(PROFILING)
1085 sum.rp_cpu_ns = RP_tot_time;
1086 sum.rp_elapsed_ns = RPe_tot_time;
1087 sum.hc_cpu_ns = HC_tot_time;
1088 sum.hc_elapsed_ns = HCe_tot_time;
1089 #endif // PROFILING
1090
1091 // We do a GC during the EXIT phase. We'll attribute the cost of
1092 // that to GC instead of EXIT, so carefully subtract it from the
1093 // EXIT time.
1094 exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
1095 exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
1096
1097 sum.exit_cpu_ns = end_exit_cpu
1098 - start_exit_cpu
1099 - exit_gc_cpu;
1100 sum.exit_elapsed_ns = end_exit_elapsed
1101 - start_exit_elapsed
1102 - exit_gc_elapsed;
1103
1104 stats.mutator_cpu_ns = start_exit_cpu
1105 - end_init_cpu
1106 - (stats.gc_cpu_ns - exit_gc_cpu)
1107 - prof_cpu;
1108 stats.mutator_elapsed_ns = start_exit_elapsed
1109 - end_init_elapsed
1110 - (stats.gc_elapsed_ns - exit_gc_elapsed)
1111 - prof_elapsed;
1112
1113 if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
1114
1115 // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
1116 // and subtracting, so the parts should add up to the total exactly.
1117 // Note that stats->total_ns is captured a tiny bit later than
1118 // end_exit_elapsed, so we don't use it here.
1119 ASSERT(stats.init_elapsed_ns \
1120 + stats.mutator_elapsed_ns \
1121 + stats.gc_elapsed_ns \
1122 + sum.exit_elapsed_ns \
1123 == end_exit_elapsed - start_init_elapsed);
1124
1125 }
1126
1127 // REVIEWERS: it's not clear to me why the following isn't done in
1128 // stat_endGC of the last garbage collection?
1129
1130 // We account for the last garbage collection
1131 {
1132 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
1133 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
1134 stats.allocated_bytes = tot_alloc_bytes;
1135 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
1136 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
1137 (W_)stats.gc.allocated_bytes, "", "");
1138 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
1139 }
1140 }
1141
1142 // We populate the remainder (non-time elements) of sum
1143 {
1144 #if defined(THREADED_RTS)
1145 uint32_t i;
1146 sum.bound_task_count = taskCount - workerCount;
1147
1148 for (i = 0; i < n_capabilities; i++) {
1149 sum.sparks.created += capabilities[i]->spark_stats.created;
1150 sum.sparks.dud += capabilities[i]->spark_stats.dud;
1151 sum.sparks.overflowed+=
1152 capabilities[i]->spark_stats.overflowed;
1153 sum.sparks.converted +=
1154 capabilities[i]->spark_stats.converted;
1155 sum.sparks.gcd += capabilities[i]->spark_stats.gcd;
1156 sum.sparks.fizzled += capabilities[i]->spark_stats.fizzled;
1157 }
1158
1159 sum.sparks_count = sum.sparks.created
1160 + sum.sparks.dud
1161 + sum.sparks.overflowed;
1162
1163 if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
1164 // See Note [Work Balance]
1165 sum.work_balance =
1166 (double)stats.cumulative_par_balanced_copied_bytes
1167 / (double)stats.par_copied_bytes;
1168 } else {
1169 sum.work_balance = 0;
1170 }
1171
1172
1173 #else // THREADED_RTS
1174 sum.gc_cpu_percent = stats.gc_cpu_ns
1175 / stats.cpu_ns;
1176 sum.gc_elapsed_percent = stats.gc_elapsed_ns
1177 / stats.elapsed_ns;
1178 #endif // THREADED_RTS
1179
1180 sum.fragmentation_bytes =
1181 (uint64_t)(peak_mblocks_allocated
1182 * BLOCKS_PER_MBLOCK
1183 * BLOCK_SIZE_W
1184 - hw_alloc_blocks * BLOCK_SIZE_W)
1185 / (uint64_t)sizeof(W_);
1186
1187 sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
1188 stats.cumulative_live_bytes/stats.major_gcs,
1189
1190 sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
1191 (uint64_t)((double)stats.allocated_bytes
1192 / TimeToSecondsDbl(stats.mutator_cpu_ns));
1193
1194 // REVIEWERS: These two values didn't used to include the exit times
1195 sum.productivity_cpu_percent =
1196 TimeToSecondsDbl(stats.cpu_ns
1197 - stats.gc_cpu_ns
1198 - sum.rp_cpu_ns
1199 - sum.hc_cpu_ns
1200 - stats.init_cpu_ns
1201 - sum.exit_cpu_ns)
1202 / TimeToSecondsDbl(stats.cpu_ns);
1203
1204 sum.productivity_elapsed_percent =
1205 TimeToSecondsDbl(stats.elapsed_ns
1206 - stats.gc_elapsed_ns
1207 - sum.rp_elapsed_ns
1208 - sum.hc_elapsed_ns
1209 - stats.init_elapsed_ns
1210 - sum.exit_elapsed_ns)
1211 / TimeToSecondsDbl(stats.elapsed_ns);
1212
1213 for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
1214 const generation* gen = &generations[g];
1215 GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
1216 gen_stats->collections = gen->collections;
1217 gen_stats->par_collections = gen->par_collections;
1218 gen_stats->cpu_ns = GC_coll_cpu[g];
1219 gen_stats->elapsed_ns = GC_coll_elapsed[g];
1220 gen_stats->max_pause_ns = GC_coll_max_pause[g];
1221 gen_stats->avg_pause_ns = gen->collections == 0 ?
1222 0 : (GC_coll_elapsed[g] / gen->collections);
1223 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1224 gen_stats->sync_spin = gen->sync.spin;
1225 gen_stats->sync_yield = gen->sync.yield;
1226 #endif // PROF_SPIN
1227 }
1228 }
1229
1230 // Now we generate the report
1231 if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
1232 report_summary(&sum);
1233 }
1234
1235 if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
1236 if (RtsFlags.MiscFlags.machineReadable) {
1237 report_machine_readable(&sum);
1238 }
1239 else {
1240 report_one_line(&sum);
1241 }
1242 }
1243
1244 free_RTSSummaryStats(&sum);
1245 statsFlush();
1246 statsClose();
1247 }
1248
1249 if (GC_coll_cpu) {
1250 stgFree(GC_coll_cpu);
1251 GC_coll_cpu = NULL;
1252 }
1253 if (GC_coll_elapsed) {
1254 stgFree(GC_coll_elapsed);
1255 GC_coll_elapsed = NULL;
1256 }
1257 if (GC_coll_max_pause) {
1258 stgFree(GC_coll_max_pause);
1259 GC_coll_max_pause = NULL;
1260 }
1261 }
1262
1263 /* Note [Work Balance]
1264 ----------------------
1265 Work balance is a measure of how evenly the work done during parallel garbage
1266 collection is spread across threads. To compute work balance we must take care
1267 to account for the number of GC threads changing between GCs. The statistics we
1268 track must have the number of GC threads "integrated out".
1269
1270 We accumulate two values from each garbage collection:
1271 * par_copied: is a measure of the total work done during parallel garbage
1272 collection
1273 * par_balanced_copied: is a measure of the balanced work done
1274 during parallel garbage collection.
1275
1276 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
1277 complicated:
1278
1279 For a given garbage collection:
1280 Let gc_copied := total copies during the gc
1281 gc_copied_i := copies by the ith thread during the gc
1282 num_gc_threads := the number of threads participating in the gc
1283 balance_limit := (gc_copied / num_gc_threads)
1284
1285 If we were to graph gc_copied_i, sorted from largest to smallest we would see
1286 something like:
1287
1288 |X
1289 ^ |X X
1290 | |X X X X: unbalanced copies
1291 copies |----------- Y: balanced copies by the busiest GC thread
1292 |Y Z Z Z: other balanced copies
1293 |Y Z Z Z
1294 |Y Z Z Z Z
1295 |Y Z Z Z Z Z
1296 |===========
1297 |1 2 3 4 5 6
1298 i ->
1299
1300 where the --- line is at balance_limit. Balanced copies are those under the ---
1301 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
1302 always equal balance_limit. Completely balanced gc has every thread copying
1303 balance_limit and a completely unbalanced gc has a single thread copying
1304 gc_copied.
1305
1306 One could define par_balance_copied as the areas of the Ys and Zs in the graph
1307 above, however we would like the ratio of (par_balance_copied / gc_copied) to
1308 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
1309 from 0 to 1. We therefore define par_balanced_copied as:
1310
1311 ( num_gc_threads )
1312 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
1313 i (num_gc_threads - 1)
1314 vvv vvv
1315 S T
1316
1317 Where the S and T terms serve to remove the area of the Ys, and
1318 to normalize the result to lie between 0 and gc_copied.
1319
1320 Note that the implementation orders these operations differently to minimize
1321 error due to integer rounding.
1322
1323 Then cumulative work balance is computed as
1324 (cumulative_par_balanced_copied_bytes / par_copied_byes)
1325
1326 Previously, cumulative work balance was computed as:
1327
1328 (cumulative_par_max_copied_bytes)
1329 (-------------------------------) - 1
1330 ( par_copied_bytes )
1331 -------------------------------------
1332 (n_capabilities - 1)
1333
1334 This was less accurate than the current method, and invalid whenever a garbage
1335 collection had occurred with num_gc_threads /= n_capabilities; which can happen
1336 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
1337 the number of gc threads is limited to the number of cores.
1338 See #13830
1339 */
1340
1341 /*
1342 Note [Internal Counter Stats]
1343 -----------------------------
1344 What do the counts at the end of a '+RTS -s --internal-counters' report mean?
1345 They are detailed below. Most of these counters are used by multiple threads
1346 with no attempt at synchronisation. This means that reported values may be
1347 lower than the true value and this becomes more likely and more severe as
1348 contention increases.
1349
1350 The first counters are for various SpinLock-like constructs in the RTS. See
1351 Spinlock.h for the definition of a SpinLock. We maintain up two counters per
1352 SpinLock:
1353 * spin: The number of busy-spins over the length of the program.
1354 * yield: The number of times the SpinLock spun SPIN_COUNT times without success
1355 and called yieldThread().
1356 Not all of these are actual SpinLocks, see the details below.
1357
1358 Actual SpinLocks:
1359 * gc_alloc_block:
1360 This SpinLock protects the block allocator and free list manager. See
1361 BlockAlloc.c.
1362 * gc_spin and mut_spin:
1363 These SpinLocks are used to herd gc worker threads during parallel garbage
1364 collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
1365 * gen[g].sync:
1366 These SpinLocks, one per generation, protect the generations[g] data
1367 structure during garbage collection.
1368
1369 waitForGcThreads:
1370 These counters are incremented while we wait for all threads to be ready
1371 for a parallel garbage collection. We yield more than we spin in this case.
1372
1373 In several places in the runtime we must take a lock on a closure. To do this,
1374 we replace its info table with stg_WHITEHOLE_info, spinning if it is already
1375 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
1376 don't. We count these white-hole spins and include them in the SpinLocks table.
1377 If a particular loop does not yield, we put "n/a" in the table. They are named
1378 for the function that has the spinning loop except that several loops in the
1379 garbage collector accumulate into whitehole_gc.
1380 TODO: Should these counters be more or less granular?
1381
1382 white-hole spin counters:
1383 * whitehole_gc
1384 * whitehole_lockClosure
1385 * whitehole_executeMessage
1386 * whitehole_threadPaused
1387
1388
1389 We count the number of calls of several functions in the parallel garbage
1390 collector.
1391
1392 Parallel garbage collector counters:
1393 * any_work:
1394 A cheap function called whenever a gc_thread is ready for work. Does
1395 not do any work.
1396 * no_work:
1397 Incremented whenever any_work finds no work.
1398 * scav_find_work:
1399 Called to do work when any_work return true.
1400
1401 */
1402
1403 /* -----------------------------------------------------------------------------
1404 stat_describe_gens
1405
1406 Produce some detailed info on the state of the generational GC.
1407 -------------------------------------------------------------------------- */
1408 void
1409 statDescribeGens(void)
1410 {
1411 uint32_t g, mut, lge, compacts, i;
1412 W_ gen_slop;
1413 W_ tot_live, tot_slop;
1414 W_ gen_live, gen_blocks;
1415 bdescr *bd;
1416 generation *gen;
1417
1418 debugBelch(
1419 "----------------------------------------------------------------------\n"
1420 " Gen Max Mut-list Blocks Large Compacts Live Slop\n"
1421 " Blocks Bytes Objects \n"
1422 "----------------------------------------------------------------------\n");
1423
1424 tot_live = 0;
1425 tot_slop = 0;
1426
1427 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1428 gen = &generations[g];
1429
1430 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
1431 lge++;
1432 }
1433
1434 for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
1435 compacts++;
1436 }
1437
1438 gen_live = genLiveWords(gen);
1439 gen_blocks = genLiveBlocks(gen);
1440
1441 mut = 0;
1442 for (i = 0; i < n_capabilities; i++) {
1443 mut += countOccupied(capabilities[i]->mut_lists[g]);
1444
1445 // Add the pinned object block.
1446 bd = capabilities[i]->pinned_object_block;
1447 if (bd != NULL) {
1448 gen_live += bd->free - bd->start;
1449 gen_blocks += bd->blocks;
1450 }
1451
1452 gen_live += gcThreadLiveWords(i,g);
1453 gen_blocks += gcThreadLiveBlocks(i,g);
1454 }
1455
1456 debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
1457
1458 gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
1459
1460 debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n",
1461 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
1462 gen_slop*(W_)sizeof(W_));
1463 tot_live += gen_live;
1464 tot_slop += gen_slop;
1465 }
1466 debugBelch("----------------------------------------------------------------------\n");
1467 debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
1468 "",tot_live*(W_)sizeof(W_),tot_slop*(W_)sizeof(W_));
1469 debugBelch("----------------------------------------------------------------------\n");
1470 debugBelch("\n");
1471 }
1472
1473 /* -----------------------------------------------------------------------------
1474 Stats available via a programmatic interface, so eg. GHCi can time
1475 each compilation and expression evaluation.
1476 -------------------------------------------------------------------------- */
1477
1478 uint64_t getAllocations( void )
1479 {
1480 return stats.allocated_bytes;
1481 }
1482
1483 int getRTSStatsEnabled( void )
1484 {
1485 return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
1486 }
1487
1488 void getRTSStats( RTSStats *s )
1489 {
1490 Time current_elapsed = 0;
1491 Time current_cpu = 0;
1492
1493 *s = stats;
1494
1495 getProcessTimes(&current_cpu, &current_elapsed);
1496 s->cpu_ns = current_cpu - end_init_cpu;
1497 s->elapsed_ns = current_elapsed - end_init_elapsed;
1498
1499 s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
1500 s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1501 stats.gc_elapsed_ns;
1502 }
1503
1504 /* -----------------------------------------------------------------------------
1505 Dumping stuff in the stats file, or via the debug message interface
1506 -------------------------------------------------------------------------- */
1507
1508 void
1509 statsPrintf( char *s, ... )
1510 {
1511 FILE *sf = RtsFlags.GcFlags.statsFile;
1512 va_list ap;
1513
1514 va_start(ap,s);
1515 if (sf == NULL) {
1516 vdebugBelch(s,ap);
1517 } else {
1518 vfprintf(sf, s, ap);
1519 }
1520 va_end(ap);
1521 }
1522
1523 static void
1524 statsFlush( void )
1525 {
1526 FILE *sf = RtsFlags.GcFlags.statsFile;
1527 if (sf != NULL) {
1528 fflush(sf);
1529 }
1530 }
1531
1532 static void
1533 statsClose( void )
1534 {
1535 FILE *sf = RtsFlags.GcFlags.statsFile;
1536 if (sf != NULL) {
1537 fclose(sf);
1538 }
1539 }