Fix Work Balance computation in RTS stats
[ghc.git] / rts / Stats.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2005
4 *
5 * Statistics and timing-related functions.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "PosixSource.h"
10 #include "Rts.h"
11
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GC.h" // gc_alloc_block_sync, whitehole_spin
20 #include "sm/GCThread.h"
21 #include "sm/BlockAlloc.h"
22
23 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
24
25 static Time
26 start_init_cpu, start_init_elapsed,
27 end_init_cpu, end_init_elapsed,
28 start_exit_cpu, start_exit_elapsed,
29 start_exit_gc_elapsed, start_exit_gc_cpu,
30 end_exit_cpu, end_exit_elapsed;
31
32 #if defined(PROFILING)
33 static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
34 static Time RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time
35
36 static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
37 static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
38 #endif
39
40 #if defined(PROFILING)
41 #define PROF_VAL(x) (x)
42 #else
43 #define PROF_VAL(x) 0
44 #endif
45
46 //
47 // All the stats!
48 //
49 // This is where we accumulate all the stats during execution, and it's also
50 // in a convenient form that we can copy over to a caller of getRTSStats().
51 //
52 static RTSStats stats;
53
54 static W_ GC_end_faults = 0;
55
56 static Time *GC_coll_cpu = NULL;
57 static Time *GC_coll_elapsed = NULL;
58 static Time *GC_coll_max_pause = NULL;
59
60 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
61 static void statsFlush( void );
62 static void statsClose( void );
63
64 /* -----------------------------------------------------------------------------
65 Current elapsed time
66 ------------------------------------------------------------------------- */
67
68 Time stat_getElapsedTime(void)
69 {
70 return getProcessElapsedTime() - start_init_elapsed;
71 }
72
73 /* ---------------------------------------------------------------------------
74 Measure the current MUT time, for profiling
75 ------------------------------------------------------------------------ */
76
77 double
78 mut_user_time_until( Time t )
79 {
80 return TimeToSecondsDbl(t - stats.gc_cpu_ns);
81 // heapCensus() time is included in GC_tot_cpu, so we don't need
82 // to subtract it here.
83 }
84
85 double
86 mut_user_time( void )
87 {
88 Time cpu;
89 cpu = getProcessCPUTime();
90 return mut_user_time_until(cpu);
91 }
92
93 #if defined(PROFILING)
94 /*
95 mut_user_time_during_RP() returns the MUT time during retainer profiling.
96 The same is for mut_user_time_during_HC();
97 */
98 static double
99 mut_user_time_during_RP( void )
100 {
101 return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
102 }
103
104 #endif /* PROFILING */
105
106 /* ---------------------------------------------------------------------------
107 initStats0() has no dependencies, it can be called right at the beginning
108 ------------------------------------------------------------------------ */
109
110 void
111 initStats0(void)
112 {
113 start_init_cpu = 0;
114 start_init_elapsed = 0;
115 end_init_cpu = 0;
116 end_init_elapsed = 0;
117
118 start_exit_cpu = 0;
119 start_exit_elapsed = 0;
120 start_exit_gc_cpu = 0;
121 start_exit_gc_elapsed = 0;
122 end_exit_cpu = 0;
123 end_exit_elapsed = 0;
124
125 #if defined(PROFILING)
126 RP_start_time = 0;
127 RP_tot_time = 0;
128 RPe_start_time = 0;
129 RPe_tot_time = 0;
130
131 HC_start_time = 0;
132 HC_tot_time = 0;
133 HCe_start_time = 0;
134 HCe_tot_time = 0;
135 #endif
136
137 GC_end_faults = 0;
138
139 stats = (RTSStats) {
140 .gcs = 0,
141 .major_gcs = 0,
142 .allocated_bytes = 0,
143 .max_live_bytes = 0,
144 .max_large_objects_bytes = 0,
145 .max_compact_bytes = 0,
146 .max_slop_bytes = 0,
147 .max_mem_in_use_bytes = 0,
148 .cumulative_live_bytes = 0,
149 .copied_bytes = 0,
150 .par_copied_bytes = 0,
151 .cumulative_par_max_copied_bytes = 0,
152 .cumulative_par_balanced_copied_bytes = 0,
153 .mutator_cpu_ns = 0,
154 .mutator_elapsed_ns = 0,
155 .gc_cpu_ns = 0,
156 .gc_elapsed_ns = 0,
157 .cpu_ns = 0,
158 .elapsed_ns = 0,
159 .gc = {
160 .gen = 0,
161 .threads = 0,
162 .allocated_bytes = 0,
163 .live_bytes = 0,
164 .large_objects_bytes = 0,
165 .compact_bytes = 0,
166 .slop_bytes = 0,
167 .mem_in_use_bytes = 0,
168 .copied_bytes = 0,
169 .par_max_copied_bytes = 0,
170 .par_balanced_copied_bytes = 0,
171 .sync_elapsed_ns = 0,
172 .cpu_ns = 0,
173 .elapsed_ns = 0
174 }
175 };
176 }
177
178 /* ---------------------------------------------------------------------------
179 initStats1() can be called after setupRtsFlags()
180 ------------------------------------------------------------------------ */
181
182 void
183 initStats1 (void)
184 {
185 uint32_t i;
186
187 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
188 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
189 statsPrintf(" bytes bytes bytes user elap user elap\n");
190 }
191 GC_coll_cpu =
192 (Time *)stgMallocBytes(
193 sizeof(Time)*RtsFlags.GcFlags.generations,
194 "initStats");
195 GC_coll_elapsed =
196 (Time *)stgMallocBytes(
197 sizeof(Time)*RtsFlags.GcFlags.generations,
198 "initStats");
199 GC_coll_max_pause =
200 (Time *)stgMallocBytes(
201 sizeof(Time)*RtsFlags.GcFlags.generations,
202 "initStats");
203 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
204 GC_coll_cpu[i] = 0;
205 GC_coll_elapsed[i] = 0;
206 GC_coll_max_pause[i] = 0;
207 }
208 }
209
210 /* -----------------------------------------------------------------------------
211 Initialisation time...
212 -------------------------------------------------------------------------- */
213
214 void
215 stat_startInit(void)
216 {
217 getProcessTimes(&start_init_cpu, &start_init_elapsed);
218 }
219
220 void
221 stat_endInit(void)
222 {
223 getProcessTimes(&end_init_cpu, &end_init_elapsed);
224 }
225
226 /* -----------------------------------------------------------------------------
227 stat_startExit and stat_endExit
228
229 These two measure the time taken in shutdownHaskell().
230 -------------------------------------------------------------------------- */
231
232 void
233 stat_startExit(void)
234 {
235 getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
236 start_exit_gc_elapsed = stats.gc_elapsed_ns;
237 start_exit_gc_cpu = stats.gc_cpu_ns;
238 }
239
240 void
241 stat_endExit(void)
242 {
243 getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
244 }
245
246 void
247 stat_startGCSync (gc_thread *gct)
248 {
249 gct->gc_sync_start_elapsed = getProcessElapsedTime();
250 }
251
252 /* -----------------------------------------------------------------------------
253 Called at the beginning of each GC
254 -------------------------------------------------------------------------- */
255
256 void
257 stat_startGC (Capability *cap, gc_thread *gct)
258 {
259 if (RtsFlags.GcFlags.ringBell) {
260 debugBelch("\007");
261 }
262
263 getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
264
265 // Post EVENT_GC_START with the same timestamp as used for stats
266 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
267 // Here, as opposed to other places, the event is emitted on the cap
268 // that initiates the GC and external tools expect it to have the same
269 // timestamp as used in +RTS -s calculcations.
270 traceEventGcStartAtT(cap,
271 TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
272
273 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
274 {
275 gct->gc_start_faults = getPageFaults();
276 }
277
278 updateNurseriesStats();
279 }
280
281 /* -----------------------------------------------------------------------------
282 Called at the end of each GC
283 -------------------------------------------------------------------------- */
284
285 void
286 stat_endGC (Capability *cap, gc_thread *gct,
287 W_ live, W_ copied, W_ slop, uint32_t gen,
288 uint32_t par_n_threads, W_ par_max_copied,
289 W_ par_balanced_copied)
290 {
291 // -------------------------------------------------
292 // Collect all the stats about this GC in stats.gc. We always do this since
293 // it's relatively cheap and we need allocated_bytes to catch heap
294 // overflows.
295
296 stats.gc.gen = gen;
297 stats.gc.threads = par_n_threads;
298
299 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
300
301 // allocated since the last GC
302 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
303
304 stats.gc.live_bytes = live * sizeof(W_);
305 stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
306 stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
307 stats.gc.slop_bytes = slop * sizeof(W_);
308 stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
309 stats.gc.copied_bytes = copied * sizeof(W_);
310 stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
311 stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
312
313 // -------------------------------------------------
314 // Update the cumulative stats
315
316 stats.gcs++;
317 stats.allocated_bytes = tot_alloc_bytes;
318 stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
319
320 GC_coll_cpu[gen] += stats.gc.cpu_ns;
321 GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
322 if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
323 GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
324 }
325
326 stats.copied_bytes += stats.gc.copied_bytes;
327 if (par_n_threads > 1) {
328 stats.par_copied_bytes += stats.gc.copied_bytes;
329 stats.cumulative_par_max_copied_bytes +=
330 stats.gc.par_max_copied_bytes;
331 stats.cumulative_par_balanced_copied_bytes +=
332 stats.gc.par_balanced_copied_bytes;
333 }
334 stats.gc_cpu_ns += stats.gc.cpu_ns;
335 stats.gc_elapsed_ns += stats.gc.elapsed_ns;
336
337 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
338 stats.major_gcs++;
339 if (stats.gc.live_bytes > stats.max_live_bytes) {
340 stats.max_live_bytes = stats.gc.live_bytes;
341 }
342 if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
343 stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
344 }
345 if (stats.gc.compact_bytes > stats.max_compact_bytes) {
346 stats.max_compact_bytes = stats.gc.compact_bytes;
347 }
348 if (stats.gc.slop_bytes > stats.max_slop_bytes) {
349 stats.max_slop_bytes = stats.gc.slop_bytes;
350 }
351 stats.cumulative_live_bytes += stats.gc.live_bytes;
352 }
353
354 // -------------------------------------------------
355 // Do the more expensive bits only when stats are enabled.
356
357 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
358 rtsConfig.gcDoneHook != NULL ||
359 RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
360 {
361 // We only update the times when stats are explicitly enabled since
362 // getProcessTimes (e.g. requiring a system call) can be expensive on
363 // some platforms.
364 Time current_cpu, current_elapsed;
365 getProcessTimes(&current_cpu, &current_elapsed);
366 stats.cpu_ns = current_cpu - start_init_cpu;
367 stats.elapsed_ns = current_elapsed - start_init_elapsed;
368
369 stats.gc.sync_elapsed_ns =
370 gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
371 stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
372 stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
373
374 // -------------------------------------------------
375 // Emit events to the event log
376
377 // Has to be emitted while all caps stopped for GC, but before GC_END.
378 // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
379 // for a detailed design rationale of the current setup
380 // of GC eventlog events.
381 traceEventGcGlobalSync(cap);
382
383 // Emitted before GC_END on all caps, which simplifies tools code.
384 traceEventGcStats(cap,
385 CAPSET_HEAP_DEFAULT,
386 stats.gc.gen,
387 stats.gc.copied_bytes,
388 stats.gc.slop_bytes,
389 /* current loss due to fragmentation */
390 (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
391 * BLOCK_SIZE,
392 par_n_threads,
393 stats.gc.par_max_copied_bytes,
394 stats.gc.copied_bytes,
395 stats.gc.par_balanced_copied_bytes);
396
397 // Post EVENT_GC_END with the same timestamp as used for stats
398 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
399 // Here, as opposed to other places, the event is emitted on the cap
400 // that initiates the GC and external tools expect it to have the same
401 // timestamp as used in +RTS -s calculcations.
402 traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
403
404 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
405 traceEventHeapLive(cap,
406 CAPSET_HEAP_DEFAULT,
407 stats.gc.live_bytes);
408 }
409
410 // -------------------------------------------------
411 // Print GC stats to stdout or a file (+RTS -S/-s)
412
413 if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
414 W_ faults = getPageFaults();
415
416 statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
417 stats.gc.allocated_bytes, stats.gc.copied_bytes,
418 stats.gc.live_bytes);
419
420 statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
421 FMT_Word " %4" FMT_Word " (Gen: %2d)\n",
422 TimeToSecondsDbl(stats.gc.cpu_ns),
423 TimeToSecondsDbl(stats.gc.elapsed_ns),
424 TimeToSecondsDbl(stats.cpu_ns),
425 TimeToSecondsDbl(stats.elapsed_ns),
426 faults - gct->gc_start_faults,
427 gct->gc_start_faults - GC_end_faults,
428 gen);
429
430 GC_end_faults = faults;
431 statsFlush();
432 }
433
434
435 if (rtsConfig.gcDoneHook != NULL) {
436 rtsConfig.gcDoneHook(&stats.gc);
437 }
438
439 traceEventHeapSize(cap,
440 CAPSET_HEAP_DEFAULT,
441 mblocks_allocated * MBLOCK_SIZE);
442 }
443 }
444
445 /* -----------------------------------------------------------------------------
446 Called at the beginning of each Retainer Profiliing
447 -------------------------------------------------------------------------- */
448 #if defined(PROFILING)
449 void
450 stat_startRP(void)
451 {
452 Time user, elapsed;
453 getProcessTimes( &user, &elapsed );
454
455 RP_start_time = user;
456 RPe_start_time = elapsed;
457 }
458 #endif /* PROFILING */
459
460 /* -----------------------------------------------------------------------------
461 Called at the end of each Retainer Profiliing
462 -------------------------------------------------------------------------- */
463
464 #if defined(PROFILING)
465 void
466 stat_endRP(
467 uint32_t retainerGeneration,
468 #if defined(DEBUG_RETAINER)
469 uint32_t maxCStackSize,
470 int maxStackSize,
471 #endif
472 double averageNumVisit)
473 {
474 Time user, elapsed;
475 getProcessTimes( &user, &elapsed );
476
477 RP_tot_time += user - RP_start_time;
478 RPe_tot_time += elapsed - RPe_start_time;
479
480 fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
481 retainerGeneration, mut_user_time_during_RP());
482 #if defined(DEBUG_RETAINER)
483 fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
484 fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
485 #endif
486 fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
487 }
488 #endif /* PROFILING */
489
490 /* -----------------------------------------------------------------------------
491 Called at the beginning of each heap census
492 -------------------------------------------------------------------------- */
493 #if defined(PROFILING)
494 void
495 stat_startHeapCensus(void)
496 {
497 Time user, elapsed;
498 getProcessTimes( &user, &elapsed );
499
500 HC_start_time = user;
501 HCe_start_time = elapsed;
502 }
503 #endif /* PROFILING */
504
505 /* -----------------------------------------------------------------------------
506 Called at the end of each heap census
507 -------------------------------------------------------------------------- */
508 #if defined(PROFILING)
509 void
510 stat_endHeapCensus(void)
511 {
512 Time user, elapsed;
513 getProcessTimes( &user, &elapsed );
514
515 HC_tot_time += user - HC_start_time;
516 HCe_tot_time += elapsed - HCe_start_time;
517 }
518 #endif /* PROFILING */
519
520 /* -----------------------------------------------------------------------------
521 Called at the end of execution
522
523 NOTE: number of allocations is not entirely accurate: it doesn't
524 take into account the few bytes at the end of the heap that
525 were left unused when the heap-check failed.
526 -------------------------------------------------------------------------- */
527
528 #if defined(DEBUG)
529 #define TICK_VAR_INI(arity) \
530 StgInt SLOW_CALLS_##arity = 1; \
531 StgInt RIGHT_ARITY_##arity = 1; \
532 StgInt TAGGED_PTR_##arity = 0;
533
534 TICK_VAR_INI(1)
535 TICK_VAR_INI(2)
536
537 StgInt TOTAL_CALLS=1;
538 #endif
539
540 /* Report the value of a counter */
541 #define REPORT(counter) \
542 { \
543 showStgWord64(counter,temp,true/*commas*/); \
544 statsPrintf(" (" #counter ") : %s\n",temp); \
545 }
546
547 /* Report the value of a counter as a percentage of another counter */
548 #define REPORT_PCT(counter,countertot) \
549 statsPrintf(" (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
550 counter*100.0/countertot)
551
552 #define TICK_PRINT(arity) \
553 REPORT(SLOW_CALLS_##arity); \
554 REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
555 REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
556 REPORT(RIGHT_ARITY_##arity); \
557 REPORT(TAGGED_PTR_##arity)
558
559 #define TICK_PRINT_TOT(arity) \
560 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
561 SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
562
563 STATIC_INLINE Time get_init_cpu(void) {
564 return end_init_cpu - start_init_cpu;
565 }
566
567 STATIC_INLINE Time get_init_elapsed(void) {
568 return end_init_elapsed - start_init_elapsed;
569 }
570
571
572 void
573 stat_exit (void)
574 {
575 generation *gen;
576 Time gc_cpu = 0;
577 Time gc_elapsed = 0;
578 Time init_cpu = 0;
579 Time init_elapsed = 0;
580 Time mut_cpu = 0;
581 Time mut_elapsed = 0;
582 Time exit_cpu = 0;
583 Time exit_elapsed = 0;
584 Time exit_gc_cpu = 0;
585 Time exit_gc_elapsed = 0;
586
587 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
588
589 char temp[512];
590 Time tot_cpu;
591 Time tot_elapsed;
592 uint32_t g;
593
594 getProcessTimes( &tot_cpu, &tot_elapsed );
595 tot_cpu -= start_init_cpu;
596 tot_elapsed -= start_init_elapsed;
597
598 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
599
600 // allocated since the last GC
601 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
602 stats.allocated_bytes = tot_alloc_bytes;
603
604 /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
605 if (tot_cpu <= 0) tot_cpu = 1;
606 if (tot_elapsed <= 0) tot_elapsed = 1;
607
608 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
609 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
610 (W_)stats.gc.allocated_bytes, "", "");
611 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
612 }
613
614 // heapCensus() is called by the GC, so RP and HC time are
615 // included in the GC stats. We therefore subtract them to
616 // obtain the actual GC cpu time.
617 gc_cpu = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
618 gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
619
620 init_cpu = get_init_cpu();
621 init_elapsed = get_init_elapsed();
622
623 // We do a GC during the EXIT phase. We'll attribute the cost of that
624 // to GC instead of EXIT, so carefully subtract it from the EXIT time.
625 exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
626 exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
627 exit_cpu = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
628 exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
629
630 mut_elapsed = start_exit_elapsed - end_init_elapsed -
631 (gc_elapsed - exit_gc_elapsed);
632
633 mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
634 - PROF_VAL(RP_tot_time + HC_tot_time);
635 if (mut_cpu < 0) { mut_cpu = 0; }
636
637 // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
638 // subtracting, so the parts should add up to the total exactly. Note
639 // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
640 // so we don't use it here.
641 ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
642 == end_exit_elapsed - start_init_elapsed);
643
644
645 if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
646 showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
647 statsPrintf("%16s bytes allocated in the heap\n", temp);
648
649 showStgWord64(stats.copied_bytes, temp, true/*commas*/);
650 statsPrintf("%16s bytes copied during GC\n", temp);
651
652 if ( stats.major_gcs > 0 ) {
653 showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
654 statsPrintf("%16s bytes maximum residency (%" FMT_Word32
655 " sample(s))\n",
656 temp, stats.major_gcs);
657 }
658
659 showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
660 statsPrintf("%16s bytes maximum slop\n", temp);
661
662 statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
663 FMT_SizeT " MB lost due to fragmentation)\n\n",
664 (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
665 (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
666
667 /* Print garbage collections in each gen */
668 statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
669 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
670 gen = &generations[g];
671 statsPrintf(" Gen %2d %5d colls, %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
672 gen->no,
673 gen->collections,
674 gen->par_collections,
675 TimeToSecondsDbl(GC_coll_cpu[g]),
676 TimeToSecondsDbl(GC_coll_elapsed[g]),
677 gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
678 TimeToSecondsDbl(GC_coll_max_pause[g]));
679 }
680
681 #if defined(THREADED_RTS)
682 if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
683 // See Note [Work Balance]
684 statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
685 100 * (double)stats.cumulative_par_balanced_copied_bytes /
686 (double)stats.par_copied_bytes);
687 }
688 #endif
689 statsPrintf("\n");
690
691 #if defined(THREADED_RTS)
692 statsPrintf(" TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
693 taskCount, taskCount - workerCount,
694 peakWorkerCount, workerCount,
695 n_capabilities);
696
697 statsPrintf("\n");
698
699 {
700 uint32_t i;
701 SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
702 for (i = 0; i < n_capabilities; i++) {
703 sparks.created += capabilities[i]->spark_stats.created;
704 sparks.dud += capabilities[i]->spark_stats.dud;
705 sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
706 sparks.converted += capabilities[i]->spark_stats.converted;
707 sparks.gcd += capabilities[i]->spark_stats.gcd;
708 sparks.fizzled += capabilities[i]->spark_stats.fizzled;
709 }
710
711 statsPrintf(" SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
712 sparks.created + sparks.dud + sparks.overflowed,
713 sparks.converted, sparks.overflowed, sparks.dud,
714 sparks.gcd, sparks.fizzled);
715 }
716 #endif
717
718 statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
719 TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
720
721 statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
722 TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
723 statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
724 TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
725
726 #if defined(PROFILING)
727 statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
728 TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
729 statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
730 TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
731 #endif
732 statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
733 TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
734 statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
735 TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
736 #if !defined(THREADED_RTS)
737 statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
738 TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
739 TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
740 #endif
741
742 if (mut_cpu == 0) {
743 showStgWord64(0, temp, true/*commas*/);
744 } else {
745 showStgWord64(
746 (StgWord64)((double)stats.allocated_bytes /
747 TimeToSecondsDbl(mut_cpu)),
748 temp, true/*commas*/);
749 }
750
751 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
752
753 statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
754 TimeToSecondsDbl(tot_cpu - gc_cpu -
755 PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
756 / TimeToSecondsDbl(tot_cpu),
757 TimeToSecondsDbl(tot_elapsed - gc_elapsed -
758 PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
759 / TimeToSecondsDbl(tot_elapsed));
760
761 #if defined(THREADED_RTS) && defined(PROF_SPIN)
762 {
763 uint32_t g;
764
765 statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
766 statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
767 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
768 statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
769 }
770 }
771 #endif
772 }
773
774 if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
775 char *fmt;
776 if (RtsFlags.MiscFlags.machineReadable) {
777 fmt =
778 " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
779 " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
780 " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
781 " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
782 " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
783 " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
784 " ,(\"init_cpu_seconds\", \"%.3f\")\n"
785 " ,(\"init_wall_seconds\", \"%.3f\")\n"
786 " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
787 " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
788 " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
789 " ,(\"GC_wall_seconds\", \"%.3f\")\n"
790 " ]\n";
791 }
792 else {
793 fmt =
794 "<<ghc: %" FMT_Word64 " bytes, "
795 "%" FMT_Word32 " GCs, "
796 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
797 "%" FMT_Word64 "M in use, "
798 "%.3f INIT (%.3f elapsed), "
799 "%.3f MUT (%.3f elapsed), "
800 "%.3f GC (%.3f elapsed) :ghc>>\n";
801 }
802 /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
803 statsPrintf(fmt,
804 stats.allocated_bytes,
805 stats.gcs,
806 (uint64_t)
807 (stats.major_gcs == 0 ? 0 :
808 stats.cumulative_live_bytes/stats.major_gcs),
809 stats.max_live_bytes,
810 stats.major_gcs,
811 (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
812 TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
813 TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
814 TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
815 }
816
817 statsFlush();
818 statsClose();
819 }
820
821 if (GC_coll_cpu) {
822 stgFree(GC_coll_cpu);
823 GC_coll_cpu = NULL;
824 }
825 if (GC_coll_elapsed) {
826 stgFree(GC_coll_elapsed);
827 GC_coll_elapsed = NULL;
828 }
829 if (GC_coll_max_pause) {
830 stgFree(GC_coll_max_pause);
831 GC_coll_max_pause = NULL;
832 }
833 }
834
835 /* Note [Work Balance]
836 ----------------------
837 Work balance is a measure of how evenly the work done during parallel garbage
838 collection is spread across threads. To compute work balance we must take care
839 to account for the number of GC threads changing between GCs. The statistics we
840 track must have the number of GC threads "integrated out".
841
842 We accumulate two values from each garbage collection:
843 * par_copied: is a measure of the total work done during parallel garbage
844 collection
845 * par_balanced_copied: is a measure of the balanced work done
846 during parallel garbage collection.
847
848 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
849 complicated:
850
851 For a given garbage collection:
852 Let gc_copied := total copies during the gc
853 gc_copied_i := copies by the ith thread during the gc
854 num_gc_threads := the number of threads participating in the gc
855 balance_limit := (gc_copied / num_gc_threads)
856
857 If we were to graph gc_copied_i, sorted from largest to smallest we would see
858 something like:
859
860 |X
861 ^ |X X
862 | |X X X X: unbalanced copies
863 copies |----------- Y: balanced copies by the busiest GC thread
864 |Y Z Z Z: other balanced copies
865 |Y Z Z Z
866 |Y Z Z Z Z
867 |Y Z Z Z Z Z
868 |===========
869 |1 2 3 4 5 6
870 i ->
871
872 where the --- line is at balance_limit. Balanced copies are those under the ---
873 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
874 always equal balance_limit. Completely balanced gc has every thread copying
875 balance_limit and a completely unbalanced gc has a single thread copying
876 gc_copied.
877
878 One could define par_balance_copied as the areas of the Ys and Zs in the graph
879 above, however we would like the ratio of (par_balance_copied / gc_copied) to
880 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
881 from 0 to 1. We therefore define par_balanced_copied as:
882
883 ( num_gc_threads )
884 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
885 i (num_gc_threads - 1)
886 vvv vvv
887 S T
888
889 Where the S and T terms serve to remove the area of the Ys, and
890 to normalize the result to lie between 0 and gc_copied.
891
892 Note that the implementation orders these operations differently to minimize
893 error due to integer rounding.
894
895 Then cumulative work balance is computed as
896 (cumulative_par_balanced_copied_bytes / par_copied_byes)
897
898 Previously, cumulative work balance was computed as:
899
900 (cumulative_par_max_copied_bytes)
901 (-------------------------------) - 1
902 ( par_copied_bytes )
903 -------------------------------------
904 (n_capabilities - 1)
905
906 This was less accurate than the current method, and invalid whenever a garbage
907 collection had occurred with num_gc_threads /= n_capabilities; which can happen
908 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
909 the number of gc threads is limited to the number of cores.
910 See #13830
911 */
912
913 /* -----------------------------------------------------------------------------
914 stat_describe_gens
915
916 Produce some detailed info on the state of the generational GC.
917 -------------------------------------------------------------------------- */
918 void
919 statDescribeGens(void)
920 {
921 uint32_t g, mut, lge, compacts, i;
922 W_ gen_slop;
923 W_ tot_live, tot_slop;
924 W_ gen_live, gen_blocks;
925 bdescr *bd;
926 generation *gen;
927
928 debugBelch(
929 "----------------------------------------------------------------------\n"
930 " Gen Max Mut-list Blocks Large Compacts Live Slop\n"
931 " Blocks Bytes Objects \n"
932 "----------------------------------------------------------------------\n");
933
934 tot_live = 0;
935 tot_slop = 0;
936
937 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
938 gen = &generations[g];
939
940 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
941 lge++;
942 }
943
944 for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
945 compacts++;
946 }
947
948 gen_live = genLiveWords(gen);
949 gen_blocks = genLiveBlocks(gen);
950
951 mut = 0;
952 for (i = 0; i < n_capabilities; i++) {
953 mut += countOccupied(capabilities[i]->mut_lists[g]);
954
955 // Add the pinned object block.
956 bd = capabilities[i]->pinned_object_block;
957 if (bd != NULL) {
958 gen_live += bd->free - bd->start;
959 gen_blocks += bd->blocks;
960 }
961
962 gen_live += gcThreadLiveWords(i,g);
963 gen_blocks += gcThreadLiveBlocks(i,g);
964 }
965
966 debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
967
968 gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
969
970 debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
971 gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
972 tot_live += gen_live;
973 tot_slop += gen_slop;
974 }
975 debugBelch("----------------------------------------------------------------------\n");
976 debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
977 "",tot_live*sizeof(W_),tot_slop*sizeof(W_));
978 debugBelch("----------------------------------------------------------------------\n");
979 debugBelch("\n");
980 }
981
982 /* -----------------------------------------------------------------------------
983 Stats available via a programmatic interface, so eg. GHCi can time
984 each compilation and expression evaluation.
985 -------------------------------------------------------------------------- */
986
987 uint64_t getAllocations( void )
988 {
989 return stats.allocated_bytes;
990 }
991
992 int getRTSStatsEnabled( void )
993 {
994 return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
995 }
996
997 void getRTSStats( RTSStats *s )
998 {
999 Time current_elapsed = 0;
1000 Time current_cpu = 0;
1001
1002 *s = stats;
1003
1004 getProcessTimes(&current_cpu, &current_elapsed);
1005 s->cpu_ns = current_cpu - end_init_cpu;
1006 s->elapsed_ns = current_elapsed - end_init_elapsed;
1007
1008 s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
1009 PROF_VAL(RP_tot_time + HC_tot_time);
1010 s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1011 stats.gc_elapsed_ns;
1012 }
1013
1014 /* -----------------------------------------------------------------------------
1015 Dumping stuff in the stats file, or via the debug message interface
1016 -------------------------------------------------------------------------- */
1017
1018 void
1019 statsPrintf( char *s, ... )
1020 {
1021 FILE *sf = RtsFlags.GcFlags.statsFile;
1022 va_list ap;
1023
1024 va_start(ap,s);
1025 if (sf == NULL) {
1026 vdebugBelch(s,ap);
1027 } else {
1028 vfprintf(sf, s, ap);
1029 }
1030 va_end(ap);
1031 }
1032
1033 static void
1034 statsFlush( void )
1035 {
1036 FILE *sf = RtsFlags.GcFlags.statsFile;
1037 if (sf != NULL) {
1038 fflush(sf);
1039 }
1040 }
1041
1042 static void
1043 statsClose( void )
1044 {
1045 FILE *sf = RtsFlags.GcFlags.statsFile;
1046 if (sf != NULL) {
1047 fclose(sf);
1048 }
1049 }