26bdac0ea5603c3dffa9aeda65db6bc5608db432
[ghc.git] / rts / Stats.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2005
4 *
5 * Statistics and timing-related functions.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "PosixSource.h"
10 #include "Rts.h"
11
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GC.h" // gc_alloc_block_sync, whitehole_gc_spin
20 #include "sm/GCThread.h"
21 #include "sm/BlockAlloc.h"
22
23 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
24
25 static Time
26 start_init_cpu, start_init_elapsed,
27 end_init_cpu, end_init_elapsed,
28 start_exit_cpu, start_exit_elapsed,
29 start_exit_gc_elapsed, start_exit_gc_cpu,
30 end_exit_cpu, end_exit_elapsed;
31
32 #if defined(PROFILING)
33 static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
34 static Time RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time
35
36 static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
37 static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
38 #endif
39
40 #if defined(PROFILING)
41 #define PROF_VAL(x) (x)
42 #else
43 #define PROF_VAL(x) 0
44 #endif
45
46 //
47 // All the stats!
48 //
49 // This is where we accumulate all the stats during execution, and it's also
50 // in a convenient form that we can copy over to a caller of getRTSStats().
51 //
52 static RTSStats stats;
53
54 static W_ GC_end_faults = 0;
55
56 static Time *GC_coll_cpu = NULL;
57 static Time *GC_coll_elapsed = NULL;
58 static Time *GC_coll_max_pause = NULL;
59
60 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
61 static void statsFlush( void );
62 static void statsClose( void );
63
64 /* -----------------------------------------------------------------------------
65 Current elapsed time
66 ------------------------------------------------------------------------- */
67
68 Time stat_getElapsedTime(void)
69 {
70 return getProcessElapsedTime() - start_init_elapsed;
71 }
72
73 /* ---------------------------------------------------------------------------
74 Measure the current MUT time, for profiling
75 ------------------------------------------------------------------------ */
76
77 double
78 mut_user_time_until( Time t )
79 {
80 return TimeToSecondsDbl(t - stats.gc_cpu_ns);
81 // heapCensus() time is included in GC_tot_cpu, so we don't need
82 // to subtract it here.
83 }
84
85 double
86 mut_user_time( void )
87 {
88 Time cpu;
89 cpu = getProcessCPUTime();
90 return mut_user_time_until(cpu);
91 }
92
93 #if defined(PROFILING)
94 /*
95 mut_user_time_during_RP() returns the MUT time during retainer profiling.
96 The same is for mut_user_time_during_HC();
97 */
98 static double
99 mut_user_time_during_RP( void )
100 {
101 return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
102 }
103
104 #endif /* PROFILING */
105
106 /* ---------------------------------------------------------------------------
107 initStats0() has no dependencies, it can be called right at the beginning
108 ------------------------------------------------------------------------ */
109
110 void
111 initStats0(void)
112 {
113 start_init_cpu = 0;
114 start_init_elapsed = 0;
115 end_init_cpu = 0;
116 end_init_elapsed = 0;
117
118 start_exit_cpu = 0;
119 start_exit_elapsed = 0;
120 start_exit_gc_cpu = 0;
121 start_exit_gc_elapsed = 0;
122 end_exit_cpu = 0;
123 end_exit_elapsed = 0;
124
125 #if defined(PROFILING)
126 RP_start_time = 0;
127 RP_tot_time = 0;
128 RPe_start_time = 0;
129 RPe_tot_time = 0;
130
131 HC_start_time = 0;
132 HC_tot_time = 0;
133 HCe_start_time = 0;
134 HCe_tot_time = 0;
135 #endif
136
137 GC_end_faults = 0;
138
139 stats = (RTSStats) {
140 .gcs = 0,
141 .major_gcs = 0,
142 .allocated_bytes = 0,
143 .max_live_bytes = 0,
144 .max_large_objects_bytes = 0,
145 .max_compact_bytes = 0,
146 .max_slop_bytes = 0,
147 .max_mem_in_use_bytes = 0,
148 .cumulative_live_bytes = 0,
149 .copied_bytes = 0,
150 .par_copied_bytes = 0,
151 .cumulative_par_max_copied_bytes = 0,
152 .cumulative_par_balanced_copied_bytes = 0,
153 .mutator_cpu_ns = 0,
154 .mutator_elapsed_ns = 0,
155 .gc_cpu_ns = 0,
156 .gc_elapsed_ns = 0,
157 .cpu_ns = 0,
158 .elapsed_ns = 0,
159 .gc = {
160 .gen = 0,
161 .threads = 0,
162 .allocated_bytes = 0,
163 .live_bytes = 0,
164 .large_objects_bytes = 0,
165 .compact_bytes = 0,
166 .slop_bytes = 0,
167 .mem_in_use_bytes = 0,
168 .copied_bytes = 0,
169 .par_max_copied_bytes = 0,
170 .par_balanced_copied_bytes = 0,
171 .sync_elapsed_ns = 0,
172 .cpu_ns = 0,
173 .elapsed_ns = 0
174 }
175 };
176 }
177
178 /* ---------------------------------------------------------------------------
179 initStats1() can be called after setupRtsFlags()
180 ------------------------------------------------------------------------ */
181
182 void
183 initStats1 (void)
184 {
185 uint32_t i;
186
187 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
188 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
189 statsPrintf(" bytes bytes bytes user elap user elap\n");
190 }
191 GC_coll_cpu =
192 (Time *)stgMallocBytes(
193 sizeof(Time)*RtsFlags.GcFlags.generations,
194 "initStats");
195 GC_coll_elapsed =
196 (Time *)stgMallocBytes(
197 sizeof(Time)*RtsFlags.GcFlags.generations,
198 "initStats");
199 GC_coll_max_pause =
200 (Time *)stgMallocBytes(
201 sizeof(Time)*RtsFlags.GcFlags.generations,
202 "initStats");
203 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
204 GC_coll_cpu[i] = 0;
205 GC_coll_elapsed[i] = 0;
206 GC_coll_max_pause[i] = 0;
207 }
208 }
209
210 /* -----------------------------------------------------------------------------
211 Initialisation time...
212 -------------------------------------------------------------------------- */
213
214 void
215 stat_startInit(void)
216 {
217 getProcessTimes(&start_init_cpu, &start_init_elapsed);
218 }
219
220 void
221 stat_endInit(void)
222 {
223 getProcessTimes(&end_init_cpu, &end_init_elapsed);
224 }
225
226 /* -----------------------------------------------------------------------------
227 stat_startExit and stat_endExit
228
229 These two measure the time taken in shutdownHaskell().
230 -------------------------------------------------------------------------- */
231
232 void
233 stat_startExit(void)
234 {
235 getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
236 start_exit_gc_elapsed = stats.gc_elapsed_ns;
237 start_exit_gc_cpu = stats.gc_cpu_ns;
238 }
239
240 void
241 stat_endExit(void)
242 {
243 getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
244 }
245
246 void
247 stat_startGCSync (gc_thread *gct)
248 {
249 gct->gc_sync_start_elapsed = getProcessElapsedTime();
250 }
251
252 /* -----------------------------------------------------------------------------
253 Called at the beginning of each GC
254 -------------------------------------------------------------------------- */
255
256 void
257 stat_startGC (Capability *cap, gc_thread *gct)
258 {
259 if (RtsFlags.GcFlags.ringBell) {
260 debugBelch("\007");
261 }
262
263 getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
264
265 // Post EVENT_GC_START with the same timestamp as used for stats
266 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
267 // Here, as opposed to other places, the event is emitted on the cap
268 // that initiates the GC and external tools expect it to have the same
269 // timestamp as used in +RTS -s calculcations.
270 traceEventGcStartAtT(cap,
271 TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
272
273 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
274 {
275 gct->gc_start_faults = getPageFaults();
276 }
277
278 updateNurseriesStats();
279 }
280
281 /* -----------------------------------------------------------------------------
282 Called at the end of each GC
283 -------------------------------------------------------------------------- */
284
285 void
286 stat_endGC (Capability *cap, gc_thread *gct,
287 W_ live, W_ copied, W_ slop, uint32_t gen,
288 uint32_t par_n_threads, W_ par_max_copied,
289 W_ par_balanced_copied)
290 {
291 // -------------------------------------------------
292 // Collect all the stats about this GC in stats.gc. We always do this since
293 // it's relatively cheap and we need allocated_bytes to catch heap
294 // overflows.
295
296 stats.gc.gen = gen;
297 stats.gc.threads = par_n_threads;
298
299 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
300
301 // allocated since the last GC
302 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
303
304 stats.gc.live_bytes = live * sizeof(W_);
305 stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
306 stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
307 stats.gc.slop_bytes = slop * sizeof(W_);
308 stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
309 stats.gc.copied_bytes = copied * sizeof(W_);
310 stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
311 stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
312
313 bool stats_enabled =
314 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
315 rtsConfig.gcDoneHook != NULL;
316
317 if (stats_enabled
318 || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
319 {
320 // We only update the times when stats are explicitly enabled since
321 // getProcessTimes (e.g. requiring a system call) can be expensive on
322 // some platforms.
323 Time current_cpu, current_elapsed;
324 getProcessTimes(&current_cpu, &current_elapsed);
325 stats.cpu_ns = current_cpu - start_init_cpu;
326 stats.elapsed_ns = current_elapsed - start_init_elapsed;
327
328 stats.gc.sync_elapsed_ns =
329 gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
330 stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
331 stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
332 }
333 // -------------------------------------------------
334 // Update the cumulative stats
335
336 stats.gcs++;
337 stats.allocated_bytes = tot_alloc_bytes;
338 stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
339
340 GC_coll_cpu[gen] += stats.gc.cpu_ns;
341 GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
342 if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
343 GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
344 }
345
346 stats.copied_bytes += stats.gc.copied_bytes;
347 if (par_n_threads > 1) {
348 stats.par_copied_bytes += stats.gc.copied_bytes;
349 stats.cumulative_par_max_copied_bytes +=
350 stats.gc.par_max_copied_bytes;
351 stats.cumulative_par_balanced_copied_bytes +=
352 stats.gc.par_balanced_copied_bytes;
353 }
354 stats.gc_cpu_ns += stats.gc.cpu_ns;
355 stats.gc_elapsed_ns += stats.gc.elapsed_ns;
356
357 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
358 stats.major_gcs++;
359 if (stats.gc.live_bytes > stats.max_live_bytes) {
360 stats.max_live_bytes = stats.gc.live_bytes;
361 }
362 if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
363 stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
364 }
365 if (stats.gc.compact_bytes > stats.max_compact_bytes) {
366 stats.max_compact_bytes = stats.gc.compact_bytes;
367 }
368 if (stats.gc.slop_bytes > stats.max_slop_bytes) {
369 stats.max_slop_bytes = stats.gc.slop_bytes;
370 }
371 stats.cumulative_live_bytes += stats.gc.live_bytes;
372 }
373
374 // -------------------------------------------------
375 // Do the more expensive bits only when stats are enabled.
376
377 if (stats_enabled)
378 {
379 // -------------------------------------------------
380 // Emit events to the event log
381
382 // Has to be emitted while all caps stopped for GC, but before GC_END.
383 // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
384 // for a detailed design rationale of the current setup
385 // of GC eventlog events.
386 traceEventGcGlobalSync(cap);
387
388 // Emitted before GC_END on all caps, which simplifies tools code.
389 traceEventGcStats(cap,
390 CAPSET_HEAP_DEFAULT,
391 stats.gc.gen,
392 stats.gc.copied_bytes,
393 stats.gc.slop_bytes,
394 /* current loss due to fragmentation */
395 (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
396 * BLOCK_SIZE,
397 par_n_threads,
398 stats.gc.par_max_copied_bytes,
399 stats.gc.copied_bytes,
400 stats.gc.par_balanced_copied_bytes);
401
402 // Post EVENT_GC_END with the same timestamp as used for stats
403 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
404 // Here, as opposed to other places, the event is emitted on the cap
405 // that initiates the GC and external tools expect it to have the same
406 // timestamp as used in +RTS -s calculcations.
407 traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
408
409 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
410 traceEventHeapLive(cap,
411 CAPSET_HEAP_DEFAULT,
412 stats.gc.live_bytes);
413 }
414
415 // -------------------------------------------------
416 // Print GC stats to stdout or a file (+RTS -S/-s)
417
418 if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
419 W_ faults = getPageFaults();
420
421 statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
422 stats.gc.allocated_bytes, stats.gc.copied_bytes,
423 stats.gc.live_bytes);
424
425 statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
426 FMT_Word " %4" FMT_Word " (Gen: %2d)\n",
427 TimeToSecondsDbl(stats.gc.cpu_ns),
428 TimeToSecondsDbl(stats.gc.elapsed_ns),
429 TimeToSecondsDbl(stats.cpu_ns),
430 TimeToSecondsDbl(stats.elapsed_ns),
431 faults - gct->gc_start_faults,
432 gct->gc_start_faults - GC_end_faults,
433 gen);
434
435 GC_end_faults = faults;
436 statsFlush();
437 }
438
439
440 if (rtsConfig.gcDoneHook != NULL) {
441 rtsConfig.gcDoneHook(&stats.gc);
442 }
443
444 traceEventHeapSize(cap,
445 CAPSET_HEAP_DEFAULT,
446 mblocks_allocated * MBLOCK_SIZE);
447 }
448 }
449
450 /* -----------------------------------------------------------------------------
451 Called at the beginning of each Retainer Profiliing
452 -------------------------------------------------------------------------- */
453 #if defined(PROFILING)
454 void
455 stat_startRP(void)
456 {
457 Time user, elapsed;
458 getProcessTimes( &user, &elapsed );
459
460 RP_start_time = user;
461 RPe_start_time = elapsed;
462 }
463 #endif /* PROFILING */
464
465 /* -----------------------------------------------------------------------------
466 Called at the end of each Retainer Profiliing
467 -------------------------------------------------------------------------- */
468
469 #if defined(PROFILING)
470 void
471 stat_endRP(
472 uint32_t retainerGeneration,
473 #if defined(DEBUG_RETAINER)
474 uint32_t maxCStackSize,
475 int maxStackSize,
476 #endif
477 double averageNumVisit)
478 {
479 Time user, elapsed;
480 getProcessTimes( &user, &elapsed );
481
482 RP_tot_time += user - RP_start_time;
483 RPe_tot_time += elapsed - RPe_start_time;
484
485 fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
486 retainerGeneration, mut_user_time_during_RP());
487 #if defined(DEBUG_RETAINER)
488 fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
489 fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
490 #endif
491 fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
492 }
493 #endif /* PROFILING */
494
495 /* -----------------------------------------------------------------------------
496 Called at the beginning of each heap census
497 -------------------------------------------------------------------------- */
498 #if defined(PROFILING)
499 void
500 stat_startHeapCensus(void)
501 {
502 Time user, elapsed;
503 getProcessTimes( &user, &elapsed );
504
505 HC_start_time = user;
506 HCe_start_time = elapsed;
507 }
508 #endif /* PROFILING */
509
510 /* -----------------------------------------------------------------------------
511 Called at the end of each heap census
512 -------------------------------------------------------------------------- */
513 #if defined(PROFILING)
514 void
515 stat_endHeapCensus(void)
516 {
517 Time user, elapsed;
518 getProcessTimes( &user, &elapsed );
519
520 HC_tot_time += user - HC_start_time;
521 HCe_tot_time += elapsed - HCe_start_time;
522 }
523 #endif /* PROFILING */
524
525 /* -----------------------------------------------------------------------------
526 Called at the end of execution
527
528 NOTE: number of allocations is not entirely accurate: it doesn't
529 take into account the few bytes at the end of the heap that
530 were left unused when the heap-check failed.
531 -------------------------------------------------------------------------- */
532
533 #if defined(DEBUG)
534 #define TICK_VAR_INI(arity) \
535 StgInt SLOW_CALLS_##arity = 1; \
536 StgInt RIGHT_ARITY_##arity = 1; \
537 StgInt TAGGED_PTR_##arity = 0;
538
539 TICK_VAR_INI(1)
540 TICK_VAR_INI(2)
541
542 StgInt TOTAL_CALLS=1;
543 #endif
544
545 /* Report the value of a counter */
546 #define REPORT(counter) \
547 { \
548 showStgWord64(counter,temp,true/*commas*/); \
549 statsPrintf(" (" #counter ") : %s\n",temp); \
550 }
551
552 /* Report the value of a counter as a percentage of another counter */
553 #define REPORT_PCT(counter,countertot) \
554 statsPrintf(" (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
555 counter*100.0/countertot)
556
557 #define TICK_PRINT(arity) \
558 REPORT(SLOW_CALLS_##arity); \
559 REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
560 REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
561 REPORT(RIGHT_ARITY_##arity); \
562 REPORT(TAGGED_PTR_##arity)
563
564 #define TICK_PRINT_TOT(arity) \
565 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
566 SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
567
568 STATIC_INLINE Time get_init_cpu(void) {
569 return end_init_cpu - start_init_cpu;
570 }
571
572 STATIC_INLINE Time get_init_elapsed(void) {
573 return end_init_elapsed - start_init_elapsed;
574 }
575
576
577 void
578 stat_exit (void)
579 {
580 generation *gen;
581 Time gc_cpu = 0;
582 Time gc_elapsed = 0;
583 Time init_cpu = 0;
584 Time init_elapsed = 0;
585 Time mut_cpu = 0;
586 Time mut_elapsed = 0;
587 Time exit_cpu = 0;
588 Time exit_elapsed = 0;
589 Time exit_gc_cpu = 0;
590 Time exit_gc_elapsed = 0;
591
592 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
593
594 char temp[512];
595 Time tot_cpu;
596 Time tot_elapsed;
597 uint32_t g;
598
599 getProcessTimes( &tot_cpu, &tot_elapsed );
600 tot_cpu -= start_init_cpu;
601 tot_elapsed -= start_init_elapsed;
602
603 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
604
605 // allocated since the last GC
606 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
607 stats.allocated_bytes = tot_alloc_bytes;
608
609 /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
610 if (tot_cpu <= 0) tot_cpu = 1;
611 if (tot_elapsed <= 0) tot_elapsed = 1;
612
613 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
614 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
615 (W_)stats.gc.allocated_bytes, "", "");
616 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
617 }
618
619 // heapCensus() is called by the GC, so RP and HC time are
620 // included in the GC stats. We therefore subtract them to
621 // obtain the actual GC cpu time.
622 gc_cpu = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
623 gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
624
625 init_cpu = get_init_cpu();
626 init_elapsed = get_init_elapsed();
627
628 // We do a GC during the EXIT phase. We'll attribute the cost of that
629 // to GC instead of EXIT, so carefully subtract it from the EXIT time.
630 exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
631 exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
632 exit_cpu = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
633 exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
634
635 mut_elapsed = start_exit_elapsed - end_init_elapsed -
636 (gc_elapsed - exit_gc_elapsed) -
637 PROF_VAL(RPe_tot_time + HCe_tot_time);
638
639 mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
640 - PROF_VAL(RP_tot_time + HC_tot_time);
641 if (mut_cpu < 0) { mut_cpu = 0; }
642
643 // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
644 // subtracting, so the parts should add up to the total exactly. Note
645 // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
646 // so we don't use it here.
647 ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
648 == end_exit_elapsed - start_init_elapsed);
649
650
651 if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
652 showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
653 statsPrintf("%16s bytes allocated in the heap\n", temp);
654
655 showStgWord64(stats.copied_bytes, temp, true/*commas*/);
656 statsPrintf("%16s bytes copied during GC\n", temp);
657
658 if ( stats.major_gcs > 0 ) {
659 showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
660 statsPrintf("%16s bytes maximum residency (%" FMT_Word32
661 " sample(s))\n",
662 temp, stats.major_gcs);
663 }
664
665 showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
666 statsPrintf("%16s bytes maximum slop\n", temp);
667
668 statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
669 FMT_SizeT " MB lost due to fragmentation)\n\n",
670 (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
671 (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
672
673 /* Print garbage collections in each gen */
674 statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
675 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
676 gen = &generations[g];
677 statsPrintf(" Gen %2d %5d colls, %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
678 gen->no,
679 gen->collections,
680 gen->par_collections,
681 TimeToSecondsDbl(GC_coll_cpu[g]),
682 TimeToSecondsDbl(GC_coll_elapsed[g]),
683 gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
684 TimeToSecondsDbl(GC_coll_max_pause[g]));
685 }
686
687 #if defined(THREADED_RTS)
688 if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
689 // See Note [Work Balance]
690 statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
691 100 * (double)stats.cumulative_par_balanced_copied_bytes /
692 (double)stats.par_copied_bytes);
693 }
694 #endif
695 statsPrintf("\n");
696
697 #if defined(THREADED_RTS)
698 statsPrintf(" TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
699 taskCount, taskCount - workerCount,
700 peakWorkerCount, workerCount,
701 n_capabilities);
702
703 statsPrintf("\n");
704
705 {
706 uint32_t i;
707 SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
708 for (i = 0; i < n_capabilities; i++) {
709 sparks.created += capabilities[i]->spark_stats.created;
710 sparks.dud += capabilities[i]->spark_stats.dud;
711 sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
712 sparks.converted += capabilities[i]->spark_stats.converted;
713 sparks.gcd += capabilities[i]->spark_stats.gcd;
714 sparks.fizzled += capabilities[i]->spark_stats.fizzled;
715 }
716
717 statsPrintf(" SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
718 sparks.created + sparks.dud + sparks.overflowed,
719 sparks.converted, sparks.overflowed, sparks.dud,
720 sparks.gcd, sparks.fizzled);
721 }
722 #endif
723
724 statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
725 TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
726
727 statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
728 TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
729 statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
730 TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
731
732 #if defined(PROFILING)
733 statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
734 TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
735 statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
736 TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
737 #endif
738 statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
739 TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
740 statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
741 TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
742 #if !defined(THREADED_RTS)
743 statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
744 TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
745 TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
746 #endif
747
748 if (mut_cpu == 0) {
749 showStgWord64(0, temp, true/*commas*/);
750 } else {
751 showStgWord64(
752 (StgWord64)((double)stats.allocated_bytes /
753 TimeToSecondsDbl(mut_cpu)),
754 temp, true/*commas*/);
755 }
756
757 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
758
759 statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
760 TimeToSecondsDbl(tot_cpu - gc_cpu -
761 PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
762 / TimeToSecondsDbl(tot_cpu),
763 TimeToSecondsDbl(tot_elapsed - gc_elapsed -
764 PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
765 / TimeToSecondsDbl(tot_elapsed));
766
767 #if defined(THREADED_RTS) && defined(PROF_SPIN)
768 {
769 uint32_t g;
770
771 statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
772 statsPrintf("whitehole_gc_spin: %"FMT_Word64"\n"
773 , whitehole_gc_spin);
774 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
775 statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
776 }
777 }
778 #endif
779 }
780
781 if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
782 char *fmt;
783 if (RtsFlags.MiscFlags.machineReadable) {
784 fmt =
785 " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
786 " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
787 " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
788 " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
789 " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
790 " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
791 " ,(\"init_cpu_seconds\", \"%.3f\")\n"
792 " ,(\"init_wall_seconds\", \"%.3f\")\n"
793 " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
794 " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
795 " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
796 " ,(\"GC_wall_seconds\", \"%.3f\")\n"
797 " ]\n";
798 }
799 else {
800 fmt =
801 "<<ghc: %" FMT_Word64 " bytes, "
802 "%" FMT_Word32 " GCs, "
803 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
804 "%" FMT_Word64 "M in use, "
805 "%.3f INIT (%.3f elapsed), "
806 "%.3f MUT (%.3f elapsed), "
807 "%.3f GC (%.3f elapsed) :ghc>>\n";
808 }
809 /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
810 statsPrintf(fmt,
811 stats.allocated_bytes,
812 stats.gcs,
813 (uint64_t)
814 (stats.major_gcs == 0 ? 0 :
815 stats.cumulative_live_bytes/stats.major_gcs),
816 stats.max_live_bytes,
817 stats.major_gcs,
818 (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
819 TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
820 TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
821 TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
822 }
823
824 statsFlush();
825 statsClose();
826 }
827
828 if (GC_coll_cpu) {
829 stgFree(GC_coll_cpu);
830 GC_coll_cpu = NULL;
831 }
832 if (GC_coll_elapsed) {
833 stgFree(GC_coll_elapsed);
834 GC_coll_elapsed = NULL;
835 }
836 if (GC_coll_max_pause) {
837 stgFree(GC_coll_max_pause);
838 GC_coll_max_pause = NULL;
839 }
840 }
841
842 /* Note [Work Balance]
843 ----------------------
844 Work balance is a measure of how evenly the work done during parallel garbage
845 collection is spread across threads. To compute work balance we must take care
846 to account for the number of GC threads changing between GCs. The statistics we
847 track must have the number of GC threads "integrated out".
848
849 We accumulate two values from each garbage collection:
850 * par_copied: is a measure of the total work done during parallel garbage
851 collection
852 * par_balanced_copied: is a measure of the balanced work done
853 during parallel garbage collection.
854
855 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
856 complicated:
857
858 For a given garbage collection:
859 Let gc_copied := total copies during the gc
860 gc_copied_i := copies by the ith thread during the gc
861 num_gc_threads := the number of threads participating in the gc
862 balance_limit := (gc_copied / num_gc_threads)
863
864 If we were to graph gc_copied_i, sorted from largest to smallest we would see
865 something like:
866
867 |X
868 ^ |X X
869 | |X X X X: unbalanced copies
870 copies |----------- Y: balanced copies by the busiest GC thread
871 |Y Z Z Z: other balanced copies
872 |Y Z Z Z
873 |Y Z Z Z Z
874 |Y Z Z Z Z Z
875 |===========
876 |1 2 3 4 5 6
877 i ->
878
879 where the --- line is at balance_limit. Balanced copies are those under the ---
880 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
881 always equal balance_limit. Completely balanced gc has every thread copying
882 balance_limit and a completely unbalanced gc has a single thread copying
883 gc_copied.
884
885 One could define par_balance_copied as the areas of the Ys and Zs in the graph
886 above, however we would like the ratio of (par_balance_copied / gc_copied) to
887 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
888 from 0 to 1. We therefore define par_balanced_copied as:
889
890 ( num_gc_threads )
891 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
892 i (num_gc_threads - 1)
893 vvv vvv
894 S T
895
896 Where the S and T terms serve to remove the area of the Ys, and
897 to normalize the result to lie between 0 and gc_copied.
898
899 Note that the implementation orders these operations differently to minimize
900 error due to integer rounding.
901
902 Then cumulative work balance is computed as
903 (cumulative_par_balanced_copied_bytes / par_copied_byes)
904
905 Previously, cumulative work balance was computed as:
906
907 (cumulative_par_max_copied_bytes)
908 (-------------------------------) - 1
909 ( par_copied_bytes )
910 -------------------------------------
911 (n_capabilities - 1)
912
913 This was less accurate than the current method, and invalid whenever a garbage
914 collection had occurred with num_gc_threads /= n_capabilities; which can happen
915 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
916 the number of gc threads is limited to the number of cores.
917 See #13830
918 */
919
920 /* -----------------------------------------------------------------------------
921 stat_describe_gens
922
923 Produce some detailed info on the state of the generational GC.
924 -------------------------------------------------------------------------- */
925 void
926 statDescribeGens(void)
927 {
928 uint32_t g, mut, lge, compacts, i;
929 W_ gen_slop;
930 W_ tot_live, tot_slop;
931 W_ gen_live, gen_blocks;
932 bdescr *bd;
933 generation *gen;
934
935 debugBelch(
936 "----------------------------------------------------------------------\n"
937 " Gen Max Mut-list Blocks Large Compacts Live Slop\n"
938 " Blocks Bytes Objects \n"
939 "----------------------------------------------------------------------\n");
940
941 tot_live = 0;
942 tot_slop = 0;
943
944 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
945 gen = &generations[g];
946
947 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
948 lge++;
949 }
950
951 for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
952 compacts++;
953 }
954
955 gen_live = genLiveWords(gen);
956 gen_blocks = genLiveBlocks(gen);
957
958 mut = 0;
959 for (i = 0; i < n_capabilities; i++) {
960 mut += countOccupied(capabilities[i]->mut_lists[g]);
961
962 // Add the pinned object block.
963 bd = capabilities[i]->pinned_object_block;
964 if (bd != NULL) {
965 gen_live += bd->free - bd->start;
966 gen_blocks += bd->blocks;
967 }
968
969 gen_live += gcThreadLiveWords(i,g);
970 gen_blocks += gcThreadLiveBlocks(i,g);
971 }
972
973 debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
974
975 gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
976
977 debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
978 gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
979 tot_live += gen_live;
980 tot_slop += gen_slop;
981 }
982 debugBelch("----------------------------------------------------------------------\n");
983 debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
984 "",tot_live*sizeof(W_),tot_slop*sizeof(W_));
985 debugBelch("----------------------------------------------------------------------\n");
986 debugBelch("\n");
987 }
988
989 /* -----------------------------------------------------------------------------
990 Stats available via a programmatic interface, so eg. GHCi can time
991 each compilation and expression evaluation.
992 -------------------------------------------------------------------------- */
993
994 uint64_t getAllocations( void )
995 {
996 return stats.allocated_bytes;
997 }
998
999 int getRTSStatsEnabled( void )
1000 {
1001 return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
1002 }
1003
1004 void getRTSStats( RTSStats *s )
1005 {
1006 Time current_elapsed = 0;
1007 Time current_cpu = 0;
1008
1009 *s = stats;
1010
1011 getProcessTimes(&current_cpu, &current_elapsed);
1012 s->cpu_ns = current_cpu - end_init_cpu;
1013 s->elapsed_ns = current_elapsed - end_init_elapsed;
1014
1015 s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
1016 s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1017 stats.gc_elapsed_ns;
1018 }
1019
1020 /* -----------------------------------------------------------------------------
1021 Dumping stuff in the stats file, or via the debug message interface
1022 -------------------------------------------------------------------------- */
1023
1024 void
1025 statsPrintf( char *s, ... )
1026 {
1027 FILE *sf = RtsFlags.GcFlags.statsFile;
1028 va_list ap;
1029
1030 va_start(ap,s);
1031 if (sf == NULL) {
1032 vdebugBelch(s,ap);
1033 } else {
1034 vfprintf(sf, s, ap);
1035 }
1036 va_end(ap);
1037 }
1038
1039 static void
1040 statsFlush( void )
1041 {
1042 FILE *sf = RtsFlags.GcFlags.statsFile;
1043 if (sf != NULL) {
1044 fflush(sf);
1045 }
1046 }
1047
1048 static void
1049 statsClose( void )
1050 {
1051 FILE *sf = RtsFlags.GcFlags.statsFile;
1052 if (sf != NULL) {
1053 fclose(sf);
1054 }
1055 }