rts: Add --internal-counters RTS flag and several counters
[ghc.git] / rts / sm / GC.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * Generational garbage collector
6 *
7 * Documentation on the architecture of the Garbage Collector can be
8 * found in the online commentary:
9 *
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16 #include "HsFFI.h"
17
18 #include "GC.h"
19 #include "GCThread.h"
20 #include "GCTDecl.h" // NB. before RtsSignals.h which
21 // clobbers REG_R1 on arm/Linux
22 #include "Compact.h"
23 #include "Evac.h"
24 #include "Scav.h"
25 #include "GCUtils.h"
26 #include "MarkStack.h"
27 #include "MarkWeak.h"
28 #include "Sparks.h"
29 #include "Sweep.h"
30
31 #include "Arena.h"
32 #include "Storage.h"
33 #include "RtsUtils.h"
34 #include "Apply.h"
35 #include "Updates.h"
36 #include "Stats.h"
37 #include "Schedule.h"
38 #include "Sanity.h"
39 #include "BlockAlloc.h"
40 #include "ProfHeap.h"
41 #include "Weak.h"
42 #include "Prelude.h"
43 #include "RtsSignals.h"
44 #include "STM.h"
45 #include "Trace.h"
46 #include "RetainerProfile.h"
47 #include "LdvProfile.h"
48 #include "RaiseAsync.h"
49 #include "Stable.h"
50 #include "CheckUnload.h"
51 #include "CNF.h"
52 #include "RtsFlags.h"
53
54 #if defined(PROFILING)
55 #include "RetainerProfile.h"
56 #endif
57
58 #include <string.h> // for memset()
59 #include <unistd.h>
60
61 /* -----------------------------------------------------------------------------
62 Global variables
63 -------------------------------------------------------------------------- */
64
65 /* STATIC OBJECT LIST.
66 *
67 * During GC:
68 * We maintain a linked list of static objects that are still live.
69 * The requirements for this list are:
70 *
71 * - we need to scan the list while adding to it, in order to
72 * scavenge all the static objects (in the same way that
73 * breadth-first scavenging works for dynamic objects).
74 *
75 * - we need to be able to tell whether an object is already on
76 * the list, to break loops.
77 *
78 * Each static object has a "static link field", which we use for
79 * linking objects on to the list. We use a stack-type list, consing
80 * objects on the front as they are added (this means that the
81 * scavenge phase is depth-first, not breadth-first, but that
82 * shouldn't matter).
83 *
84 * A separate list is kept for objects that have been scavenged
85 * already - this is so that we can zero all the marks afterwards.
86 *
87 * An object is on the list if its static link field is non-zero; this
88 * means that we have to mark the end of the list with '1', not NULL.
89 *
90 * Extra notes for generational GC:
91 *
92 * Each generation has a static object list associated with it. When
93 * collecting generations up to N, we treat the static object lists
94 * from generations > N as roots.
95 *
96 * We build up a static object list while collecting generations 0..N,
97 * which is then appended to the static object list of generation N+1.
98 */
99
100 /* N is the oldest generation being collected, where the generations
101 * are numbered starting at 0. A major GC (indicated by the major_gc
102 * flag) is when we're collecting all generations. We only attempt to
103 * deal with static objects and GC CAFs when doing a major GC.
104 */
105 uint32_t N;
106 bool major_gc;
107
108 /* Data used for allocation area sizing.
109 */
110 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
111
112 /* Mut-list stats */
113 #if defined(DEBUG)
114 uint32_t mutlist_MUTVARS,
115 mutlist_MUTARRS,
116 mutlist_MVARS,
117 mutlist_TVAR,
118 mutlist_TVAR_WATCH_QUEUE,
119 mutlist_TREC_CHUNK,
120 mutlist_TREC_HEADER,
121 mutlist_ATOMIC_INVARIANT,
122 mutlist_INVARIANT_CHECK_QUEUE,
123 mutlist_OTHERS;
124 #endif
125
126 /* Thread-local data for each GC thread
127 */
128 gc_thread **gc_threads = NULL;
129
130 #if !defined(THREADED_RTS)
131 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)];
132 #endif
133
134 // Number of threads running in *this* GC. Affects how many
135 // step->todos[] lists we have to look in to find work.
136 uint32_t n_gc_threads;
137
138 // For stats:
139 static long copied; // *words* copied & scavenged during this GC
140
141 #if defined(PROF_SPIN) && defined(THREADED_RTS)
142 // spin and yield counts for the quasi-SpinLock in waitForGcThreads
143 volatile StgWord64 waitForGcThreads_spin = 0;
144 volatile StgWord64 waitForGcThreads_yield = 0;
145 volatile StgWord64 whitehole_gc_spin = 0;
146 #endif
147
148 bool work_stealing;
149
150 uint32_t static_flag = STATIC_FLAG_B;
151 uint32_t prev_static_flag = STATIC_FLAG_A;
152
153 DECLARE_GCT
154
155 /* -----------------------------------------------------------------------------
156 Static function declarations
157 -------------------------------------------------------------------------- */
158
159 static void mark_root (void *user, StgClosure **root);
160 static void prepare_collected_gen (generation *gen);
161 static void prepare_uncollected_gen (generation *gen);
162 static void init_gc_thread (gc_thread *t);
163 static void resize_generations (void);
164 static void resize_nursery (void);
165 static void start_gc_threads (void);
166 static void scavenge_until_all_done (void);
167 static StgWord inc_running (void);
168 static StgWord dec_running (void);
169 static void wakeup_gc_threads (uint32_t me, bool idle_cap[]);
170 static void shutdown_gc_threads (uint32_t me, bool idle_cap[]);
171 static void collect_gct_blocks (void);
172 static void collect_pinned_object_blocks (void);
173 static void heapOverflow (void);
174
175 #if defined(DEBUG)
176 static void gcCAFs (void);
177 #endif
178
179 /* -----------------------------------------------------------------------------
180 The mark stack.
181 -------------------------------------------------------------------------- */
182
183 bdescr *mark_stack_top_bd; // topmost block in the mark stack
184 bdescr *mark_stack_bd; // current block in the mark stack
185 StgPtr mark_sp; // pointer to the next unallocated mark stack entry
186
187 /* -----------------------------------------------------------------------------
188 GarbageCollect: the main entry point to the garbage collector.
189
190 The collect_gen parameter is gotten by calling calcNeeded().
191
192 Locks held: all capabilities are held throughout GarbageCollect().
193 -------------------------------------------------------------------------- */
194
195 void
196 GarbageCollect (uint32_t collect_gen,
197 bool do_heap_census,
198 uint32_t gc_type USED_IF_THREADS,
199 Capability *cap,
200 bool idle_cap[])
201 {
202 bdescr *bd;
203 generation *gen;
204 StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
205 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
206 any_work, no_work, scav_find_work;
207 #if defined(THREADED_RTS)
208 gc_thread *saved_gct;
209 #endif
210 uint32_t g, n;
211
212 // necessary if we stole a callee-saves register for gct:
213 #if defined(THREADED_RTS)
214 saved_gct = gct;
215 #endif
216
217 #if defined(PROFILING)
218 CostCentreStack *save_CCS[n_capabilities];
219 #endif
220
221 ACQUIRE_SM_LOCK;
222
223 #if defined(RTS_USER_SIGNALS)
224 if (RtsFlags.MiscFlags.install_signal_handlers) {
225 // block signals
226 blockUserSignals();
227 }
228 #endif
229
230 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord));
231 // otherwise adjust the padding in gen_workspace.
232
233 // this is the main thread
234 SET_GCT(gc_threads[cap->no]);
235
236 // tell the stats department that we've started a GC
237 stat_startGC(cap, gct);
238
239 // lock the StablePtr table
240 stableLock();
241
242 #if defined(DEBUG)
243 mutlist_MUTVARS = 0;
244 mutlist_MUTARRS = 0;
245 mutlist_MVARS = 0;
246 mutlist_TVAR = 0;
247 mutlist_TVAR_WATCH_QUEUE = 0;
248 mutlist_TREC_CHUNK = 0;
249 mutlist_TREC_HEADER = 0;
250 mutlist_ATOMIC_INVARIANT = 0;
251 mutlist_INVARIANT_CHECK_QUEUE = 0;
252 mutlist_OTHERS = 0;
253 #endif
254
255 // attribute any costs to CCS_GC
256 #if defined(PROFILING)
257 for (n = 0; n < n_capabilities; n++) {
258 save_CCS[n] = capabilities[n]->r.rCCCS;
259 capabilities[n]->r.rCCCS = CCS_GC;
260 }
261 #endif
262
263 /* Figure out which generation to collect
264 */
265 N = collect_gen;
266 major_gc = (N == RtsFlags.GcFlags.generations-1);
267
268 if (major_gc) {
269 prev_static_flag = static_flag;
270 static_flag =
271 static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
272 }
273
274 #if defined(THREADED_RTS)
275 work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
276 N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
277 // It's not always a good idea to do load balancing in parallel
278 // GC. In particular, for a parallel program we don't want to
279 // lose locality by moving cached data into another CPU's cache
280 // (this effect can be quite significant).
281 //
282 // We could have a more complex way to deterimine whether to do
283 // work stealing or not, e.g. it might be a good idea to do it
284 // if the heap is big. For now, we just turn it on or off with
285 // a flag.
286 #endif
287
288 /* Start threads, so they can be spinning up while we finish initialisation.
289 */
290 start_gc_threads();
291
292 #if defined(THREADED_RTS)
293 /* How many threads will be participating in this GC?
294 * We don't try to parallelise minor GCs (unless the user asks for
295 * it with +RTS -gn0), or mark/compact/sweep GC.
296 */
297 if (gc_type == SYNC_GC_PAR) {
298 n_gc_threads = n_capabilities;
299 } else {
300 n_gc_threads = 1;
301 }
302 #else
303 n_gc_threads = 1;
304 #endif
305
306 debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s))",
307 N, n_gc_threads);
308
309 #if defined(DEBUG)
310 // check for memory leaks if DEBUG is on
311 memInventory(DEBUG_gc);
312 #endif
313
314 // do this *before* we start scavenging
315 collectFreshWeakPtrs();
316
317 // check sanity *before* GC
318 IF_DEBUG(sanity, checkSanity(false /* before GC */, major_gc));
319
320 // gather blocks allocated using allocatePinned() from each capability
321 // and put them on the g0->large_object list.
322 collect_pinned_object_blocks();
323
324 // Initialise all the generations that we're collecting.
325 for (g = 0; g <= N; g++) {
326 prepare_collected_gen(&generations[g]);
327 }
328 // Initialise all the generations that we're *not* collecting.
329 for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
330 prepare_uncollected_gen(&generations[g]);
331 }
332
333 // Prepare this gc_thread
334 init_gc_thread(gct);
335
336 /* Allocate a mark stack if we're doing a major collection.
337 */
338 if (major_gc && oldest_gen->mark) {
339 mark_stack_bd = allocBlock();
340 mark_stack_top_bd = mark_stack_bd;
341 mark_stack_bd->link = NULL;
342 mark_stack_bd->u.back = NULL;
343 mark_sp = mark_stack_bd->start;
344 } else {
345 mark_stack_bd = NULL;
346 mark_stack_top_bd = NULL;
347 mark_sp = NULL;
348 }
349
350 /* -----------------------------------------------------------------------
351 * follow all the roots that we know about:
352 */
353
354 // the main thread is running: this prevents any other threads from
355 // exiting prematurely, so we can start them now.
356 // NB. do this after the mutable lists have been saved above, otherwise
357 // the other GC threads will be writing into the old mutable lists.
358 inc_running();
359 wakeup_gc_threads(gct->thread_index, idle_cap);
360
361 traceEventGcWork(gct->cap);
362
363 // scavenge the capability-private mutable lists. This isn't part
364 // of markSomeCapabilities() because markSomeCapabilities() can only
365 // call back into the GC via mark_root() (due to the gct register
366 // variable).
367 if (n_gc_threads == 1) {
368 for (n = 0; n < n_capabilities; n++) {
369 #if defined(THREADED_RTS)
370 scavenge_capability_mut_Lists1(capabilities[n]);
371 #else
372 scavenge_capability_mut_lists(capabilities[n]);
373 #endif
374 }
375 } else {
376 scavenge_capability_mut_lists(gct->cap);
377 for (n = 0; n < n_capabilities; n++) {
378 if (idle_cap[n]) {
379 markCapability(mark_root, gct, capabilities[n],
380 true/*don't mark sparks*/);
381 scavenge_capability_mut_lists(capabilities[n]);
382 }
383 }
384 }
385
386 // follow roots from the CAF list (used by GHCi)
387 gct->evac_gen_no = 0;
388 markCAFs(mark_root, gct);
389
390 // follow all the roots that the application knows about.
391 gct->evac_gen_no = 0;
392 if (n_gc_threads == 1) {
393 for (n = 0; n < n_capabilities; n++) {
394 markCapability(mark_root, gct, capabilities[n],
395 true/*don't mark sparks*/);
396 }
397 } else {
398 markCapability(mark_root, gct, cap, true/*don't mark sparks*/);
399 }
400
401 markScheduler(mark_root, gct);
402
403 #if defined(RTS_USER_SIGNALS)
404 // mark the signal handlers (signals should be already blocked)
405 markSignalHandlers(mark_root, gct);
406 #endif
407
408 // Mark the weak pointer list, and prepare to detect dead weak pointers.
409 markWeakPtrList();
410 initWeakForGC();
411
412 // Mark the stable pointer table.
413 markStableTables(mark_root, gct);
414
415 /* -------------------------------------------------------------------------
416 * Repeatedly scavenge all the areas we know about until there's no
417 * more scavenging to be done.
418 */
419 for (;;)
420 {
421 scavenge_until_all_done();
422 // The other threads are now stopped. We might recurse back to
423 // here, but from now on this is the only thread.
424
425 // must be last... invariant is that everything is fully
426 // scavenged at this point.
427 if (traverseWeakPtrList()) { // returns true if evaced something
428 inc_running();
429 continue;
430 }
431
432 // If we get to here, there's really nothing left to do.
433 break;
434 }
435
436 shutdown_gc_threads(gct->thread_index, idle_cap);
437
438 // Now see which stable names are still alive.
439 gcStableTables();
440
441 #if defined(THREADED_RTS)
442 if (n_gc_threads == 1) {
443 for (n = 0; n < n_capabilities; n++) {
444 pruneSparkQueue(capabilities[n]);
445 }
446 } else {
447 for (n = 0; n < n_capabilities; n++) {
448 if (n == cap->no || idle_cap[n]) {
449 pruneSparkQueue(capabilities[n]);
450 }
451 }
452 }
453 #endif
454
455 #if defined(PROFILING)
456 // We call processHeapClosureForDead() on every closure destroyed during
457 // the current garbage collection, so we invoke LdvCensusForDead().
458 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
459 || RtsFlags.ProfFlags.bioSelector != NULL) {
460 RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
461 LdvCensusForDead(N);
462 ACQUIRE_SM_LOCK;
463 }
464 #endif
465
466 // NO MORE EVACUATION AFTER THIS POINT!
467
468 // Finally: compact or sweep the oldest generation.
469 if (major_gc && oldest_gen->mark) {
470 if (oldest_gen->compact)
471 compact(gct->scavenged_static_objects);
472 else
473 sweep(oldest_gen);
474 }
475
476 copied = 0;
477 par_max_copied = 0;
478 par_balanced_copied = 0;
479 gc_spin_spin = 0;
480 gc_spin_yield = 0;
481 mut_spin_spin = 0;
482 mut_spin_yield = 0;
483 any_work = 0;
484 no_work = 0;
485 scav_find_work = 0;
486 {
487 uint32_t i;
488 uint64_t par_balanced_copied_acc = 0;
489 const gc_thread* thread;
490
491 for (i=0; i < n_gc_threads; i++) {
492 copied += gc_threads[i]->copied;
493 }
494 for (i=0; i < n_gc_threads; i++) {
495 thread = gc_threads[i];
496 if (n_gc_threads > 1) {
497 debugTrace(DEBUG_gc,"thread %d:", i);
498 debugTrace(DEBUG_gc," copied %ld",
499 thread->copied * sizeof(W_));
500 debugTrace(DEBUG_gc," scanned %ld",
501 thread->scanned * sizeof(W_));
502 debugTrace(DEBUG_gc," any_work %ld",
503 thread->any_work);
504 debugTrace(DEBUG_gc," no_work %ld",
505 thread->no_work);
506 debugTrace(DEBUG_gc," scav_find_work %ld",
507 thread->scav_find_work);
508
509 #if defined(THREADED_RTS) && defined(PROF_SPIN)
510 gc_spin_spin += thread->gc_spin.spin;
511 gc_spin_yield += thread->gc_spin.yield;
512 mut_spin_spin += thread->mut_spin.spin;
513 mut_spin_yield += thread->mut_spin.yield;
514 #endif
515
516 any_work += thread->any_work;
517 no_work += thread->no_work;
518 scav_find_work += thread->scav_find_work;
519
520 par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
521 par_balanced_copied_acc +=
522 stg_min(n_gc_threads * gc_threads[i]->copied, copied);
523 }
524 }
525 if (n_gc_threads > 1) {
526 // See Note [Work Balance] for an explanation of this computation
527 par_balanced_copied =
528 (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
529 (n_gc_threads - 1);
530 }
531 }
532
533 // Run through all the generations and tidy up.
534 // We're going to:
535 // - count the amount of "live" data (live_words, live_blocks)
536 // - count the amount of "copied" data in this GC (copied)
537 // - free from-space
538 // - make to-space the new from-space (set BF_EVACUATED on all blocks)
539 //
540 live_words = 0;
541 live_blocks = 0;
542
543 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
544
545 if (g == N) {
546 generations[g].collections++; // for stats
547 if (n_gc_threads > 1) generations[g].par_collections++;
548 }
549
550 // Count the mutable list as bytes "copied" for the purposes of
551 // stats. Every mutable list is copied during every GC.
552 if (g > 0) {
553 W_ mut_list_size = 0;
554 for (n = 0; n < n_capabilities; n++) {
555 mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
556 }
557 copied += mut_list_size;
558
559 debugTrace(DEBUG_gc,
560 "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d ATOMIC_INVARIANTs, %d INVARIANT_CHECK_QUEUEs, %d others)",
561 (unsigned long)(mut_list_size * sizeof(W_)),
562 mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
563 mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
564 mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
565 mutlist_ATOMIC_INVARIANT,
566 mutlist_INVARIANT_CHECK_QUEUE,
567 mutlist_OTHERS);
568 }
569
570 bdescr *next, *prev;
571 gen = &generations[g];
572
573 // for generations we collected...
574 if (g <= N) {
575
576 /* free old memory and shift to-space into from-space for all
577 * the collected generations (except the allocation area). These
578 * freed blocks will probaby be quickly recycled.
579 */
580 if (gen->mark)
581 {
582 // tack the new blocks on the end of the existing blocks
583 if (gen->old_blocks != NULL) {
584
585 prev = NULL;
586 for (bd = gen->old_blocks; bd != NULL; bd = next) {
587
588 next = bd->link;
589
590 if (!(bd->flags & BF_MARKED))
591 {
592 if (prev == NULL) {
593 gen->old_blocks = next;
594 } else {
595 prev->link = next;
596 }
597 freeGroup(bd);
598 gen->n_old_blocks--;
599 }
600 else
601 {
602 gen->n_words += bd->free - bd->start;
603
604 // NB. this step might not be compacted next
605 // time, so reset the BF_MARKED flags.
606 // They are set before GC if we're going to
607 // compact. (search for BF_MARKED above).
608 bd->flags &= ~BF_MARKED;
609
610 // between GCs, all blocks in the heap except
611 // for the nursery have the BF_EVACUATED flag set.
612 bd->flags |= BF_EVACUATED;
613
614 prev = bd;
615 }
616 }
617
618 if (prev != NULL) {
619 prev->link = gen->blocks;
620 gen->blocks = gen->old_blocks;
621 }
622 }
623 // add the new blocks to the block tally
624 gen->n_blocks += gen->n_old_blocks;
625 ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
626 ASSERT(countOccupied(gen->blocks) == gen->n_words);
627 }
628 else // not copacted
629 {
630 freeChain(gen->old_blocks);
631 }
632
633 gen->old_blocks = NULL;
634 gen->n_old_blocks = 0;
635
636 /* LARGE OBJECTS. The current live large objects are chained on
637 * scavenged_large, having been moved during garbage
638 * collection from large_objects. Any objects left on the
639 * large_objects list are therefore dead, so we free them here.
640 */
641 freeChain(gen->large_objects);
642 gen->large_objects = gen->scavenged_large_objects;
643 gen->n_large_blocks = gen->n_scavenged_large_blocks;
644 gen->n_large_words = countOccupied(gen->large_objects);
645 gen->n_new_large_words = 0;
646
647 /* COMPACT_NFDATA. The currently live compacts are chained
648 * to live_compact_objects, quite like large objects. And
649 * objects left on the compact_objects list are dead.
650 *
651 * We don't run a simple freeChain because want to give the
652 * CNF module some chance to free memory that freeChain would
653 * not see (namely blocks appended to a CNF through a compactResize).
654 *
655 * See Note [Compact Normal Forms] for details.
656 */
657 for (bd = gen->compact_objects; bd; bd = next) {
658 next = bd->link;
659 compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
660 }
661 gen->compact_objects = gen->live_compact_objects;
662 gen->n_compact_blocks = gen->n_live_compact_blocks;
663 }
664 else // for generations > N
665 {
666 /* For older generations, we need to append the
667 * scavenged_large_object list (i.e. large objects that have been
668 * promoted during this GC) to the large_object list for that step.
669 */
670 for (bd = gen->scavenged_large_objects; bd; bd = next) {
671 next = bd->link;
672 dbl_link_onto(bd, &gen->large_objects);
673 gen->n_large_words += bd->free - bd->start;
674 }
675
676 // And same for compacts
677 for (bd = gen->live_compact_objects; bd; bd = next) {
678 next = bd->link;
679 dbl_link_onto(bd, &gen->compact_objects);
680 }
681
682 // add the new blocks we promoted during this GC
683 gen->n_large_blocks += gen->n_scavenged_large_blocks;
684 gen->n_compact_blocks += gen->n_live_compact_blocks;
685 }
686
687 ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
688 ASSERT(countOccupied(gen->large_objects) == gen->n_large_words);
689 // We can run the same assertion on compact objects because there
690 // is memory "the GC doesn't see" (directly), but which is still
691 // accounted in gen->n_compact_blocks
692
693 gen->scavenged_large_objects = NULL;
694 gen->n_scavenged_large_blocks = 0;
695 gen->live_compact_objects = NULL;
696 gen->n_live_compact_blocks = 0;
697
698 // Count "live" data
699 live_words += genLiveWords(gen);
700 live_blocks += genLiveBlocks(gen);
701
702 // add in the partial blocks in the gen_workspaces
703 {
704 uint32_t i;
705 for (i = 0; i < n_capabilities; i++) {
706 live_words += gcThreadLiveWords(i, gen->no);
707 live_blocks += gcThreadLiveBlocks(i, gen->no);
708 }
709 }
710 } // for all generations
711
712 // update the max size of older generations after a major GC
713 resize_generations();
714
715 // Free the mark stack.
716 if (mark_stack_top_bd != NULL) {
717 debugTrace(DEBUG_gc, "mark stack: %d blocks",
718 countBlocks(mark_stack_top_bd));
719 freeChain(mark_stack_top_bd);
720 }
721
722 // Free any bitmaps.
723 for (g = 0; g <= N; g++) {
724 gen = &generations[g];
725 if (gen->bitmap != NULL) {
726 freeGroup(gen->bitmap);
727 gen->bitmap = NULL;
728 }
729 }
730
731 resize_nursery();
732
733 resetNurseries();
734
735 // mark the garbage collected CAFs as dead
736 #if defined(DEBUG)
737 if (major_gc) { gcCAFs(); }
738 #endif
739
740 // Update the stable pointer hash table.
741 updateStableTables(major_gc);
742
743 // unlock the StablePtr table. Must be before scheduleFinalizers(),
744 // because a finalizer may call hs_free_fun_ptr() or
745 // hs_free_stable_ptr(), both of which access the StablePtr table.
746 stableUnlock();
747
748 // Must be after stableUnlock(), because it might free stable ptrs.
749 if (major_gc) {
750 checkUnload (gct->scavenged_static_objects);
751 }
752
753 #if defined(PROFILING)
754 // resetStaticObjectForRetainerProfiling() must be called before
755 // zeroing below.
756
757 // ToDo: fix the gct->scavenged_static_objects below
758 resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
759 #endif
760
761 // Start any pending finalizers. Must be after
762 // updateStableTables() and stableUnlock() (see #4221).
763 RELEASE_SM_LOCK;
764 scheduleFinalizers(cap, dead_weak_ptr_list);
765 ACQUIRE_SM_LOCK;
766
767 // check sanity after GC
768 // before resurrectThreads(), because that might overwrite some
769 // closures, which will cause problems with THREADED where we don't
770 // fill slop.
771 IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
772
773 // If a heap census is due, we need to do it before
774 // resurrectThreads(), for the same reason as checkSanity above:
775 // resurrectThreads() will overwrite some closures and leave slop
776 // behind.
777 if (do_heap_census) {
778 debugTrace(DEBUG_sched, "performing heap census");
779 RELEASE_SM_LOCK;
780 heapCensus(gct->gc_start_cpu);
781 ACQUIRE_SM_LOCK;
782 }
783
784 // send exceptions to any threads which were about to die
785 RELEASE_SM_LOCK;
786 resurrectThreads(resurrected_threads);
787 ACQUIRE_SM_LOCK;
788
789 if (major_gc) {
790 W_ need_prealloc, need_live, need, got;
791 uint32_t i;
792
793 need_live = 0;
794 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
795 need_live += genLiveBlocks(&generations[i]);
796 }
797 need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
798
799 need_prealloc = 0;
800 for (i = 0; i < n_nurseries; i++) {
801 need_prealloc += nurseries[i].n_blocks;
802 }
803 need_prealloc += RtsFlags.GcFlags.largeAllocLim;
804 need_prealloc += countAllocdBlocks(exec_block);
805 need_prealloc += arenaBlocks();
806 #if defined(PROFILING)
807 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
808 need_prealloc = retainerStackBlocks();
809 }
810 #endif
811
812 /* If the amount of data remains constant, next major GC we'll
813 * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
814 * in order to reduce repeated deallocation and reallocation. #14702
815 */
816 need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
817
818 /* Also, if user set heap size, do not drop below it.
819 */
820 need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
821
822 /* But with a large nursery, the above estimate might exceed
823 * maxHeapSize. A large resident set size might make the OS
824 * kill this process, or swap unnecessarily. Therefore we
825 * ensure that our estimate does not exceed maxHeapSize.
826 */
827 if (RtsFlags.GcFlags.maxHeapSize != 0) {
828 need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
829 }
830
831 need = BLOCKS_TO_MBLOCKS(need);
832
833 got = mblocks_allocated;
834
835 if (got > need) {
836 returnMemoryToOS(got - need);
837 }
838 }
839
840 // extra GC trace info
841 IF_DEBUG(gc, statDescribeGens());
842
843 #if defined(DEBUG)
844 // symbol-table based profiling
845 /* heapCensus(to_blocks); */ /* ToDo */
846 #endif
847
848 // restore enclosing cost centre
849 #if defined(PROFILING)
850 for (n = 0; n < n_capabilities; n++) {
851 capabilities[n]->r.rCCCS = save_CCS[n];
852 }
853 #endif
854
855 #if defined(DEBUG)
856 // check for memory leaks if DEBUG is on
857 memInventory(DEBUG_gc);
858 #endif
859
860 // ok, GC over: tell the stats department what happened.
861 stat_endGC(cap, gct, live_words, copied,
862 live_blocks * BLOCK_SIZE_W - live_words /* slop */,
863 N, n_gc_threads, par_max_copied, par_balanced_copied,
864 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
865 any_work, no_work, scav_find_work);
866
867 #if defined(RTS_USER_SIGNALS)
868 if (RtsFlags.MiscFlags.install_signal_handlers) {
869 // unblock signals again
870 unblockUserSignals();
871 }
872 #endif
873
874 RELEASE_SM_LOCK;
875
876 SET_GCT(saved_gct);
877 }
878
879 /* -----------------------------------------------------------------------------
880 Heap overflow is indicated by setting a flag that the caller of
881 GarbageCollect can check. (not ideal, TODO: better)
882 -------------------------------------------------------------------------- */
883
884 static void heapOverflow(void)
885 {
886 heap_overflow = true;
887 }
888
889 /* -----------------------------------------------------------------------------
890 Initialise the gc_thread structures.
891 -------------------------------------------------------------------------- */
892
893 static void
894 new_gc_thread (uint32_t n, gc_thread *t)
895 {
896 uint32_t g;
897 gen_workspace *ws;
898
899 t->cap = capabilities[n];
900
901 #if defined(THREADED_RTS)
902 t->id = 0;
903 initSpinLock(&t->gc_spin);
904 initSpinLock(&t->mut_spin);
905 ACQUIRE_SPIN_LOCK(&t->gc_spin);
906 ACQUIRE_SPIN_LOCK(&t->mut_spin);
907 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the
908 // thread to start up, see wakeup_gc_threads
909 #endif
910
911 t->thread_index = n;
912 t->free_blocks = NULL;
913 t->gc_count = 0;
914
915 init_gc_thread(t);
916
917 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
918 {
919 ws = &t->gens[g];
920 ws->gen = &generations[g];
921 ASSERT(g == ws->gen->no);
922 ws->my_gct = t;
923
924 // We want to call
925 // alloc_todo_block(ws,0);
926 // but can't, because it uses gct which isn't set up at this point.
927 // Hence, allocate a block for todo_bd manually:
928 {
929 bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
930 // no lock, locks aren't initialised yet
931 initBdescr(bd, ws->gen, ws->gen->to);
932 bd->flags = BF_EVACUATED;
933 bd->u.scan = bd->free = bd->start;
934
935 ws->todo_bd = bd;
936 ws->todo_free = bd->free;
937 ws->todo_lim = bd->start + BLOCK_SIZE_W;
938 }
939
940 ws->todo_q = newWSDeque(128);
941 ws->todo_overflow = NULL;
942 ws->n_todo_overflow = 0;
943 ws->todo_large_objects = NULL;
944
945 ws->part_list = NULL;
946 ws->n_part_blocks = 0;
947 ws->n_part_words = 0;
948
949 ws->scavd_list = NULL;
950 ws->n_scavd_blocks = 0;
951 ws->n_scavd_words = 0;
952 }
953 }
954
955
956 void
957 initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
958 {
959 #if defined(THREADED_RTS)
960 uint32_t i;
961
962 if (from > 0) {
963 gc_threads = stgReallocBytes (gc_threads, to * sizeof(gc_thread*),
964 "initGcThreads");
965 } else {
966 gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
967 "initGcThreads");
968 }
969
970 for (i = from; i < to; i++) {
971 gc_threads[i] =
972 stgMallocBytes(sizeof(gc_thread) +
973 RtsFlags.GcFlags.generations * sizeof(gen_workspace),
974 "alloc_gc_threads");
975
976 new_gc_thread(i, gc_threads[i]);
977 }
978 #else
979 ASSERT(from == 0 && to == 1);
980 gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
981 gc_threads[0] = gct;
982 new_gc_thread(0,gc_threads[0]);
983 #endif
984 }
985
986 void
987 freeGcThreads (void)
988 {
989 uint32_t g;
990 if (gc_threads != NULL) {
991 #if defined(THREADED_RTS)
992 uint32_t i;
993 for (i = 0; i < n_capabilities; i++) {
994 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
995 {
996 freeWSDeque(gc_threads[i]->gens[g].todo_q);
997 }
998 stgFree (gc_threads[i]);
999 }
1000 stgFree (gc_threads);
1001 #else
1002 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1003 {
1004 freeWSDeque(gc_threads[0]->gens[g].todo_q);
1005 }
1006 stgFree (gc_threads);
1007 #endif
1008 gc_threads = NULL;
1009 }
1010 }
1011
1012 /* ----------------------------------------------------------------------------
1013 Start GC threads
1014 ------------------------------------------------------------------------- */
1015
1016 static volatile StgWord gc_running_threads;
1017
1018 static StgWord
1019 inc_running (void)
1020 {
1021 StgWord new;
1022 new = atomic_inc(&gc_running_threads, 1);
1023 ASSERT(new <= n_gc_threads);
1024 return new;
1025 }
1026
1027 static StgWord
1028 dec_running (void)
1029 {
1030 ASSERT(gc_running_threads != 0);
1031 return atomic_dec(&gc_running_threads);
1032 }
1033
1034 static bool
1035 any_work (void)
1036 {
1037 int g;
1038 gen_workspace *ws;
1039
1040 gct->any_work++;
1041
1042 write_barrier();
1043
1044 // scavenge objects in compacted generation
1045 if (mark_stack_bd != NULL && !mark_stack_empty()) {
1046 return true;
1047 }
1048
1049 // Check for global work in any gen. We don't need to check for
1050 // local work, because we have already exited scavenge_loop(),
1051 // which means there is no local work for this thread.
1052 for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
1053 ws = &gct->gens[g];
1054 if (ws->todo_large_objects) return true;
1055 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1056 if (ws->todo_overflow) return true;
1057 }
1058
1059 #if defined(THREADED_RTS)
1060 if (work_stealing) {
1061 uint32_t n;
1062 // look for work to steal
1063 for (n = 0; n < n_gc_threads; n++) {
1064 if (n == gct->thread_index) continue;
1065 for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
1066 ws = &gc_threads[n]->gens[g];
1067 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1068 }
1069 }
1070 }
1071 #endif
1072
1073 gct->no_work++;
1074 #if defined(THREADED_RTS)
1075 yieldThread();
1076 #endif
1077
1078 return false;
1079 }
1080
1081 static void
1082 scavenge_until_all_done (void)
1083 {
1084 DEBUG_ONLY( uint32_t r );
1085
1086
1087 loop:
1088 #if defined(THREADED_RTS)
1089 if (n_gc_threads > 1) {
1090 scavenge_loop();
1091 } else {
1092 scavenge_loop1();
1093 }
1094 #else
1095 scavenge_loop();
1096 #endif
1097
1098 collect_gct_blocks();
1099
1100 // scavenge_loop() only exits when there's no work to do
1101
1102 #if defined(DEBUG)
1103 r = dec_running();
1104 #else
1105 dec_running();
1106 #endif
1107
1108 traceEventGcIdle(gct->cap);
1109
1110 debugTrace(DEBUG_gc, "%d GC threads still running", r);
1111
1112 while (gc_running_threads != 0) {
1113 // usleep(1);
1114 if (any_work()) {
1115 inc_running();
1116 traceEventGcWork(gct->cap);
1117 goto loop;
1118 }
1119 // any_work() does not remove the work from the queue, it
1120 // just checks for the presence of work. If we find any,
1121 // then we increment gc_running_threads and go back to
1122 // scavenge_loop() to perform any pending work.
1123 }
1124
1125 traceEventGcDone(gct->cap);
1126 }
1127
1128 #if defined(THREADED_RTS)
1129
1130 void
1131 gcWorkerThread (Capability *cap)
1132 {
1133 gc_thread *saved_gct;
1134
1135 // necessary if we stole a callee-saves register for gct:
1136 saved_gct = gct;
1137
1138 SET_GCT(gc_threads[cap->no]);
1139 gct->id = osThreadId();
1140
1141 // Wait until we're told to wake up
1142 RELEASE_SPIN_LOCK(&gct->mut_spin);
1143 // yieldThread();
1144 // Strangely, adding a yieldThread() here makes the CPU time
1145 // measurements more accurate on Linux, perhaps because it syncs
1146 // the CPU time across the multiple cores. Without this, CPU time
1147 // is heavily skewed towards GC rather than MUT.
1148 gct->wakeup = GC_THREAD_STANDING_BY;
1149 debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1150 ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1151
1152 init_gc_thread(gct);
1153
1154 traceEventGcWork(gct->cap);
1155
1156 // Every thread evacuates some roots.
1157 gct->evac_gen_no = 0;
1158 markCapability(mark_root, gct, cap, true/*prune sparks*/);
1159 scavenge_capability_mut_lists(cap);
1160
1161 scavenge_until_all_done();
1162
1163 #if defined(THREADED_RTS)
1164 // Now that the whole heap is marked, we discard any sparks that
1165 // were found to be unreachable. The main GC thread is currently
1166 // marking heap reachable via weak pointers, so it is
1167 // non-deterministic whether a spark will be retained if it is
1168 // only reachable via weak pointers. To fix this problem would
1169 // require another GC barrier, which is too high a price.
1170 pruneSparkQueue(cap);
1171 #endif
1172
1173 // Wait until we're told to continue
1174 RELEASE_SPIN_LOCK(&gct->gc_spin);
1175 gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1176 debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1177 gct->thread_index);
1178 ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1179 debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1180
1181 SET_GCT(saved_gct);
1182 }
1183
1184 #endif
1185
1186 #if defined(THREADED_RTS)
1187
1188 void
1189 waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1190 {
1191 const uint32_t n_threads = n_capabilities;
1192 const uint32_t me = cap->no;
1193 uint32_t i, j;
1194 bool retry = true;
1195 Time t0, t1, t2;
1196
1197 t0 = t1 = t2 = getProcessElapsedTime();
1198
1199 while(retry) {
1200 for (i=0; i < n_threads; i++) {
1201 if (i == me || idle_cap[i]) continue;
1202 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1203 prodCapability(capabilities[i], cap->running_task);
1204 }
1205 }
1206 for (j=0; j < 10; j++) {
1207 retry = false;
1208 for (i=0; i < n_threads; i++) {
1209 if (i == me || idle_cap[i]) continue;
1210 write_barrier();
1211 interruptCapability(capabilities[i]);
1212 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1213 retry = true;
1214 }
1215 }
1216 if (!retry) break;
1217 #if defined(PROF_SPIN)
1218 waitForGcThreads_yield++;
1219 #endif
1220 yieldThread();
1221 }
1222
1223 t2 = getProcessElapsedTime();
1224 if (RtsFlags.GcFlags.longGCSync != 0 &&
1225 t2 - t1 > RtsFlags.GcFlags.longGCSync) {
1226 /* call this every longGCSync of delay */
1227 rtsConfig.longGCSync(cap->no, t2 - t0);
1228 t1 = t2;
1229 }
1230 if (retry) {
1231 #if defined(PROF_SPIN)
1232 // This is a bit strange, we'll get more yields than spins.
1233 // I guess that means it's not a spin-lock at all, but these
1234 // numbers are still useful (I think).
1235 waitForGcThreads_spin++;
1236 #endif
1237 }
1238 }
1239
1240 if (RtsFlags.GcFlags.longGCSync != 0 &&
1241 t2 - t0 > RtsFlags.GcFlags.longGCSync) {
1242 rtsConfig.longGCSyncEnd(t2 - t0);
1243 }
1244 }
1245
1246 #endif // THREADED_RTS
1247
1248 static void
1249 start_gc_threads (void)
1250 {
1251 #if defined(THREADED_RTS)
1252 gc_running_threads = 0;
1253 #endif
1254 }
1255
1256 static void
1257 wakeup_gc_threads (uint32_t me USED_IF_THREADS,
1258 bool idle_cap[] USED_IF_THREADS)
1259 {
1260 #if defined(THREADED_RTS)
1261 uint32_t i;
1262
1263 if (n_gc_threads == 1) return;
1264
1265 for (i=0; i < n_gc_threads; i++) {
1266 if (i == me || idle_cap[i]) continue;
1267 inc_running();
1268 debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1269 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
1270 barf("wakeup_gc_threads");
1271
1272 gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1273 ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1274 RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1275 }
1276 #endif
1277 }
1278
1279 // After GC is complete, we must wait for all GC threads to enter the
1280 // standby state, otherwise they may still be executing inside
1281 // any_work(), and may even remain awake until the next GC starts.
1282 static void
1283 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
1284 bool idle_cap[] USED_IF_THREADS)
1285 {
1286 #if defined(THREADED_RTS)
1287 uint32_t i;
1288
1289 if (n_gc_threads == 1) return;
1290
1291 for (i=0; i < n_gc_threads; i++) {
1292 if (i == me || idle_cap[i]) continue;
1293 while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
1294 busy_wait_nop();
1295 write_barrier();
1296 }
1297 }
1298 #endif
1299 }
1300
1301 #if defined(THREADED_RTS)
1302 void
1303 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1304 {
1305 const uint32_t n_threads = n_capabilities;
1306 const uint32_t me = cap->no;
1307 uint32_t i;
1308 for (i=0; i < n_threads; i++) {
1309 if (i == me || idle_cap[i]) continue;
1310 if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1311 barf("releaseGCThreads");
1312
1313 gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1314 ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1315 RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1316 }
1317 }
1318 #endif
1319
1320 /* ----------------------------------------------------------------------------
1321 Initialise a generation that is to be collected
1322 ------------------------------------------------------------------------- */
1323
1324 static void
1325 prepare_collected_gen (generation *gen)
1326 {
1327 uint32_t i, g, n;
1328 gen_workspace *ws;
1329 bdescr *bd, *next;
1330
1331 // Throw away the current mutable list. Invariant: the mutable
1332 // list always has at least one block; this means we can avoid a
1333 // check for NULL in recordMutable().
1334 g = gen->no;
1335 if (g != 0) {
1336 for (i = 0; i < n_capabilities; i++) {
1337 freeChain(capabilities[i]->mut_lists[g]);
1338 capabilities[i]->mut_lists[g] =
1339 allocBlockOnNode(capNoToNumaNode(i));
1340 }
1341 }
1342
1343 gen = &generations[g];
1344 ASSERT(gen->no == g);
1345
1346 // we'll construct a new list of threads in this step
1347 // during GC, throw away the current list.
1348 gen->old_threads = gen->threads;
1349 gen->threads = END_TSO_QUEUE;
1350
1351 // deprecate the existing blocks
1352 gen->old_blocks = gen->blocks;
1353 gen->n_old_blocks = gen->n_blocks;
1354 gen->blocks = NULL;
1355 gen->n_blocks = 0;
1356 gen->n_words = 0;
1357 gen->live_estimate = 0;
1358
1359 // initialise the large object queues.
1360 ASSERT(gen->scavenged_large_objects == NULL);
1361 ASSERT(gen->n_scavenged_large_blocks == 0);
1362 ASSERT(gen->live_compact_objects == NULL);
1363 ASSERT(gen->n_live_compact_blocks == 0);
1364
1365 // grab all the partial blocks stashed in the gc_thread workspaces and
1366 // move them to the old_blocks list of this gen.
1367 for (n = 0; n < n_capabilities; n++) {
1368 ws = &gc_threads[n]->gens[gen->no];
1369
1370 for (bd = ws->part_list; bd != NULL; bd = next) {
1371 next = bd->link;
1372 bd->link = gen->old_blocks;
1373 gen->old_blocks = bd;
1374 gen->n_old_blocks += bd->blocks;
1375 }
1376 ws->part_list = NULL;
1377 ws->n_part_blocks = 0;
1378 ws->n_part_words = 0;
1379
1380 ASSERT(ws->scavd_list == NULL);
1381 ASSERT(ws->n_scavd_blocks == 0);
1382 ASSERT(ws->n_scavd_words == 0);
1383
1384 if (ws->todo_free != ws->todo_bd->start) {
1385 ws->todo_bd->free = ws->todo_free;
1386 ws->todo_bd->link = gen->old_blocks;
1387 gen->old_blocks = ws->todo_bd;
1388 gen->n_old_blocks += ws->todo_bd->blocks;
1389 alloc_todo_block(ws,0); // always has one block.
1390 }
1391 }
1392
1393 // mark the small objects as from-space
1394 for (bd = gen->old_blocks; bd; bd = bd->link) {
1395 bd->flags &= ~BF_EVACUATED;
1396 }
1397
1398 // mark the large objects as from-space
1399 for (bd = gen->large_objects; bd; bd = bd->link) {
1400 bd->flags &= ~BF_EVACUATED;
1401 }
1402
1403 // mark the compact objects as from-space
1404 for (bd = gen->compact_objects; bd; bd = bd->link) {
1405 bd->flags &= ~BF_EVACUATED;
1406 }
1407
1408 // for a compacted generation, we need to allocate the bitmap
1409 if (gen->mark) {
1410 StgWord bitmap_size; // in bytes
1411 bdescr *bitmap_bdescr;
1412 StgWord *bitmap;
1413
1414 bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
1415
1416 if (bitmap_size > 0) {
1417 bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
1418 / BLOCK_SIZE);
1419 gen->bitmap = bitmap_bdescr;
1420 bitmap = bitmap_bdescr->start;
1421
1422 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1423 bitmap_size, bitmap);
1424
1425 // don't forget to fill it with zeros!
1426 memset(bitmap, 0, bitmap_size);
1427
1428 // For each block in this step, point to its bitmap from the
1429 // block descriptor.
1430 for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
1431 bd->u.bitmap = bitmap;
1432 bitmap += BLOCK_SIZE_W / BITS_IN(W_);
1433
1434 // Also at this point we set the BF_MARKED flag
1435 // for this block. The invariant is that
1436 // BF_MARKED is always unset, except during GC
1437 // when it is set on those blocks which will be
1438 // compacted.
1439 if (!(bd->flags & BF_FRAGMENTED)) {
1440 bd->flags |= BF_MARKED;
1441 }
1442
1443 // BF_SWEPT should be marked only for blocks that are being
1444 // collected in sweep()
1445 bd->flags &= ~BF_SWEPT;
1446 }
1447 }
1448 }
1449 }
1450
1451
1452 /* ----------------------------------------------------------------------------
1453 Save the mutable lists in saved_mut_lists
1454 ------------------------------------------------------------------------- */
1455
1456 static void
1457 stash_mut_list (Capability *cap, uint32_t gen_no)
1458 {
1459 cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
1460 cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
1461 }
1462
1463 /* ----------------------------------------------------------------------------
1464 Initialise a generation that is *not* to be collected
1465 ------------------------------------------------------------------------- */
1466
1467 static void
1468 prepare_uncollected_gen (generation *gen)
1469 {
1470 uint32_t i;
1471
1472
1473 ASSERT(gen->no > 0);
1474
1475 // save the current mutable lists for this generation, and
1476 // allocate a fresh block for each one. We'll traverse these
1477 // mutable lists as roots early on in the GC.
1478 for (i = 0; i < n_capabilities; i++) {
1479 stash_mut_list(capabilities[i], gen->no);
1480 }
1481
1482 ASSERT(gen->scavenged_large_objects == NULL);
1483 ASSERT(gen->n_scavenged_large_blocks == 0);
1484 }
1485
1486 /* -----------------------------------------------------------------------------
1487 Collect the completed blocks from a GC thread and attach them to
1488 the generation.
1489 -------------------------------------------------------------------------- */
1490
1491 static void
1492 collect_gct_blocks (void)
1493 {
1494 uint32_t g;
1495 gen_workspace *ws;
1496 bdescr *bd, *prev;
1497
1498 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1499 ws = &gct->gens[g];
1500
1501 // there may still be a block attached to ws->todo_bd;
1502 // leave it there to use next time.
1503
1504 if (ws->scavd_list != NULL) {
1505 ACQUIRE_SPIN_LOCK(&ws->gen->sync);
1506
1507 ASSERT(gct->scan_bd == NULL);
1508 ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
1509
1510 prev = NULL;
1511 for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
1512 prev = bd;
1513 }
1514 if (prev != NULL) {
1515 prev->link = ws->gen->blocks;
1516 ws->gen->blocks = ws->scavd_list;
1517 }
1518 ws->gen->n_blocks += ws->n_scavd_blocks;
1519 ws->gen->n_words += ws->n_scavd_words;
1520
1521 ws->scavd_list = NULL;
1522 ws->n_scavd_blocks = 0;
1523 ws->n_scavd_words = 0;
1524
1525 RELEASE_SPIN_LOCK(&ws->gen->sync);
1526 }
1527 }
1528 }
1529
1530 /* -----------------------------------------------------------------------------
1531 During mutation, any blocks that are filled by allocatePinned() are
1532 stashed on the local pinned_object_blocks list, to avoid needing to
1533 take a global lock. Here we collect those blocks from the
1534 cap->pinned_object_blocks lists and put them on the
1535 main g0->large_object list.
1536
1537 Returns: the number of words allocated this way, for stats
1538 purposes.
1539 -------------------------------------------------------------------------- */
1540
1541 static void
1542 collect_pinned_object_blocks (void)
1543 {
1544 uint32_t n;
1545 bdescr *bd, *prev;
1546
1547 for (n = 0; n < n_capabilities; n++) {
1548 prev = NULL;
1549 for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1550 prev = bd;
1551 }
1552 if (prev != NULL) {
1553 prev->link = g0->large_objects;
1554 if (g0->large_objects != NULL) {
1555 g0->large_objects->u.back = prev;
1556 }
1557 g0->large_objects = capabilities[n]->pinned_object_blocks;
1558 capabilities[n]->pinned_object_blocks = 0;
1559 }
1560 }
1561 }
1562
1563 /* -----------------------------------------------------------------------------
1564 Initialise a gc_thread before GC
1565 -------------------------------------------------------------------------- */
1566
1567 static void
1568 init_gc_thread (gc_thread *t)
1569 {
1570 t->static_objects = END_OF_STATIC_OBJECT_LIST;
1571 t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
1572 t->scan_bd = NULL;
1573 t->mut_lists = t->cap->mut_lists;
1574 t->evac_gen_no = 0;
1575 t->failed_to_evac = false;
1576 t->eager_promotion = true;
1577 t->thunk_selector_depth = 0;
1578 t->copied = 0;
1579 t->scanned = 0;
1580 t->any_work = 0;
1581 t->no_work = 0;
1582 t->scav_find_work = 0;
1583 }
1584
1585 /* -----------------------------------------------------------------------------
1586 Function we pass to evacuate roots.
1587 -------------------------------------------------------------------------- */
1588
1589 static void
1590 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1591 {
1592 // we stole a register for gct, but this function is called from
1593 // *outside* the GC where the register variable is not in effect,
1594 // so we need to save and restore it here. NB. only call
1595 // mark_root() from the main GC thread, otherwise gct will be
1596 // incorrect.
1597 #if defined(THREADED_RTS)
1598 gc_thread *saved_gct;
1599 saved_gct = gct;
1600 #endif
1601 SET_GCT(user);
1602
1603 evacuate(root);
1604
1605 SET_GCT(saved_gct);
1606 }
1607
1608 /* ----------------------------------------------------------------------------
1609 Reset the sizes of the older generations when we do a major
1610 collection.
1611
1612 CURRENT STRATEGY: make all generations except zero the same size.
1613 We have to stay within the maximum heap size, and leave a certain
1614 percentage of the maximum heap size available to allocate into.
1615 ------------------------------------------------------------------------- */
1616
1617 static void
1618 resize_generations (void)
1619 {
1620 uint32_t g;
1621
1622 if (major_gc && RtsFlags.GcFlags.generations > 1) {
1623 W_ live, size, min_alloc, words;
1624 const W_ max = RtsFlags.GcFlags.maxHeapSize;
1625 const W_ gens = RtsFlags.GcFlags.generations;
1626
1627 // live in the oldest generations
1628 if (oldest_gen->live_estimate != 0) {
1629 words = oldest_gen->live_estimate;
1630 } else {
1631 words = oldest_gen->n_words;
1632 }
1633 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1634 oldest_gen->n_large_blocks +
1635 oldest_gen->n_compact_blocks;
1636
1637 // default max size for all generations except zero
1638 size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1639 RtsFlags.GcFlags.minOldGenSize);
1640
1641 if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1642 if (max > 0) {
1643 RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
1644 } else {
1645 RtsFlags.GcFlags.heapSizeSuggestion = size;
1646 }
1647 }
1648
1649 // minimum size for generation zero
1650 min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1651 RtsFlags.GcFlags.minAllocAreaSize
1652 * (W_)n_capabilities);
1653
1654 // Auto-enable compaction when the residency reaches a
1655 // certain percentage of the maximum heap size (default: 30%).
1656 if (RtsFlags.GcFlags.compact ||
1657 (max > 0 &&
1658 oldest_gen->n_blocks >
1659 (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
1660 oldest_gen->mark = 1;
1661 oldest_gen->compact = 1;
1662 // debugBelch("compaction: on\n", live);
1663 } else {
1664 oldest_gen->mark = 0;
1665 oldest_gen->compact = 0;
1666 // debugBelch("compaction: off\n", live);
1667 }
1668
1669 if (RtsFlags.GcFlags.sweep) {
1670 oldest_gen->mark = 1;
1671 }
1672
1673 // if we're going to go over the maximum heap size, reduce the
1674 // size of the generations accordingly. The calculation is
1675 // different if compaction is turned on, because we don't need
1676 // to double the space required to collect the old generation.
1677 if (max != 0) {
1678
1679 // this test is necessary to ensure that the calculations
1680 // below don't have any negative results - we're working
1681 // with unsigned values here.
1682 if (max < min_alloc) {
1683 heapOverflow();
1684 }
1685
1686 if (oldest_gen->compact) {
1687 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1688 size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1689 }
1690 } else {
1691 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1692 size = (max - min_alloc) / ((gens - 1) * 2);
1693 }
1694 }
1695
1696 if (size < live) {
1697 heapOverflow();
1698 }
1699 }
1700
1701 #if 0
1702 debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1703 min_alloc, size, max);
1704 #endif
1705
1706 for (g = 0; g < gens; g++) {
1707 generations[g].max_blocks = size;
1708 }
1709 }
1710 }
1711
1712 /* -----------------------------------------------------------------------------
1713 Calculate the new size of the nursery, and resize it.
1714 -------------------------------------------------------------------------- */
1715
1716 static void
1717 resize_nursery (void)
1718 {
1719 const StgWord min_nursery =
1720 RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
1721
1722 if (RtsFlags.GcFlags.generations == 1)
1723 { // Two-space collector:
1724 W_ blocks;
1725
1726 /* set up a new nursery. Allocate a nursery size based on a
1727 * function of the amount of live data (by default a factor of 2)
1728 * Use the blocks from the old nursery if possible, freeing up any
1729 * left over blocks.
1730 *
1731 * If we get near the maximum heap size, then adjust our nursery
1732 * size accordingly. If the nursery is the same size as the live
1733 * data (L), then we need 3L bytes. We can reduce the size of the
1734 * nursery to bring the required memory down near 2L bytes.
1735 *
1736 * A normal 2-space collector would need 4L bytes to give the same
1737 * performance we get from 3L bytes, reducing to the same
1738 * performance at 2L bytes.
1739 */
1740 blocks = generations[0].n_blocks;
1741
1742 if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1743 blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1744 RtsFlags.GcFlags.maxHeapSize )
1745 {
1746 long adjusted_blocks; // signed on purpose
1747 int pc_free;
1748
1749 adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1750
1751 debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1752 RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1753
1754 pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1755 if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1756 {
1757 heapOverflow();
1758 }
1759 blocks = adjusted_blocks;
1760 }
1761 else
1762 {
1763 blocks *= RtsFlags.GcFlags.oldGenFactor;
1764 if (blocks < min_nursery)
1765 {
1766 blocks = min_nursery;
1767 }
1768 }
1769 resizeNurseries(blocks);
1770 }
1771 else // Generational collector
1772 {
1773 /*
1774 * If the user has given us a suggested heap size, adjust our
1775 * allocation area to make best use of the memory available.
1776 */
1777 if (RtsFlags.GcFlags.heapSizeSuggestion)
1778 {
1779 long blocks;
1780 StgWord needed;
1781
1782 calcNeeded(false, &needed); // approx blocks needed at next GC
1783
1784 /* Guess how much will be live in generation 0 step 0 next time.
1785 * A good approximation is obtained by finding the
1786 * percentage of g0 that was live at the last minor GC.
1787 *
1788 * We have an accurate figure for the amount of copied data in
1789 * 'copied', but we must convert this to a number of blocks, with
1790 * a small adjustment for estimated slop at the end of a block
1791 * (- 10 words).
1792 */
1793 if (N == 0)
1794 {
1795 g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1796 / countNurseryBlocks();
1797 }
1798
1799 /* Estimate a size for the allocation area based on the
1800 * information available. We might end up going slightly under
1801 * or over the suggested heap size, but we should be pretty
1802 * close on average.
1803 *
1804 * Formula: suggested - needed
1805 * ----------------------------
1806 * 1 + g0_pcnt_kept/100
1807 *
1808 * where 'needed' is the amount of memory needed at the next
1809 * collection for collecting all gens except g0.
1810 */
1811 blocks =
1812 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1813 (100 + (long)g0_pcnt_kept);
1814
1815 if (blocks < (long)min_nursery) {
1816 blocks = min_nursery;
1817 }
1818
1819 resizeNurseries((W_)blocks);
1820 }
1821 else
1822 {
1823 // we might have added extra blocks to the nursery, so
1824 // resize back to the original size again.
1825 resizeNurseriesFixed();
1826 }
1827 }
1828 }
1829
1830 /* -----------------------------------------------------------------------------
1831 Sanity code for CAF garbage collection.
1832
1833 With DEBUG turned on, we manage a CAF list in addition to the SRT
1834 mechanism. After GC, we run down the CAF list and blackhole any
1835 CAFs which have been garbage collected. This means we get an error
1836 whenever the program tries to enter a garbage collected CAF.
1837
1838 Any garbage collected CAFs are taken off the CAF list at the same
1839 time.
1840 -------------------------------------------------------------------------- */
1841
1842 #if defined(DEBUG)
1843
1844 static void gcCAFs(void)
1845 {
1846 StgIndStatic *p, *prev;
1847
1848 const StgInfoTable *info;
1849 uint32_t i;
1850
1851 i = 0;
1852 p = debug_caf_list;
1853 prev = NULL;
1854
1855 for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
1856 p = (StgIndStatic*)p->saved_info) {
1857
1858 info = get_itbl((StgClosure*)p);
1859 ASSERT(info->type == IND_STATIC);
1860
1861 if (p->static_link == NULL) {
1862 debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
1863 SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
1864 if (prev == NULL) {
1865 debug_caf_list = (StgIndStatic*)p->saved_info;
1866 } else {
1867 prev->saved_info = p->saved_info;
1868 }
1869 } else {
1870 prev = p;
1871 i++;
1872 }
1873 }
1874
1875 debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1876 }
1877 #endif