d07714bf4eb5e9fc7e1b534e20b9eee1fa50f50d
[ghc.git] / rts / sm / GC.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * Generational garbage collector
6 *
7 * Documentation on the architecture of the Garbage Collector can be
8 * found in the online commentary:
9 *
10 * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/storage/gc
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16 #include "HsFFI.h"
17
18 #include "GC.h"
19 #include "GCThread.h"
20 #include "GCTDecl.h" // NB. before RtsSignals.h which
21 // clobbers REG_R1 on arm/Linux
22 #include "Compact.h"
23 #include "Evac.h"
24 #include "Scav.h"
25 #include "GCUtils.h"
26 #include "MarkStack.h"
27 #include "MarkWeak.h"
28 #include "Sparks.h"
29 #include "Sweep.h"
30
31 #include "Arena.h"
32 #include "Storage.h"
33 #include "RtsUtils.h"
34 #include "Apply.h"
35 #include "Updates.h"
36 #include "Stats.h"
37 #include "Schedule.h"
38 #include "Sanity.h"
39 #include "BlockAlloc.h"
40 #include "ProfHeap.h"
41 #include "Weak.h"
42 #include "Prelude.h"
43 #include "RtsSignals.h"
44 #include "STM.h"
45 #include "Trace.h"
46 #include "RetainerProfile.h"
47 #include "LdvProfile.h"
48 #include "RaiseAsync.h"
49 #include "StableName.h"
50 #include "StablePtr.h"
51 #include "CheckUnload.h"
52 #include "CNF.h"
53 #include "RtsFlags.h"
54 #include "NonMoving.h"
55
56 #if defined(PROFILING)
57 #include "RetainerProfile.h"
58 #endif
59
60 #include <string.h> // for memset()
61 #include <unistd.h>
62
63 /* -----------------------------------------------------------------------------
64 Global variables
65 -------------------------------------------------------------------------- */
66
67 /* STATIC OBJECT LIST.
68 *
69 * During GC:
70 * We maintain a linked list of static objects that are still live.
71 * The requirements for this list are:
72 *
73 * - we need to scan the list while adding to it, in order to
74 * scavenge all the static objects (in the same way that
75 * breadth-first scavenging works for dynamic objects).
76 *
77 * - we need to be able to tell whether an object is already on
78 * the list, to break loops.
79 *
80 * Each static object has a "static link field", which we use for
81 * linking objects on to the list. We use a stack-type list, consing
82 * objects on the front as they are added (this means that the
83 * scavenge phase is depth-first, not breadth-first, but that
84 * shouldn't matter).
85 *
86 * A separate list is kept for objects that have been scavenged
87 * already - this is so that we can zero all the marks afterwards.
88 *
89 * An object is on the list if its static link field is non-zero; this
90 * means that we have to mark the end of the list with '1', not NULL.
91 *
92 * Extra notes for generational GC:
93 *
94 * Each generation has a static object list associated with it. When
95 * collecting generations up to N, we treat the static object lists
96 * from generations > N as roots.
97 *
98 * We build up a static object list while collecting generations 0..N,
99 * which is then appended to the static object list of generation N+1.
100 *
101 * See also: Note [STATIC_LINK fields] in Storage.h.
102 */
103
104 /* N is the oldest generation being collected, where the generations
105 * are numbered starting at 0. A major GC (indicated by the major_gc
106 * flag) is when we're collecting all generations. We only attempt to
107 * deal with static objects and GC CAFs when doing a major GC.
108 */
109 uint32_t N;
110 bool major_gc;
111
112 /* Data used for allocation area sizing.
113 */
114 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
115
116 /* Mut-list stats */
117 #if defined(DEBUG)
118 uint32_t mutlist_MUTVARS,
119 mutlist_MUTARRS,
120 mutlist_MVARS,
121 mutlist_TVAR,
122 mutlist_TVAR_WATCH_QUEUE,
123 mutlist_TREC_CHUNK,
124 mutlist_TREC_HEADER,
125 mutlist_OTHERS;
126 #endif
127
128 /* Thread-local data for each GC thread
129 */
130 gc_thread **gc_threads = NULL;
131
132 #if !defined(THREADED_RTS)
133 // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
134 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
135 ATTRIBUTE_ALIGNED(64);
136 #endif
137
138 // Number of threads running in *this* GC. Affects how many
139 // step->todos[] lists we have to look in to find work.
140 uint32_t n_gc_threads;
141
142 // For stats:
143 static long copied; // *words* copied & scavenged during this GC
144
145 #if defined(PROF_SPIN) && defined(THREADED_RTS)
146 // spin and yield counts for the quasi-SpinLock in waitForGcThreads
147 volatile StgWord64 waitForGcThreads_spin = 0;
148 volatile StgWord64 waitForGcThreads_yield = 0;
149 volatile StgWord64 whitehole_gc_spin = 0;
150 #endif
151
152 bool work_stealing;
153
154 uint32_t static_flag = STATIC_FLAG_B;
155 uint32_t prev_static_flag = STATIC_FLAG_A;
156
157 DECLARE_GCT
158
159 /* -----------------------------------------------------------------------------
160 Static function declarations
161 -------------------------------------------------------------------------- */
162
163 static void mark_root (void *user, StgClosure **root);
164 static void prepare_collected_gen (generation *gen);
165 static void prepare_uncollected_gen (generation *gen);
166 static void init_gc_thread (gc_thread *t);
167 static void resize_nursery (void);
168 static void start_gc_threads (void);
169 static void scavenge_until_all_done (void);
170 static StgWord inc_running (void);
171 static StgWord dec_running (void);
172 static void wakeup_gc_threads (uint32_t me, bool idle_cap[]);
173 static void shutdown_gc_threads (uint32_t me, bool idle_cap[]);
174 static void collect_gct_blocks (void);
175 static void collect_pinned_object_blocks (void);
176 static void heapOverflow (void);
177
178 #if defined(DEBUG)
179 static void gcCAFs (void);
180 #endif
181
182 /* -----------------------------------------------------------------------------
183 The mark stack.
184 -------------------------------------------------------------------------- */
185
186 bdescr *mark_stack_top_bd; // topmost block in the mark stack
187 bdescr *mark_stack_bd; // current block in the mark stack
188 StgPtr mark_sp; // pointer to the next unallocated mark stack entry
189
190 /* -----------------------------------------------------------------------------
191 GarbageCollect: the main entry point to the garbage collector.
192
193 The collect_gen parameter is gotten by calling calcNeeded().
194
195 Locks held: all capabilities are held throughout GarbageCollect().
196 -------------------------------------------------------------------------- */
197
198 void
199 GarbageCollect (uint32_t collect_gen,
200 bool do_heap_census,
201 uint32_t gc_type USED_IF_THREADS,
202 Capability *cap,
203 bool idle_cap[])
204 {
205 bdescr *bd;
206 generation *gen;
207 StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
208 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
209 any_work, no_work, scav_find_work;
210 #if defined(THREADED_RTS)
211 gc_thread *saved_gct;
212 #endif
213 uint32_t g, n;
214
215 // necessary if we stole a callee-saves register for gct:
216 #if defined(THREADED_RTS)
217 saved_gct = gct;
218 #endif
219
220 #if defined(PROFILING)
221 CostCentreStack *save_CCS[n_capabilities];
222 #endif
223
224 ACQUIRE_SM_LOCK;
225
226 #if defined(RTS_USER_SIGNALS)
227 if (RtsFlags.MiscFlags.install_signal_handlers) {
228 // block signals
229 blockUserSignals();
230 }
231 #endif
232
233 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord));
234 // otherwise adjust the padding in gen_workspace.
235
236 // this is the main thread
237 SET_GCT(gc_threads[cap->no]);
238
239 // tell the stats department that we've started a GC
240 stat_startGC(cap, gct);
241
242 // Lock the StablePtr table. This prevents FFI calls manipulating
243 // the table from occurring during GC.
244 stablePtrLock();
245
246 #if defined(DEBUG)
247 mutlist_MUTVARS = 0;
248 mutlist_MUTARRS = 0;
249 mutlist_MVARS = 0;
250 mutlist_TVAR = 0;
251 mutlist_TVAR_WATCH_QUEUE = 0;
252 mutlist_TREC_CHUNK = 0;
253 mutlist_TREC_HEADER = 0;
254 mutlist_OTHERS = 0;
255 #endif
256
257 // attribute any costs to CCS_GC
258 #if defined(PROFILING)
259 for (n = 0; n < n_capabilities; n++) {
260 save_CCS[n] = capabilities[n]->r.rCCCS;
261 capabilities[n]->r.rCCCS = CCS_GC;
262 }
263 #endif
264
265 /* Figure out which generation to collect
266 */
267 N = collect_gen;
268 major_gc = (N == RtsFlags.GcFlags.generations-1);
269
270 /* N.B. The nonmoving collector works a bit differently. See
271 * Note [Static objects under the nonmoving collector].
272 */
273 if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
274 prev_static_flag = static_flag;
275 static_flag =
276 static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
277 }
278
279 #if defined(THREADED_RTS)
280 work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
281 N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
282 // It's not always a good idea to do load balancing in parallel
283 // GC. In particular, for a parallel program we don't want to
284 // lose locality by moving cached data into another CPU's cache
285 // (this effect can be quite significant).
286 //
287 // We could have a more complex way to determine whether to do
288 // work stealing or not, e.g. it might be a good idea to do it
289 // if the heap is big. For now, we just turn it on or off with
290 // a flag.
291 #endif
292
293 /* Start threads, so they can be spinning up while we finish initialisation.
294 */
295 start_gc_threads();
296
297 #if defined(THREADED_RTS)
298 /* How many threads will be participating in this GC?
299 * We don't try to parallelise minor GCs (unless the user asks for
300 * it with +RTS -gn0), or mark/compact/sweep GC.
301 */
302 if (gc_type == SYNC_GC_PAR) {
303 n_gc_threads = n_capabilities;
304 } else {
305 n_gc_threads = 1;
306 }
307 #else
308 n_gc_threads = 1;
309 #endif
310
311 debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s))",
312 N, n_gc_threads);
313
314 #if defined(DEBUG)
315 // check for memory leaks if DEBUG is on
316 memInventory(DEBUG_gc);
317 #endif
318
319 // do this *before* we start scavenging
320 collectFreshWeakPtrs();
321
322 // check sanity *before* GC
323 IF_DEBUG(sanity, checkSanity(false /* before GC */, major_gc));
324
325 // gather blocks allocated using allocatePinned() from each capability
326 // and put them on the g0->large_object list.
327 collect_pinned_object_blocks();
328
329 // Initialise all the generations that we're collecting.
330 for (g = 0; g <= N; g++) {
331 prepare_collected_gen(&generations[g]);
332 }
333 // Initialise all the generations that we're *not* collecting.
334 for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
335 prepare_uncollected_gen(&generations[g]);
336 }
337
338 // Prepare this gc_thread
339 init_gc_thread(gct);
340
341 /* Allocate a mark stack if we're doing a major collection.
342 */
343 if (major_gc && oldest_gen->mark) {
344 mark_stack_bd = allocBlock();
345 mark_stack_top_bd = mark_stack_bd;
346 mark_stack_bd->link = NULL;
347 mark_stack_bd->u.back = NULL;
348 mark_sp = mark_stack_bd->start;
349 } else {
350 mark_stack_bd = NULL;
351 mark_stack_top_bd = NULL;
352 mark_sp = NULL;
353 }
354
355 /* -----------------------------------------------------------------------
356 * follow all the roots that we know about:
357 */
358
359 // the main thread is running: this prevents any other threads from
360 // exiting prematurely, so we can start them now.
361 // NB. do this after the mutable lists have been saved above, otherwise
362 // the other GC threads will be writing into the old mutable lists.
363 inc_running();
364 wakeup_gc_threads(gct->thread_index, idle_cap);
365
366 traceEventGcWork(gct->cap);
367
368 // scavenge the capability-private mutable lists. This isn't part
369 // of markSomeCapabilities() because markSomeCapabilities() can only
370 // call back into the GC via mark_root() (due to the gct register
371 // variable).
372 if (n_gc_threads == 1) {
373 for (n = 0; n < n_capabilities; n++) {
374 #if defined(THREADED_RTS)
375 scavenge_capability_mut_Lists1(capabilities[n]);
376 #else
377 scavenge_capability_mut_lists(capabilities[n]);
378 #endif
379 }
380 } else {
381 scavenge_capability_mut_lists(gct->cap);
382 for (n = 0; n < n_capabilities; n++) {
383 if (idle_cap[n]) {
384 markCapability(mark_root, gct, capabilities[n],
385 true/*don't mark sparks*/);
386 scavenge_capability_mut_lists(capabilities[n]);
387 }
388 }
389 }
390
391 // follow roots from the CAF list (used by GHCi)
392 gct->evac_gen_no = 0;
393 markCAFs(mark_root, gct);
394
395 // follow all the roots that the application knows about.
396 gct->evac_gen_no = 0;
397 if (n_gc_threads == 1) {
398 for (n = 0; n < n_capabilities; n++) {
399 markCapability(mark_root, gct, capabilities[n],
400 true/*don't mark sparks*/);
401 }
402 } else {
403 markCapability(mark_root, gct, cap, true/*don't mark sparks*/);
404 }
405
406 markScheduler(mark_root, gct);
407
408 // Mark the weak pointer list, and prepare to detect dead weak pointers.
409 markWeakPtrList();
410 initWeakForGC();
411
412 // Mark the stable pointer table.
413 markStablePtrTable(mark_root, gct);
414
415 // Remember old stable name addresses.
416 rememberOldStableNameAddresses ();
417
418 /* -------------------------------------------------------------------------
419 * Repeatedly scavenge all the areas we know about until there's no
420 * more scavenging to be done.
421 */
422
423 StgWeak *dead_weak_ptr_list = NULL;
424 StgTSO *resurrected_threads = END_TSO_QUEUE;
425
426 for (;;)
427 {
428 scavenge_until_all_done();
429
430 // The other threads are now stopped. We might recurse back to
431 // here, but from now on this is the only thread.
432
433 // must be last... invariant is that everything is fully
434 // scavenged at this point.
435 if (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { // returns true if evaced something
436 inc_running();
437 continue;
438 }
439
440 // If we get to here, there's really nothing left to do.
441 break;
442 }
443
444 shutdown_gc_threads(gct->thread_index, idle_cap);
445
446 // Now see which stable names are still alive.
447 gcStableNameTable();
448
449 #if defined(THREADED_RTS)
450 if (n_gc_threads == 1) {
451 for (n = 0; n < n_capabilities; n++) {
452 pruneSparkQueue(capabilities[n]);
453 }
454 } else {
455 for (n = 0; n < n_capabilities; n++) {
456 if (n == cap->no || idle_cap[n]) {
457 pruneSparkQueue(capabilities[n]);
458 }
459 }
460 }
461 #endif
462
463 #if defined(PROFILING)
464 // We call processHeapClosureForDead() on every closure destroyed during
465 // the current garbage collection, so we invoke LdvCensusForDead().
466 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
467 || RtsFlags.ProfFlags.bioSelector != NULL) {
468 RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
469 LdvCensusForDead(N);
470 ACQUIRE_SM_LOCK;
471 }
472 #endif
473
474 // NO MORE EVACUATION AFTER THIS POINT!
475
476 // Finally: compact or sweep the oldest generation.
477 if (major_gc && oldest_gen->mark) {
478 if (oldest_gen->compact)
479 compact(gct->scavenged_static_objects,
480 &dead_weak_ptr_list,
481 &resurrected_threads);
482 else
483 sweep(oldest_gen);
484 }
485
486 copied = 0;
487 par_max_copied = 0;
488 par_balanced_copied = 0;
489 gc_spin_spin = 0;
490 gc_spin_yield = 0;
491 mut_spin_spin = 0;
492 mut_spin_yield = 0;
493 any_work = 0;
494 no_work = 0;
495 scav_find_work = 0;
496 {
497 uint32_t i;
498 uint64_t par_balanced_copied_acc = 0;
499 const gc_thread* thread;
500
501 for (i=0; i < n_gc_threads; i++) {
502 copied += gc_threads[i]->copied;
503 }
504 for (i=0; i < n_gc_threads; i++) {
505 thread = gc_threads[i];
506 if (n_gc_threads > 1) {
507 debugTrace(DEBUG_gc,"thread %d:", i);
508 debugTrace(DEBUG_gc," copied %ld",
509 thread->copied * sizeof(W_));
510 debugTrace(DEBUG_gc," scanned %ld",
511 thread->scanned * sizeof(W_));
512 debugTrace(DEBUG_gc," any_work %ld",
513 thread->any_work);
514 debugTrace(DEBUG_gc," no_work %ld",
515 thread->no_work);
516 debugTrace(DEBUG_gc," scav_find_work %ld",
517 thread->scav_find_work);
518
519 #if defined(THREADED_RTS) && defined(PROF_SPIN)
520 gc_spin_spin += thread->gc_spin.spin;
521 gc_spin_yield += thread->gc_spin.yield;
522 mut_spin_spin += thread->mut_spin.spin;
523 mut_spin_yield += thread->mut_spin.yield;
524 #endif
525
526 any_work += thread->any_work;
527 no_work += thread->no_work;
528 scav_find_work += thread->scav_find_work;
529
530 par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
531 par_balanced_copied_acc +=
532 stg_min(n_gc_threads * gc_threads[i]->copied, copied);
533 }
534 }
535 if (n_gc_threads > 1) {
536 // See Note [Work Balance] for an explanation of this computation
537 par_balanced_copied =
538 (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
539 (n_gc_threads - 1);
540 }
541 }
542
543 // Run through all the generations and tidy up.
544 // We're going to:
545 // - count the amount of "live" data (live_words, live_blocks)
546 // - count the amount of "copied" data in this GC (copied)
547 // - free from-space
548 // - make to-space the new from-space (set BF_EVACUATED on all blocks)
549 //
550 live_words = 0;
551 live_blocks = 0;
552
553 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
554
555 if (g == N) {
556 generations[g].collections++; // for stats
557 if (n_gc_threads > 1) generations[g].par_collections++;
558 }
559
560 // Count the mutable list as bytes "copied" for the purposes of
561 // stats. Every mutable list is copied during every GC.
562 if (g > 0) {
563 W_ mut_list_size = 0;
564 for (n = 0; n < n_capabilities; n++) {
565 mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
566 }
567 copied += mut_list_size;
568
569 debugTrace(DEBUG_gc,
570 "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
571 (unsigned long)(mut_list_size * sizeof(W_)),
572 mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
573 mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
574 mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
575 mutlist_OTHERS);
576 }
577
578 bdescr *next, *prev;
579 gen = &generations[g];
580
581 // for generations we collected...
582 if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
583
584 /* free old memory and shift to-space into from-space for all
585 * the collected generations (except the allocation area). These
586 * freed blocks will probaby be quickly recycled.
587 */
588 if (gen->mark)
589 {
590 // tack the new blocks on the end of the existing blocks
591 if (gen->old_blocks != NULL) {
592
593 prev = NULL;
594 for (bd = gen->old_blocks; bd != NULL; bd = next) {
595
596 next = bd->link;
597
598 if (!(bd->flags & BF_MARKED))
599 {
600 if (prev == NULL) {
601 gen->old_blocks = next;
602 } else {
603 prev->link = next;
604 }
605 freeGroup(bd);
606 gen->n_old_blocks--;
607 }
608 else
609 {
610 gen->n_words += bd->free - bd->start;
611
612 // NB. this step might not be compacted next
613 // time, so reset the BF_MARKED flags.
614 // They are set before GC if we're going to
615 // compact. (search for BF_MARKED above).
616 bd->flags &= ~BF_MARKED;
617
618 // between GCs, all blocks in the heap except
619 // for the nursery have the BF_EVACUATED flag set.
620 bd->flags |= BF_EVACUATED;
621
622 prev = bd;
623 }
624 }
625
626 if (prev != NULL) {
627 prev->link = gen->blocks;
628 gen->blocks = gen->old_blocks;
629 }
630 }
631 // add the new blocks to the block tally
632 gen->n_blocks += gen->n_old_blocks;
633 ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
634 ASSERT(countOccupied(gen->blocks) == gen->n_words);
635 }
636 else // not copacted
637 {
638 freeChain(gen->old_blocks);
639 }
640
641 gen->old_blocks = NULL;
642 gen->n_old_blocks = 0;
643
644 /* LARGE OBJECTS. The current live large objects are chained on
645 * scavenged_large, having been moved during garbage
646 * collection from large_objects. Any objects left on the
647 * large_objects list are therefore dead, so we free them here.
648 */
649 freeChain(gen->large_objects);
650 gen->large_objects = gen->scavenged_large_objects;
651 gen->n_large_blocks = gen->n_scavenged_large_blocks;
652 gen->n_large_words = countOccupied(gen->large_objects);
653 gen->n_new_large_words = 0;
654
655 /* COMPACT_NFDATA. The currently live compacts are chained
656 * to live_compact_objects, quite like large objects. And
657 * objects left on the compact_objects list are dead.
658 *
659 * We don't run a simple freeChain because want to give the
660 * CNF module some chance to free memory that freeChain would
661 * not see (namely blocks appended to a CNF through a compactResize).
662 *
663 * See Note [Compact Normal Forms] for details.
664 */
665 for (bd = gen->compact_objects; bd; bd = next) {
666 next = bd->link;
667 compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
668 }
669 gen->compact_objects = gen->live_compact_objects;
670 gen->n_compact_blocks = gen->n_live_compact_blocks;
671 }
672 else // for generations > N
673 {
674 /* For older generations, we need to append the
675 * scavenged_large_object list (i.e. large objects that have been
676 * promoted during this GC) to the large_object list for that step.
677 */
678 for (bd = gen->scavenged_large_objects; bd; bd = next) {
679 next = bd->link;
680 dbl_link_onto(bd, &gen->large_objects);
681 gen->n_large_words += bd->free - bd->start;
682 }
683
684 // And same for compacts
685 for (bd = gen->live_compact_objects; bd; bd = next) {
686 next = bd->link;
687 dbl_link_onto(bd, &gen->compact_objects);
688 }
689
690 // add the new blocks we promoted during this GC
691 gen->n_large_blocks += gen->n_scavenged_large_blocks;
692 gen->n_compact_blocks += gen->n_live_compact_blocks;
693 }
694
695 ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
696 ASSERT(countOccupied(gen->large_objects) == gen->n_large_words);
697 // We can run the same assertion on compact objects because there
698 // is memory "the GC doesn't see" (directly), but which is still
699 // accounted in gen->n_compact_blocks
700
701 gen->scavenged_large_objects = NULL;
702 gen->n_scavenged_large_blocks = 0;
703 gen->live_compact_objects = NULL;
704 gen->n_live_compact_blocks = 0;
705
706 // Count "live" data
707 live_words += genLiveWords(gen);
708 live_blocks += genLiveBlocks(gen);
709
710 // add in the partial blocks in the gen_workspaces
711 {
712 uint32_t i;
713 for (i = 0; i < n_capabilities; i++) {
714 live_words += gcThreadLiveWords(i, gen->no);
715 live_blocks += gcThreadLiveBlocks(i, gen->no);
716 }
717 }
718 } // for all generations
719
720 // Mark and sweep the oldest generation.
721 // N.B. This can only happen after we've moved
722 // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
723 ASSERT(oldest_gen->scavenged_large_objects == NULL);
724 if (RtsFlags.GcFlags.useNonmoving && major_gc) {
725 // All threads in non-moving heap should be found to be alive, becuase
726 // threads in the non-moving generation's list should live in the
727 // non-moving heap, and we consider non-moving objects alive during
728 // preparation.
729 ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
730 // For weaks, remember that we evacuated all weaks to the non-moving heap
731 // in markWeakPtrList(), and then moved the weak_ptr_list list to
732 // old_weak_ptr_list. We then moved weaks with live keys to the
733 // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in
734 // old_weak_ptr_list to dead_weak_ptr_list. So at this point
735 // old_weak_ptr_list should be empty.
736 ASSERT(oldest_gen->old_weak_ptr_list == NULL);
737
738 // we may need to take the lock to allocate mark queue blocks
739 RELEASE_SM_LOCK;
740 // dead_weak_ptr_list contains weak pointers with dead keys. Those need to
741 // be kept alive because we'll use them in finalizeSchedulers(). Similarly
742 // resurrected_threads are also going to be used in resurrectedThreads()
743 // so we need to mark those too.
744 // Note that in sequential case these lists will be appended with more
745 // weaks and threads found to be dead in mark.
746 #if !defined(THREADED_RTS)
747 // In the non-threaded runtime this is the only time we push to the
748 // upd_rem_set
749 nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
750 #endif
751 nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
752 ACQUIRE_SM_LOCK;
753 }
754
755 // Update the max size of older generations after a major GC:
756 // We can't resize here in the case of the concurrent collector since we
757 // don't yet know how much live data we have. This will be instead done
758 // once we finish marking.
759 if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving)
760 resizeGenerations();
761
762 // Free the mark stack.
763 if (mark_stack_top_bd != NULL) {
764 debugTrace(DEBUG_gc, "mark stack: %d blocks",
765 countBlocks(mark_stack_top_bd));
766 freeChain(mark_stack_top_bd);
767 }
768
769 // Free any bitmaps.
770 for (g = 0; g <= N; g++) {
771 gen = &generations[g];
772 if (gen->bitmap != NULL) {
773 freeGroup(gen->bitmap);
774 gen->bitmap = NULL;
775 }
776 }
777
778 resize_nursery();
779
780 resetNurseries();
781
782 // mark the garbage collected CAFs as dead
783 #if defined(DEBUG)
784 if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); }
785 #endif
786
787 // Update the stable name hash table
788 updateStableNameTable(major_gc);
789
790 // unlock the StablePtr table. Must be before scheduleFinalizers(),
791 // because a finalizer may call hs_free_fun_ptr() or
792 // hs_free_stable_ptr(), both of which access the StablePtr table.
793 stablePtrUnlock();
794
795 // Must be after stablePtrUnlock(), because it might free stable ptrs.
796 if (major_gc) {
797 checkUnload (gct->scavenged_static_objects);
798 }
799
800 #if defined(PROFILING)
801 // resetStaticObjectForRetainerProfiling() must be called before
802 // zeroing below.
803
804 // ToDo: fix the gct->scavenged_static_objects below
805 resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
806 #endif
807
808 // Start any pending finalizers. Must be after
809 // updateStableTables() and stableUnlock() (see #4221).
810 RELEASE_SM_LOCK;
811 scheduleFinalizers(cap, dead_weak_ptr_list);
812 ACQUIRE_SM_LOCK;
813
814 // check sanity after GC
815 // before resurrectThreads(), because that might overwrite some
816 // closures, which will cause problems with THREADED where we don't
817 // fill slop. If we are using the nonmoving collector then we can't claim to
818 // be *after* the major GC; it's now running concurrently.
819 IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving));
820
821 // If a heap census is due, we need to do it before
822 // resurrectThreads(), for the same reason as checkSanity above:
823 // resurrectThreads() will overwrite some closures and leave slop
824 // behind.
825 if (do_heap_census) {
826 debugTrace(DEBUG_sched, "performing heap census");
827 RELEASE_SM_LOCK;
828 heapCensus(gct->gc_start_cpu);
829 ACQUIRE_SM_LOCK;
830 }
831
832 // send exceptions to any threads which were about to die
833 RELEASE_SM_LOCK;
834 resurrectThreads(resurrected_threads);
835 ACQUIRE_SM_LOCK;
836
837 if (major_gc) {
838 W_ need_prealloc, need_live, need, got;
839 uint32_t i;
840
841 need_live = 0;
842 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
843 need_live += genLiveBlocks(&generations[i]);
844 }
845 need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
846
847 need_prealloc = 0;
848 for (i = 0; i < n_nurseries; i++) {
849 need_prealloc += nurseries[i].n_blocks;
850 }
851 need_prealloc += RtsFlags.GcFlags.largeAllocLim;
852 need_prealloc += countAllocdBlocks(exec_block);
853 need_prealloc += arenaBlocks();
854 #if defined(PROFILING)
855 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
856 need_prealloc = retainerStackBlocks();
857 }
858 #endif
859
860 /* If the amount of data remains constant, next major GC we'll
861 * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
862 * in order to reduce repeated deallocation and reallocation. #14702
863 */
864 need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
865
866 /* Also, if user set heap size, do not drop below it.
867 */
868 need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
869
870 /* But with a large nursery, the above estimate might exceed
871 * maxHeapSize. A large resident set size might make the OS
872 * kill this process, or swap unnecessarily. Therefore we
873 * ensure that our estimate does not exceed maxHeapSize.
874 */
875 if (RtsFlags.GcFlags.maxHeapSize != 0) {
876 need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
877 }
878
879 need = BLOCKS_TO_MBLOCKS(need);
880
881 got = mblocks_allocated;
882
883 if (got > need) {
884 returnMemoryToOS(got - need);
885 }
886 }
887
888 // extra GC trace info
889 IF_DEBUG(gc, statDescribeGens());
890
891 #if defined(DEBUG)
892 // symbol-table based profiling
893 /* heapCensus(to_blocks); */ /* ToDo */
894 #endif
895
896 // restore enclosing cost centre
897 #if defined(PROFILING)
898 for (n = 0; n < n_capabilities; n++) {
899 capabilities[n]->r.rCCCS = save_CCS[n];
900 }
901 #endif
902
903 #if defined(DEBUG)
904 // check for memory leaks if DEBUG is on
905 memInventory(DEBUG_gc);
906 #endif
907
908 // ok, GC over: tell the stats department what happened.
909 stat_endGC(cap, gct, live_words, copied,
910 live_blocks * BLOCK_SIZE_W - live_words /* slop */,
911 N, n_gc_threads, par_max_copied, par_balanced_copied,
912 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
913 any_work, no_work, scav_find_work);
914
915 #if defined(RTS_USER_SIGNALS)
916 if (RtsFlags.MiscFlags.install_signal_handlers) {
917 // unblock signals again
918 unblockUserSignals();
919 }
920 #endif
921
922 RELEASE_SM_LOCK;
923
924 SET_GCT(saved_gct);
925 }
926
927 /* -----------------------------------------------------------------------------
928 Heap overflow is indicated by setting a flag that the caller of
929 GarbageCollect can check. (not ideal, TODO: better)
930 -------------------------------------------------------------------------- */
931
932 static void heapOverflow(void)
933 {
934 heap_overflow = true;
935 }
936
937 /* -----------------------------------------------------------------------------
938 Initialise the gc_thread structures.
939 -------------------------------------------------------------------------- */
940
941 static void
942 new_gc_thread (uint32_t n, gc_thread *t)
943 {
944 uint32_t g;
945 gen_workspace *ws;
946
947 t->cap = capabilities[n];
948
949 #if defined(THREADED_RTS)
950 t->id = 0;
951 initSpinLock(&t->gc_spin);
952 initSpinLock(&t->mut_spin);
953 ACQUIRE_SPIN_LOCK(&t->gc_spin);
954 ACQUIRE_SPIN_LOCK(&t->mut_spin);
955 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the
956 // thread to start up, see wakeup_gc_threads
957 #endif
958
959 t->thread_index = n;
960 t->free_blocks = NULL;
961 t->gc_count = 0;
962
963 init_gc_thread(t);
964
965 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
966 {
967 ws = &t->gens[g];
968 ws->gen = &generations[g];
969 ASSERT(g == ws->gen->no);
970 ws->my_gct = t;
971
972 // We want to call
973 // alloc_todo_block(ws,0);
974 // but can't, because it uses gct which isn't set up at this point.
975 // Hence, allocate a block for todo_bd manually:
976 {
977 bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
978 // no lock, locks aren't initialised yet
979 initBdescr(bd, ws->gen, ws->gen->to);
980 bd->flags = BF_EVACUATED;
981 bd->u.scan = bd->free = bd->start;
982
983 ws->todo_bd = bd;
984 ws->todo_free = bd->free;
985 ws->todo_lim = bd->start + BLOCK_SIZE_W;
986 }
987
988 ws->todo_q = newWSDeque(128);
989 ws->todo_overflow = NULL;
990 ws->n_todo_overflow = 0;
991 ws->todo_large_objects = NULL;
992 ws->todo_seg = END_NONMOVING_TODO_LIST;
993
994 ws->part_list = NULL;
995 ws->n_part_blocks = 0;
996 ws->n_part_words = 0;
997
998 ws->scavd_list = NULL;
999 ws->n_scavd_blocks = 0;
1000 ws->n_scavd_words = 0;
1001 }
1002 }
1003
1004
1005 void
1006 initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
1007 {
1008 #if defined(THREADED_RTS)
1009 uint32_t i;
1010
1011 if (from > 0) {
1012 gc_threads = stgReallocBytes (gc_threads, to * sizeof(gc_thread*),
1013 "initGcThreads");
1014 } else {
1015 gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
1016 "initGcThreads");
1017 }
1018
1019 for (i = from; i < to; i++) {
1020 gc_threads[i] =
1021 stgMallocBytes(sizeof(gc_thread) +
1022 RtsFlags.GcFlags.generations * sizeof(gen_workspace),
1023 "alloc_gc_threads");
1024
1025 new_gc_thread(i, gc_threads[i]);
1026 }
1027 #else
1028 ASSERT(from == 0 && to == 1);
1029 gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
1030 gc_threads[0] = gct;
1031 new_gc_thread(0,gc_threads[0]);
1032 #endif
1033 }
1034
1035 void
1036 freeGcThreads (void)
1037 {
1038 uint32_t g;
1039 if (gc_threads != NULL) {
1040 #if defined(THREADED_RTS)
1041 uint32_t i;
1042 for (i = 0; i < n_capabilities; i++) {
1043 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1044 {
1045 freeWSDeque(gc_threads[i]->gens[g].todo_q);
1046 }
1047 stgFree (gc_threads[i]);
1048 }
1049 stgFree (gc_threads);
1050 #else
1051 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1052 {
1053 freeWSDeque(gc_threads[0]->gens[g].todo_q);
1054 }
1055 stgFree (gc_threads);
1056 #endif
1057 gc_threads = NULL;
1058 }
1059 }
1060
1061 /* ----------------------------------------------------------------------------
1062 Start GC threads
1063 ------------------------------------------------------------------------- */
1064
1065 static volatile StgWord gc_running_threads;
1066
1067 static StgWord
1068 inc_running (void)
1069 {
1070 StgWord new;
1071 new = atomic_inc(&gc_running_threads, 1);
1072 ASSERT(new <= n_gc_threads);
1073 return new;
1074 }
1075
1076 static StgWord
1077 dec_running (void)
1078 {
1079 ASSERT(gc_running_threads != 0);
1080 return atomic_dec(&gc_running_threads);
1081 }
1082
1083 static bool
1084 any_work (void)
1085 {
1086 int g;
1087 gen_workspace *ws;
1088
1089 gct->any_work++;
1090
1091 write_barrier();
1092
1093 // scavenge objects in compacted generation
1094 if (mark_stack_bd != NULL && !mark_stack_empty()) {
1095 return true;
1096 }
1097
1098 // Check for global work in any gen. We don't need to check for
1099 // local work, because we have already exited scavenge_loop(),
1100 // which means there is no local work for this thread.
1101 for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
1102 ws = &gct->gens[g];
1103 if (ws->todo_large_objects) return true;
1104 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1105 if (ws->todo_overflow) return true;
1106 }
1107
1108 #if defined(THREADED_RTS)
1109 if (work_stealing) {
1110 uint32_t n;
1111 // look for work to steal
1112 for (n = 0; n < n_gc_threads; n++) {
1113 if (n == gct->thread_index) continue;
1114 for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
1115 ws = &gc_threads[n]->gens[g];
1116 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1117 }
1118 }
1119 }
1120 #endif
1121
1122 gct->no_work++;
1123 #if defined(THREADED_RTS)
1124 yieldThread();
1125 #endif
1126
1127 return false;
1128 }
1129
1130 static void
1131 scavenge_until_all_done (void)
1132 {
1133 DEBUG_ONLY( uint32_t r );
1134
1135
1136 loop:
1137 #if defined(THREADED_RTS)
1138 if (n_gc_threads > 1) {
1139 scavenge_loop();
1140 } else {
1141 scavenge_loop1();
1142 }
1143 #else
1144 scavenge_loop();
1145 #endif
1146
1147 collect_gct_blocks();
1148
1149 // scavenge_loop() only exits when there's no work to do
1150
1151 #if defined(DEBUG)
1152 r = dec_running();
1153 #else
1154 dec_running();
1155 #endif
1156
1157 traceEventGcIdle(gct->cap);
1158
1159 debugTrace(DEBUG_gc, "%d GC threads still running", r);
1160
1161 while (gc_running_threads != 0) {
1162 // usleep(1);
1163 if (any_work()) {
1164 inc_running();
1165 traceEventGcWork(gct->cap);
1166 goto loop;
1167 }
1168 // any_work() does not remove the work from the queue, it
1169 // just checks for the presence of work. If we find any,
1170 // then we increment gc_running_threads and go back to
1171 // scavenge_loop() to perform any pending work.
1172 }
1173
1174 traceEventGcDone(gct->cap);
1175 }
1176
1177 #if defined(THREADED_RTS)
1178
1179 void
1180 gcWorkerThread (Capability *cap)
1181 {
1182 gc_thread *saved_gct;
1183
1184 // necessary if we stole a callee-saves register for gct:
1185 saved_gct = gct;
1186
1187 SET_GCT(gc_threads[cap->no]);
1188 gct->id = osThreadId();
1189
1190 // Wait until we're told to wake up
1191 RELEASE_SPIN_LOCK(&gct->mut_spin);
1192 // yieldThread();
1193 // Strangely, adding a yieldThread() here makes the CPU time
1194 // measurements more accurate on Linux, perhaps because it syncs
1195 // the CPU time across the multiple cores. Without this, CPU time
1196 // is heavily skewed towards GC rather than MUT.
1197 gct->wakeup = GC_THREAD_STANDING_BY;
1198 debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1199 ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1200
1201 init_gc_thread(gct);
1202
1203 traceEventGcWork(gct->cap);
1204
1205 // Every thread evacuates some roots.
1206 gct->evac_gen_no = 0;
1207 markCapability(mark_root, gct, cap, true/*prune sparks*/);
1208 scavenge_capability_mut_lists(cap);
1209
1210 scavenge_until_all_done();
1211
1212 #if defined(THREADED_RTS)
1213 // Now that the whole heap is marked, we discard any sparks that
1214 // were found to be unreachable. The main GC thread is currently
1215 // marking heap reachable via weak pointers, so it is
1216 // non-deterministic whether a spark will be retained if it is
1217 // only reachable via weak pointers. To fix this problem would
1218 // require another GC barrier, which is too high a price.
1219 pruneSparkQueue(cap);
1220 #endif
1221
1222 // Wait until we're told to continue
1223 RELEASE_SPIN_LOCK(&gct->gc_spin);
1224 gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1225 debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1226 gct->thread_index);
1227 ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1228 debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1229
1230 SET_GCT(saved_gct);
1231 }
1232
1233 #endif
1234
1235 #if defined(THREADED_RTS)
1236
1237 void
1238 waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1239 {
1240 const uint32_t n_threads = n_capabilities;
1241 const uint32_t me = cap->no;
1242 uint32_t i, j;
1243 bool retry = true;
1244 Time t0, t1, t2;
1245
1246 t0 = t1 = t2 = getProcessElapsedTime();
1247
1248 while(retry) {
1249 for (i=0; i < n_threads; i++) {
1250 if (i == me || idle_cap[i]) continue;
1251 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1252 prodCapability(capabilities[i], cap->running_task);
1253 }
1254 }
1255 for (j=0; j < 10; j++) {
1256 retry = false;
1257 for (i=0; i < n_threads; i++) {
1258 if (i == me || idle_cap[i]) continue;
1259 write_barrier();
1260 interruptCapability(capabilities[i]);
1261 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1262 retry = true;
1263 }
1264 }
1265 if (!retry) break;
1266 #if defined(PROF_SPIN)
1267 waitForGcThreads_yield++;
1268 #endif
1269 yieldThread();
1270 }
1271
1272 t2 = getProcessElapsedTime();
1273 if (RtsFlags.GcFlags.longGCSync != 0 &&
1274 t2 - t1 > RtsFlags.GcFlags.longGCSync) {
1275 /* call this every longGCSync of delay */
1276 rtsConfig.longGCSync(cap->no, t2 - t0);
1277 t1 = t2;
1278 }
1279 if (retry) {
1280 #if defined(PROF_SPIN)
1281 // This is a bit strange, we'll get more yields than spins.
1282 // I guess that means it's not a spin-lock at all, but these
1283 // numbers are still useful (I think).
1284 waitForGcThreads_spin++;
1285 #endif
1286 }
1287 }
1288
1289 if (RtsFlags.GcFlags.longGCSync != 0 &&
1290 t2 - t0 > RtsFlags.GcFlags.longGCSync) {
1291 rtsConfig.longGCSyncEnd(t2 - t0);
1292 }
1293 }
1294
1295 #endif // THREADED_RTS
1296
1297 static void
1298 start_gc_threads (void)
1299 {
1300 #if defined(THREADED_RTS)
1301 gc_running_threads = 0;
1302 #endif
1303 }
1304
1305 static void
1306 wakeup_gc_threads (uint32_t me USED_IF_THREADS,
1307 bool idle_cap[] USED_IF_THREADS)
1308 {
1309 #if defined(THREADED_RTS)
1310 uint32_t i;
1311
1312 if (n_gc_threads == 1) return;
1313
1314 for (i=0; i < n_gc_threads; i++) {
1315 if (i == me || idle_cap[i]) continue;
1316 inc_running();
1317 debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1318 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
1319 barf("wakeup_gc_threads");
1320
1321 gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1322 ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1323 RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1324 }
1325 #endif
1326 }
1327
1328 // After GC is complete, we must wait for all GC threads to enter the
1329 // standby state, otherwise they may still be executing inside
1330 // any_work(), and may even remain awake until the next GC starts.
1331 static void
1332 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
1333 bool idle_cap[] USED_IF_THREADS)
1334 {
1335 #if defined(THREADED_RTS)
1336 uint32_t i;
1337
1338 if (n_gc_threads == 1) return;
1339
1340 for (i=0; i < n_gc_threads; i++) {
1341 if (i == me || idle_cap[i]) continue;
1342 while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
1343 busy_wait_nop();
1344 write_barrier();
1345 }
1346 }
1347 #endif
1348 }
1349
1350 #if defined(THREADED_RTS)
1351 void
1352 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1353 {
1354 const uint32_t n_threads = n_capabilities;
1355 const uint32_t me = cap->no;
1356 uint32_t i;
1357 for (i=0; i < n_threads; i++) {
1358 if (i == me || idle_cap[i]) continue;
1359 if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1360 barf("releaseGCThreads");
1361
1362 gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1363 ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1364 RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1365 }
1366 }
1367 #endif
1368
1369 /* ----------------------------------------------------------------------------
1370 Save the mutable lists in saved_mut_lists where it will be scavenged
1371 during GC
1372 ------------------------------------------------------------------------- */
1373
1374 static void
1375 stash_mut_list (Capability *cap, uint32_t gen_no)
1376 {
1377 cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
1378 cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
1379 }
1380
1381 /* ----------------------------------------------------------------------------
1382 Initialise a generation that is to be collected
1383 ------------------------------------------------------------------------- */
1384
1385 static void
1386 prepare_collected_gen (generation *gen)
1387 {
1388 uint32_t i, g, n;
1389 gen_workspace *ws;
1390 bdescr *bd, *next;
1391
1392 g = gen->no;
1393
1394 if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
1395 // Nonmoving heap's mutable list is always a root.
1396 for (i = 0; i < n_capabilities; i++) {
1397 stash_mut_list(capabilities[i], g);
1398 }
1399 } else if (g != 0) {
1400 // Otherwise throw away the current mutable list. Invariant: the
1401 // mutable list always has at least one block; this means we can avoid
1402 // a check for NULL in recordMutable().
1403 for (i = 0; i < n_capabilities; i++) {
1404 freeChain(capabilities[i]->mut_lists[g]);
1405 capabilities[i]->mut_lists[g] =
1406 allocBlockOnNode(capNoToNumaNode(i));
1407 }
1408 }
1409
1410 gen = &generations[g];
1411 ASSERT(gen->no == g);
1412
1413 // we'll construct a new list of threads in this step
1414 // during GC, throw away the current list.
1415 gen->old_threads = gen->threads;
1416 gen->threads = END_TSO_QUEUE;
1417
1418 // deprecate the existing blocks (except in the case of the nonmoving
1419 // collector since these will be preserved in nonmovingCollect for the
1420 // concurrent GC).
1421 if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) {
1422 gen->old_blocks = gen->blocks;
1423 gen->n_old_blocks = gen->n_blocks;
1424 gen->blocks = NULL;
1425 gen->n_blocks = 0;
1426 gen->n_words = 0;
1427 gen->live_estimate = 0;
1428 }
1429
1430 // initialise the large object queues.
1431 ASSERT(gen->scavenged_large_objects == NULL);
1432 ASSERT(gen->n_scavenged_large_blocks == 0);
1433 ASSERT(gen->live_compact_objects == NULL);
1434 ASSERT(gen->n_live_compact_blocks == 0);
1435
1436 // grab all the partial blocks stashed in the gc_thread workspaces and
1437 // move them to the old_blocks list of this gen.
1438 for (n = 0; n < n_capabilities; n++) {
1439 ws = &gc_threads[n]->gens[gen->no];
1440
1441 for (bd = ws->part_list; bd != NULL; bd = next) {
1442 next = bd->link;
1443 bd->link = gen->old_blocks;
1444 gen->old_blocks = bd;
1445 gen->n_old_blocks += bd->blocks;
1446 }
1447 ws->part_list = NULL;
1448 ws->n_part_blocks = 0;
1449 ws->n_part_words = 0;
1450
1451 ASSERT(ws->scavd_list == NULL);
1452 ASSERT(ws->n_scavd_blocks == 0);
1453 ASSERT(ws->n_scavd_words == 0);
1454
1455 if (ws->todo_free != ws->todo_bd->start) {
1456 ws->todo_bd->free = ws->todo_free;
1457 ws->todo_bd->link = gen->old_blocks;
1458 gen->old_blocks = ws->todo_bd;
1459 gen->n_old_blocks += ws->todo_bd->blocks;
1460 alloc_todo_block(ws,0); // always has one block.
1461 }
1462 }
1463
1464 // mark the small objects as from-space
1465 for (bd = gen->old_blocks; bd; bd = bd->link) {
1466 bd->flags &= ~BF_EVACUATED;
1467 }
1468
1469 // mark the large objects as from-space
1470 for (bd = gen->large_objects; bd; bd = bd->link) {
1471 bd->flags &= ~BF_EVACUATED;
1472 }
1473
1474 // mark the compact objects as from-space
1475 for (bd = gen->compact_objects; bd; bd = bd->link) {
1476 bd->flags &= ~BF_EVACUATED;
1477 }
1478
1479 // for a compacted generation, we need to allocate the bitmap
1480 if (gen->mark) {
1481 StgWord bitmap_size; // in bytes
1482 bdescr *bitmap_bdescr;
1483 StgWord *bitmap;
1484
1485 bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
1486
1487 if (bitmap_size > 0) {
1488 bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
1489 / BLOCK_SIZE);
1490 gen->bitmap = bitmap_bdescr;
1491 bitmap = bitmap_bdescr->start;
1492
1493 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1494 bitmap_size, bitmap);
1495
1496 // don't forget to fill it with zeros!
1497 memset(bitmap, 0, bitmap_size);
1498
1499 // For each block in this step, point to its bitmap from the
1500 // block descriptor.
1501 for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
1502 bd->u.bitmap = bitmap;
1503 bitmap += BLOCK_SIZE_W / BITS_IN(W_);
1504
1505 // Also at this point we set the BF_MARKED flag
1506 // for this block. The invariant is that
1507 // BF_MARKED is always unset, except during GC
1508 // when it is set on those blocks which will be
1509 // compacted.
1510 if (!(bd->flags & BF_FRAGMENTED)) {
1511 bd->flags |= BF_MARKED;
1512 }
1513
1514 // BF_SWEPT should be marked only for blocks that are being
1515 // collected in sweep()
1516 bd->flags &= ~BF_SWEPT;
1517 }
1518 }
1519 }
1520 }
1521
1522 /* ----------------------------------------------------------------------------
1523 Initialise a generation that is *not* to be collected
1524 ------------------------------------------------------------------------- */
1525
1526 static void
1527 prepare_uncollected_gen (generation *gen)
1528 {
1529 uint32_t i;
1530
1531
1532 ASSERT(gen->no > 0);
1533
1534 // save the current mutable lists for this generation, and
1535 // allocate a fresh block for each one. We'll traverse these
1536 // mutable lists as roots early on in the GC.
1537 for (i = 0; i < n_capabilities; i++) {
1538 stash_mut_list(capabilities[i], gen->no);
1539 }
1540
1541 ASSERT(gen->scavenged_large_objects == NULL);
1542 ASSERT(gen->n_scavenged_large_blocks == 0);
1543 }
1544
1545 /* -----------------------------------------------------------------------------
1546 Collect the completed blocks from a GC thread and attach them to
1547 the generation.
1548 -------------------------------------------------------------------------- */
1549
1550 static void
1551 collect_gct_blocks (void)
1552 {
1553 uint32_t g;
1554 gen_workspace *ws;
1555 bdescr *bd, *prev;
1556
1557 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1558 ws = &gct->gens[g];
1559
1560 // there may still be a block attached to ws->todo_bd;
1561 // leave it there to use next time.
1562
1563 if (ws->scavd_list != NULL) {
1564 ACQUIRE_SPIN_LOCK(&ws->gen->sync);
1565
1566 ASSERT(gct->scan_bd == NULL);
1567 ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
1568
1569 prev = NULL;
1570 for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
1571 prev = bd;
1572 }
1573 if (prev != NULL) {
1574 prev->link = ws->gen->blocks;
1575 ws->gen->blocks = ws->scavd_list;
1576 }
1577 ws->gen->n_blocks += ws->n_scavd_blocks;
1578 ws->gen->n_words += ws->n_scavd_words;
1579
1580 ws->scavd_list = NULL;
1581 ws->n_scavd_blocks = 0;
1582 ws->n_scavd_words = 0;
1583
1584 RELEASE_SPIN_LOCK(&ws->gen->sync);
1585 }
1586 }
1587 }
1588
1589 /* -----------------------------------------------------------------------------
1590 During mutation, any blocks that are filled by allocatePinned() are stashed
1591 on the local pinned_object_blocks list, to avoid needing to take a global
1592 lock. Here we collect those blocks from the cap->pinned_object_blocks lists
1593 and put them on the g0->large_object or oldest_gen->large_objects.
1594
1595 How to decide which list to put them on?
1596
1597 - When non-moving heap is enabled and this is a major GC, we put them on
1598 oldest_gen. This is because after preparation we really want no
1599 old-to-young references, and we want to be able to reset mut_lists. For
1600 this we need to promote every potentially live object to the oldest gen.
1601
1602 - Otherwise we put them on g0.
1603 -------------------------------------------------------------------------- */
1604
1605 static void
1606 collect_pinned_object_blocks (void)
1607 {
1608 generation *gen;
1609 const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
1610 if (use_nonmoving && major_gc) {
1611 gen = oldest_gen;
1612 } else {
1613 gen = g0;
1614 }
1615
1616 for (uint32_t n = 0; n < n_capabilities; n++) {
1617 bdescr *last = NULL;
1618 if (use_nonmoving && gen == oldest_gen) {
1619 // Mark objects as belonging to the nonmoving heap
1620 for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1621 bd->flags |= BF_NONMOVING;
1622 bd->gen = oldest_gen;
1623 bd->gen_no = oldest_gen->no;
1624 oldest_gen->n_large_words += bd->free - bd->start;
1625 oldest_gen->n_large_blocks += bd->blocks;
1626 last = bd;
1627 }
1628 } else {
1629 for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1630 last = bd;
1631 }
1632 }
1633
1634 if (last != NULL) {
1635 last->link = gen->large_objects;
1636 if (gen->large_objects != NULL) {
1637 gen->large_objects->u.back = last;
1638 }
1639 gen->large_objects = capabilities[n]->pinned_object_blocks;
1640 capabilities[n]->pinned_object_blocks = NULL;
1641 }
1642 }
1643 }
1644
1645 /* -----------------------------------------------------------------------------
1646 Initialise a gc_thread before GC
1647 -------------------------------------------------------------------------- */
1648
1649 static void
1650 init_gc_thread (gc_thread *t)
1651 {
1652 t->static_objects = END_OF_STATIC_OBJECT_LIST;
1653 t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
1654 t->scan_bd = NULL;
1655 t->mut_lists = t->cap->mut_lists;
1656 t->evac_gen_no = 0;
1657 t->failed_to_evac = false;
1658 t->eager_promotion = true;
1659 t->thunk_selector_depth = 0;
1660 t->copied = 0;
1661 t->scanned = 0;
1662 t->any_work = 0;
1663 t->no_work = 0;
1664 t->scav_find_work = 0;
1665 }
1666
1667 /* -----------------------------------------------------------------------------
1668 Function we pass to evacuate roots.
1669 -------------------------------------------------------------------------- */
1670
1671 static void
1672 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1673 {
1674 // we stole a register for gct, but this function is called from
1675 // *outside* the GC where the register variable is not in effect,
1676 // so we need to save and restore it here. NB. only call
1677 // mark_root() from the main GC thread, otherwise gct will be
1678 // incorrect.
1679 #if defined(THREADED_RTS)
1680 gc_thread *saved_gct;
1681 saved_gct = gct;
1682 #endif
1683 SET_GCT(user);
1684
1685 evacuate(root);
1686
1687 SET_GCT(saved_gct);
1688 }
1689
1690 /* ----------------------------------------------------------------------------
1691 Reset the sizes of the older generations when we do a major
1692 collection.
1693
1694 CURRENT STRATEGY: make all generations except zero the same size.
1695 We have to stay within the maximum heap size, and leave a certain
1696 percentage of the maximum heap size available to allocate into.
1697 ------------------------------------------------------------------------- */
1698
1699 void
1700 resizeGenerations (void)
1701 {
1702 uint32_t g;
1703 W_ live, size, min_alloc, words;
1704 const W_ max = RtsFlags.GcFlags.maxHeapSize;
1705 const W_ gens = RtsFlags.GcFlags.generations;
1706
1707 // live in the oldest generations
1708 if (oldest_gen->live_estimate != 0) {
1709 words = oldest_gen->live_estimate;
1710 } else {
1711 words = oldest_gen->n_words;
1712 }
1713 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1714 oldest_gen->n_large_blocks +
1715 oldest_gen->n_compact_blocks;
1716
1717 // default max size for all generations except zero
1718 size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1719 RtsFlags.GcFlags.minOldGenSize);
1720
1721 if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1722 if (max > 0) {
1723 RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
1724 } else {
1725 RtsFlags.GcFlags.heapSizeSuggestion = size;
1726 }
1727 }
1728
1729 // minimum size for generation zero
1730 min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1731 RtsFlags.GcFlags.minAllocAreaSize
1732 * (W_)n_capabilities);
1733
1734 // Auto-enable compaction when the residency reaches a
1735 // certain percentage of the maximum heap size (default: 30%).
1736 // Except when non-moving GC is enabled.
1737 if (!RtsFlags.GcFlags.useNonmoving &&
1738 (RtsFlags.GcFlags.compact ||
1739 (max > 0 &&
1740 oldest_gen->n_blocks >
1741 (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
1742 oldest_gen->mark = 1;
1743 oldest_gen->compact = 1;
1744 // debugBelch("compaction: on\n", live);
1745 } else {
1746 oldest_gen->mark = 0;
1747 oldest_gen->compact = 0;
1748 // debugBelch("compaction: off\n", live);
1749 }
1750
1751 if (RtsFlags.GcFlags.sweep) {
1752 oldest_gen->mark = 1;
1753 }
1754
1755 // if we're going to go over the maximum heap size, reduce the
1756 // size of the generations accordingly. The calculation is
1757 // different if compaction is turned on, because we don't need
1758 // to double the space required to collect the old generation.
1759 if (max != 0) {
1760
1761 // this test is necessary to ensure that the calculations
1762 // below don't have any negative results - we're working
1763 // with unsigned values here.
1764 if (max < min_alloc) {
1765 heapOverflow();
1766 }
1767
1768 if (oldest_gen->compact) {
1769 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1770 size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1771 }
1772 } else {
1773 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1774 size = (max - min_alloc) / ((gens - 1) * 2);
1775 }
1776 }
1777
1778 if (size < live) {
1779 heapOverflow();
1780 }
1781 }
1782
1783 #if 0
1784 debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1785 min_alloc, size, max);
1786 debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n",
1787 oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks);
1788 debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks);
1789 #endif
1790
1791 for (g = 0; g < gens; g++) {
1792 generations[g].max_blocks = size;
1793 }
1794 }
1795
1796 /* -----------------------------------------------------------------------------
1797 Calculate the new size of the nursery, and resize it.
1798 -------------------------------------------------------------------------- */
1799
1800 static void
1801 resize_nursery (void)
1802 {
1803 const StgWord min_nursery =
1804 RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
1805
1806 if (RtsFlags.GcFlags.generations == 1)
1807 { // Two-space collector:
1808 W_ blocks;
1809
1810 /* set up a new nursery. Allocate a nursery size based on a
1811 * function of the amount of live data (by default a factor of 2)
1812 * Use the blocks from the old nursery if possible, freeing up any
1813 * left over blocks.
1814 *
1815 * If we get near the maximum heap size, then adjust our nursery
1816 * size accordingly. If the nursery is the same size as the live
1817 * data (L), then we need 3L bytes. We can reduce the size of the
1818 * nursery to bring the required memory down near 2L bytes.
1819 *
1820 * A normal 2-space collector would need 4L bytes to give the same
1821 * performance we get from 3L bytes, reducing to the same
1822 * performance at 2L bytes.
1823 */
1824 blocks = generations[0].n_blocks;
1825
1826 if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1827 blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1828 RtsFlags.GcFlags.maxHeapSize )
1829 {
1830 long adjusted_blocks; // signed on purpose
1831 int pc_free;
1832
1833 adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1834
1835 debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1836 RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1837
1838 pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1839 if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1840 {
1841 heapOverflow();
1842 }
1843 blocks = adjusted_blocks;
1844 }
1845 else
1846 {
1847 blocks *= RtsFlags.GcFlags.oldGenFactor;
1848 if (blocks < min_nursery)
1849 {
1850 blocks = min_nursery;
1851 }
1852 }
1853 resizeNurseries(blocks);
1854 }
1855 else // Generational collector
1856 {
1857 /*
1858 * If the user has given us a suggested heap size, adjust our
1859 * allocation area to make best use of the memory available.
1860 */
1861 if (RtsFlags.GcFlags.heapSizeSuggestion)
1862 {
1863 long blocks;
1864 StgWord needed;
1865
1866 calcNeeded(false, &needed); // approx blocks needed at next GC
1867
1868 /* Guess how much will be live in generation 0 step 0 next time.
1869 * A good approximation is obtained by finding the
1870 * percentage of g0 that was live at the last minor GC.
1871 *
1872 * We have an accurate figure for the amount of copied data in
1873 * 'copied', but we must convert this to a number of blocks, with
1874 * a small adjustment for estimated slop at the end of a block
1875 * (- 10 words).
1876 */
1877 if (N == 0)
1878 {
1879 g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1880 / countNurseryBlocks();
1881 }
1882
1883 /* Estimate a size for the allocation area based on the
1884 * information available. We might end up going slightly under
1885 * or over the suggested heap size, but we should be pretty
1886 * close on average.
1887 *
1888 * Formula: suggested - needed
1889 * ----------------------------
1890 * 1 + g0_pcnt_kept/100
1891 *
1892 * where 'needed' is the amount of memory needed at the next
1893 * collection for collecting all gens except g0.
1894 */
1895 blocks =
1896 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1897 (100 + (long)g0_pcnt_kept);
1898
1899 if (blocks < (long)min_nursery) {
1900 blocks = min_nursery;
1901 }
1902
1903 resizeNurseries((W_)blocks);
1904 }
1905 else
1906 {
1907 // we might have added extra blocks to the nursery, so
1908 // resize back to the original size again.
1909 resizeNurseriesFixed();
1910 }
1911 }
1912 }
1913
1914 /* -----------------------------------------------------------------------------
1915 Sanity code for CAF garbage collection.
1916
1917 With DEBUG turned on, we manage a CAF list in addition to the SRT
1918 mechanism. After GC, we run down the CAF list and make any
1919 CAFs which have been garbage collected GCD_CAF. This means we get an error
1920 whenever the program tries to enter a garbage collected CAF.
1921
1922 Any garbage collected CAFs are taken off the CAF list at the same
1923 time.
1924 -------------------------------------------------------------------------- */
1925
1926 #if defined(DEBUG)
1927
1928 void gcCAFs(void)
1929 {
1930 uint32_t i = 0;
1931 StgIndStatic *prev = NULL;
1932
1933 for (StgIndStatic *p = debug_caf_list;
1934 p != (StgIndStatic*) END_OF_CAF_LIST;
1935 p = (StgIndStatic*) p->saved_info)
1936 {
1937 const StgInfoTable *info = get_itbl((StgClosure*)p);
1938 ASSERT(info->type == IND_STATIC);
1939
1940 // See Note [STATIC_LINK fields] in Storage.h
1941 // This condition identifies CAFs that have just been GC'd and
1942 // don't have static_link==3 which means they should be ignored.
1943 if ((((StgWord)(p->static_link)&STATIC_BITS) | prev_static_flag) != 3) {
1944 debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
1945 SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
1946 if (prev == NULL) {
1947 debug_caf_list = (StgIndStatic*)p->saved_info;
1948 } else {
1949 prev->saved_info = p->saved_info;
1950 }
1951 } else {
1952 prev = p;
1953 i++;
1954 }
1955 }
1956
1957 debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1958 }
1959 #endif
1960
1961
1962 /* -----------------------------------------------------------------------------
1963 The GC can leave some work for the mutator to do before the next
1964 GC, provided the work can be safely overlapped with mutation. This
1965 can help reduce the GC pause time.
1966
1967 The mutator can call doIdleGCWork() any time it likes, but
1968 preferably when it is idle. It's safe for multiple capabilities to
1969 call doIdleGCWork().
1970
1971 When 'all' is
1972 * false: doIdleGCWork() should only take a short, bounded, amount
1973 of time.
1974 * true: doIdleGCWork() will complete all the outstanding GC work.
1975
1976 The return value is
1977 * true if there's more to do (only if 'all' is false).
1978 * false otherwise.
1979 -------------------------------------------------------------------------- */
1980
1981 bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
1982 {
1983 return runSomeFinalizers(all);
1984 }