testsuite: Assert that testsuite ways are known
[ghc.git] / rts / sm / GC.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * Generational garbage collector
6 *
7 * Documentation on the architecture of the Garbage Collector can be
8 * found in the online commentary:
9 *
10 * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/storage/gc
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16 #include "HsFFI.h"
17
18 #include "GC.h"
19 #include "GCThread.h"
20 #include "GCTDecl.h" // NB. before RtsSignals.h which
21 // clobbers REG_R1 on arm/Linux
22 #include "Compact.h"
23 #include "Evac.h"
24 #include "Scav.h"
25 #include "GCUtils.h"
26 #include "MarkStack.h"
27 #include "MarkWeak.h"
28 #include "Sparks.h"
29 #include "Sweep.h"
30
31 #include "Arena.h"
32 #include "Storage.h"
33 #include "RtsUtils.h"
34 #include "Apply.h"
35 #include "Updates.h"
36 #include "Stats.h"
37 #include "Schedule.h"
38 #include "Sanity.h"
39 #include "BlockAlloc.h"
40 #include "ProfHeap.h"
41 #include "Weak.h"
42 #include "Prelude.h"
43 #include "RtsSignals.h"
44 #include "STM.h"
45 #include "Trace.h"
46 #include "RetainerProfile.h"
47 #include "LdvProfile.h"
48 #include "RaiseAsync.h"
49 #include "StableName.h"
50 #include "StablePtr.h"
51 #include "CheckUnload.h"
52 #include "CNF.h"
53 #include "RtsFlags.h"
54
55 #include <string.h> // for memset()
56 #include <unistd.h>
57
58 /* -----------------------------------------------------------------------------
59 Global variables
60 -------------------------------------------------------------------------- */
61
62 /* STATIC OBJECT LIST.
63 *
64 * During GC:
65 * We maintain a linked list of static objects that are still live.
66 * The requirements for this list are:
67 *
68 * - we need to scan the list while adding to it, in order to
69 * scavenge all the static objects (in the same way that
70 * breadth-first scavenging works for dynamic objects).
71 *
72 * - we need to be able to tell whether an object is already on
73 * the list, to break loops.
74 *
75 * Each static object has a "static link field", which we use for
76 * linking objects on to the list. We use a stack-type list, consing
77 * objects on the front as they are added (this means that the
78 * scavenge phase is depth-first, not breadth-first, but that
79 * shouldn't matter).
80 *
81 * A separate list is kept for objects that have been scavenged
82 * already - this is so that we can zero all the marks afterwards.
83 *
84 * An object is on the list if its static link field is non-zero; this
85 * means that we have to mark the end of the list with '1', not NULL.
86 *
87 * Extra notes for generational GC:
88 *
89 * Each generation has a static object list associated with it. When
90 * collecting generations up to N, we treat the static object lists
91 * from generations > N as roots.
92 *
93 * We build up a static object list while collecting generations 0..N,
94 * which is then appended to the static object list of generation N+1.
95 *
96 * See also: Note [STATIC_LINK fields] in Storage.h.
97 */
98
99 /* N is the oldest generation being collected, where the generations
100 * are numbered starting at 0. A major GC (indicated by the major_gc
101 * flag) is when we're collecting all generations. We only attempt to
102 * deal with static objects and GC CAFs when doing a major GC.
103 */
104 uint32_t N;
105 bool major_gc;
106
107 /* Data used for allocation area sizing.
108 */
109 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
110
111 /* Mut-list stats */
112 #if defined(DEBUG)
113 uint32_t mutlist_MUTVARS,
114 mutlist_MUTARRS,
115 mutlist_MVARS,
116 mutlist_TVAR,
117 mutlist_TVAR_WATCH_QUEUE,
118 mutlist_TREC_CHUNK,
119 mutlist_TREC_HEADER,
120 mutlist_OTHERS;
121 #endif
122
123 /* Thread-local data for each GC thread
124 */
125 gc_thread **gc_threads = NULL;
126
127 #if !defined(THREADED_RTS)
128 // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
129 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
130 ATTRIBUTE_ALIGNED(64);
131 #endif
132
133 // Number of threads running in *this* GC. Affects how many
134 // step->todos[] lists we have to look in to find work.
135 uint32_t n_gc_threads;
136
137 // For stats:
138 static long copied; // *words* copied & scavenged during this GC
139
140 #if defined(PROF_SPIN) && defined(THREADED_RTS)
141 // spin and yield counts for the quasi-SpinLock in waitForGcThreads
142 volatile StgWord64 waitForGcThreads_spin = 0;
143 volatile StgWord64 waitForGcThreads_yield = 0;
144 volatile StgWord64 whitehole_gc_spin = 0;
145 #endif
146
147 bool work_stealing;
148
149 uint32_t static_flag = STATIC_FLAG_B;
150 uint32_t prev_static_flag = STATIC_FLAG_A;
151
152 DECLARE_GCT
153
154 /* -----------------------------------------------------------------------------
155 Static function declarations
156 -------------------------------------------------------------------------- */
157
158 static void mark_root (void *user, StgClosure **root);
159 static void prepare_collected_gen (generation *gen);
160 static void prepare_uncollected_gen (generation *gen);
161 static void init_gc_thread (gc_thread *t);
162 static void resize_generations (void);
163 static void resize_nursery (void);
164 static void start_gc_threads (void);
165 static void scavenge_until_all_done (void);
166 static StgWord inc_running (void);
167 static StgWord dec_running (void);
168 static void wakeup_gc_threads (uint32_t me, bool idle_cap[]);
169 static void shutdown_gc_threads (uint32_t me, bool idle_cap[]);
170 static void collect_gct_blocks (void);
171 static void collect_pinned_object_blocks (void);
172 static void heapOverflow (void);
173
174 #if defined(DEBUG)
175 static void gcCAFs (void);
176 #endif
177
178 /* -----------------------------------------------------------------------------
179 The mark stack.
180 -------------------------------------------------------------------------- */
181
182 bdescr *mark_stack_top_bd; // topmost block in the mark stack
183 bdescr *mark_stack_bd; // current block in the mark stack
184 StgPtr mark_sp; // pointer to the next unallocated mark stack entry
185
186 /* -----------------------------------------------------------------------------
187 GarbageCollect: the main entry point to the garbage collector.
188
189 The collect_gen parameter is gotten by calling calcNeeded().
190
191 Locks held: all capabilities are held throughout GarbageCollect().
192 -------------------------------------------------------------------------- */
193
194 void
195 GarbageCollect (uint32_t collect_gen,
196 bool do_heap_census,
197 uint32_t gc_type USED_IF_THREADS,
198 Capability *cap,
199 bool idle_cap[])
200 {
201 bdescr *bd;
202 generation *gen;
203 StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
204 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
205 any_work, no_work, scav_find_work;
206 #if defined(THREADED_RTS)
207 gc_thread *saved_gct;
208 #endif
209 uint32_t g, n;
210
211 // necessary if we stole a callee-saves register for gct:
212 #if defined(THREADED_RTS)
213 saved_gct = gct;
214 #endif
215
216 #if defined(PROFILING)
217 CostCentreStack *save_CCS[n_capabilities];
218 #endif
219
220 ACQUIRE_SM_LOCK;
221
222 #if defined(RTS_USER_SIGNALS)
223 if (RtsFlags.MiscFlags.install_signal_handlers) {
224 // block signals
225 blockUserSignals();
226 }
227 #endif
228
229 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord));
230 // otherwise adjust the padding in gen_workspace.
231
232 // this is the main thread
233 SET_GCT(gc_threads[cap->no]);
234
235 // tell the stats department that we've started a GC
236 stat_startGC(cap, gct);
237
238 // Lock the StablePtr table. This prevents FFI calls manipulating
239 // the table from occurring during GC.
240 stablePtrLock();
241
242 #if defined(DEBUG)
243 mutlist_MUTVARS = 0;
244 mutlist_MUTARRS = 0;
245 mutlist_MVARS = 0;
246 mutlist_TVAR = 0;
247 mutlist_TVAR_WATCH_QUEUE = 0;
248 mutlist_TREC_CHUNK = 0;
249 mutlist_TREC_HEADER = 0;
250 mutlist_OTHERS = 0;
251 #endif
252
253 // attribute any costs to CCS_GC
254 #if defined(PROFILING)
255 for (n = 0; n < n_capabilities; n++) {
256 save_CCS[n] = capabilities[n]->r.rCCCS;
257 capabilities[n]->r.rCCCS = CCS_GC;
258 }
259 #endif
260
261 /* Figure out which generation to collect
262 */
263 N = collect_gen;
264 major_gc = (N == RtsFlags.GcFlags.generations-1);
265
266 if (major_gc) {
267 prev_static_flag = static_flag;
268 static_flag =
269 static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
270 }
271
272 #if defined(THREADED_RTS)
273 work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
274 N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
275 // It's not always a good idea to do load balancing in parallel
276 // GC. In particular, for a parallel program we don't want to
277 // lose locality by moving cached data into another CPU's cache
278 // (this effect can be quite significant).
279 //
280 // We could have a more complex way to determine whether to do
281 // work stealing or not, e.g. it might be a good idea to do it
282 // if the heap is big. For now, we just turn it on or off with
283 // a flag.
284 #endif
285
286 /* Start threads, so they can be spinning up while we finish initialisation.
287 */
288 start_gc_threads();
289
290 #if defined(THREADED_RTS)
291 /* How many threads will be participating in this GC?
292 * We don't try to parallelise minor GCs (unless the user asks for
293 * it with +RTS -gn0), or mark/compact/sweep GC.
294 */
295 if (gc_type == SYNC_GC_PAR) {
296 n_gc_threads = n_capabilities;
297 } else {
298 n_gc_threads = 1;
299 }
300 #else
301 n_gc_threads = 1;
302 #endif
303
304 debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s))",
305 N, n_gc_threads);
306
307 #if defined(DEBUG)
308 // check for memory leaks if DEBUG is on
309 memInventory(DEBUG_gc);
310 #endif
311
312 // do this *before* we start scavenging
313 collectFreshWeakPtrs();
314
315 // check sanity *before* GC
316 IF_DEBUG(sanity, checkSanity(false /* before GC */, major_gc));
317
318 // gather blocks allocated using allocatePinned() from each capability
319 // and put them on the g0->large_object list.
320 collect_pinned_object_blocks();
321
322 // Initialise all the generations that we're collecting.
323 for (g = 0; g <= N; g++) {
324 prepare_collected_gen(&generations[g]);
325 }
326 // Initialise all the generations that we're *not* collecting.
327 for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
328 prepare_uncollected_gen(&generations[g]);
329 }
330
331 // Prepare this gc_thread
332 init_gc_thread(gct);
333
334 /* Allocate a mark stack if we're doing a major collection.
335 */
336 if (major_gc && oldest_gen->mark) {
337 mark_stack_bd = allocBlock();
338 mark_stack_top_bd = mark_stack_bd;
339 mark_stack_bd->link = NULL;
340 mark_stack_bd->u.back = NULL;
341 mark_sp = mark_stack_bd->start;
342 } else {
343 mark_stack_bd = NULL;
344 mark_stack_top_bd = NULL;
345 mark_sp = NULL;
346 }
347
348 /* -----------------------------------------------------------------------
349 * follow all the roots that we know about:
350 */
351
352 // the main thread is running: this prevents any other threads from
353 // exiting prematurely, so we can start them now.
354 // NB. do this after the mutable lists have been saved above, otherwise
355 // the other GC threads will be writing into the old mutable lists.
356 inc_running();
357 wakeup_gc_threads(gct->thread_index, idle_cap);
358
359 traceEventGcWork(gct->cap);
360
361 // scavenge the capability-private mutable lists. This isn't part
362 // of markSomeCapabilities() because markSomeCapabilities() can only
363 // call back into the GC via mark_root() (due to the gct register
364 // variable).
365 if (n_gc_threads == 1) {
366 for (n = 0; n < n_capabilities; n++) {
367 #if defined(THREADED_RTS)
368 scavenge_capability_mut_Lists1(capabilities[n]);
369 #else
370 scavenge_capability_mut_lists(capabilities[n]);
371 #endif
372 }
373 } else {
374 scavenge_capability_mut_lists(gct->cap);
375 for (n = 0; n < n_capabilities; n++) {
376 if (idle_cap[n]) {
377 markCapability(mark_root, gct, capabilities[n],
378 true/*don't mark sparks*/);
379 scavenge_capability_mut_lists(capabilities[n]);
380 }
381 }
382 }
383
384 // follow roots from the CAF list (used by GHCi)
385 gct->evac_gen_no = 0;
386 markCAFs(mark_root, gct);
387
388 // follow all the roots that the application knows about.
389 gct->evac_gen_no = 0;
390 if (n_gc_threads == 1) {
391 for (n = 0; n < n_capabilities; n++) {
392 markCapability(mark_root, gct, capabilities[n],
393 true/*don't mark sparks*/);
394 }
395 } else {
396 markCapability(mark_root, gct, cap, true/*don't mark sparks*/);
397 }
398
399 markScheduler(mark_root, gct);
400
401 // Mark the weak pointer list, and prepare to detect dead weak pointers.
402 markWeakPtrList();
403 initWeakForGC();
404
405 // Mark the stable pointer table.
406 markStablePtrTable(mark_root, gct);
407
408 // Remember old stable name addresses.
409 rememberOldStableNameAddresses ();
410
411 /* -------------------------------------------------------------------------
412 * Repeatedly scavenge all the areas we know about until there's no
413 * more scavenging to be done.
414 */
415
416 StgWeak *dead_weak_ptr_list = NULL;
417 StgTSO *resurrected_threads = END_TSO_QUEUE;
418
419 for (;;)
420 {
421 scavenge_until_all_done();
422
423 // The other threads are now stopped. We might recurse back to
424 // here, but from now on this is the only thread.
425
426 // must be last... invariant is that everything is fully
427 // scavenged at this point.
428 if (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { // returns true if evaced something
429 inc_running();
430 continue;
431 }
432
433 // If we get to here, there's really nothing left to do.
434 break;
435 }
436
437 shutdown_gc_threads(gct->thread_index, idle_cap);
438
439 // Now see which stable names are still alive.
440 gcStableNameTable();
441
442 #if defined(THREADED_RTS)
443 if (n_gc_threads == 1) {
444 for (n = 0; n < n_capabilities; n++) {
445 pruneSparkQueue(capabilities[n]);
446 }
447 } else {
448 for (n = 0; n < n_capabilities; n++) {
449 if (n == cap->no || idle_cap[n]) {
450 pruneSparkQueue(capabilities[n]);
451 }
452 }
453 }
454 #endif
455
456 #if defined(PROFILING)
457 // We call processHeapClosureForDead() on every closure destroyed during
458 // the current garbage collection, so we invoke LdvCensusForDead().
459 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
460 || RtsFlags.ProfFlags.bioSelector != NULL) {
461 RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
462 LdvCensusForDead(N);
463 ACQUIRE_SM_LOCK;
464 }
465 #endif
466
467 // NO MORE EVACUATION AFTER THIS POINT!
468
469 // Finally: compact or sweep the oldest generation.
470 if (major_gc && oldest_gen->mark) {
471 if (oldest_gen->compact)
472 compact(gct->scavenged_static_objects,
473 &dead_weak_ptr_list,
474 &resurrected_threads);
475 else
476 sweep(oldest_gen);
477 }
478
479 copied = 0;
480 par_max_copied = 0;
481 par_balanced_copied = 0;
482 gc_spin_spin = 0;
483 gc_spin_yield = 0;
484 mut_spin_spin = 0;
485 mut_spin_yield = 0;
486 any_work = 0;
487 no_work = 0;
488 scav_find_work = 0;
489 {
490 uint32_t i;
491 uint64_t par_balanced_copied_acc = 0;
492 const gc_thread* thread;
493
494 for (i=0; i < n_gc_threads; i++) {
495 copied += gc_threads[i]->copied;
496 }
497 for (i=0; i < n_gc_threads; i++) {
498 thread = gc_threads[i];
499 if (n_gc_threads > 1) {
500 debugTrace(DEBUG_gc,"thread %d:", i);
501 debugTrace(DEBUG_gc," copied %ld",
502 thread->copied * sizeof(W_));
503 debugTrace(DEBUG_gc," scanned %ld",
504 thread->scanned * sizeof(W_));
505 debugTrace(DEBUG_gc," any_work %ld",
506 thread->any_work);
507 debugTrace(DEBUG_gc," no_work %ld",
508 thread->no_work);
509 debugTrace(DEBUG_gc," scav_find_work %ld",
510 thread->scav_find_work);
511
512 #if defined(THREADED_RTS) && defined(PROF_SPIN)
513 gc_spin_spin += thread->gc_spin.spin;
514 gc_spin_yield += thread->gc_spin.yield;
515 mut_spin_spin += thread->mut_spin.spin;
516 mut_spin_yield += thread->mut_spin.yield;
517 #endif
518
519 any_work += thread->any_work;
520 no_work += thread->no_work;
521 scav_find_work += thread->scav_find_work;
522
523 par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
524 par_balanced_copied_acc +=
525 stg_min(n_gc_threads * gc_threads[i]->copied, copied);
526 }
527 }
528 if (n_gc_threads > 1) {
529 // See Note [Work Balance] for an explanation of this computation
530 par_balanced_copied =
531 (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
532 (n_gc_threads - 1);
533 }
534 }
535
536 // Run through all the generations and tidy up.
537 // We're going to:
538 // - count the amount of "live" data (live_words, live_blocks)
539 // - count the amount of "copied" data in this GC (copied)
540 // - free from-space
541 // - make to-space the new from-space (set BF_EVACUATED on all blocks)
542 //
543 live_words = 0;
544 live_blocks = 0;
545
546 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
547
548 if (g == N) {
549 generations[g].collections++; // for stats
550 if (n_gc_threads > 1) generations[g].par_collections++;
551 }
552
553 // Count the mutable list as bytes "copied" for the purposes of
554 // stats. Every mutable list is copied during every GC.
555 if (g > 0) {
556 W_ mut_list_size = 0;
557 for (n = 0; n < n_capabilities; n++) {
558 mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
559 }
560 copied += mut_list_size;
561
562 debugTrace(DEBUG_gc,
563 "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
564 (unsigned long)(mut_list_size * sizeof(W_)),
565 mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
566 mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
567 mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
568 mutlist_OTHERS);
569 }
570
571 bdescr *next, *prev;
572 gen = &generations[g];
573
574 // for generations we collected...
575 if (g <= N) {
576
577 /* free old memory and shift to-space into from-space for all
578 * the collected generations (except the allocation area). These
579 * freed blocks will probaby be quickly recycled.
580 */
581 if (gen->mark)
582 {
583 // tack the new blocks on the end of the existing blocks
584 if (gen->old_blocks != NULL) {
585
586 prev = NULL;
587 for (bd = gen->old_blocks; bd != NULL; bd = next) {
588
589 next = bd->link;
590
591 if (!(bd->flags & BF_MARKED))
592 {
593 if (prev == NULL) {
594 gen->old_blocks = next;
595 } else {
596 prev->link = next;
597 }
598 freeGroup(bd);
599 gen->n_old_blocks--;
600 }
601 else
602 {
603 gen->n_words += bd->free - bd->start;
604
605 // NB. this step might not be compacted next
606 // time, so reset the BF_MARKED flags.
607 // They are set before GC if we're going to
608 // compact. (search for BF_MARKED above).
609 bd->flags &= ~BF_MARKED;
610
611 // between GCs, all blocks in the heap except
612 // for the nursery have the BF_EVACUATED flag set.
613 bd->flags |= BF_EVACUATED;
614
615 prev = bd;
616 }
617 }
618
619 if (prev != NULL) {
620 prev->link = gen->blocks;
621 gen->blocks = gen->old_blocks;
622 }
623 }
624 // add the new blocks to the block tally
625 gen->n_blocks += gen->n_old_blocks;
626 ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
627 ASSERT(countOccupied(gen->blocks) == gen->n_words);
628 }
629 else // not copacted
630 {
631 freeChain(gen->old_blocks);
632 }
633
634 gen->old_blocks = NULL;
635 gen->n_old_blocks = 0;
636
637 /* LARGE OBJECTS. The current live large objects are chained on
638 * scavenged_large, having been moved during garbage
639 * collection from large_objects. Any objects left on the
640 * large_objects list are therefore dead, so we free them here.
641 */
642 freeChain(gen->large_objects);
643 gen->large_objects = gen->scavenged_large_objects;
644 gen->n_large_blocks = gen->n_scavenged_large_blocks;
645 gen->n_large_words = countOccupied(gen->large_objects);
646 gen->n_new_large_words = 0;
647
648 /* COMPACT_NFDATA. The currently live compacts are chained
649 * to live_compact_objects, quite like large objects. And
650 * objects left on the compact_objects list are dead.
651 *
652 * We don't run a simple freeChain because want to give the
653 * CNF module some chance to free memory that freeChain would
654 * not see (namely blocks appended to a CNF through a compactResize).
655 *
656 * See Note [Compact Normal Forms] for details.
657 */
658 for (bd = gen->compact_objects; bd; bd = next) {
659 next = bd->link;
660 compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
661 }
662 gen->compact_objects = gen->live_compact_objects;
663 gen->n_compact_blocks = gen->n_live_compact_blocks;
664 }
665 else // for generations > N
666 {
667 /* For older generations, we need to append the
668 * scavenged_large_object list (i.e. large objects that have been
669 * promoted during this GC) to the large_object list for that step.
670 */
671 for (bd = gen->scavenged_large_objects; bd; bd = next) {
672 next = bd->link;
673 dbl_link_onto(bd, &gen->large_objects);
674 gen->n_large_words += bd->free - bd->start;
675 }
676
677 // And same for compacts
678 for (bd = gen->live_compact_objects; bd; bd = next) {
679 next = bd->link;
680 dbl_link_onto(bd, &gen->compact_objects);
681 }
682
683 // add the new blocks we promoted during this GC
684 gen->n_large_blocks += gen->n_scavenged_large_blocks;
685 gen->n_compact_blocks += gen->n_live_compact_blocks;
686 }
687
688 ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
689 ASSERT(countOccupied(gen->large_objects) == gen->n_large_words);
690 // We can run the same assertion on compact objects because there
691 // is memory "the GC doesn't see" (directly), but which is still
692 // accounted in gen->n_compact_blocks
693
694 gen->scavenged_large_objects = NULL;
695 gen->n_scavenged_large_blocks = 0;
696 gen->live_compact_objects = NULL;
697 gen->n_live_compact_blocks = 0;
698
699 // Count "live" data
700 live_words += genLiveWords(gen);
701 live_blocks += genLiveBlocks(gen);
702
703 // add in the partial blocks in the gen_workspaces
704 {
705 uint32_t i;
706 for (i = 0; i < n_capabilities; i++) {
707 live_words += gcThreadLiveWords(i, gen->no);
708 live_blocks += gcThreadLiveBlocks(i, gen->no);
709 }
710 }
711 } // for all generations
712
713 // update the max size of older generations after a major GC
714 resize_generations();
715
716 // Free the mark stack.
717 if (mark_stack_top_bd != NULL) {
718 debugTrace(DEBUG_gc, "mark stack: %d blocks",
719 countBlocks(mark_stack_top_bd));
720 freeChain(mark_stack_top_bd);
721 }
722
723 // Free any bitmaps.
724 for (g = 0; g <= N; g++) {
725 gen = &generations[g];
726 if (gen->bitmap != NULL) {
727 freeGroup(gen->bitmap);
728 gen->bitmap = NULL;
729 }
730 }
731
732 resize_nursery();
733
734 resetNurseries();
735
736 // mark the garbage collected CAFs as dead
737 #if defined(DEBUG)
738 if (major_gc) { gcCAFs(); }
739 #endif
740
741 // Update the stable name hash table
742 updateStableNameTable(major_gc);
743
744 // unlock the StablePtr table. Must be before scheduleFinalizers(),
745 // because a finalizer may call hs_free_fun_ptr() or
746 // hs_free_stable_ptr(), both of which access the StablePtr table.
747 stablePtrUnlock();
748
749 // Must be after stablePtrUnlock(), because it might free stable ptrs.
750 if (major_gc) {
751 checkUnload (gct->scavenged_static_objects);
752 }
753
754 #if defined(PROFILING)
755 // resetStaticObjectForProfiling() must be called before
756 // zeroing below.
757
758 // ToDo: fix the gct->scavenged_static_objects below
759 resetStaticObjectForProfiling(gct->scavenged_static_objects);
760 #endif
761
762 // Start any pending finalizers. Must be after
763 // updateStableTables() and stableUnlock() (see #4221).
764 RELEASE_SM_LOCK;
765 scheduleFinalizers(cap, dead_weak_ptr_list);
766 ACQUIRE_SM_LOCK;
767
768 // check sanity after GC
769 // before resurrectThreads(), because that might overwrite some
770 // closures, which will cause problems with THREADED where we don't
771 // fill slop.
772 IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
773
774 // If a heap census is due, we need to do it before
775 // resurrectThreads(), for the same reason as checkSanity above:
776 // resurrectThreads() will overwrite some closures and leave slop
777 // behind.
778 if (do_heap_census) {
779 debugTrace(DEBUG_sched, "performing heap census");
780 RELEASE_SM_LOCK;
781 heapCensus(gct->gc_start_cpu);
782 ACQUIRE_SM_LOCK;
783 }
784
785 // send exceptions to any threads which were about to die
786 RELEASE_SM_LOCK;
787 resurrectThreads(resurrected_threads);
788 ACQUIRE_SM_LOCK;
789
790 if (major_gc) {
791 W_ need_prealloc, need_live, need, got;
792 uint32_t i;
793
794 need_live = 0;
795 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
796 need_live += genLiveBlocks(&generations[i]);
797 }
798 need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
799
800 need_prealloc = 0;
801 for (i = 0; i < n_nurseries; i++) {
802 need_prealloc += nurseries[i].n_blocks;
803 }
804 need_prealloc += RtsFlags.GcFlags.largeAllocLim;
805 need_prealloc += countAllocdBlocks(exec_block);
806 need_prealloc += arenaBlocks();
807 #if defined(PROFILING)
808 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
809 need_prealloc = retainerStackBlocks();
810 }
811 #endif
812
813 /* If the amount of data remains constant, next major GC we'll
814 * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
815 * in order to reduce repeated deallocation and reallocation. #14702
816 */
817 need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
818
819 /* Also, if user set heap size, do not drop below it.
820 */
821 need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
822
823 /* But with a large nursery, the above estimate might exceed
824 * maxHeapSize. A large resident set size might make the OS
825 * kill this process, or swap unnecessarily. Therefore we
826 * ensure that our estimate does not exceed maxHeapSize.
827 */
828 if (RtsFlags.GcFlags.maxHeapSize != 0) {
829 need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
830 }
831
832 need = BLOCKS_TO_MBLOCKS(need);
833
834 got = mblocks_allocated;
835
836 if (got > need) {
837 returnMemoryToOS(got - need);
838 }
839 }
840
841 // extra GC trace info
842 IF_DEBUG(gc, statDescribeGens());
843
844 #if defined(DEBUG)
845 // symbol-table based profiling
846 /* heapCensus(to_blocks); */ /* ToDo */
847 #endif
848
849 // restore enclosing cost centre
850 #if defined(PROFILING)
851 for (n = 0; n < n_capabilities; n++) {
852 capabilities[n]->r.rCCCS = save_CCS[n];
853 }
854 #endif
855
856 #if defined(DEBUG)
857 // check for memory leaks if DEBUG is on
858 memInventory(DEBUG_gc);
859 #endif
860
861 // ok, GC over: tell the stats department what happened.
862 stat_endGC(cap, gct, live_words, copied,
863 live_blocks * BLOCK_SIZE_W - live_words /* slop */,
864 N, n_gc_threads, par_max_copied, par_balanced_copied,
865 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
866 any_work, no_work, scav_find_work);
867
868 #if defined(RTS_USER_SIGNALS)
869 if (RtsFlags.MiscFlags.install_signal_handlers) {
870 // unblock signals again
871 unblockUserSignals();
872 }
873 #endif
874
875 RELEASE_SM_LOCK;
876
877 SET_GCT(saved_gct);
878 }
879
880 /* -----------------------------------------------------------------------------
881 Heap overflow is indicated by setting a flag that the caller of
882 GarbageCollect can check. (not ideal, TODO: better)
883 -------------------------------------------------------------------------- */
884
885 static void heapOverflow(void)
886 {
887 heap_overflow = true;
888 }
889
890 /* -----------------------------------------------------------------------------
891 Initialise the gc_thread structures.
892 -------------------------------------------------------------------------- */
893
894 static void
895 new_gc_thread (uint32_t n, gc_thread *t)
896 {
897 uint32_t g;
898 gen_workspace *ws;
899
900 t->cap = capabilities[n];
901
902 #if defined(THREADED_RTS)
903 t->id = 0;
904 initSpinLock(&t->gc_spin);
905 initSpinLock(&t->mut_spin);
906 ACQUIRE_SPIN_LOCK(&t->gc_spin);
907 ACQUIRE_SPIN_LOCK(&t->mut_spin);
908 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the
909 // thread to start up, see wakeup_gc_threads
910 #endif
911
912 t->thread_index = n;
913 t->free_blocks = NULL;
914 t->gc_count = 0;
915
916 init_gc_thread(t);
917
918 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
919 {
920 ws = &t->gens[g];
921 ws->gen = &generations[g];
922 ASSERT(g == ws->gen->no);
923 ws->my_gct = t;
924
925 // We want to call
926 // alloc_todo_block(ws,0);
927 // but can't, because it uses gct which isn't set up at this point.
928 // Hence, allocate a block for todo_bd manually:
929 {
930 bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
931 // no lock, locks aren't initialised yet
932 initBdescr(bd, ws->gen, ws->gen->to);
933 bd->flags = BF_EVACUATED;
934 bd->u.scan = bd->free = bd->start;
935
936 ws->todo_bd = bd;
937 ws->todo_free = bd->free;
938 ws->todo_lim = bd->start + BLOCK_SIZE_W;
939 }
940
941 ws->todo_q = newWSDeque(128);
942 ws->todo_overflow = NULL;
943 ws->n_todo_overflow = 0;
944 ws->todo_large_objects = NULL;
945
946 ws->part_list = NULL;
947 ws->n_part_blocks = 0;
948 ws->n_part_words = 0;
949
950 ws->scavd_list = NULL;
951 ws->n_scavd_blocks = 0;
952 ws->n_scavd_words = 0;
953 }
954 }
955
956
957 void
958 initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
959 {
960 #if defined(THREADED_RTS)
961 uint32_t i;
962
963 if (from > 0) {
964 gc_threads = stgReallocBytes (gc_threads, to * sizeof(gc_thread*),
965 "initGcThreads");
966 } else {
967 gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
968 "initGcThreads");
969 }
970
971 for (i = from; i < to; i++) {
972 gc_threads[i] =
973 stgMallocBytes(sizeof(gc_thread) +
974 RtsFlags.GcFlags.generations * sizeof(gen_workspace),
975 "alloc_gc_threads");
976
977 new_gc_thread(i, gc_threads[i]);
978 }
979 #else
980 ASSERT(from == 0 && to == 1);
981 gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
982 gc_threads[0] = gct;
983 new_gc_thread(0,gc_threads[0]);
984 #endif
985 }
986
987 void
988 freeGcThreads (void)
989 {
990 uint32_t g;
991 if (gc_threads != NULL) {
992 #if defined(THREADED_RTS)
993 uint32_t i;
994 for (i = 0; i < n_capabilities; i++) {
995 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
996 {
997 freeWSDeque(gc_threads[i]->gens[g].todo_q);
998 }
999 stgFree (gc_threads[i]);
1000 }
1001 stgFree (gc_threads);
1002 #else
1003 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1004 {
1005 freeWSDeque(gc_threads[0]->gens[g].todo_q);
1006 }
1007 stgFree (gc_threads);
1008 #endif
1009 gc_threads = NULL;
1010 }
1011 }
1012
1013 /* ----------------------------------------------------------------------------
1014 Start GC threads
1015 ------------------------------------------------------------------------- */
1016
1017 static volatile StgWord gc_running_threads;
1018
1019 static StgWord
1020 inc_running (void)
1021 {
1022 StgWord new;
1023 new = atomic_inc(&gc_running_threads, 1);
1024 ASSERT(new <= n_gc_threads);
1025 return new;
1026 }
1027
1028 static StgWord
1029 dec_running (void)
1030 {
1031 ASSERT(gc_running_threads != 0);
1032 return atomic_dec(&gc_running_threads);
1033 }
1034
1035 static bool
1036 any_work (void)
1037 {
1038 int g;
1039 gen_workspace *ws;
1040
1041 gct->any_work++;
1042
1043 write_barrier();
1044
1045 // scavenge objects in compacted generation
1046 if (mark_stack_bd != NULL && !mark_stack_empty()) {
1047 return true;
1048 }
1049
1050 // Check for global work in any gen. We don't need to check for
1051 // local work, because we have already exited scavenge_loop(),
1052 // which means there is no local work for this thread.
1053 for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
1054 ws = &gct->gens[g];
1055 if (ws->todo_large_objects) return true;
1056 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1057 if (ws->todo_overflow) return true;
1058 }
1059
1060 #if defined(THREADED_RTS)
1061 if (work_stealing) {
1062 uint32_t n;
1063 // look for work to steal
1064 for (n = 0; n < n_gc_threads; n++) {
1065 if (n == gct->thread_index) continue;
1066 for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
1067 ws = &gc_threads[n]->gens[g];
1068 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1069 }
1070 }
1071 }
1072 #endif
1073
1074 gct->no_work++;
1075 #if defined(THREADED_RTS)
1076 yieldThread();
1077 #endif
1078
1079 return false;
1080 }
1081
1082 static void
1083 scavenge_until_all_done (void)
1084 {
1085 DEBUG_ONLY( uint32_t r );
1086
1087
1088 loop:
1089 #if defined(THREADED_RTS)
1090 if (n_gc_threads > 1) {
1091 scavenge_loop();
1092 } else {
1093 scavenge_loop1();
1094 }
1095 #else
1096 scavenge_loop();
1097 #endif
1098
1099 collect_gct_blocks();
1100
1101 // scavenge_loop() only exits when there's no work to do
1102
1103 // This atomic decrement also serves as a full barrier to ensure that any
1104 // writes we made during scavenging are visible to other threads.
1105 #if defined(DEBUG)
1106 r = dec_running();
1107 #else
1108 dec_running();
1109 #endif
1110
1111 traceEventGcIdle(gct->cap);
1112
1113 debugTrace(DEBUG_gc, "%d GC threads still running", r);
1114
1115 while (gc_running_threads != 0) {
1116 // usleep(1);
1117 if (any_work()) {
1118 inc_running();
1119 traceEventGcWork(gct->cap);
1120 goto loop;
1121 }
1122 // any_work() does not remove the work from the queue, it
1123 // just checks for the presence of work. If we find any,
1124 // then we increment gc_running_threads and go back to
1125 // scavenge_loop() to perform any pending work.
1126 }
1127
1128 traceEventGcDone(gct->cap);
1129 }
1130
1131 #if defined(THREADED_RTS)
1132
1133 void
1134 gcWorkerThread (Capability *cap)
1135 {
1136 gc_thread *saved_gct;
1137
1138 // necessary if we stole a callee-saves register for gct:
1139 saved_gct = gct;
1140
1141 SET_GCT(gc_threads[cap->no]);
1142 gct->id = osThreadId();
1143
1144 // Wait until we're told to wake up
1145 RELEASE_SPIN_LOCK(&gct->mut_spin);
1146 // yieldThread();
1147 // Strangely, adding a yieldThread() here makes the CPU time
1148 // measurements more accurate on Linux, perhaps because it syncs
1149 // the CPU time across the multiple cores. Without this, CPU time
1150 // is heavily skewed towards GC rather than MUT.
1151 gct->wakeup = GC_THREAD_STANDING_BY;
1152 debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1153 ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1154
1155 init_gc_thread(gct);
1156
1157 traceEventGcWork(gct->cap);
1158
1159 // Every thread evacuates some roots.
1160 gct->evac_gen_no = 0;
1161 markCapability(mark_root, gct, cap, true/*prune sparks*/);
1162 scavenge_capability_mut_lists(cap);
1163
1164 scavenge_until_all_done();
1165
1166 #if defined(THREADED_RTS)
1167 // Now that the whole heap is marked, we discard any sparks that
1168 // were found to be unreachable. The main GC thread is currently
1169 // marking heap reachable via weak pointers, so it is
1170 // non-deterministic whether a spark will be retained if it is
1171 // only reachable via weak pointers. To fix this problem would
1172 // require another GC barrier, which is too high a price.
1173 pruneSparkQueue(cap);
1174 #endif
1175
1176 // Wait until we're told to continue
1177 RELEASE_SPIN_LOCK(&gct->gc_spin);
1178 gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1179 debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1180 gct->thread_index);
1181 ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1182 debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1183
1184 SET_GCT(saved_gct);
1185 }
1186
1187 #endif
1188
1189 #if defined(THREADED_RTS)
1190
1191 void
1192 waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1193 {
1194 const uint32_t n_threads = n_capabilities;
1195 const uint32_t me = cap->no;
1196 uint32_t i, j;
1197 bool retry = true;
1198 Time t0, t1, t2;
1199
1200 t0 = t1 = t2 = getProcessElapsedTime();
1201
1202 while(retry) {
1203 for (i=0; i < n_threads; i++) {
1204 if (i == me || idle_cap[i]) continue;
1205 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1206 prodCapability(capabilities[i], cap->running_task);
1207 }
1208 }
1209 for (j=0; j < 10; j++) {
1210 retry = false;
1211 for (i=0; i < n_threads; i++) {
1212 if (i == me || idle_cap[i]) continue;
1213 write_barrier();
1214 interruptCapability(capabilities[i]);
1215 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1216 retry = true;
1217 }
1218 }
1219 if (!retry) break;
1220 #if defined(PROF_SPIN)
1221 waitForGcThreads_yield++;
1222 #endif
1223 yieldThread();
1224 }
1225
1226 t2 = getProcessElapsedTime();
1227 if (RtsFlags.GcFlags.longGCSync != 0 &&
1228 t2 - t1 > RtsFlags.GcFlags.longGCSync) {
1229 /* call this every longGCSync of delay */
1230 rtsConfig.longGCSync(cap->no, t2 - t0);
1231 t1 = t2;
1232 }
1233 if (retry) {
1234 #if defined(PROF_SPIN)
1235 // This is a bit strange, we'll get more yields than spins.
1236 // I guess that means it's not a spin-lock at all, but these
1237 // numbers are still useful (I think).
1238 waitForGcThreads_spin++;
1239 #endif
1240 }
1241 }
1242
1243 if (RtsFlags.GcFlags.longGCSync != 0 &&
1244 t2 - t0 > RtsFlags.GcFlags.longGCSync) {
1245 rtsConfig.longGCSyncEnd(t2 - t0);
1246 }
1247 }
1248
1249 #endif // THREADED_RTS
1250
1251 static void
1252 start_gc_threads (void)
1253 {
1254 #if defined(THREADED_RTS)
1255 gc_running_threads = 0;
1256 #endif
1257 }
1258
1259 static void
1260 wakeup_gc_threads (uint32_t me USED_IF_THREADS,
1261 bool idle_cap[] USED_IF_THREADS)
1262 {
1263 #if defined(THREADED_RTS)
1264 uint32_t i;
1265
1266 if (n_gc_threads == 1) return;
1267
1268 for (i=0; i < n_gc_threads; i++) {
1269 if (i == me || idle_cap[i]) continue;
1270 inc_running();
1271 debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1272 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
1273 barf("wakeup_gc_threads");
1274
1275 gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1276 ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1277 RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1278 }
1279 #endif
1280 }
1281
1282 // After GC is complete, we must wait for all GC threads to enter the
1283 // standby state, otherwise they may still be executing inside
1284 // any_work(), and may even remain awake until the next GC starts.
1285 static void
1286 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
1287 bool idle_cap[] USED_IF_THREADS)
1288 {
1289 #if defined(THREADED_RTS)
1290 uint32_t i;
1291
1292 if (n_gc_threads == 1) return;
1293
1294 for (i=0; i < n_gc_threads; i++) {
1295 if (i == me || idle_cap[i]) continue;
1296 while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
1297 busy_wait_nop();
1298 write_barrier();
1299 }
1300 }
1301 #endif
1302 }
1303
1304 #if defined(THREADED_RTS)
1305 void
1306 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1307 {
1308 const uint32_t n_threads = n_capabilities;
1309 const uint32_t me = cap->no;
1310 uint32_t i;
1311 for (i=0; i < n_threads; i++) {
1312 if (i == me || idle_cap[i]) continue;
1313 if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1314 barf("releaseGCThreads");
1315
1316 gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1317 ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1318 RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1319 }
1320 }
1321 #endif
1322
1323 /* ----------------------------------------------------------------------------
1324 Initialise a generation that is to be collected
1325 ------------------------------------------------------------------------- */
1326
1327 static void
1328 prepare_collected_gen (generation *gen)
1329 {
1330 uint32_t i, g, n;
1331 gen_workspace *ws;
1332 bdescr *bd, *next;
1333
1334 // Throw away the current mutable list. Invariant: the mutable
1335 // list always has at least one block; this means we can avoid a
1336 // check for NULL in recordMutable().
1337 g = gen->no;
1338 if (g != 0) {
1339 for (i = 0; i < n_capabilities; i++) {
1340 freeChain(capabilities[i]->mut_lists[g]);
1341 capabilities[i]->mut_lists[g] =
1342 allocBlockOnNode(capNoToNumaNode(i));
1343 }
1344 }
1345
1346 gen = &generations[g];
1347 ASSERT(gen->no == g);
1348
1349 // we'll construct a new list of threads in this step
1350 // during GC, throw away the current list.
1351 gen->old_threads = gen->threads;
1352 gen->threads = END_TSO_QUEUE;
1353
1354 // deprecate the existing blocks
1355 gen->old_blocks = gen->blocks;
1356 gen->n_old_blocks = gen->n_blocks;
1357 gen->blocks = NULL;
1358 gen->n_blocks = 0;
1359 gen->n_words = 0;
1360 gen->live_estimate = 0;
1361
1362 // initialise the large object queues.
1363 ASSERT(gen->scavenged_large_objects == NULL);
1364 ASSERT(gen->n_scavenged_large_blocks == 0);
1365 ASSERT(gen->live_compact_objects == NULL);
1366 ASSERT(gen->n_live_compact_blocks == 0);
1367
1368 // grab all the partial blocks stashed in the gc_thread workspaces and
1369 // move them to the old_blocks list of this gen.
1370 for (n = 0; n < n_capabilities; n++) {
1371 ws = &gc_threads[n]->gens[gen->no];
1372
1373 for (bd = ws->part_list; bd != NULL; bd = next) {
1374 next = bd->link;
1375 bd->link = gen->old_blocks;
1376 gen->old_blocks = bd;
1377 gen->n_old_blocks += bd->blocks;
1378 }
1379 ws->part_list = NULL;
1380 ws->n_part_blocks = 0;
1381 ws->n_part_words = 0;
1382
1383 ASSERT(ws->scavd_list == NULL);
1384 ASSERT(ws->n_scavd_blocks == 0);
1385 ASSERT(ws->n_scavd_words == 0);
1386
1387 if (ws->todo_free != ws->todo_bd->start) {
1388 ws->todo_bd->free = ws->todo_free;
1389 ws->todo_bd->link = gen->old_blocks;
1390 gen->old_blocks = ws->todo_bd;
1391 gen->n_old_blocks += ws->todo_bd->blocks;
1392 alloc_todo_block(ws,0); // always has one block.
1393 }
1394 }
1395
1396 // mark the small objects as from-space
1397 for (bd = gen->old_blocks; bd; bd = bd->link) {
1398 bd->flags &= ~BF_EVACUATED;
1399 }
1400
1401 // mark the large objects as from-space
1402 for (bd = gen->large_objects; bd; bd = bd->link) {
1403 bd->flags &= ~BF_EVACUATED;
1404 }
1405
1406 // mark the compact objects as from-space
1407 for (bd = gen->compact_objects; bd; bd = bd->link) {
1408 bd->flags &= ~BF_EVACUATED;
1409 }
1410
1411 // for a compacted generation, we need to allocate the bitmap
1412 if (gen->mark) {
1413 StgWord bitmap_size; // in bytes
1414 bdescr *bitmap_bdescr;
1415 StgWord *bitmap;
1416
1417 bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
1418
1419 if (bitmap_size > 0) {
1420 bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
1421 / BLOCK_SIZE);
1422 gen->bitmap = bitmap_bdescr;
1423 bitmap = bitmap_bdescr->start;
1424
1425 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1426 bitmap_size, bitmap);
1427
1428 // don't forget to fill it with zeros!
1429 memset(bitmap, 0, bitmap_size);
1430
1431 // For each block in this step, point to its bitmap from the
1432 // block descriptor.
1433 for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
1434 bd->u.bitmap = bitmap;
1435 bitmap += BLOCK_SIZE_W / BITS_IN(W_);
1436
1437 // Also at this point we set the BF_MARKED flag
1438 // for this block. The invariant is that
1439 // BF_MARKED is always unset, except during GC
1440 // when it is set on those blocks which will be
1441 // compacted.
1442 if (!(bd->flags & BF_FRAGMENTED)) {
1443 bd->flags |= BF_MARKED;
1444 }
1445
1446 // BF_SWEPT should be marked only for blocks that are being
1447 // collected in sweep()
1448 bd->flags &= ~BF_SWEPT;
1449 }
1450 }
1451 }
1452 }
1453
1454
1455 /* ----------------------------------------------------------------------------
1456 Save the mutable lists in saved_mut_lists
1457 ------------------------------------------------------------------------- */
1458
1459 static void
1460 stash_mut_list (Capability *cap, uint32_t gen_no)
1461 {
1462 cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
1463 cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
1464 }
1465
1466 /* ----------------------------------------------------------------------------
1467 Initialise a generation that is *not* to be collected
1468 ------------------------------------------------------------------------- */
1469
1470 static void
1471 prepare_uncollected_gen (generation *gen)
1472 {
1473 uint32_t i;
1474
1475
1476 ASSERT(gen->no > 0);
1477
1478 // save the current mutable lists for this generation, and
1479 // allocate a fresh block for each one. We'll traverse these
1480 // mutable lists as roots early on in the GC.
1481 for (i = 0; i < n_capabilities; i++) {
1482 stash_mut_list(capabilities[i], gen->no);
1483 }
1484
1485 ASSERT(gen->scavenged_large_objects == NULL);
1486 ASSERT(gen->n_scavenged_large_blocks == 0);
1487 }
1488
1489 /* -----------------------------------------------------------------------------
1490 Collect the completed blocks from a GC thread and attach them to
1491 the generation.
1492 -------------------------------------------------------------------------- */
1493
1494 static void
1495 collect_gct_blocks (void)
1496 {
1497 uint32_t g;
1498 gen_workspace *ws;
1499 bdescr *bd, *prev;
1500
1501 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1502 ws = &gct->gens[g];
1503
1504 // there may still be a block attached to ws->todo_bd;
1505 // leave it there to use next time.
1506
1507 if (ws->scavd_list != NULL) {
1508 ACQUIRE_SPIN_LOCK(&ws->gen->sync);
1509
1510 ASSERT(gct->scan_bd == NULL);
1511 ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
1512
1513 prev = NULL;
1514 for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
1515 prev = bd;
1516 }
1517 if (prev != NULL) {
1518 prev->link = ws->gen->blocks;
1519 ws->gen->blocks = ws->scavd_list;
1520 }
1521 ws->gen->n_blocks += ws->n_scavd_blocks;
1522 ws->gen->n_words += ws->n_scavd_words;
1523
1524 ws->scavd_list = NULL;
1525 ws->n_scavd_blocks = 0;
1526 ws->n_scavd_words = 0;
1527
1528 RELEASE_SPIN_LOCK(&ws->gen->sync);
1529 }
1530 }
1531 }
1532
1533 /* -----------------------------------------------------------------------------
1534 During mutation, any blocks that are filled by allocatePinned() are
1535 stashed on the local pinned_object_blocks list, to avoid needing to
1536 take a global lock. Here we collect those blocks from the
1537 cap->pinned_object_blocks lists and put them on the
1538 main g0->large_object list.
1539 -------------------------------------------------------------------------- */
1540
1541 static void
1542 collect_pinned_object_blocks (void)
1543 {
1544 uint32_t n;
1545 bdescr *bd, *prev;
1546
1547 for (n = 0; n < n_capabilities; n++) {
1548 prev = NULL;
1549 for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1550 prev = bd;
1551 }
1552 if (prev != NULL) {
1553 prev->link = g0->large_objects;
1554 if (g0->large_objects != NULL) {
1555 g0->large_objects->u.back = prev;
1556 }
1557 g0->large_objects = capabilities[n]->pinned_object_blocks;
1558 capabilities[n]->pinned_object_blocks = 0;
1559 }
1560 }
1561 }
1562
1563 /* -----------------------------------------------------------------------------
1564 Initialise a gc_thread before GC
1565 -------------------------------------------------------------------------- */
1566
1567 static void
1568 init_gc_thread (gc_thread *t)
1569 {
1570 t->static_objects = END_OF_STATIC_OBJECT_LIST;
1571 t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
1572 t->scan_bd = NULL;
1573 t->mut_lists = t->cap->mut_lists;
1574 t->evac_gen_no = 0;
1575 t->failed_to_evac = false;
1576 t->eager_promotion = true;
1577 t->thunk_selector_depth = 0;
1578 t->copied = 0;
1579 t->scanned = 0;
1580 t->any_work = 0;
1581 t->no_work = 0;
1582 t->scav_find_work = 0;
1583 }
1584
1585 /* -----------------------------------------------------------------------------
1586 Function we pass to evacuate roots.
1587 -------------------------------------------------------------------------- */
1588
1589 static void
1590 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1591 {
1592 // we stole a register for gct, but this function is called from
1593 // *outside* the GC where the register variable is not in effect,
1594 // so we need to save and restore it here. NB. only call
1595 // mark_root() from the main GC thread, otherwise gct will be
1596 // incorrect.
1597 #if defined(THREADED_RTS)
1598 gc_thread *saved_gct;
1599 saved_gct = gct;
1600 #endif
1601 SET_GCT(user);
1602
1603 evacuate(root);
1604
1605 SET_GCT(saved_gct);
1606 }
1607
1608 /* ----------------------------------------------------------------------------
1609 Reset the sizes of the older generations when we do a major
1610 collection.
1611
1612 CURRENT STRATEGY: make all generations except zero the same size.
1613 We have to stay within the maximum heap size, and leave a certain
1614 percentage of the maximum heap size available to allocate into.
1615 ------------------------------------------------------------------------- */
1616
1617 static void
1618 resize_generations (void)
1619 {
1620 uint32_t g;
1621
1622 if (major_gc && RtsFlags.GcFlags.generations > 1) {
1623 W_ live, size, min_alloc, words;
1624 const W_ max = RtsFlags.GcFlags.maxHeapSize;
1625 const W_ gens = RtsFlags.GcFlags.generations;
1626
1627 // live in the oldest generations
1628 if (oldest_gen->live_estimate != 0) {
1629 words = oldest_gen->live_estimate;
1630 } else {
1631 words = oldest_gen->n_words;
1632 }
1633 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1634 oldest_gen->n_large_blocks +
1635 oldest_gen->n_compact_blocks;
1636
1637 // default max size for all generations except zero
1638 size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1639 RtsFlags.GcFlags.minOldGenSize);
1640
1641 if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1642 if (max > 0) {
1643 RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
1644 } else {
1645 RtsFlags.GcFlags.heapSizeSuggestion = size;
1646 }
1647 }
1648
1649 // minimum size for generation zero
1650 min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1651 RtsFlags.GcFlags.minAllocAreaSize
1652 * (W_)n_capabilities);
1653
1654 // Auto-enable compaction when the residency reaches a
1655 // certain percentage of the maximum heap size (default: 30%).
1656 if (RtsFlags.GcFlags.compact ||
1657 (max > 0 &&
1658 oldest_gen->n_blocks >
1659 (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
1660 oldest_gen->mark = 1;
1661 oldest_gen->compact = 1;
1662 // debugBelch("compaction: on\n", live);
1663 } else {
1664 oldest_gen->mark = 0;
1665 oldest_gen->compact = 0;
1666 // debugBelch("compaction: off\n", live);
1667 }
1668
1669 if (RtsFlags.GcFlags.sweep) {
1670 oldest_gen->mark = 1;
1671 }
1672
1673 // if we're going to go over the maximum heap size, reduce the
1674 // size of the generations accordingly. The calculation is
1675 // different if compaction is turned on, because we don't need
1676 // to double the space required to collect the old generation.
1677 if (max != 0) {
1678
1679 // this test is necessary to ensure that the calculations
1680 // below don't have any negative results - we're working
1681 // with unsigned values here.
1682 if (max < min_alloc) {
1683 heapOverflow();
1684 }
1685
1686 if (oldest_gen->compact) {
1687 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1688 size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1689 }
1690 } else {
1691 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1692 size = (max - min_alloc) / ((gens - 1) * 2);
1693 }
1694 }
1695
1696 if (size < live) {
1697 heapOverflow();
1698 }
1699 }
1700
1701 #if 0
1702 debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1703 min_alloc, size, max);
1704 #endif
1705
1706 for (g = 0; g < gens; g++) {
1707 generations[g].max_blocks = size;
1708 }
1709 }
1710 }
1711
1712 /* -----------------------------------------------------------------------------
1713 Calculate the new size of the nursery, and resize it.
1714 -------------------------------------------------------------------------- */
1715
1716 static void
1717 resize_nursery (void)
1718 {
1719 const StgWord min_nursery =
1720 RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
1721
1722 if (RtsFlags.GcFlags.generations == 1)
1723 { // Two-space collector:
1724 W_ blocks;
1725
1726 /* set up a new nursery. Allocate a nursery size based on a
1727 * function of the amount of live data (by default a factor of 2)
1728 * Use the blocks from the old nursery if possible, freeing up any
1729 * left over blocks.
1730 *
1731 * If we get near the maximum heap size, then adjust our nursery
1732 * size accordingly. If the nursery is the same size as the live
1733 * data (L), then we need 3L bytes. We can reduce the size of the
1734 * nursery to bring the required memory down near 2L bytes.
1735 *
1736 * A normal 2-space collector would need 4L bytes to give the same
1737 * performance we get from 3L bytes, reducing to the same
1738 * performance at 2L bytes.
1739 */
1740 blocks = generations[0].n_blocks;
1741
1742 if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1743 blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1744 RtsFlags.GcFlags.maxHeapSize )
1745 {
1746 long adjusted_blocks; // signed on purpose
1747 int pc_free;
1748
1749 adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1750
1751 debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1752 RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1753
1754 pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1755 if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1756 {
1757 heapOverflow();
1758 }
1759 blocks = adjusted_blocks;
1760 }
1761 else
1762 {
1763 blocks *= RtsFlags.GcFlags.oldGenFactor;
1764 if (blocks < min_nursery)
1765 {
1766 blocks = min_nursery;
1767 }
1768 }
1769 resizeNurseries(blocks);
1770 }
1771 else // Generational collector
1772 {
1773 /*
1774 * If the user has given us a suggested heap size, adjust our
1775 * allocation area to make best use of the memory available.
1776 */
1777 if (RtsFlags.GcFlags.heapSizeSuggestion)
1778 {
1779 long blocks;
1780 StgWord needed;
1781
1782 calcNeeded(false, &needed); // approx blocks needed at next GC
1783
1784 /* Guess how much will be live in generation 0 step 0 next time.
1785 * A good approximation is obtained by finding the
1786 * percentage of g0 that was live at the last minor GC.
1787 *
1788 * We have an accurate figure for the amount of copied data in
1789 * 'copied', but we must convert this to a number of blocks, with
1790 * a small adjustment for estimated slop at the end of a block
1791 * (- 10 words).
1792 */
1793 if (N == 0)
1794 {
1795 g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1796 / countNurseryBlocks();
1797 }
1798
1799 /* Estimate a size for the allocation area based on the
1800 * information available. We might end up going slightly under
1801 * or over the suggested heap size, but we should be pretty
1802 * close on average.
1803 *
1804 * Formula: suggested - needed
1805 * ----------------------------
1806 * 1 + g0_pcnt_kept/100
1807 *
1808 * where 'needed' is the amount of memory needed at the next
1809 * collection for collecting all gens except g0.
1810 */
1811 blocks =
1812 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1813 (100 + (long)g0_pcnt_kept);
1814
1815 if (blocks < (long)min_nursery) {
1816 blocks = min_nursery;
1817 }
1818
1819 resizeNurseries((W_)blocks);
1820 }
1821 else
1822 {
1823 // we might have added extra blocks to the nursery, so
1824 // resize back to the original size again.
1825 resizeNurseriesFixed();
1826 }
1827 }
1828 }
1829
1830 /* -----------------------------------------------------------------------------
1831 Sanity code for CAF garbage collection.
1832
1833 With DEBUG turned on, we manage a CAF list in addition to the SRT
1834 mechanism. After GC, we run down the CAF list and make any
1835 CAFs which have been garbage collected GCD_CAF. This means we get an error
1836 whenever the program tries to enter a garbage collected CAF.
1837
1838 Any garbage collected CAFs are taken off the CAF list at the same
1839 time.
1840 -------------------------------------------------------------------------- */
1841
1842 #if defined(DEBUG)
1843
1844 static void gcCAFs(void)
1845 {
1846 StgIndStatic *p, *prev;
1847
1848 const StgInfoTable *info;
1849 uint32_t i;
1850
1851 i = 0;
1852 p = debug_caf_list;
1853 prev = NULL;
1854
1855 for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
1856 p = (StgIndStatic*)p->saved_info) {
1857
1858 info = get_itbl((StgClosure*)p);
1859 ASSERT(info->type == IND_STATIC);
1860
1861 // See Note [STATIC_LINK fields] in Storage.h
1862 // This condition identifies CAFs that have just been GC'd and
1863 // don't have static_link==3 which means they should be ignored.
1864 if ((((StgWord)(p->static_link)&STATIC_BITS) | prev_static_flag) != 3) {
1865 debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
1866 SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
1867 if (prev == NULL) {
1868 debug_caf_list = (StgIndStatic*)p->saved_info;
1869 } else {
1870 prev->saved_info = p->saved_info;
1871 }
1872 } else {
1873 prev = p;
1874 i++;
1875 }
1876 }
1877
1878 debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1879 }
1880 #endif
1881
1882
1883 /* -----------------------------------------------------------------------------
1884 The GC can leave some work for the mutator to do before the next
1885 GC, provided the work can be safely overlapped with mutation. This
1886 can help reduce the GC pause time.
1887
1888 The mutator can call doIdleGCWork() any time it likes, but
1889 preferably when it is idle. It's safe for multiple capabilities to
1890 call doIdleGCWork().
1891
1892 When 'all' is
1893 * false: doIdleGCWork() should only take a short, bounded, amount
1894 of time.
1895 * true: doIdleGCWork() will complete all the outstanding GC work.
1896
1897 The return value is
1898 * true if there's more to do (only if 'all' is false).
1899 * false otherwise.
1900 -------------------------------------------------------------------------- */
1901
1902 bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
1903 {
1904 return runSomeFinalizers(all);
1905 }