Revert "rts: Drop redundant flags for libffi"
[ghc.git] / rts / sm / GC.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * Generational garbage collector
6 *
7 * Documentation on the architecture of the Garbage Collector can be
8 * found in the online commentary:
9 *
10 * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/storage/gc
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16 #include "HsFFI.h"
17
18 #include "GC.h"
19 #include "GCThread.h"
20 #include "GCTDecl.h" // NB. before RtsSignals.h which
21 // clobbers REG_R1 on arm/Linux
22 #include "Compact.h"
23 #include "Evac.h"
24 #include "Scav.h"
25 #include "GCUtils.h"
26 #include "MarkStack.h"
27 #include "MarkWeak.h"
28 #include "Sparks.h"
29 #include "Sweep.h"
30
31 #include "Arena.h"
32 #include "Storage.h"
33 #include "RtsUtils.h"
34 #include "Apply.h"
35 #include "Updates.h"
36 #include "Stats.h"
37 #include "Schedule.h"
38 #include "Sanity.h"
39 #include "BlockAlloc.h"
40 #include "ProfHeap.h"
41 #include "Weak.h"
42 #include "Prelude.h"
43 #include "RtsSignals.h"
44 #include "STM.h"
45 #include "Trace.h"
46 #include "RetainerProfile.h"
47 #include "LdvProfile.h"
48 #include "RaiseAsync.h"
49 #include "StableName.h"
50 #include "StablePtr.h"
51 #include "CheckUnload.h"
52 #include "CNF.h"
53 #include "RtsFlags.h"
54 #include "NonMoving.h"
55
56 #include <string.h> // for memset()
57 #include <unistd.h>
58
59 /* -----------------------------------------------------------------------------
60 Global variables
61 -------------------------------------------------------------------------- */
62
63 /* STATIC OBJECT LIST.
64 *
65 * During GC:
66 * We maintain a linked list of static objects that are still live.
67 * The requirements for this list are:
68 *
69 * - we need to scan the list while adding to it, in order to
70 * scavenge all the static objects (in the same way that
71 * breadth-first scavenging works for dynamic objects).
72 *
73 * - we need to be able to tell whether an object is already on
74 * the list, to break loops.
75 *
76 * Each static object has a "static link field", which we use for
77 * linking objects on to the list. We use a stack-type list, consing
78 * objects on the front as they are added (this means that the
79 * scavenge phase is depth-first, not breadth-first, but that
80 * shouldn't matter).
81 *
82 * A separate list is kept for objects that have been scavenged
83 * already - this is so that we can zero all the marks afterwards.
84 *
85 * An object is on the list if its static link field is non-zero; this
86 * means that we have to mark the end of the list with '1', not NULL.
87 *
88 * Extra notes for generational GC:
89 *
90 * Each generation has a static object list associated with it. When
91 * collecting generations up to N, we treat the static object lists
92 * from generations > N as roots.
93 *
94 * We build up a static object list while collecting generations 0..N,
95 * which is then appended to the static object list of generation N+1.
96 *
97 * See also: Note [STATIC_LINK fields] in Storage.h.
98 */
99
100 /* N is the oldest generation being collected, where the generations
101 * are numbered starting at 0. A major GC (indicated by the major_gc
102 * flag) is when we're collecting all generations. We only attempt to
103 * deal with static objects and GC CAFs when doing a major GC.
104 */
105 uint32_t N;
106 bool major_gc;
107 bool deadlock_detect_gc;
108
109 /* Data used for allocation area sizing.
110 */
111 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
112
113 /* Mut-list stats */
114 #if defined(DEBUG)
115 uint32_t mutlist_MUTVARS,
116 mutlist_MUTARRS,
117 mutlist_MVARS,
118 mutlist_TVAR,
119 mutlist_TVAR_WATCH_QUEUE,
120 mutlist_TREC_CHUNK,
121 mutlist_TREC_HEADER,
122 mutlist_OTHERS;
123 #endif
124
125 /* Thread-local data for each GC thread
126 */
127 gc_thread **gc_threads = NULL;
128
129 #if !defined(THREADED_RTS)
130 // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
131 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
132 ATTRIBUTE_ALIGNED(64);
133 #endif
134
135 // Number of threads running in *this* GC. Affects how many
136 // step->todos[] lists we have to look in to find work.
137 uint32_t n_gc_threads;
138
139 // For stats:
140 static long copied; // *words* copied & scavenged during this GC
141
142 #if defined(PROF_SPIN) && defined(THREADED_RTS)
143 // spin and yield counts for the quasi-SpinLock in waitForGcThreads
144 volatile StgWord64 waitForGcThreads_spin = 0;
145 volatile StgWord64 waitForGcThreads_yield = 0;
146 volatile StgWord64 whitehole_gc_spin = 0;
147 #endif
148
149 bool work_stealing;
150
151 uint32_t static_flag = STATIC_FLAG_B;
152 uint32_t prev_static_flag = STATIC_FLAG_A;
153
154 DECLARE_GCT
155
156 /* -----------------------------------------------------------------------------
157 Static function declarations
158 -------------------------------------------------------------------------- */
159
160 static void mark_root (void *user, StgClosure **root);
161 static void prepare_collected_gen (generation *gen);
162 static void prepare_uncollected_gen (generation *gen);
163 static void init_gc_thread (gc_thread *t);
164 static void resize_nursery (void);
165 static void start_gc_threads (void);
166 static void scavenge_until_all_done (void);
167 static StgWord inc_running (void);
168 static StgWord dec_running (void);
169 static void wakeup_gc_threads (uint32_t me, bool idle_cap[]);
170 static void shutdown_gc_threads (uint32_t me, bool idle_cap[]);
171 static void collect_gct_blocks (void);
172 static void collect_pinned_object_blocks (void);
173 static void heapOverflow (void);
174
175 #if defined(DEBUG)
176 static void gcCAFs (void);
177 #endif
178
179 /* -----------------------------------------------------------------------------
180 The mark stack.
181 -------------------------------------------------------------------------- */
182
183 bdescr *mark_stack_top_bd; // topmost block in the mark stack
184 bdescr *mark_stack_bd; // current block in the mark stack
185 StgPtr mark_sp; // pointer to the next unallocated mark stack entry
186
187 /* -----------------------------------------------------------------------------
188 GarbageCollect: the main entry point to the garbage collector.
189
190 The collect_gen parameter is gotten by calling calcNeeded().
191
192 Locks held: all capabilities are held throughout GarbageCollect().
193 -------------------------------------------------------------------------- */
194
195 void
196 GarbageCollect (uint32_t collect_gen,
197 const bool do_heap_census,
198 const bool deadlock_detect,
199 uint32_t gc_type USED_IF_THREADS,
200 Capability *cap,
201 bool idle_cap[])
202 {
203 bdescr *bd;
204 generation *gen;
205 StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
206 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
207 any_work, no_work, scav_find_work;
208 #if defined(THREADED_RTS)
209 gc_thread *saved_gct;
210 #endif
211 uint32_t g, n;
212
213 // necessary if we stole a callee-saves register for gct:
214 #if defined(THREADED_RTS)
215 saved_gct = gct;
216 #endif
217
218 #if defined(PROFILING)
219 CostCentreStack *save_CCS[n_capabilities];
220 #endif
221
222 ACQUIRE_SM_LOCK;
223
224 #if defined(RTS_USER_SIGNALS)
225 if (RtsFlags.MiscFlags.install_signal_handlers) {
226 // block signals
227 blockUserSignals();
228 }
229 #endif
230
231 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord));
232 // otherwise adjust the padding in gen_workspace.
233
234 // this is the main thread
235 SET_GCT(gc_threads[cap->no]);
236
237 // tell the stats department that we've started a GC
238 stat_startGC(cap, gct);
239
240 // Lock the StablePtr table. This prevents FFI calls manipulating
241 // the table from occurring during GC.
242 stablePtrLock();
243
244 #if defined(DEBUG)
245 mutlist_MUTVARS = 0;
246 mutlist_MUTARRS = 0;
247 mutlist_MVARS = 0;
248 mutlist_TVAR = 0;
249 mutlist_TVAR_WATCH_QUEUE = 0;
250 mutlist_TREC_CHUNK = 0;
251 mutlist_TREC_HEADER = 0;
252 mutlist_OTHERS = 0;
253 #endif
254
255 // attribute any costs to CCS_GC
256 #if defined(PROFILING)
257 for (n = 0; n < n_capabilities; n++) {
258 save_CCS[n] = capabilities[n]->r.rCCCS;
259 capabilities[n]->r.rCCCS = CCS_GC;
260 }
261 #endif
262
263 /* Figure out which generation to collect
264 */
265 N = collect_gen;
266 major_gc = (N == RtsFlags.GcFlags.generations-1);
267
268 /* See Note [Deadlock detection under nonmoving collector]. */
269 deadlock_detect_gc = deadlock_detect;
270
271 #if defined(THREADED_RTS)
272 if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) {
273 /* If there is already a concurrent major collection running then
274 * there is no benefit to starting another.
275 * TODO: Catch heap-size runaway.
276 */
277 N--;
278 collect_gen--;
279 major_gc = false;
280 }
281 #endif
282
283 /* N.B. The nonmoving collector works a bit differently. See
284 * Note [Static objects under the nonmoving collector].
285 */
286 if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
287 prev_static_flag = static_flag;
288 static_flag =
289 static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
290 }
291
292 #if defined(THREADED_RTS)
293 work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
294 N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
295 // It's not always a good idea to do load balancing in parallel
296 // GC. In particular, for a parallel program we don't want to
297 // lose locality by moving cached data into another CPU's cache
298 // (this effect can be quite significant).
299 //
300 // We could have a more complex way to determine whether to do
301 // work stealing or not, e.g. it might be a good idea to do it
302 // if the heap is big. For now, we just turn it on or off with
303 // a flag.
304 #endif
305
306 /* Start threads, so they can be spinning up while we finish initialisation.
307 */
308 start_gc_threads();
309
310 #if defined(THREADED_RTS)
311 /* How many threads will be participating in this GC?
312 * We don't try to parallelise minor GCs (unless the user asks for
313 * it with +RTS -gn0), or mark/compact/sweep GC.
314 */
315 if (gc_type == SYNC_GC_PAR) {
316 n_gc_threads = n_capabilities;
317 } else {
318 n_gc_threads = 1;
319 }
320 #else
321 n_gc_threads = 1;
322 #endif
323
324 debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s))",
325 N, n_gc_threads);
326
327 #if defined(DEBUG)
328 // check for memory leaks if DEBUG is on
329 memInventory(DEBUG_gc);
330 #endif
331
332 // do this *before* we start scavenging
333 collectFreshWeakPtrs();
334
335 // check sanity *before* GC
336 IF_DEBUG(sanity, checkSanity(false /* before GC */, major_gc));
337
338 // gather blocks allocated using allocatePinned() from each capability
339 // and put them on the g0->large_object list.
340 collect_pinned_object_blocks();
341
342 // Initialise all the generations that we're collecting.
343 for (g = 0; g <= N; g++) {
344 prepare_collected_gen(&generations[g]);
345 }
346 // Initialise all the generations that we're *not* collecting.
347 for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
348 prepare_uncollected_gen(&generations[g]);
349 }
350
351 // Prepare this gc_thread
352 init_gc_thread(gct);
353
354 /* Allocate a mark stack if we're doing a major collection.
355 */
356 if (major_gc && oldest_gen->mark) {
357 mark_stack_bd = allocBlock();
358 mark_stack_top_bd = mark_stack_bd;
359 mark_stack_bd->link = NULL;
360 mark_stack_bd->u.back = NULL;
361 mark_sp = mark_stack_bd->start;
362 } else {
363 mark_stack_bd = NULL;
364 mark_stack_top_bd = NULL;
365 mark_sp = NULL;
366 }
367
368 /* -----------------------------------------------------------------------
369 * follow all the roots that we know about:
370 */
371
372 // the main thread is running: this prevents any other threads from
373 // exiting prematurely, so we can start them now.
374 // NB. do this after the mutable lists have been saved above, otherwise
375 // the other GC threads will be writing into the old mutable lists.
376 inc_running();
377 wakeup_gc_threads(gct->thread_index, idle_cap);
378
379 traceEventGcWork(gct->cap);
380
381 // scavenge the capability-private mutable lists. This isn't part
382 // of markSomeCapabilities() because markSomeCapabilities() can only
383 // call back into the GC via mark_root() (due to the gct register
384 // variable).
385 if (n_gc_threads == 1) {
386 for (n = 0; n < n_capabilities; n++) {
387 #if defined(THREADED_RTS)
388 scavenge_capability_mut_Lists1(capabilities[n]);
389 #else
390 scavenge_capability_mut_lists(capabilities[n]);
391 #endif
392 }
393 } else {
394 scavenge_capability_mut_lists(gct->cap);
395 for (n = 0; n < n_capabilities; n++) {
396 if (idle_cap[n]) {
397 markCapability(mark_root, gct, capabilities[n],
398 true/*don't mark sparks*/);
399 scavenge_capability_mut_lists(capabilities[n]);
400 }
401 }
402 }
403
404 // follow roots from the CAF list (used by GHCi)
405 gct->evac_gen_no = 0;
406 markCAFs(mark_root, gct);
407
408 // follow all the roots that the application knows about.
409 gct->evac_gen_no = 0;
410 if (n_gc_threads == 1) {
411 for (n = 0; n < n_capabilities; n++) {
412 markCapability(mark_root, gct, capabilities[n],
413 true/*don't mark sparks*/);
414 }
415 } else {
416 markCapability(mark_root, gct, cap, true/*don't mark sparks*/);
417 }
418
419 markScheduler(mark_root, gct);
420
421 // Mark the weak pointer list, and prepare to detect dead weak pointers.
422 markWeakPtrList();
423 initWeakForGC();
424
425 // Mark the stable pointer table.
426 markStablePtrTable(mark_root, gct);
427
428 // Remember old stable name addresses.
429 rememberOldStableNameAddresses ();
430
431 /* -------------------------------------------------------------------------
432 * Repeatedly scavenge all the areas we know about until there's no
433 * more scavenging to be done.
434 */
435
436 StgWeak *dead_weak_ptr_list = NULL;
437 StgTSO *resurrected_threads = END_TSO_QUEUE;
438
439 for (;;)
440 {
441 scavenge_until_all_done();
442
443 // The other threads are now stopped. We might recurse back to
444 // here, but from now on this is the only thread.
445
446 // must be last... invariant is that everything is fully
447 // scavenged at this point.
448 if (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { // returns true if evaced something
449 inc_running();
450 continue;
451 }
452
453 // If we get to here, there's really nothing left to do.
454 break;
455 }
456
457 shutdown_gc_threads(gct->thread_index, idle_cap);
458
459 // Now see which stable names are still alive.
460 gcStableNameTable();
461
462 #if defined(THREADED_RTS)
463 if (n_gc_threads == 1) {
464 for (n = 0; n < n_capabilities; n++) {
465 pruneSparkQueue(capabilities[n]);
466 }
467 } else {
468 for (n = 0; n < n_capabilities; n++) {
469 if (n == cap->no || idle_cap[n]) {
470 pruneSparkQueue(capabilities[n]);
471 }
472 }
473 }
474 #endif
475
476 #if defined(PROFILING)
477 // We call processHeapClosureForDead() on every closure destroyed during
478 // the current garbage collection, so we invoke LdvCensusForDead().
479 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
480 || RtsFlags.ProfFlags.bioSelector != NULL) {
481 RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
482 LdvCensusForDead(N);
483 ACQUIRE_SM_LOCK;
484 }
485 #endif
486
487 // NO MORE EVACUATION AFTER THIS POINT!
488
489 // Finally: compact or sweep the oldest generation.
490 if (major_gc && oldest_gen->mark) {
491 if (oldest_gen->compact)
492 compact(gct->scavenged_static_objects,
493 &dead_weak_ptr_list,
494 &resurrected_threads);
495 else
496 sweep(oldest_gen);
497 }
498
499 copied = 0;
500 par_max_copied = 0;
501 par_balanced_copied = 0;
502 gc_spin_spin = 0;
503 gc_spin_yield = 0;
504 mut_spin_spin = 0;
505 mut_spin_yield = 0;
506 any_work = 0;
507 no_work = 0;
508 scav_find_work = 0;
509 {
510 uint32_t i;
511 uint64_t par_balanced_copied_acc = 0;
512 const gc_thread* thread;
513
514 for (i=0; i < n_gc_threads; i++) {
515 copied += gc_threads[i]->copied;
516 }
517 for (i=0; i < n_gc_threads; i++) {
518 thread = gc_threads[i];
519 if (n_gc_threads > 1) {
520 debugTrace(DEBUG_gc,"thread %d:", i);
521 debugTrace(DEBUG_gc," copied %ld",
522 thread->copied * sizeof(W_));
523 debugTrace(DEBUG_gc," scanned %ld",
524 thread->scanned * sizeof(W_));
525 debugTrace(DEBUG_gc," any_work %ld",
526 thread->any_work);
527 debugTrace(DEBUG_gc," no_work %ld",
528 thread->no_work);
529 debugTrace(DEBUG_gc," scav_find_work %ld",
530 thread->scav_find_work);
531
532 #if defined(THREADED_RTS) && defined(PROF_SPIN)
533 gc_spin_spin += thread->gc_spin.spin;
534 gc_spin_yield += thread->gc_spin.yield;
535 mut_spin_spin += thread->mut_spin.spin;
536 mut_spin_yield += thread->mut_spin.yield;
537 #endif
538
539 any_work += thread->any_work;
540 no_work += thread->no_work;
541 scav_find_work += thread->scav_find_work;
542
543 par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
544 par_balanced_copied_acc +=
545 stg_min(n_gc_threads * gc_threads[i]->copied, copied);
546 }
547 }
548 if (n_gc_threads > 1) {
549 // See Note [Work Balance] for an explanation of this computation
550 par_balanced_copied =
551 (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
552 (n_gc_threads - 1);
553 }
554 }
555
556 // Run through all the generations and tidy up.
557 // We're going to:
558 // - count the amount of "live" data (live_words, live_blocks)
559 // - count the amount of "copied" data in this GC (copied)
560 // - free from-space
561 // - make to-space the new from-space (set BF_EVACUATED on all blocks)
562 //
563 live_words = 0;
564 live_blocks = 0;
565
566 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
567
568 if (g == N) {
569 generations[g].collections++; // for stats
570 if (n_gc_threads > 1) generations[g].par_collections++;
571 }
572
573 // Count the mutable list as bytes "copied" for the purposes of
574 // stats. Every mutable list is copied during every GC.
575 if (g > 0) {
576 W_ mut_list_size = 0;
577 for (n = 0; n < n_capabilities; n++) {
578 mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
579 }
580 copied += mut_list_size;
581
582 debugTrace(DEBUG_gc,
583 "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
584 (unsigned long)(mut_list_size * sizeof(W_)),
585 mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
586 mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
587 mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
588 mutlist_OTHERS);
589 }
590
591 bdescr *next, *prev;
592 gen = &generations[g];
593
594 // for generations we collected...
595 if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
596
597 /* free old memory and shift to-space into from-space for all
598 * the collected generations (except the allocation area). These
599 * freed blocks will probaby be quickly recycled.
600 */
601 if (gen->mark)
602 {
603 // tack the new blocks on the end of the existing blocks
604 if (gen->old_blocks != NULL) {
605
606 prev = NULL;
607 for (bd = gen->old_blocks; bd != NULL; bd = next) {
608
609 next = bd->link;
610
611 if (!(bd->flags & BF_MARKED))
612 {
613 if (prev == NULL) {
614 gen->old_blocks = next;
615 } else {
616 prev->link = next;
617 }
618 freeGroup(bd);
619 gen->n_old_blocks--;
620 }
621 else
622 {
623 gen->n_words += bd->free - bd->start;
624
625 // NB. this step might not be compacted next
626 // time, so reset the BF_MARKED flags.
627 // They are set before GC if we're going to
628 // compact. (search for BF_MARKED above).
629 bd->flags &= ~BF_MARKED;
630
631 // between GCs, all blocks in the heap except
632 // for the nursery have the BF_EVACUATED flag set.
633 bd->flags |= BF_EVACUATED;
634
635 prev = bd;
636 }
637 }
638
639 if (prev != NULL) {
640 prev->link = gen->blocks;
641 gen->blocks = gen->old_blocks;
642 }
643 }
644 // add the new blocks to the block tally
645 gen->n_blocks += gen->n_old_blocks;
646 ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
647 ASSERT(countOccupied(gen->blocks) == gen->n_words);
648 }
649 else // not copacted
650 {
651 freeChain(gen->old_blocks);
652 }
653
654 gen->old_blocks = NULL;
655 gen->n_old_blocks = 0;
656
657 /* LARGE OBJECTS. The current live large objects are chained on
658 * scavenged_large, having been moved during garbage
659 * collection from large_objects. Any objects left on the
660 * large_objects list are therefore dead, so we free them here.
661 */
662 freeChain(gen->large_objects);
663 gen->large_objects = gen->scavenged_large_objects;
664 gen->n_large_blocks = gen->n_scavenged_large_blocks;
665 gen->n_large_words = countOccupied(gen->large_objects);
666 gen->n_new_large_words = 0;
667
668 /* COMPACT_NFDATA. The currently live compacts are chained
669 * to live_compact_objects, quite like large objects. And
670 * objects left on the compact_objects list are dead.
671 *
672 * We don't run a simple freeChain because want to give the
673 * CNF module some chance to free memory that freeChain would
674 * not see (namely blocks appended to a CNF through a compactResize).
675 *
676 * See Note [Compact Normal Forms] for details.
677 */
678 for (bd = gen->compact_objects; bd; bd = next) {
679 next = bd->link;
680 compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
681 }
682 gen->compact_objects = gen->live_compact_objects;
683 gen->n_compact_blocks = gen->n_live_compact_blocks;
684 }
685 else // for generations > N
686 {
687 /* For older generations, we need to append the
688 * scavenged_large_object list (i.e. large objects that have been
689 * promoted during this GC) to the large_object list for that step.
690 */
691 for (bd = gen->scavenged_large_objects; bd; bd = next) {
692 next = bd->link;
693 dbl_link_onto(bd, &gen->large_objects);
694 gen->n_large_words += bd->free - bd->start;
695 }
696
697 // And same for compacts
698 for (bd = gen->live_compact_objects; bd; bd = next) {
699 next = bd->link;
700 dbl_link_onto(bd, &gen->compact_objects);
701 }
702
703 // add the new blocks we promoted during this GC
704 gen->n_large_blocks += gen->n_scavenged_large_blocks;
705 gen->n_compact_blocks += gen->n_live_compact_blocks;
706 }
707
708 ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
709 ASSERT(countOccupied(gen->large_objects) == gen->n_large_words);
710 // We can run the same assertion on compact objects because there
711 // is memory "the GC doesn't see" (directly), but which is still
712 // accounted in gen->n_compact_blocks
713
714 gen->scavenged_large_objects = NULL;
715 gen->n_scavenged_large_blocks = 0;
716 gen->live_compact_objects = NULL;
717 gen->n_live_compact_blocks = 0;
718
719 // Count "live" data
720 live_words += genLiveWords(gen);
721 live_blocks += genLiveBlocks(gen);
722
723 // add in the partial blocks in the gen_workspaces
724 {
725 uint32_t i;
726 for (i = 0; i < n_capabilities; i++) {
727 live_words += gcThreadLiveWords(i, gen->no);
728 live_blocks += gcThreadLiveBlocks(i, gen->no);
729 }
730 }
731 } // for all generations
732
733 // Flush the update remembered set. See Note [Eager update remembered set
734 // flushing] in NonMovingMark.c
735 if (RtsFlags.GcFlags.useNonmoving) {
736 RELEASE_SM_LOCK;
737 nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
738 ACQUIRE_SM_LOCK;
739 }
740
741 // Mark and sweep the oldest generation.
742 // N.B. This can only happen after we've moved
743 // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
744 ASSERT(oldest_gen->scavenged_large_objects == NULL);
745 if (RtsFlags.GcFlags.useNonmoving && major_gc) {
746 // All threads in non-moving heap should be found to be alive, because
747 // threads in the non-moving generation's list should live in the
748 // non-moving heap, and we consider non-moving objects alive during
749 // preparation.
750 ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
751 // For weaks, remember that we evacuated all weaks to the non-moving heap
752 // in markWeakPtrList(), and then moved the weak_ptr_list list to
753 // old_weak_ptr_list. We then moved weaks with live keys to the
754 // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in
755 // old_weak_ptr_list to dead_weak_ptr_list. So at this point
756 // old_weak_ptr_list should be empty.
757 ASSERT(oldest_gen->old_weak_ptr_list == NULL);
758
759 // we may need to take the lock to allocate mark queue blocks
760 RELEASE_SM_LOCK;
761 // dead_weak_ptr_list contains weak pointers with dead keys. Those need to
762 // be kept alive because we'll use them in finalizeSchedulers(). Similarly
763 // resurrected_threads are also going to be used in resurrectedThreads()
764 // so we need to mark those too.
765 // Note that in sequential case these lists will be appended with more
766 // weaks and threads found to be dead in mark.
767 #if !defined(THREADED_RTS)
768 // In the non-threaded runtime this is the only time we push to the
769 // upd_rem_set
770 nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
771 #endif
772 nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
773 ACQUIRE_SM_LOCK;
774 }
775
776 // Update the max size of older generations after a major GC:
777 // We can't resize here in the case of the concurrent collector since we
778 // don't yet know how much live data we have. This will be instead done
779 // once we finish marking.
780 if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving)
781 resizeGenerations();
782
783 // Free the mark stack.
784 if (mark_stack_top_bd != NULL) {
785 debugTrace(DEBUG_gc, "mark stack: %d blocks",
786 countBlocks(mark_stack_top_bd));
787 freeChain(mark_stack_top_bd);
788 }
789
790 // Free any bitmaps.
791 for (g = 0; g <= N; g++) {
792 gen = &generations[g];
793 if (gen->bitmap != NULL) {
794 freeGroup(gen->bitmap);
795 gen->bitmap = NULL;
796 }
797 }
798
799 resize_nursery();
800
801 resetNurseries();
802
803 // mark the garbage collected CAFs as dead
804 #if defined(DEBUG)
805 if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); }
806 #endif
807
808 // Update the stable name hash table
809 updateStableNameTable(major_gc);
810
811 // unlock the StablePtr table. Must be before scheduleFinalizers(),
812 // because a finalizer may call hs_free_fun_ptr() or
813 // hs_free_stable_ptr(), both of which access the StablePtr table.
814 stablePtrUnlock();
815
816 // Must be after stablePtrUnlock(), because it might free stable ptrs.
817 if (major_gc) {
818 checkUnload (gct->scavenged_static_objects);
819 }
820
821 #if defined(PROFILING)
822 // resetStaticObjectForProfiling() must be called before
823 // zeroing below.
824
825 // ToDo: fix the gct->scavenged_static_objects below
826 resetStaticObjectForProfiling(gct->scavenged_static_objects);
827 #endif
828
829 // Start any pending finalizers. Must be after
830 // updateStableTables() and stableUnlock() (see #4221).
831 RELEASE_SM_LOCK;
832 scheduleFinalizers(cap, dead_weak_ptr_list);
833 ACQUIRE_SM_LOCK;
834
835 // check sanity after GC
836 // before resurrectThreads(), because that might overwrite some
837 // closures, which will cause problems with THREADED where we don't
838 // fill slop. If we are using the nonmoving collector then we can't claim to
839 // be *after* the major GC; it's now running concurrently.
840 IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving));
841
842 // If a heap census is due, we need to do it before
843 // resurrectThreads(), for the same reason as checkSanity above:
844 // resurrectThreads() will overwrite some closures and leave slop
845 // behind.
846 if (do_heap_census) {
847 debugTrace(DEBUG_sched, "performing heap census");
848 RELEASE_SM_LOCK;
849 heapCensus(gct->gc_start_cpu);
850 ACQUIRE_SM_LOCK;
851 }
852
853 // send exceptions to any threads which were about to die
854 RELEASE_SM_LOCK;
855 resurrectThreads(resurrected_threads);
856 ACQUIRE_SM_LOCK;
857
858 if (major_gc) {
859 W_ need_prealloc, need_live, need, got;
860 uint32_t i;
861
862 need_live = 0;
863 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
864 need_live += genLiveBlocks(&generations[i]);
865 }
866 need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
867
868 need_prealloc = 0;
869 for (i = 0; i < n_nurseries; i++) {
870 need_prealloc += nurseries[i].n_blocks;
871 }
872 need_prealloc += RtsFlags.GcFlags.largeAllocLim;
873 need_prealloc += countAllocdBlocks(exec_block);
874 need_prealloc += arenaBlocks();
875 #if defined(PROFILING)
876 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
877 need_prealloc = retainerStackBlocks();
878 }
879 #endif
880
881 /* If the amount of data remains constant, next major GC we'll
882 * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
883 * in order to reduce repeated deallocation and reallocation. #14702
884 */
885 need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
886
887 /* Also, if user set heap size, do not drop below it.
888 */
889 need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
890
891 /* But with a large nursery, the above estimate might exceed
892 * maxHeapSize. A large resident set size might make the OS
893 * kill this process, or swap unnecessarily. Therefore we
894 * ensure that our estimate does not exceed maxHeapSize.
895 */
896 if (RtsFlags.GcFlags.maxHeapSize != 0) {
897 need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
898 }
899
900 need = BLOCKS_TO_MBLOCKS(need);
901
902 got = mblocks_allocated;
903
904 if (got > need) {
905 returnMemoryToOS(got - need);
906 }
907 }
908
909 // extra GC trace info
910 IF_DEBUG(gc, statDescribeGens());
911
912 #if defined(DEBUG)
913 // symbol-table based profiling
914 /* heapCensus(to_blocks); */ /* ToDo */
915 #endif
916
917 // restore enclosing cost centre
918 #if defined(PROFILING)
919 for (n = 0; n < n_capabilities; n++) {
920 capabilities[n]->r.rCCCS = save_CCS[n];
921 }
922 #endif
923
924 #if defined(DEBUG)
925 // check for memory leaks if DEBUG is on
926 memInventory(DEBUG_gc);
927 #endif
928
929 // ok, GC over: tell the stats department what happened.
930 stat_endGC(cap, gct, live_words, copied,
931 live_blocks * BLOCK_SIZE_W - live_words /* slop */,
932 N, n_gc_threads, par_max_copied, par_balanced_copied,
933 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
934 any_work, no_work, scav_find_work);
935
936 #if defined(RTS_USER_SIGNALS)
937 if (RtsFlags.MiscFlags.install_signal_handlers) {
938 // unblock signals again
939 unblockUserSignals();
940 }
941 #endif
942
943 RELEASE_SM_LOCK;
944
945 SET_GCT(saved_gct);
946 }
947
948 /* -----------------------------------------------------------------------------
949 Heap overflow is indicated by setting a flag that the caller of
950 GarbageCollect can check. (not ideal, TODO: better)
951 -------------------------------------------------------------------------- */
952
953 static void heapOverflow(void)
954 {
955 heap_overflow = true;
956 }
957
958 /* -----------------------------------------------------------------------------
959 Initialise the gc_thread structures.
960 -------------------------------------------------------------------------- */
961
962 static void
963 new_gc_thread (uint32_t n, gc_thread *t)
964 {
965 uint32_t g;
966 gen_workspace *ws;
967
968 t->cap = capabilities[n];
969
970 #if defined(THREADED_RTS)
971 t->id = 0;
972 initSpinLock(&t->gc_spin);
973 initSpinLock(&t->mut_spin);
974 ACQUIRE_SPIN_LOCK(&t->gc_spin);
975 ACQUIRE_SPIN_LOCK(&t->mut_spin);
976 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the
977 // thread to start up, see wakeup_gc_threads
978 #endif
979
980 t->thread_index = n;
981 t->free_blocks = NULL;
982 t->gc_count = 0;
983
984 init_gc_thread(t);
985
986 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
987 {
988 ws = &t->gens[g];
989 ws->gen = &generations[g];
990 ASSERT(g == ws->gen->no);
991 ws->my_gct = t;
992
993 // We want to call
994 // alloc_todo_block(ws,0);
995 // but can't, because it uses gct which isn't set up at this point.
996 // Hence, allocate a block for todo_bd manually:
997 {
998 bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
999 // no lock, locks aren't initialised yet
1000 initBdescr(bd, ws->gen, ws->gen->to);
1001 bd->flags = BF_EVACUATED;
1002 bd->u.scan = bd->free = bd->start;
1003
1004 ws->todo_bd = bd;
1005 ws->todo_free = bd->free;
1006 ws->todo_lim = bd->start + BLOCK_SIZE_W;
1007 }
1008
1009 ws->todo_q = newWSDeque(128);
1010 ws->todo_overflow = NULL;
1011 ws->n_todo_overflow = 0;
1012 ws->todo_large_objects = NULL;
1013 ws->todo_seg = END_NONMOVING_TODO_LIST;
1014
1015 ws->part_list = NULL;
1016 ws->n_part_blocks = 0;
1017 ws->n_part_words = 0;
1018
1019 ws->scavd_list = NULL;
1020 ws->n_scavd_blocks = 0;
1021 ws->n_scavd_words = 0;
1022 }
1023 }
1024
1025
1026 void
1027 initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
1028 {
1029 #if defined(THREADED_RTS)
1030 uint32_t i;
1031
1032 if (from > 0) {
1033 gc_threads = stgReallocBytes (gc_threads, to * sizeof(gc_thread*),
1034 "initGcThreads");
1035 } else {
1036 gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
1037 "initGcThreads");
1038 }
1039
1040 for (i = from; i < to; i++) {
1041 gc_threads[i] =
1042 stgMallocBytes(sizeof(gc_thread) +
1043 RtsFlags.GcFlags.generations * sizeof(gen_workspace),
1044 "alloc_gc_threads");
1045
1046 new_gc_thread(i, gc_threads[i]);
1047 }
1048 #else
1049 ASSERT(from == 0 && to == 1);
1050 gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
1051 gc_threads[0] = gct;
1052 new_gc_thread(0,gc_threads[0]);
1053 #endif
1054 }
1055
1056 void
1057 freeGcThreads (void)
1058 {
1059 uint32_t g;
1060 if (gc_threads != NULL) {
1061 #if defined(THREADED_RTS)
1062 uint32_t i;
1063 for (i = 0; i < n_capabilities; i++) {
1064 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1065 {
1066 freeWSDeque(gc_threads[i]->gens[g].todo_q);
1067 }
1068 stgFree (gc_threads[i]);
1069 }
1070 stgFree (gc_threads);
1071 #else
1072 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
1073 {
1074 freeWSDeque(gc_threads[0]->gens[g].todo_q);
1075 }
1076 stgFree (gc_threads);
1077 #endif
1078 gc_threads = NULL;
1079 }
1080 }
1081
1082 /* ----------------------------------------------------------------------------
1083 Start GC threads
1084 ------------------------------------------------------------------------- */
1085
1086 static volatile StgWord gc_running_threads;
1087
1088 static StgWord
1089 inc_running (void)
1090 {
1091 StgWord new;
1092 new = atomic_inc(&gc_running_threads, 1);
1093 ASSERT(new <= n_gc_threads);
1094 return new;
1095 }
1096
1097 static StgWord
1098 dec_running (void)
1099 {
1100 ASSERT(gc_running_threads != 0);
1101 return atomic_dec(&gc_running_threads);
1102 }
1103
1104 static bool
1105 any_work (void)
1106 {
1107 int g;
1108 gen_workspace *ws;
1109
1110 gct->any_work++;
1111
1112 write_barrier();
1113
1114 // scavenge objects in compacted generation
1115 if (mark_stack_bd != NULL && !mark_stack_empty()) {
1116 return true;
1117 }
1118
1119 // Check for global work in any gen. We don't need to check for
1120 // local work, because we have already exited scavenge_loop(),
1121 // which means there is no local work for this thread.
1122 for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
1123 ws = &gct->gens[g];
1124 if (ws->todo_large_objects) return true;
1125 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1126 if (ws->todo_overflow) return true;
1127 }
1128
1129 #if defined(THREADED_RTS)
1130 if (work_stealing) {
1131 uint32_t n;
1132 // look for work to steal
1133 for (n = 0; n < n_gc_threads; n++) {
1134 if (n == gct->thread_index) continue;
1135 for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
1136 ws = &gc_threads[n]->gens[g];
1137 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1138 }
1139 }
1140 }
1141 #endif
1142
1143 gct->no_work++;
1144 #if defined(THREADED_RTS)
1145 yieldThread();
1146 #endif
1147
1148 return false;
1149 }
1150
1151 static void
1152 scavenge_until_all_done (void)
1153 {
1154 DEBUG_ONLY( uint32_t r );
1155
1156
1157 loop:
1158 #if defined(THREADED_RTS)
1159 if (n_gc_threads > 1) {
1160 scavenge_loop();
1161 } else {
1162 scavenge_loop1();
1163 }
1164 #else
1165 scavenge_loop();
1166 #endif
1167
1168 collect_gct_blocks();
1169
1170 // scavenge_loop() only exits when there's no work to do
1171
1172 // This atomic decrement also serves as a full barrier to ensure that any
1173 // writes we made during scavenging are visible to other threads.
1174 #if defined(DEBUG)
1175 r = dec_running();
1176 #else
1177 dec_running();
1178 #endif
1179
1180 traceEventGcIdle(gct->cap);
1181
1182 debugTrace(DEBUG_gc, "%d GC threads still running", r);
1183
1184 while (gc_running_threads != 0) {
1185 // usleep(1);
1186 if (any_work()) {
1187 inc_running();
1188 traceEventGcWork(gct->cap);
1189 goto loop;
1190 }
1191 // any_work() does not remove the work from the queue, it
1192 // just checks for the presence of work. If we find any,
1193 // then we increment gc_running_threads and go back to
1194 // scavenge_loop() to perform any pending work.
1195 }
1196
1197 traceEventGcDone(gct->cap);
1198 }
1199
1200 #if defined(THREADED_RTS)
1201
1202 void
1203 gcWorkerThread (Capability *cap)
1204 {
1205 gc_thread *saved_gct;
1206
1207 // necessary if we stole a callee-saves register for gct:
1208 saved_gct = gct;
1209
1210 SET_GCT(gc_threads[cap->no]);
1211 gct->id = osThreadId();
1212
1213 // Wait until we're told to wake up
1214 RELEASE_SPIN_LOCK(&gct->mut_spin);
1215 // yieldThread();
1216 // Strangely, adding a yieldThread() here makes the CPU time
1217 // measurements more accurate on Linux, perhaps because it syncs
1218 // the CPU time across the multiple cores. Without this, CPU time
1219 // is heavily skewed towards GC rather than MUT.
1220 gct->wakeup = GC_THREAD_STANDING_BY;
1221 debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1222 ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1223
1224 init_gc_thread(gct);
1225
1226 traceEventGcWork(gct->cap);
1227
1228 // Every thread evacuates some roots.
1229 gct->evac_gen_no = 0;
1230 markCapability(mark_root, gct, cap, true/*prune sparks*/);
1231 scavenge_capability_mut_lists(cap);
1232
1233 scavenge_until_all_done();
1234
1235 #if defined(THREADED_RTS)
1236 // Now that the whole heap is marked, we discard any sparks that
1237 // were found to be unreachable. The main GC thread is currently
1238 // marking heap reachable via weak pointers, so it is
1239 // non-deterministic whether a spark will be retained if it is
1240 // only reachable via weak pointers. To fix this problem would
1241 // require another GC barrier, which is too high a price.
1242 pruneSparkQueue(cap);
1243 #endif
1244
1245 // Wait until we're told to continue
1246 RELEASE_SPIN_LOCK(&gct->gc_spin);
1247 gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1248 debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1249 gct->thread_index);
1250 ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1251 debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1252
1253 SET_GCT(saved_gct);
1254 }
1255
1256 #endif
1257
1258 #if defined(THREADED_RTS)
1259
1260 void
1261 waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1262 {
1263 const uint32_t n_threads = n_capabilities;
1264 const uint32_t me = cap->no;
1265 uint32_t i, j;
1266 bool retry = true;
1267 Time t0, t1, t2;
1268
1269 t0 = t1 = t2 = getProcessElapsedTime();
1270
1271 while(retry) {
1272 for (i=0; i < n_threads; i++) {
1273 if (i == me || idle_cap[i]) continue;
1274 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1275 prodCapability(capabilities[i], cap->running_task);
1276 }
1277 }
1278 for (j=0; j < 10; j++) {
1279 retry = false;
1280 for (i=0; i < n_threads; i++) {
1281 if (i == me || idle_cap[i]) continue;
1282 write_barrier();
1283 interruptCapability(capabilities[i]);
1284 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1285 retry = true;
1286 }
1287 }
1288 if (!retry) break;
1289 #if defined(PROF_SPIN)
1290 waitForGcThreads_yield++;
1291 #endif
1292 yieldThread();
1293 }
1294
1295 t2 = getProcessElapsedTime();
1296 if (RtsFlags.GcFlags.longGCSync != 0 &&
1297 t2 - t1 > RtsFlags.GcFlags.longGCSync) {
1298 /* call this every longGCSync of delay */
1299 rtsConfig.longGCSync(cap->no, t2 - t0);
1300 t1 = t2;
1301 }
1302 if (retry) {
1303 #if defined(PROF_SPIN)
1304 // This is a bit strange, we'll get more yields than spins.
1305 // I guess that means it's not a spin-lock at all, but these
1306 // numbers are still useful (I think).
1307 waitForGcThreads_spin++;
1308 #endif
1309 }
1310 }
1311
1312 if (RtsFlags.GcFlags.longGCSync != 0 &&
1313 t2 - t0 > RtsFlags.GcFlags.longGCSync) {
1314 rtsConfig.longGCSyncEnd(t2 - t0);
1315 }
1316 }
1317
1318 #endif // THREADED_RTS
1319
1320 static void
1321 start_gc_threads (void)
1322 {
1323 #if defined(THREADED_RTS)
1324 gc_running_threads = 0;
1325 #endif
1326 }
1327
1328 static void
1329 wakeup_gc_threads (uint32_t me USED_IF_THREADS,
1330 bool idle_cap[] USED_IF_THREADS)
1331 {
1332 #if defined(THREADED_RTS)
1333 uint32_t i;
1334
1335 if (n_gc_threads == 1) return;
1336
1337 for (i=0; i < n_gc_threads; i++) {
1338 if (i == me || idle_cap[i]) continue;
1339 inc_running();
1340 debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1341 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
1342 barf("wakeup_gc_threads");
1343
1344 gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1345 ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1346 RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1347 }
1348 #endif
1349 }
1350
1351 // After GC is complete, we must wait for all GC threads to enter the
1352 // standby state, otherwise they may still be executing inside
1353 // any_work(), and may even remain awake until the next GC starts.
1354 static void
1355 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
1356 bool idle_cap[] USED_IF_THREADS)
1357 {
1358 #if defined(THREADED_RTS)
1359 uint32_t i;
1360
1361 if (n_gc_threads == 1) return;
1362
1363 for (i=0; i < n_gc_threads; i++) {
1364 if (i == me || idle_cap[i]) continue;
1365 while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
1366 busy_wait_nop();
1367 write_barrier();
1368 }
1369 }
1370 #endif
1371 }
1372
1373 #if defined(THREADED_RTS)
1374 void
1375 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1376 {
1377 const uint32_t n_threads = n_capabilities;
1378 const uint32_t me = cap->no;
1379 uint32_t i;
1380 for (i=0; i < n_threads; i++) {
1381 if (i == me || idle_cap[i]) continue;
1382 if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1383 barf("releaseGCThreads");
1384
1385 gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1386 ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1387 RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1388 }
1389 }
1390 #endif
1391
1392 /* ----------------------------------------------------------------------------
1393 Save the mutable lists in saved_mut_lists where it will be scavenged
1394 during GC
1395 ------------------------------------------------------------------------- */
1396
1397 static void
1398 stash_mut_list (Capability *cap, uint32_t gen_no)
1399 {
1400 cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
1401 cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
1402 }
1403
1404 /* ----------------------------------------------------------------------------
1405 Initialise a generation that is to be collected
1406 ------------------------------------------------------------------------- */
1407
1408 static void
1409 prepare_collected_gen (generation *gen)
1410 {
1411 uint32_t i, g, n;
1412 gen_workspace *ws;
1413 bdescr *bd, *next;
1414
1415 g = gen->no;
1416
1417 if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
1418 // Nonmoving heap's mutable list is always a root.
1419 for (i = 0; i < n_capabilities; i++) {
1420 stash_mut_list(capabilities[i], g);
1421 }
1422 } else if (g != 0) {
1423 // Otherwise throw away the current mutable list. Invariant: the
1424 // mutable list always has at least one block; this means we can avoid
1425 // a check for NULL in recordMutable().
1426 for (i = 0; i < n_capabilities; i++) {
1427 freeChain(capabilities[i]->mut_lists[g]);
1428 capabilities[i]->mut_lists[g] =
1429 allocBlockOnNode(capNoToNumaNode(i));
1430 }
1431 }
1432
1433 gen = &generations[g];
1434 ASSERT(gen->no == g);
1435
1436 // we'll construct a new list of threads in this step
1437 // during GC, throw away the current list.
1438 gen->old_threads = gen->threads;
1439 gen->threads = END_TSO_QUEUE;
1440
1441 // deprecate the existing blocks (except in the case of the nonmoving
1442 // collector since these will be preserved in nonmovingCollect for the
1443 // concurrent GC).
1444 if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) {
1445 gen->old_blocks = gen->blocks;
1446 gen->n_old_blocks = gen->n_blocks;
1447 gen->blocks = NULL;
1448 gen->n_blocks = 0;
1449 gen->n_words = 0;
1450 gen->live_estimate = 0;
1451 }
1452
1453 // initialise the large object queues.
1454 ASSERT(gen->scavenged_large_objects == NULL);
1455 ASSERT(gen->n_scavenged_large_blocks == 0);
1456 ASSERT(gen->live_compact_objects == NULL);
1457 ASSERT(gen->n_live_compact_blocks == 0);
1458
1459 // grab all the partial blocks stashed in the gc_thread workspaces and
1460 // move them to the old_blocks list of this gen.
1461 for (n = 0; n < n_capabilities; n++) {
1462 ws = &gc_threads[n]->gens[gen->no];
1463
1464 for (bd = ws->part_list; bd != NULL; bd = next) {
1465 next = bd->link;
1466 bd->link = gen->old_blocks;
1467 gen->old_blocks = bd;
1468 gen->n_old_blocks += bd->blocks;
1469 }
1470 ws->part_list = NULL;
1471 ws->n_part_blocks = 0;
1472 ws->n_part_words = 0;
1473
1474 ASSERT(ws->scavd_list == NULL);
1475 ASSERT(ws->n_scavd_blocks == 0);
1476 ASSERT(ws->n_scavd_words == 0);
1477
1478 if (ws->todo_free != ws->todo_bd->start) {
1479 ws->todo_bd->free = ws->todo_free;
1480 ws->todo_bd->link = gen->old_blocks;
1481 gen->old_blocks = ws->todo_bd;
1482 gen->n_old_blocks += ws->todo_bd->blocks;
1483 alloc_todo_block(ws,0); // always has one block.
1484 }
1485 }
1486
1487 // mark the small objects as from-space
1488 for (bd = gen->old_blocks; bd; bd = bd->link) {
1489 bd->flags &= ~BF_EVACUATED;
1490 }
1491
1492 // mark the large objects as from-space
1493 for (bd = gen->large_objects; bd; bd = bd->link) {
1494 bd->flags &= ~BF_EVACUATED;
1495 }
1496
1497 // mark the compact objects as from-space
1498 for (bd = gen->compact_objects; bd; bd = bd->link) {
1499 bd->flags &= ~BF_EVACUATED;
1500 }
1501
1502 // for a compacted generation, we need to allocate the bitmap
1503 if (gen->mark) {
1504 StgWord bitmap_size; // in bytes
1505 bdescr *bitmap_bdescr;
1506 StgWord *bitmap;
1507
1508 bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
1509
1510 if (bitmap_size > 0) {
1511 bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
1512 / BLOCK_SIZE);
1513 gen->bitmap = bitmap_bdescr;
1514 bitmap = bitmap_bdescr->start;
1515
1516 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1517 bitmap_size, bitmap);
1518
1519 // don't forget to fill it with zeros!
1520 memset(bitmap, 0, bitmap_size);
1521
1522 // For each block in this step, point to its bitmap from the
1523 // block descriptor.
1524 for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
1525 bd->u.bitmap = bitmap;
1526 bitmap += BLOCK_SIZE_W / BITS_IN(W_);
1527
1528 // Also at this point we set the BF_MARKED flag
1529 // for this block. The invariant is that
1530 // BF_MARKED is always unset, except during GC
1531 // when it is set on those blocks which will be
1532 // compacted.
1533 if (!(bd->flags & BF_FRAGMENTED)) {
1534 bd->flags |= BF_MARKED;
1535 }
1536
1537 // BF_SWEPT should be marked only for blocks that are being
1538 // collected in sweep()
1539 bd->flags &= ~BF_SWEPT;
1540 }
1541 }
1542 }
1543 }
1544
1545 /* ----------------------------------------------------------------------------
1546 Initialise a generation that is *not* to be collected
1547 ------------------------------------------------------------------------- */
1548
1549 static void
1550 prepare_uncollected_gen (generation *gen)
1551 {
1552 uint32_t i;
1553
1554
1555 ASSERT(gen->no > 0);
1556
1557 // save the current mutable lists for this generation, and
1558 // allocate a fresh block for each one. We'll traverse these
1559 // mutable lists as roots early on in the GC.
1560 for (i = 0; i < n_capabilities; i++) {
1561 stash_mut_list(capabilities[i], gen->no);
1562 }
1563
1564 ASSERT(gen->scavenged_large_objects == NULL);
1565 ASSERT(gen->n_scavenged_large_blocks == 0);
1566 }
1567
1568 /* -----------------------------------------------------------------------------
1569 Collect the completed blocks from a GC thread and attach them to
1570 the generation.
1571 -------------------------------------------------------------------------- */
1572
1573 static void
1574 collect_gct_blocks (void)
1575 {
1576 uint32_t g;
1577 gen_workspace *ws;
1578 bdescr *bd, *prev;
1579
1580 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1581 ws = &gct->gens[g];
1582
1583 // there may still be a block attached to ws->todo_bd;
1584 // leave it there to use next time.
1585
1586 if (ws->scavd_list != NULL) {
1587 ACQUIRE_SPIN_LOCK(&ws->gen->sync);
1588
1589 ASSERT(gct->scan_bd == NULL);
1590 ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
1591
1592 prev = NULL;
1593 for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
1594 prev = bd;
1595 }
1596 if (prev != NULL) {
1597 prev->link = ws->gen->blocks;
1598 ws->gen->blocks = ws->scavd_list;
1599 }
1600 ws->gen->n_blocks += ws->n_scavd_blocks;
1601 ws->gen->n_words += ws->n_scavd_words;
1602
1603 ws->scavd_list = NULL;
1604 ws->n_scavd_blocks = 0;
1605 ws->n_scavd_words = 0;
1606
1607 RELEASE_SPIN_LOCK(&ws->gen->sync);
1608 }
1609 }
1610 }
1611
1612 /* -----------------------------------------------------------------------------
1613 During mutation, any blocks that are filled by allocatePinned() are stashed
1614 on the local pinned_object_blocks list, to avoid needing to take a global
1615 lock. Here we collect those blocks from the cap->pinned_object_blocks lists
1616 and put them on the g0->large_object or oldest_gen->large_objects.
1617
1618 How to decide which list to put them on?
1619
1620 - When non-moving heap is enabled and this is a major GC, we put them on
1621 oldest_gen. This is because after preparation we really want no
1622 old-to-young references, and we want to be able to reset mut_lists. For
1623 this we need to promote every potentially live object to the oldest gen.
1624
1625 - Otherwise we put them on g0.
1626 -------------------------------------------------------------------------- */
1627
1628 static void
1629 collect_pinned_object_blocks (void)
1630 {
1631 generation *gen;
1632 const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
1633 if (use_nonmoving && major_gc) {
1634 gen = oldest_gen;
1635 } else {
1636 gen = g0;
1637 }
1638
1639 for (uint32_t n = 0; n < n_capabilities; n++) {
1640 bdescr *last = NULL;
1641 if (use_nonmoving && gen == oldest_gen) {
1642 // Mark objects as belonging to the nonmoving heap
1643 for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1644 bd->flags |= BF_NONMOVING;
1645 bd->gen = oldest_gen;
1646 bd->gen_no = oldest_gen->no;
1647 oldest_gen->n_large_words += bd->free - bd->start;
1648 oldest_gen->n_large_blocks += bd->blocks;
1649 last = bd;
1650 }
1651 } else {
1652 for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1653 last = bd;
1654 }
1655 }
1656
1657 if (last != NULL) {
1658 last->link = gen->large_objects;
1659 if (gen->large_objects != NULL) {
1660 gen->large_objects->u.back = last;
1661 }
1662 gen->large_objects = capabilities[n]->pinned_object_blocks;
1663 capabilities[n]->pinned_object_blocks = NULL;
1664 }
1665 }
1666 }
1667
1668 /* -----------------------------------------------------------------------------
1669 Initialise a gc_thread before GC
1670 -------------------------------------------------------------------------- */
1671
1672 static void
1673 init_gc_thread (gc_thread *t)
1674 {
1675 t->static_objects = END_OF_STATIC_OBJECT_LIST;
1676 t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
1677 t->scan_bd = NULL;
1678 t->mut_lists = t->cap->mut_lists;
1679 t->evac_gen_no = 0;
1680 t->failed_to_evac = false;
1681 t->eager_promotion = true;
1682 t->thunk_selector_depth = 0;
1683 t->copied = 0;
1684 t->scanned = 0;
1685 t->any_work = 0;
1686 t->no_work = 0;
1687 t->scav_find_work = 0;
1688 }
1689
1690 /* -----------------------------------------------------------------------------
1691 Function we pass to evacuate roots.
1692 -------------------------------------------------------------------------- */
1693
1694 static void
1695 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1696 {
1697 // we stole a register for gct, but this function is called from
1698 // *outside* the GC where the register variable is not in effect,
1699 // so we need to save and restore it here. NB. only call
1700 // mark_root() from the main GC thread, otherwise gct will be
1701 // incorrect.
1702 #if defined(THREADED_RTS)
1703 gc_thread *saved_gct;
1704 saved_gct = gct;
1705 #endif
1706 SET_GCT(user);
1707
1708 evacuate(root);
1709
1710 SET_GCT(saved_gct);
1711 }
1712
1713 /* ----------------------------------------------------------------------------
1714 Reset the sizes of the older generations when we do a major
1715 collection.
1716
1717 CURRENT STRATEGY: make all generations except zero the same size.
1718 We have to stay within the maximum heap size, and leave a certain
1719 percentage of the maximum heap size available to allocate into.
1720 ------------------------------------------------------------------------- */
1721
1722 void
1723 resizeGenerations (void)
1724 {
1725 uint32_t g;
1726 W_ live, size, min_alloc, words;
1727 const W_ max = RtsFlags.GcFlags.maxHeapSize;
1728 const W_ gens = RtsFlags.GcFlags.generations;
1729
1730 // live in the oldest generations
1731 if (oldest_gen->live_estimate != 0) {
1732 words = oldest_gen->live_estimate;
1733 } else {
1734 words = oldest_gen->n_words;
1735 }
1736 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1737 oldest_gen->n_large_blocks +
1738 oldest_gen->n_compact_blocks;
1739
1740 // default max size for all generations except zero
1741 size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1742 RtsFlags.GcFlags.minOldGenSize);
1743
1744 if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1745 if (max > 0) {
1746 RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
1747 } else {
1748 RtsFlags.GcFlags.heapSizeSuggestion = size;
1749 }
1750 }
1751
1752 // minimum size for generation zero
1753 min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1754 RtsFlags.GcFlags.minAllocAreaSize
1755 * (W_)n_capabilities);
1756
1757 // Auto-enable compaction when the residency reaches a
1758 // certain percentage of the maximum heap size (default: 30%).
1759 // Except when non-moving GC is enabled.
1760 if (!RtsFlags.GcFlags.useNonmoving &&
1761 (RtsFlags.GcFlags.compact ||
1762 (max > 0 &&
1763 oldest_gen->n_blocks >
1764 (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
1765 oldest_gen->mark = 1;
1766 oldest_gen->compact = 1;
1767 // debugBelch("compaction: on\n", live);
1768 } else {
1769 oldest_gen->mark = 0;
1770 oldest_gen->compact = 0;
1771 // debugBelch("compaction: off\n", live);
1772 }
1773
1774 if (RtsFlags.GcFlags.sweep) {
1775 oldest_gen->mark = 1;
1776 }
1777
1778 // if we're going to go over the maximum heap size, reduce the
1779 // size of the generations accordingly. The calculation is
1780 // different if compaction is turned on, because we don't need
1781 // to double the space required to collect the old generation.
1782 if (max != 0) {
1783
1784 // this test is necessary to ensure that the calculations
1785 // below don't have any negative results - we're working
1786 // with unsigned values here.
1787 if (max < min_alloc) {
1788 heapOverflow();
1789 }
1790
1791 if (oldest_gen->compact) {
1792 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1793 size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1794 }
1795 } else {
1796 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1797 size = (max - min_alloc) / ((gens - 1) * 2);
1798 }
1799 }
1800
1801 if (size < live) {
1802 heapOverflow();
1803 }
1804 }
1805
1806 #if 0
1807 debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1808 min_alloc, size, max);
1809 debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n",
1810 oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks);
1811 debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks);
1812 #endif
1813
1814 for (g = 0; g < gens; g++) {
1815 generations[g].max_blocks = size;
1816 }
1817 }
1818
1819 /* -----------------------------------------------------------------------------
1820 Calculate the new size of the nursery, and resize it.
1821 -------------------------------------------------------------------------- */
1822
1823 static void
1824 resize_nursery (void)
1825 {
1826 const StgWord min_nursery =
1827 RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
1828
1829 if (RtsFlags.GcFlags.generations == 1)
1830 { // Two-space collector:
1831 W_ blocks;
1832
1833 /* set up a new nursery. Allocate a nursery size based on a
1834 * function of the amount of live data (by default a factor of 2)
1835 * Use the blocks from the old nursery if possible, freeing up any
1836 * left over blocks.
1837 *
1838 * If we get near the maximum heap size, then adjust our nursery
1839 * size accordingly. If the nursery is the same size as the live
1840 * data (L), then we need 3L bytes. We can reduce the size of the
1841 * nursery to bring the required memory down near 2L bytes.
1842 *
1843 * A normal 2-space collector would need 4L bytes to give the same
1844 * performance we get from 3L bytes, reducing to the same
1845 * performance at 2L bytes.
1846 */
1847 blocks = generations[0].n_blocks;
1848
1849 if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1850 blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1851 RtsFlags.GcFlags.maxHeapSize )
1852 {
1853 long adjusted_blocks; // signed on purpose
1854 int pc_free;
1855
1856 adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1857
1858 debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1859 RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1860
1861 pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1862 if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1863 {
1864 heapOverflow();
1865 }
1866 blocks = adjusted_blocks;
1867 }
1868 else
1869 {
1870 blocks *= RtsFlags.GcFlags.oldGenFactor;
1871 if (blocks < min_nursery)
1872 {
1873 blocks = min_nursery;
1874 }
1875 }
1876 resizeNurseries(blocks);
1877 }
1878 else // Generational collector
1879 {
1880 /*
1881 * If the user has given us a suggested heap size, adjust our
1882 * allocation area to make best use of the memory available.
1883 */
1884 if (RtsFlags.GcFlags.heapSizeSuggestion)
1885 {
1886 long blocks;
1887 StgWord needed;
1888
1889 calcNeeded(false, &needed); // approx blocks needed at next GC
1890
1891 /* Guess how much will be live in generation 0 step 0 next time.
1892 * A good approximation is obtained by finding the
1893 * percentage of g0 that was live at the last minor GC.
1894 *
1895 * We have an accurate figure for the amount of copied data in
1896 * 'copied', but we must convert this to a number of blocks, with
1897 * a small adjustment for estimated slop at the end of a block
1898 * (- 10 words).
1899 */
1900 if (N == 0)
1901 {
1902 g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1903 / countNurseryBlocks();
1904 }
1905
1906 /* Estimate a size for the allocation area based on the
1907 * information available. We might end up going slightly under
1908 * or over the suggested heap size, but we should be pretty
1909 * close on average.
1910 *
1911 * Formula: suggested - needed
1912 * ----------------------------
1913 * 1 + g0_pcnt_kept/100
1914 *
1915 * where 'needed' is the amount of memory needed at the next
1916 * collection for collecting all gens except g0.
1917 */
1918 blocks =
1919 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1920 (100 + (long)g0_pcnt_kept);
1921
1922 if (blocks < (long)min_nursery) {
1923 blocks = min_nursery;
1924 }
1925
1926 resizeNurseries((W_)blocks);
1927 }
1928 else
1929 {
1930 // we might have added extra blocks to the nursery, so
1931 // resize back to the original size again.
1932 resizeNurseriesFixed();
1933 }
1934 }
1935 }
1936
1937 /* -----------------------------------------------------------------------------
1938 Sanity code for CAF garbage collection.
1939
1940 With DEBUG turned on, we manage a CAF list in addition to the SRT
1941 mechanism. After GC, we run down the CAF list and make any
1942 CAFs which have been garbage collected GCD_CAF. This means we get an error
1943 whenever the program tries to enter a garbage collected CAF.
1944
1945 Any garbage collected CAFs are taken off the CAF list at the same
1946 time.
1947 -------------------------------------------------------------------------- */
1948
1949 #if defined(DEBUG)
1950
1951 void gcCAFs(void)
1952 {
1953 uint32_t i = 0;
1954 StgIndStatic *prev = NULL;
1955
1956 for (StgIndStatic *p = debug_caf_list;
1957 p != (StgIndStatic*) END_OF_CAF_LIST;
1958 p = (StgIndStatic*) p->saved_info)
1959 {
1960 const StgInfoTable *info = get_itbl((StgClosure*)p);
1961 ASSERT(info->type == IND_STATIC);
1962
1963 // See Note [STATIC_LINK fields] in Storage.h
1964 // This condition identifies CAFs that have just been GC'd and
1965 // don't have static_link==3 which means they should be ignored.
1966 if ((((StgWord)(p->static_link)&STATIC_BITS) | prev_static_flag) != 3) {
1967 debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
1968 SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
1969 if (prev == NULL) {
1970 debug_caf_list = (StgIndStatic*)p->saved_info;
1971 } else {
1972 prev->saved_info = p->saved_info;
1973 }
1974 } else {
1975 prev = p;
1976 i++;
1977 }
1978 }
1979
1980 debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1981 }
1982 #endif
1983
1984
1985 /* -----------------------------------------------------------------------------
1986 The GC can leave some work for the mutator to do before the next
1987 GC, provided the work can be safely overlapped with mutation. This
1988 can help reduce the GC pause time.
1989
1990 The mutator can call doIdleGCWork() any time it likes, but
1991 preferably when it is idle. It's safe for multiple capabilities to
1992 call doIdleGCWork().
1993
1994 When 'all' is
1995 * false: doIdleGCWork() should only take a short, bounded, amount
1996 of time.
1997 * true: doIdleGCWork() will complete all the outstanding GC work.
1998
1999 The return value is
2000 * true if there's more to do (only if 'all' is false).
2001 * false otherwise.
2002 -------------------------------------------------------------------------- */
2003
2004 bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
2005 {
2006 return runSomeFinalizers(all);
2007 }