rts: Align the_gc_thread to 64 bytes
[ghc.git] / rts / sm / GC.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * Generational garbage collector
6 *
7 * Documentation on the architecture of the Garbage Collector can be
8 * found in the online commentary:
9 *
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16 #include "HsFFI.h"
17
18 #include "GC.h"
19 #include "GCThread.h"
20 #include "GCTDecl.h" // NB. before RtsSignals.h which
21 // clobbers REG_R1 on arm/Linux
22 #include "Compact.h"
23 #include "Evac.h"
24 #include "Scav.h"
25 #include "GCUtils.h"
26 #include "MarkStack.h"
27 #include "MarkWeak.h"
28 #include "Sparks.h"
29 #include "Sweep.h"
30
31 #include "Arena.h"
32 #include "Storage.h"
33 #include "RtsUtils.h"
34 #include "Apply.h"
35 #include "Updates.h"
36 #include "Stats.h"
37 #include "Schedule.h"
38 #include "Sanity.h"
39 #include "BlockAlloc.h"
40 #include "ProfHeap.h"
41 #include "Weak.h"
42 #include "Prelude.h"
43 #include "RtsSignals.h"
44 #include "STM.h"
45 #include "Trace.h"
46 #include "RetainerProfile.h"
47 #include "LdvProfile.h"
48 #include "RaiseAsync.h"
49 #include "Stable.h"
50 #include "CheckUnload.h"
51 #include "CNF.h"
52 #include "RtsFlags.h"
53
54 #if defined(PROFILING)
55 #include "RetainerProfile.h"
56 #endif
57
58 #include <string.h> // for memset()
59 #include <unistd.h>
60
61 /* -----------------------------------------------------------------------------
62 Global variables
63 -------------------------------------------------------------------------- */
64
65 /* STATIC OBJECT LIST.
66 *
67 * During GC:
68 * We maintain a linked list of static objects that are still live.
69 * The requirements for this list are:
70 *
71 * - we need to scan the list while adding to it, in order to
72 * scavenge all the static objects (in the same way that
73 * breadth-first scavenging works for dynamic objects).
74 *
75 * - we need to be able to tell whether an object is already on
76 * the list, to break loops.
77 *
78 * Each static object has a "static link field", which we use for
79 * linking objects on to the list. We use a stack-type list, consing
80 * objects on the front as they are added (this means that the
81 * scavenge phase is depth-first, not breadth-first, but that
82 * shouldn't matter).
83 *
84 * A separate list is kept for objects that have been scavenged
85 * already - this is so that we can zero all the marks afterwards.
86 *
87 * An object is on the list if its static link field is non-zero; this
88 * means that we have to mark the end of the list with '1', not NULL.
89 *
90 * Extra notes for generational GC:
91 *
92 * Each generation has a static object list associated with it. When
93 * collecting generations up to N, we treat the static object lists
94 * from generations > N as roots.
95 *
96 * We build up a static object list while collecting generations 0..N,
97 * which is then appended to the static object list of generation N+1.
98 *
99 * See also: Note [STATIC_LINK fields] in Storage.h.
100 */
101
102 /* N is the oldest generation being collected, where the generations
103 * are numbered starting at 0. A major GC (indicated by the major_gc
104 * flag) is when we're collecting all generations. We only attempt to
105 * deal with static objects and GC CAFs when doing a major GC.
106 */
107 uint32_t N;
108 bool major_gc;
109
110 /* Data used for allocation area sizing.
111 */
112 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
113
114 /* Mut-list stats */
115 #if defined(DEBUG)
116 uint32_t mutlist_MUTVARS,
117 mutlist_MUTARRS,
118 mutlist_MVARS,
119 mutlist_TVAR,
120 mutlist_TVAR_WATCH_QUEUE,
121 mutlist_TREC_CHUNK,
122 mutlist_TREC_HEADER,
123 mutlist_OTHERS;
124 #endif
125
126 /* Thread-local data for each GC thread
127 */
128 gc_thread **gc_threads = NULL;
129
130 #if !defined(THREADED_RTS)
131 // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
132 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
133 ATTRIBUTE_ALIGNED(64);
134 #endif
135
136 // Number of threads running in *this* GC. Affects how many
137 // step->todos[] lists we have to look in to find work.
138 uint32_t n_gc_threads;
139
140 // For stats:
141 static long copied; // *words* copied & scavenged during this GC
142
143 #if defined(PROF_SPIN) && defined(THREADED_RTS)
144 // spin and yield counts for the quasi-SpinLock in waitForGcThreads
145 volatile StgWord64 waitForGcThreads_spin = 0;
146 volatile StgWord64 waitForGcThreads_yield = 0;
147 volatile StgWord64 whitehole_gc_spin = 0;
148 #endif
149
150 bool work_stealing;
151
152 uint32_t static_flag = STATIC_FLAG_B;
153 uint32_t prev_static_flag = STATIC_FLAG_A;
154
155 DECLARE_GCT
156
157 /* -----------------------------------------------------------------------------
158 Static function declarations
159 -------------------------------------------------------------------------- */
160
161 static void mark_root (void *user, StgClosure **root);
162 static void prepare_collected_gen (generation *gen);
163 static void prepare_uncollected_gen (generation *gen);
164 static void init_gc_thread (gc_thread *t);
165 static void resize_generations (void);
166 static void resize_nursery (void);
167 static void start_gc_threads (void);
168 static void scavenge_until_all_done (void);
169 static StgWord inc_running (void);
170 static StgWord dec_running (void);
171 static void wakeup_gc_threads (uint32_t me, bool idle_cap[]);
172 static void shutdown_gc_threads (uint32_t me, bool idle_cap[]);
173 static void collect_gct_blocks (void);
174 static void collect_pinned_object_blocks (void);
175 static void heapOverflow (void);
176
177 #if defined(DEBUG)
178 static void gcCAFs (void);
179 #endif
180
181 /* -----------------------------------------------------------------------------
182 The mark stack.
183 -------------------------------------------------------------------------- */
184
185 bdescr *mark_stack_top_bd; // topmost block in the mark stack
186 bdescr *mark_stack_bd; // current block in the mark stack
187 StgPtr mark_sp; // pointer to the next unallocated mark stack entry
188
189 /* -----------------------------------------------------------------------------
190 GarbageCollect: the main entry point to the garbage collector.
191
192 The collect_gen parameter is gotten by calling calcNeeded().
193
194 Locks held: all capabilities are held throughout GarbageCollect().
195 -------------------------------------------------------------------------- */
196
197 void
198 GarbageCollect (uint32_t collect_gen,
199 bool do_heap_census,
200 uint32_t gc_type USED_IF_THREADS,
201 Capability *cap,
202 bool idle_cap[])
203 {
204 bdescr *bd;
205 generation *gen;
206 StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
207 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
208 any_work, no_work, scav_find_work;
209 #if defined(THREADED_RTS)
210 gc_thread *saved_gct;
211 #endif
212 uint32_t g, n;
213
214 // necessary if we stole a callee-saves register for gct:
215 #if defined(THREADED_RTS)
216 saved_gct = gct;
217 #endif
218
219 #if defined(PROFILING)
220 CostCentreStack *save_CCS[n_capabilities];
221 #endif
222
223 ACQUIRE_SM_LOCK;
224
225 #if defined(RTS_USER_SIGNALS)
226 if (RtsFlags.MiscFlags.install_signal_handlers) {
227 // block signals
228 blockUserSignals();
229 }
230 #endif
231
232 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord));
233 // otherwise adjust the padding in gen_workspace.
234
235 // this is the main thread
236 SET_GCT(gc_threads[cap->no]);
237
238 // tell the stats department that we've started a GC
239 stat_startGC(cap, gct);
240
241 // lock the StablePtr table
242 stableLock();
243
244 #if defined(DEBUG)
245 mutlist_MUTVARS = 0;
246 mutlist_MUTARRS = 0;
247 mutlist_MVARS = 0;
248 mutlist_TVAR = 0;
249 mutlist_TVAR_WATCH_QUEUE = 0;
250 mutlist_TREC_CHUNK = 0;
251 mutlist_TREC_HEADER = 0;
252 mutlist_OTHERS = 0;
253 #endif
254
255 // attribute any costs to CCS_GC
256 #if defined(PROFILING)
257 for (n = 0; n < n_capabilities; n++) {
258 save_CCS[n] = capabilities[n]->r.rCCCS;
259 capabilities[n]->r.rCCCS = CCS_GC;
260 }
261 #endif
262
263 /* Figure out which generation to collect
264 */
265 N = collect_gen;
266 major_gc = (N == RtsFlags.GcFlags.generations-1);
267
268 if (major_gc) {
269 prev_static_flag = static_flag;
270 static_flag =
271 static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
272 }
273
274 #if defined(THREADED_RTS)
275 work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
276 N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
277 // It's not always a good idea to do load balancing in parallel
278 // GC. In particular, for a parallel program we don't want to
279 // lose locality by moving cached data into another CPU's cache
280 // (this effect can be quite significant).
281 //
282 // We could have a more complex way to deterimine whether to do
283 // work stealing or not, e.g. it might be a good idea to do it
284 // if the heap is big. For now, we just turn it on or off with
285 // a flag.
286 #endif
287
288 /* Start threads, so they can be spinning up while we finish initialisation.
289 */
290 start_gc_threads();
291
292 #if defined(THREADED_RTS)
293 /* How many threads will be participating in this GC?
294 * We don't try to parallelise minor GCs (unless the user asks for
295 * it with +RTS -gn0), or mark/compact/sweep GC.
296 */
297 if (gc_type == SYNC_GC_PAR) {
298 n_gc_threads = n_capabilities;
299 } else {
300 n_gc_threads = 1;
301 }
302 #else
303 n_gc_threads = 1;
304 #endif
305
306 debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s))",
307 N, n_gc_threads);
308
309 #if defined(DEBUG)
310 // check for memory leaks if DEBUG is on
311 memInventory(DEBUG_gc);
312 #endif
313
314 // do this *before* we start scavenging
315 collectFreshWeakPtrs();
316
317 // check sanity *before* GC
318 IF_DEBUG(sanity, checkSanity(false /* before GC */, major_gc));
319
320 // gather blocks allocated using allocatePinned() from each capability
321 // and put them on the g0->large_object list.
322 collect_pinned_object_blocks();
323
324 // Initialise all the generations that we're collecting.
325 for (g = 0; g <= N; g++) {
326 prepare_collected_gen(&generations[g]);
327 }
328 // Initialise all the generations that we're *not* collecting.
329 for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
330 prepare_uncollected_gen(&generations[g]);
331 }
332
333 // Prepare this gc_thread
334 init_gc_thread(gct);
335
336 /* Allocate a mark stack if we're doing a major collection.
337 */
338 if (major_gc && oldest_gen->mark) {
339 mark_stack_bd = allocBlock();
340 mark_stack_top_bd = mark_stack_bd;
341 mark_stack_bd->link = NULL;
342 mark_stack_bd->u.back = NULL;
343 mark_sp = mark_stack_bd->start;
344 } else {
345 mark_stack_bd = NULL;
346 mark_stack_top_bd = NULL;
347 mark_sp = NULL;
348 }
349
350 /* -----------------------------------------------------------------------
351 * follow all the roots that we know about:
352 */
353
354 // the main thread is running: this prevents any other threads from
355 // exiting prematurely, so we can start them now.
356 // NB. do this after the mutable lists have been saved above, otherwise
357 // the other GC threads will be writing into the old mutable lists.
358 inc_running();
359 wakeup_gc_threads(gct->thread_index, idle_cap);
360
361 traceEventGcWork(gct->cap);
362
363 // scavenge the capability-private mutable lists. This isn't part
364 // of markSomeCapabilities() because markSomeCapabilities() can only
365 // call back into the GC via mark_root() (due to the gct register
366 // variable).
367 if (n_gc_threads == 1) {
368 for (n = 0; n < n_capabilities; n++) {
369 #if defined(THREADED_RTS)
370 scavenge_capability_mut_Lists1(capabilities[n]);
371 #else
372 scavenge_capability_mut_lists(capabilities[n]);
373 #endif
374 }
375 } else {
376 scavenge_capability_mut_lists(gct->cap);
377 for (n = 0; n < n_capabilities; n++) {
378 if (idle_cap[n]) {
379 markCapability(mark_root, gct, capabilities[n],
380 true/*don't mark sparks*/);
381 scavenge_capability_mut_lists(capabilities[n]);
382 }
383 }
384 }
385
386 // follow roots from the CAF list (used by GHCi)
387 gct->evac_gen_no = 0;
388 markCAFs(mark_root, gct);
389
390 // follow all the roots that the application knows about.
391 gct->evac_gen_no = 0;
392 if (n_gc_threads == 1) {
393 for (n = 0; n < n_capabilities; n++) {
394 markCapability(mark_root, gct, capabilities[n],
395 true/*don't mark sparks*/);
396 }
397 } else {
398 markCapability(mark_root, gct, cap, true/*don't mark sparks*/);
399 }
400
401 markScheduler(mark_root, gct);
402
403 // Mark the weak pointer list, and prepare to detect dead weak pointers.
404 markWeakPtrList();
405 initWeakForGC();
406
407 // Mark the stable pointer table.
408 markStableTables(mark_root, gct);
409
410 /* -------------------------------------------------------------------------
411 * Repeatedly scavenge all the areas we know about until there's no
412 * more scavenging to be done.
413 */
414 for (;;)
415 {
416 scavenge_until_all_done();
417 // The other threads are now stopped. We might recurse back to
418 // here, but from now on this is the only thread.
419
420 // must be last... invariant is that everything is fully
421 // scavenged at this point.
422 if (traverseWeakPtrList()) { // returns true if evaced something
423 inc_running();
424 continue;
425 }
426
427 // If we get to here, there's really nothing left to do.
428 break;
429 }
430
431 shutdown_gc_threads(gct->thread_index, idle_cap);
432
433 // Now see which stable names are still alive.
434 gcStableTables();
435
436 #if defined(THREADED_RTS)
437 if (n_gc_threads == 1) {
438 for (n = 0; n < n_capabilities; n++) {
439 pruneSparkQueue(capabilities[n]);
440 }
441 } else {
442 for (n = 0; n < n_capabilities; n++) {
443 if (n == cap->no || idle_cap[n]) {
444 pruneSparkQueue(capabilities[n]);
445 }
446 }
447 }
448 #endif
449
450 #if defined(PROFILING)
451 // We call processHeapClosureForDead() on every closure destroyed during
452 // the current garbage collection, so we invoke LdvCensusForDead().
453 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
454 || RtsFlags.ProfFlags.bioSelector != NULL) {
455 RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
456 LdvCensusForDead(N);
457 ACQUIRE_SM_LOCK;
458 }
459 #endif
460
461 // NO MORE EVACUATION AFTER THIS POINT!
462
463 // Finally: compact or sweep the oldest generation.
464 if (major_gc && oldest_gen->mark) {
465 if (oldest_gen->compact)
466 compact(gct->scavenged_static_objects);
467 else
468 sweep(oldest_gen);
469 }
470
471 copied = 0;
472 par_max_copied = 0;
473 par_balanced_copied = 0;
474 gc_spin_spin = 0;
475 gc_spin_yield = 0;
476 mut_spin_spin = 0;
477 mut_spin_yield = 0;
478 any_work = 0;
479 no_work = 0;
480 scav_find_work = 0;
481 {
482 uint32_t i;
483 uint64_t par_balanced_copied_acc = 0;
484 const gc_thread* thread;
485
486 for (i=0; i < n_gc_threads; i++) {
487 copied += gc_threads[i]->copied;
488 }
489 for (i=0; i < n_gc_threads; i++) {
490 thread = gc_threads[i];
491 if (n_gc_threads > 1) {
492 debugTrace(DEBUG_gc,"thread %d:", i);
493 debugTrace(DEBUG_gc," copied %ld",
494 thread->copied * sizeof(W_));
495 debugTrace(DEBUG_gc," scanned %ld",
496 thread->scanned * sizeof(W_));
497 debugTrace(DEBUG_gc," any_work %ld",
498 thread->any_work);
499 debugTrace(DEBUG_gc," no_work %ld",
500 thread->no_work);
501 debugTrace(DEBUG_gc," scav_find_work %ld",
502 thread->scav_find_work);
503
504 #if defined(THREADED_RTS) && defined(PROF_SPIN)
505 gc_spin_spin += thread->gc_spin.spin;
506 gc_spin_yield += thread->gc_spin.yield;
507 mut_spin_spin += thread->mut_spin.spin;
508 mut_spin_yield += thread->mut_spin.yield;
509 #endif
510
511 any_work += thread->any_work;
512 no_work += thread->no_work;
513 scav_find_work += thread->scav_find_work;
514
515 par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
516 par_balanced_copied_acc +=
517 stg_min(n_gc_threads * gc_threads[i]->copied, copied);
518 }
519 }
520 if (n_gc_threads > 1) {
521 // See Note [Work Balance] for an explanation of this computation
522 par_balanced_copied =
523 (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
524 (n_gc_threads - 1);
525 }
526 }
527
528 // Run through all the generations and tidy up.
529 // We're going to:
530 // - count the amount of "live" data (live_words, live_blocks)
531 // - count the amount of "copied" data in this GC (copied)
532 // - free from-space
533 // - make to-space the new from-space (set BF_EVACUATED on all blocks)
534 //
535 live_words = 0;
536 live_blocks = 0;
537
538 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
539
540 if (g == N) {
541 generations[g].collections++; // for stats
542 if (n_gc_threads > 1) generations[g].par_collections++;
543 }
544
545 // Count the mutable list as bytes "copied" for the purposes of
546 // stats. Every mutable list is copied during every GC.
547 if (g > 0) {
548 W_ mut_list_size = 0;
549 for (n = 0; n < n_capabilities; n++) {
550 mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
551 }
552 copied += mut_list_size;
553
554 debugTrace(DEBUG_gc,
555 "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
556 (unsigned long)(mut_list_size * sizeof(W_)),
557 mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
558 mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
559 mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
560 mutlist_OTHERS);
561 }
562
563 bdescr *next, *prev;
564 gen = &generations[g];
565
566 // for generations we collected...
567 if (g <= N) {
568
569 /* free old memory and shift to-space into from-space for all
570 * the collected generations (except the allocation area). These
571 * freed blocks will probaby be quickly recycled.
572 */
573 if (gen->mark)
574 {
575 // tack the new blocks on the end of the existing blocks
576 if (gen->old_blocks != NULL) {
577
578 prev = NULL;
579 for (bd = gen->old_blocks; bd != NULL; bd = next) {
580
581 next = bd->link;
582
583 if (!(bd->flags & BF_MARKED))
584 {
585 if (prev == NULL) {
586 gen->old_blocks = next;
587 } else {
588 prev->link = next;
589 }
590 freeGroup(bd);
591 gen->n_old_blocks--;
592 }
593 else
594 {
595 gen->n_words += bd->free - bd->start;
596
597 // NB. this step might not be compacted next
598 // time, so reset the BF_MARKED flags.
599 // They are set before GC if we're going to
600 // compact. (search for BF_MARKED above).
601 bd->flags &= ~BF_MARKED;
602
603 // between GCs, all blocks in the heap except
604 // for the nursery have the BF_EVACUATED flag set.
605 bd->flags |= BF_EVACUATED;
606
607 prev = bd;
608 }
609 }
610
611 if (prev != NULL) {
612 prev->link = gen->blocks;
613 gen->blocks = gen->old_blocks;
614 }
615 }
616 // add the new blocks to the block tally
617 gen->n_blocks += gen->n_old_blocks;
618 ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
619 ASSERT(countOccupied(gen->blocks) == gen->n_words);
620 }
621 else // not copacted
622 {
623 freeChain(gen->old_blocks);
624 }
625
626 gen->old_blocks = NULL;
627 gen->n_old_blocks = 0;
628
629 /* LARGE OBJECTS. The current live large objects are chained on
630 * scavenged_large, having been moved during garbage
631 * collection from large_objects. Any objects left on the
632 * large_objects list are therefore dead, so we free them here.
633 */
634 freeChain(gen->large_objects);
635 gen->large_objects = gen->scavenged_large_objects;
636 gen->n_large_blocks = gen->n_scavenged_large_blocks;
637 gen->n_large_words = countOccupied(gen->large_objects);
638 gen->n_new_large_words = 0;
639
640 /* COMPACT_NFDATA. The currently live compacts are chained
641 * to live_compact_objects, quite like large objects. And
642 * objects left on the compact_objects list are dead.
643 *
644 * We don't run a simple freeChain because want to give the
645 * CNF module some chance to free memory that freeChain would
646 * not see (namely blocks appended to a CNF through a compactResize).
647 *
648 * See Note [Compact Normal Forms] for details.
649 */
650 for (bd = gen->compact_objects; bd; bd = next) {
651 next = bd->link;
652 compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
653 }
654 gen->compact_objects = gen->live_compact_objects;
655 gen->n_compact_blocks = gen->n_live_compact_blocks;
656 }
657 else // for generations > N
658 {
659 /* For older generations, we need to append the
660 * scavenged_large_object list (i.e. large objects that have been
661 * promoted during this GC) to the large_object list for that step.
662 */
663 for (bd = gen->scavenged_large_objects; bd; bd = next) {
664 next = bd->link;
665 dbl_link_onto(bd, &gen->large_objects);
666 gen->n_large_words += bd->free - bd->start;
667 }
668
669 // And same for compacts
670 for (bd = gen->live_compact_objects; bd; bd = next) {
671 next = bd->link;
672 dbl_link_onto(bd, &gen->compact_objects);
673 }
674
675 // add the new blocks we promoted during this GC
676 gen->n_large_blocks += gen->n_scavenged_large_blocks;
677 gen->n_compact_blocks += gen->n_live_compact_blocks;
678 }
679
680 ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
681 ASSERT(countOccupied(gen->large_objects) == gen->n_large_words);
682 // We can run the same assertion on compact objects because there
683 // is memory "the GC doesn't see" (directly), but which is still
684 // accounted in gen->n_compact_blocks
685
686 gen->scavenged_large_objects = NULL;
687 gen->n_scavenged_large_blocks = 0;
688 gen->live_compact_objects = NULL;
689 gen->n_live_compact_blocks = 0;
690
691 // Count "live" data
692 live_words += genLiveWords(gen);
693 live_blocks += genLiveBlocks(gen);
694
695 // add in the partial blocks in the gen_workspaces
696 {
697 uint32_t i;
698 for (i = 0; i < n_capabilities; i++) {
699 live_words += gcThreadLiveWords(i, gen->no);
700 live_blocks += gcThreadLiveBlocks(i, gen->no);
701 }
702 }
703 } // for all generations
704
705 // update the max size of older generations after a major GC
706 resize_generations();
707
708 // Free the mark stack.
709 if (mark_stack_top_bd != NULL) {
710 debugTrace(DEBUG_gc, "mark stack: %d blocks",
711 countBlocks(mark_stack_top_bd));
712 freeChain(mark_stack_top_bd);
713 }
714
715 // Free any bitmaps.
716 for (g = 0; g <= N; g++) {
717 gen = &generations[g];
718 if (gen->bitmap != NULL) {
719 freeGroup(gen->bitmap);
720 gen->bitmap = NULL;
721 }
722 }
723
724 resize_nursery();
725
726 resetNurseries();
727
728 // mark the garbage collected CAFs as dead
729 #if defined(DEBUG)
730 if (major_gc) { gcCAFs(); }
731 #endif
732
733 // Update the stable pointer hash table.
734 updateStableTables(major_gc);
735
736 // unlock the StablePtr table. Must be before scheduleFinalizers(),
737 // because a finalizer may call hs_free_fun_ptr() or
738 // hs_free_stable_ptr(), both of which access the StablePtr table.
739 stableUnlock();
740
741 // Must be after stableUnlock(), because it might free stable ptrs.
742 if (major_gc) {
743 checkUnload (gct->scavenged_static_objects);
744 }
745
746 #if defined(PROFILING)
747 // resetStaticObjectForRetainerProfiling() must be called before
748 // zeroing below.
749
750 // ToDo: fix the gct->scavenged_static_objects below
751 resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
752 #endif
753
754 // Start any pending finalizers. Must be after
755 // updateStableTables() and stableUnlock() (see #4221).
756 RELEASE_SM_LOCK;
757 scheduleFinalizers(cap, dead_weak_ptr_list);
758 ACQUIRE_SM_LOCK;
759
760 // check sanity after GC
761 // before resurrectThreads(), because that might overwrite some
762 // closures, which will cause problems with THREADED where we don't
763 // fill slop.
764 IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
765
766 // If a heap census is due, we need to do it before
767 // resurrectThreads(), for the same reason as checkSanity above:
768 // resurrectThreads() will overwrite some closures and leave slop
769 // behind.
770 if (do_heap_census) {
771 debugTrace(DEBUG_sched, "performing heap census");
772 RELEASE_SM_LOCK;
773 heapCensus(gct->gc_start_cpu);
774 ACQUIRE_SM_LOCK;
775 }
776
777 // send exceptions to any threads which were about to die
778 RELEASE_SM_LOCK;
779 resurrectThreads(resurrected_threads);
780 ACQUIRE_SM_LOCK;
781
782 if (major_gc) {
783 W_ need_prealloc, need_live, need, got;
784 uint32_t i;
785
786 need_live = 0;
787 for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
788 need_live += genLiveBlocks(&generations[i]);
789 }
790 need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
791
792 need_prealloc = 0;
793 for (i = 0; i < n_nurseries; i++) {
794 need_prealloc += nurseries[i].n_blocks;
795 }
796 need_prealloc += RtsFlags.GcFlags.largeAllocLim;
797 need_prealloc += countAllocdBlocks(exec_block);
798 need_prealloc += arenaBlocks();
799 #if defined(PROFILING)
800 if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
801 need_prealloc = retainerStackBlocks();
802 }
803 #endif
804
805 /* If the amount of data remains constant, next major GC we'll
806 * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
807 * in order to reduce repeated deallocation and reallocation. #14702
808 */
809 need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
810
811 /* Also, if user set heap size, do not drop below it.
812 */
813 need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
814
815 /* But with a large nursery, the above estimate might exceed
816 * maxHeapSize. A large resident set size might make the OS
817 * kill this process, or swap unnecessarily. Therefore we
818 * ensure that our estimate does not exceed maxHeapSize.
819 */
820 if (RtsFlags.GcFlags.maxHeapSize != 0) {
821 need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
822 }
823
824 need = BLOCKS_TO_MBLOCKS(need);
825
826 got = mblocks_allocated;
827
828 if (got > need) {
829 returnMemoryToOS(got - need);
830 }
831 }
832
833 // extra GC trace info
834 IF_DEBUG(gc, statDescribeGens());
835
836 #if defined(DEBUG)
837 // symbol-table based profiling
838 /* heapCensus(to_blocks); */ /* ToDo */
839 #endif
840
841 // restore enclosing cost centre
842 #if defined(PROFILING)
843 for (n = 0; n < n_capabilities; n++) {
844 capabilities[n]->r.rCCCS = save_CCS[n];
845 }
846 #endif
847
848 #if defined(DEBUG)
849 // check for memory leaks if DEBUG is on
850 memInventory(DEBUG_gc);
851 #endif
852
853 // ok, GC over: tell the stats department what happened.
854 stat_endGC(cap, gct, live_words, copied,
855 live_blocks * BLOCK_SIZE_W - live_words /* slop */,
856 N, n_gc_threads, par_max_copied, par_balanced_copied,
857 gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
858 any_work, no_work, scav_find_work);
859
860 #if defined(RTS_USER_SIGNALS)
861 if (RtsFlags.MiscFlags.install_signal_handlers) {
862 // unblock signals again
863 unblockUserSignals();
864 }
865 #endif
866
867 RELEASE_SM_LOCK;
868
869 SET_GCT(saved_gct);
870 }
871
872 /* -----------------------------------------------------------------------------
873 Heap overflow is indicated by setting a flag that the caller of
874 GarbageCollect can check. (not ideal, TODO: better)
875 -------------------------------------------------------------------------- */
876
877 static void heapOverflow(void)
878 {
879 heap_overflow = true;
880 }
881
882 /* -----------------------------------------------------------------------------
883 Initialise the gc_thread structures.
884 -------------------------------------------------------------------------- */
885
886 static void
887 new_gc_thread (uint32_t n, gc_thread *t)
888 {
889 uint32_t g;
890 gen_workspace *ws;
891
892 t->cap = capabilities[n];
893
894 #if defined(THREADED_RTS)
895 t->id = 0;
896 initSpinLock(&t->gc_spin);
897 initSpinLock(&t->mut_spin);
898 ACQUIRE_SPIN_LOCK(&t->gc_spin);
899 ACQUIRE_SPIN_LOCK(&t->mut_spin);
900 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the
901 // thread to start up, see wakeup_gc_threads
902 #endif
903
904 t->thread_index = n;
905 t->free_blocks = NULL;
906 t->gc_count = 0;
907
908 init_gc_thread(t);
909
910 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
911 {
912 ws = &t->gens[g];
913 ws->gen = &generations[g];
914 ASSERT(g == ws->gen->no);
915 ws->my_gct = t;
916
917 // We want to call
918 // alloc_todo_block(ws,0);
919 // but can't, because it uses gct which isn't set up at this point.
920 // Hence, allocate a block for todo_bd manually:
921 {
922 bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
923 // no lock, locks aren't initialised yet
924 initBdescr(bd, ws->gen, ws->gen->to);
925 bd->flags = BF_EVACUATED;
926 bd->u.scan = bd->free = bd->start;
927
928 ws->todo_bd = bd;
929 ws->todo_free = bd->free;
930 ws->todo_lim = bd->start + BLOCK_SIZE_W;
931 }
932
933 ws->todo_q = newWSDeque(128);
934 ws->todo_overflow = NULL;
935 ws->n_todo_overflow = 0;
936 ws->todo_large_objects = NULL;
937
938 ws->part_list = NULL;
939 ws->n_part_blocks = 0;
940 ws->n_part_words = 0;
941
942 ws->scavd_list = NULL;
943 ws->n_scavd_blocks = 0;
944 ws->n_scavd_words = 0;
945 }
946 }
947
948
949 void
950 initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
951 {
952 #if defined(THREADED_RTS)
953 uint32_t i;
954
955 if (from > 0) {
956 gc_threads = stgReallocBytes (gc_threads, to * sizeof(gc_thread*),
957 "initGcThreads");
958 } else {
959 gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
960 "initGcThreads");
961 }
962
963 for (i = from; i < to; i++) {
964 gc_threads[i] =
965 stgMallocBytes(sizeof(gc_thread) +
966 RtsFlags.GcFlags.generations * sizeof(gen_workspace),
967 "alloc_gc_threads");
968
969 new_gc_thread(i, gc_threads[i]);
970 }
971 #else
972 ASSERT(from == 0 && to == 1);
973 gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
974 gc_threads[0] = gct;
975 new_gc_thread(0,gc_threads[0]);
976 #endif
977 }
978
979 void
980 freeGcThreads (void)
981 {
982 uint32_t g;
983 if (gc_threads != NULL) {
984 #if defined(THREADED_RTS)
985 uint32_t i;
986 for (i = 0; i < n_capabilities; i++) {
987 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
988 {
989 freeWSDeque(gc_threads[i]->gens[g].todo_q);
990 }
991 stgFree (gc_threads[i]);
992 }
993 stgFree (gc_threads);
994 #else
995 for (g = 0; g < RtsFlags.GcFlags.generations; g++)
996 {
997 freeWSDeque(gc_threads[0]->gens[g].todo_q);
998 }
999 stgFree (gc_threads);
1000 #endif
1001 gc_threads = NULL;
1002 }
1003 }
1004
1005 /* ----------------------------------------------------------------------------
1006 Start GC threads
1007 ------------------------------------------------------------------------- */
1008
1009 static volatile StgWord gc_running_threads;
1010
1011 static StgWord
1012 inc_running (void)
1013 {
1014 StgWord new;
1015 new = atomic_inc(&gc_running_threads, 1);
1016 ASSERT(new <= n_gc_threads);
1017 return new;
1018 }
1019
1020 static StgWord
1021 dec_running (void)
1022 {
1023 ASSERT(gc_running_threads != 0);
1024 return atomic_dec(&gc_running_threads);
1025 }
1026
1027 static bool
1028 any_work (void)
1029 {
1030 int g;
1031 gen_workspace *ws;
1032
1033 gct->any_work++;
1034
1035 write_barrier();
1036
1037 // scavenge objects in compacted generation
1038 if (mark_stack_bd != NULL && !mark_stack_empty()) {
1039 return true;
1040 }
1041
1042 // Check for global work in any gen. We don't need to check for
1043 // local work, because we have already exited scavenge_loop(),
1044 // which means there is no local work for this thread.
1045 for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
1046 ws = &gct->gens[g];
1047 if (ws->todo_large_objects) return true;
1048 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1049 if (ws->todo_overflow) return true;
1050 }
1051
1052 #if defined(THREADED_RTS)
1053 if (work_stealing) {
1054 uint32_t n;
1055 // look for work to steal
1056 for (n = 0; n < n_gc_threads; n++) {
1057 if (n == gct->thread_index) continue;
1058 for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
1059 ws = &gc_threads[n]->gens[g];
1060 if (!looksEmptyWSDeque(ws->todo_q)) return true;
1061 }
1062 }
1063 }
1064 #endif
1065
1066 gct->no_work++;
1067 #if defined(THREADED_RTS)
1068 yieldThread();
1069 #endif
1070
1071 return false;
1072 }
1073
1074 static void
1075 scavenge_until_all_done (void)
1076 {
1077 DEBUG_ONLY( uint32_t r );
1078
1079
1080 loop:
1081 #if defined(THREADED_RTS)
1082 if (n_gc_threads > 1) {
1083 scavenge_loop();
1084 } else {
1085 scavenge_loop1();
1086 }
1087 #else
1088 scavenge_loop();
1089 #endif
1090
1091 collect_gct_blocks();
1092
1093 // scavenge_loop() only exits when there's no work to do
1094
1095 #if defined(DEBUG)
1096 r = dec_running();
1097 #else
1098 dec_running();
1099 #endif
1100
1101 traceEventGcIdle(gct->cap);
1102
1103 debugTrace(DEBUG_gc, "%d GC threads still running", r);
1104
1105 while (gc_running_threads != 0) {
1106 // usleep(1);
1107 if (any_work()) {
1108 inc_running();
1109 traceEventGcWork(gct->cap);
1110 goto loop;
1111 }
1112 // any_work() does not remove the work from the queue, it
1113 // just checks for the presence of work. If we find any,
1114 // then we increment gc_running_threads and go back to
1115 // scavenge_loop() to perform any pending work.
1116 }
1117
1118 traceEventGcDone(gct->cap);
1119 }
1120
1121 #if defined(THREADED_RTS)
1122
1123 void
1124 gcWorkerThread (Capability *cap)
1125 {
1126 gc_thread *saved_gct;
1127
1128 // necessary if we stole a callee-saves register for gct:
1129 saved_gct = gct;
1130
1131 SET_GCT(gc_threads[cap->no]);
1132 gct->id = osThreadId();
1133
1134 // Wait until we're told to wake up
1135 RELEASE_SPIN_LOCK(&gct->mut_spin);
1136 // yieldThread();
1137 // Strangely, adding a yieldThread() here makes the CPU time
1138 // measurements more accurate on Linux, perhaps because it syncs
1139 // the CPU time across the multiple cores. Without this, CPU time
1140 // is heavily skewed towards GC rather than MUT.
1141 gct->wakeup = GC_THREAD_STANDING_BY;
1142 debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1143 ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1144
1145 init_gc_thread(gct);
1146
1147 traceEventGcWork(gct->cap);
1148
1149 // Every thread evacuates some roots.
1150 gct->evac_gen_no = 0;
1151 markCapability(mark_root, gct, cap, true/*prune sparks*/);
1152 scavenge_capability_mut_lists(cap);
1153
1154 scavenge_until_all_done();
1155
1156 #if defined(THREADED_RTS)
1157 // Now that the whole heap is marked, we discard any sparks that
1158 // were found to be unreachable. The main GC thread is currently
1159 // marking heap reachable via weak pointers, so it is
1160 // non-deterministic whether a spark will be retained if it is
1161 // only reachable via weak pointers. To fix this problem would
1162 // require another GC barrier, which is too high a price.
1163 pruneSparkQueue(cap);
1164 #endif
1165
1166 // Wait until we're told to continue
1167 RELEASE_SPIN_LOCK(&gct->gc_spin);
1168 gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1169 debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1170 gct->thread_index);
1171 ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1172 debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1173
1174 SET_GCT(saved_gct);
1175 }
1176
1177 #endif
1178
1179 #if defined(THREADED_RTS)
1180
1181 void
1182 waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1183 {
1184 const uint32_t n_threads = n_capabilities;
1185 const uint32_t me = cap->no;
1186 uint32_t i, j;
1187 bool retry = true;
1188 Time t0, t1, t2;
1189
1190 t0 = t1 = t2 = getProcessElapsedTime();
1191
1192 while(retry) {
1193 for (i=0; i < n_threads; i++) {
1194 if (i == me || idle_cap[i]) continue;
1195 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1196 prodCapability(capabilities[i], cap->running_task);
1197 }
1198 }
1199 for (j=0; j < 10; j++) {
1200 retry = false;
1201 for (i=0; i < n_threads; i++) {
1202 if (i == me || idle_cap[i]) continue;
1203 write_barrier();
1204 interruptCapability(capabilities[i]);
1205 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1206 retry = true;
1207 }
1208 }
1209 if (!retry) break;
1210 #if defined(PROF_SPIN)
1211 waitForGcThreads_yield++;
1212 #endif
1213 yieldThread();
1214 }
1215
1216 t2 = getProcessElapsedTime();
1217 if (RtsFlags.GcFlags.longGCSync != 0 &&
1218 t2 - t1 > RtsFlags.GcFlags.longGCSync) {
1219 /* call this every longGCSync of delay */
1220 rtsConfig.longGCSync(cap->no, t2 - t0);
1221 t1 = t2;
1222 }
1223 if (retry) {
1224 #if defined(PROF_SPIN)
1225 // This is a bit strange, we'll get more yields than spins.
1226 // I guess that means it's not a spin-lock at all, but these
1227 // numbers are still useful (I think).
1228 waitForGcThreads_spin++;
1229 #endif
1230 }
1231 }
1232
1233 if (RtsFlags.GcFlags.longGCSync != 0 &&
1234 t2 - t0 > RtsFlags.GcFlags.longGCSync) {
1235 rtsConfig.longGCSyncEnd(t2 - t0);
1236 }
1237 }
1238
1239 #endif // THREADED_RTS
1240
1241 static void
1242 start_gc_threads (void)
1243 {
1244 #if defined(THREADED_RTS)
1245 gc_running_threads = 0;
1246 #endif
1247 }
1248
1249 static void
1250 wakeup_gc_threads (uint32_t me USED_IF_THREADS,
1251 bool idle_cap[] USED_IF_THREADS)
1252 {
1253 #if defined(THREADED_RTS)
1254 uint32_t i;
1255
1256 if (n_gc_threads == 1) return;
1257
1258 for (i=0; i < n_gc_threads; i++) {
1259 if (i == me || idle_cap[i]) continue;
1260 inc_running();
1261 debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1262 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
1263 barf("wakeup_gc_threads");
1264
1265 gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1266 ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1267 RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1268 }
1269 #endif
1270 }
1271
1272 // After GC is complete, we must wait for all GC threads to enter the
1273 // standby state, otherwise they may still be executing inside
1274 // any_work(), and may even remain awake until the next GC starts.
1275 static void
1276 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
1277 bool idle_cap[] USED_IF_THREADS)
1278 {
1279 #if defined(THREADED_RTS)
1280 uint32_t i;
1281
1282 if (n_gc_threads == 1) return;
1283
1284 for (i=0; i < n_gc_threads; i++) {
1285 if (i == me || idle_cap[i]) continue;
1286 while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
1287 busy_wait_nop();
1288 write_barrier();
1289 }
1290 }
1291 #endif
1292 }
1293
1294 #if defined(THREADED_RTS)
1295 void
1296 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
1297 {
1298 const uint32_t n_threads = n_capabilities;
1299 const uint32_t me = cap->no;
1300 uint32_t i;
1301 for (i=0; i < n_threads; i++) {
1302 if (i == me || idle_cap[i]) continue;
1303 if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1304 barf("releaseGCThreads");
1305
1306 gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1307 ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1308 RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1309 }
1310 }
1311 #endif
1312
1313 /* ----------------------------------------------------------------------------
1314 Initialise a generation that is to be collected
1315 ------------------------------------------------------------------------- */
1316
1317 static void
1318 prepare_collected_gen (generation *gen)
1319 {
1320 uint32_t i, g, n;
1321 gen_workspace *ws;
1322 bdescr *bd, *next;
1323
1324 // Throw away the current mutable list. Invariant: the mutable
1325 // list always has at least one block; this means we can avoid a
1326 // check for NULL in recordMutable().
1327 g = gen->no;
1328 if (g != 0) {
1329 for (i = 0; i < n_capabilities; i++) {
1330 freeChain(capabilities[i]->mut_lists[g]);
1331 capabilities[i]->mut_lists[g] =
1332 allocBlockOnNode(capNoToNumaNode(i));
1333 }
1334 }
1335
1336 gen = &generations[g];
1337 ASSERT(gen->no == g);
1338
1339 // we'll construct a new list of threads in this step
1340 // during GC, throw away the current list.
1341 gen->old_threads = gen->threads;
1342 gen->threads = END_TSO_QUEUE;
1343
1344 // deprecate the existing blocks
1345 gen->old_blocks = gen->blocks;
1346 gen->n_old_blocks = gen->n_blocks;
1347 gen->blocks = NULL;
1348 gen->n_blocks = 0;
1349 gen->n_words = 0;
1350 gen->live_estimate = 0;
1351
1352 // initialise the large object queues.
1353 ASSERT(gen->scavenged_large_objects == NULL);
1354 ASSERT(gen->n_scavenged_large_blocks == 0);
1355 ASSERT(gen->live_compact_objects == NULL);
1356 ASSERT(gen->n_live_compact_blocks == 0);
1357
1358 // grab all the partial blocks stashed in the gc_thread workspaces and
1359 // move them to the old_blocks list of this gen.
1360 for (n = 0; n < n_capabilities; n++) {
1361 ws = &gc_threads[n]->gens[gen->no];
1362
1363 for (bd = ws->part_list; bd != NULL; bd = next) {
1364 next = bd->link;
1365 bd->link = gen->old_blocks;
1366 gen->old_blocks = bd;
1367 gen->n_old_blocks += bd->blocks;
1368 }
1369 ws->part_list = NULL;
1370 ws->n_part_blocks = 0;
1371 ws->n_part_words = 0;
1372
1373 ASSERT(ws->scavd_list == NULL);
1374 ASSERT(ws->n_scavd_blocks == 0);
1375 ASSERT(ws->n_scavd_words == 0);
1376
1377 if (ws->todo_free != ws->todo_bd->start) {
1378 ws->todo_bd->free = ws->todo_free;
1379 ws->todo_bd->link = gen->old_blocks;
1380 gen->old_blocks = ws->todo_bd;
1381 gen->n_old_blocks += ws->todo_bd->blocks;
1382 alloc_todo_block(ws,0); // always has one block.
1383 }
1384 }
1385
1386 // mark the small objects as from-space
1387 for (bd = gen->old_blocks; bd; bd = bd->link) {
1388 bd->flags &= ~BF_EVACUATED;
1389 }
1390
1391 // mark the large objects as from-space
1392 for (bd = gen->large_objects; bd; bd = bd->link) {
1393 bd->flags &= ~BF_EVACUATED;
1394 }
1395
1396 // mark the compact objects as from-space
1397 for (bd = gen->compact_objects; bd; bd = bd->link) {
1398 bd->flags &= ~BF_EVACUATED;
1399 }
1400
1401 // for a compacted generation, we need to allocate the bitmap
1402 if (gen->mark) {
1403 StgWord bitmap_size; // in bytes
1404 bdescr *bitmap_bdescr;
1405 StgWord *bitmap;
1406
1407 bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
1408
1409 if (bitmap_size > 0) {
1410 bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
1411 / BLOCK_SIZE);
1412 gen->bitmap = bitmap_bdescr;
1413 bitmap = bitmap_bdescr->start;
1414
1415 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1416 bitmap_size, bitmap);
1417
1418 // don't forget to fill it with zeros!
1419 memset(bitmap, 0, bitmap_size);
1420
1421 // For each block in this step, point to its bitmap from the
1422 // block descriptor.
1423 for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
1424 bd->u.bitmap = bitmap;
1425 bitmap += BLOCK_SIZE_W / BITS_IN(W_);
1426
1427 // Also at this point we set the BF_MARKED flag
1428 // for this block. The invariant is that
1429 // BF_MARKED is always unset, except during GC
1430 // when it is set on those blocks which will be
1431 // compacted.
1432 if (!(bd->flags & BF_FRAGMENTED)) {
1433 bd->flags |= BF_MARKED;
1434 }
1435
1436 // BF_SWEPT should be marked only for blocks that are being
1437 // collected in sweep()
1438 bd->flags &= ~BF_SWEPT;
1439 }
1440 }
1441 }
1442 }
1443
1444
1445 /* ----------------------------------------------------------------------------
1446 Save the mutable lists in saved_mut_lists
1447 ------------------------------------------------------------------------- */
1448
1449 static void
1450 stash_mut_list (Capability *cap, uint32_t gen_no)
1451 {
1452 cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
1453 cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
1454 }
1455
1456 /* ----------------------------------------------------------------------------
1457 Initialise a generation that is *not* to be collected
1458 ------------------------------------------------------------------------- */
1459
1460 static void
1461 prepare_uncollected_gen (generation *gen)
1462 {
1463 uint32_t i;
1464
1465
1466 ASSERT(gen->no > 0);
1467
1468 // save the current mutable lists for this generation, and
1469 // allocate a fresh block for each one. We'll traverse these
1470 // mutable lists as roots early on in the GC.
1471 for (i = 0; i < n_capabilities; i++) {
1472 stash_mut_list(capabilities[i], gen->no);
1473 }
1474
1475 ASSERT(gen->scavenged_large_objects == NULL);
1476 ASSERT(gen->n_scavenged_large_blocks == 0);
1477 }
1478
1479 /* -----------------------------------------------------------------------------
1480 Collect the completed blocks from a GC thread and attach them to
1481 the generation.
1482 -------------------------------------------------------------------------- */
1483
1484 static void
1485 collect_gct_blocks (void)
1486 {
1487 uint32_t g;
1488 gen_workspace *ws;
1489 bdescr *bd, *prev;
1490
1491 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1492 ws = &gct->gens[g];
1493
1494 // there may still be a block attached to ws->todo_bd;
1495 // leave it there to use next time.
1496
1497 if (ws->scavd_list != NULL) {
1498 ACQUIRE_SPIN_LOCK(&ws->gen->sync);
1499
1500 ASSERT(gct->scan_bd == NULL);
1501 ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
1502
1503 prev = NULL;
1504 for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
1505 prev = bd;
1506 }
1507 if (prev != NULL) {
1508 prev->link = ws->gen->blocks;
1509 ws->gen->blocks = ws->scavd_list;
1510 }
1511 ws->gen->n_blocks += ws->n_scavd_blocks;
1512 ws->gen->n_words += ws->n_scavd_words;
1513
1514 ws->scavd_list = NULL;
1515 ws->n_scavd_blocks = 0;
1516 ws->n_scavd_words = 0;
1517
1518 RELEASE_SPIN_LOCK(&ws->gen->sync);
1519 }
1520 }
1521 }
1522
1523 /* -----------------------------------------------------------------------------
1524 During mutation, any blocks that are filled by allocatePinned() are
1525 stashed on the local pinned_object_blocks list, to avoid needing to
1526 take a global lock. Here we collect those blocks from the
1527 cap->pinned_object_blocks lists and put them on the
1528 main g0->large_object list.
1529 -------------------------------------------------------------------------- */
1530
1531 static void
1532 collect_pinned_object_blocks (void)
1533 {
1534 uint32_t n;
1535 bdescr *bd, *prev;
1536
1537 for (n = 0; n < n_capabilities; n++) {
1538 prev = NULL;
1539 for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
1540 prev = bd;
1541 }
1542 if (prev != NULL) {
1543 prev->link = g0->large_objects;
1544 if (g0->large_objects != NULL) {
1545 g0->large_objects->u.back = prev;
1546 }
1547 g0->large_objects = capabilities[n]->pinned_object_blocks;
1548 capabilities[n]->pinned_object_blocks = 0;
1549 }
1550 }
1551 }
1552
1553 /* -----------------------------------------------------------------------------
1554 Initialise a gc_thread before GC
1555 -------------------------------------------------------------------------- */
1556
1557 static void
1558 init_gc_thread (gc_thread *t)
1559 {
1560 t->static_objects = END_OF_STATIC_OBJECT_LIST;
1561 t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
1562 t->scan_bd = NULL;
1563 t->mut_lists = t->cap->mut_lists;
1564 t->evac_gen_no = 0;
1565 t->failed_to_evac = false;
1566 t->eager_promotion = true;
1567 t->thunk_selector_depth = 0;
1568 t->copied = 0;
1569 t->scanned = 0;
1570 t->any_work = 0;
1571 t->no_work = 0;
1572 t->scav_find_work = 0;
1573 }
1574
1575 /* -----------------------------------------------------------------------------
1576 Function we pass to evacuate roots.
1577 -------------------------------------------------------------------------- */
1578
1579 static void
1580 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1581 {
1582 // we stole a register for gct, but this function is called from
1583 // *outside* the GC where the register variable is not in effect,
1584 // so we need to save and restore it here. NB. only call
1585 // mark_root() from the main GC thread, otherwise gct will be
1586 // incorrect.
1587 #if defined(THREADED_RTS)
1588 gc_thread *saved_gct;
1589 saved_gct = gct;
1590 #endif
1591 SET_GCT(user);
1592
1593 evacuate(root);
1594
1595 SET_GCT(saved_gct);
1596 }
1597
1598 /* ----------------------------------------------------------------------------
1599 Reset the sizes of the older generations when we do a major
1600 collection.
1601
1602 CURRENT STRATEGY: make all generations except zero the same size.
1603 We have to stay within the maximum heap size, and leave a certain
1604 percentage of the maximum heap size available to allocate into.
1605 ------------------------------------------------------------------------- */
1606
1607 static void
1608 resize_generations (void)
1609 {
1610 uint32_t g;
1611
1612 if (major_gc && RtsFlags.GcFlags.generations > 1) {
1613 W_ live, size, min_alloc, words;
1614 const W_ max = RtsFlags.GcFlags.maxHeapSize;
1615 const W_ gens = RtsFlags.GcFlags.generations;
1616
1617 // live in the oldest generations
1618 if (oldest_gen->live_estimate != 0) {
1619 words = oldest_gen->live_estimate;
1620 } else {
1621 words = oldest_gen->n_words;
1622 }
1623 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1624 oldest_gen->n_large_blocks +
1625 oldest_gen->n_compact_blocks;
1626
1627 // default max size for all generations except zero
1628 size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1629 RtsFlags.GcFlags.minOldGenSize);
1630
1631 if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1632 if (max > 0) {
1633 RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
1634 } else {
1635 RtsFlags.GcFlags.heapSizeSuggestion = size;
1636 }
1637 }
1638
1639 // minimum size for generation zero
1640 min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1641 RtsFlags.GcFlags.minAllocAreaSize
1642 * (W_)n_capabilities);
1643
1644 // Auto-enable compaction when the residency reaches a
1645 // certain percentage of the maximum heap size (default: 30%).
1646 if (RtsFlags.GcFlags.compact ||
1647 (max > 0 &&
1648 oldest_gen->n_blocks >
1649 (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
1650 oldest_gen->mark = 1;
1651 oldest_gen->compact = 1;
1652 // debugBelch("compaction: on\n", live);
1653 } else {
1654 oldest_gen->mark = 0;
1655 oldest_gen->compact = 0;
1656 // debugBelch("compaction: off\n", live);
1657 }
1658
1659 if (RtsFlags.GcFlags.sweep) {
1660 oldest_gen->mark = 1;
1661 }
1662
1663 // if we're going to go over the maximum heap size, reduce the
1664 // size of the generations accordingly. The calculation is
1665 // different if compaction is turned on, because we don't need
1666 // to double the space required to collect the old generation.
1667 if (max != 0) {
1668
1669 // this test is necessary to ensure that the calculations
1670 // below don't have any negative results - we're working
1671 // with unsigned values here.
1672 if (max < min_alloc) {
1673 heapOverflow();
1674 }
1675
1676 if (oldest_gen->compact) {
1677 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1678 size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1679 }
1680 } else {
1681 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1682 size = (max - min_alloc) / ((gens - 1) * 2);
1683 }
1684 }
1685
1686 if (size < live) {
1687 heapOverflow();
1688 }
1689 }
1690
1691 #if 0
1692 debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1693 min_alloc, size, max);
1694 #endif
1695
1696 for (g = 0; g < gens; g++) {
1697 generations[g].max_blocks = size;
1698 }
1699 }
1700 }
1701
1702 /* -----------------------------------------------------------------------------
1703 Calculate the new size of the nursery, and resize it.
1704 -------------------------------------------------------------------------- */
1705
1706 static void
1707 resize_nursery (void)
1708 {
1709 const StgWord min_nursery =
1710 RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
1711
1712 if (RtsFlags.GcFlags.generations == 1)
1713 { // Two-space collector:
1714 W_ blocks;
1715
1716 /* set up a new nursery. Allocate a nursery size based on a
1717 * function of the amount of live data (by default a factor of 2)
1718 * Use the blocks from the old nursery if possible, freeing up any
1719 * left over blocks.
1720 *
1721 * If we get near the maximum heap size, then adjust our nursery
1722 * size accordingly. If the nursery is the same size as the live
1723 * data (L), then we need 3L bytes. We can reduce the size of the
1724 * nursery to bring the required memory down near 2L bytes.
1725 *
1726 * A normal 2-space collector would need 4L bytes to give the same
1727 * performance we get from 3L bytes, reducing to the same
1728 * performance at 2L bytes.
1729 */
1730 blocks = generations[0].n_blocks;
1731
1732 if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1733 blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1734 RtsFlags.GcFlags.maxHeapSize )
1735 {
1736 long adjusted_blocks; // signed on purpose
1737 int pc_free;
1738
1739 adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1740
1741 debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1742 RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1743
1744 pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1745 if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1746 {
1747 heapOverflow();
1748 }
1749 blocks = adjusted_blocks;
1750 }
1751 else
1752 {
1753 blocks *= RtsFlags.GcFlags.oldGenFactor;
1754 if (blocks < min_nursery)
1755 {
1756 blocks = min_nursery;
1757 }
1758 }
1759 resizeNurseries(blocks);
1760 }
1761 else // Generational collector
1762 {
1763 /*
1764 * If the user has given us a suggested heap size, adjust our
1765 * allocation area to make best use of the memory available.
1766 */
1767 if (RtsFlags.GcFlags.heapSizeSuggestion)
1768 {
1769 long blocks;
1770 StgWord needed;
1771
1772 calcNeeded(false, &needed); // approx blocks needed at next GC
1773
1774 /* Guess how much will be live in generation 0 step 0 next time.
1775 * A good approximation is obtained by finding the
1776 * percentage of g0 that was live at the last minor GC.
1777 *
1778 * We have an accurate figure for the amount of copied data in
1779 * 'copied', but we must convert this to a number of blocks, with
1780 * a small adjustment for estimated slop at the end of a block
1781 * (- 10 words).
1782 */
1783 if (N == 0)
1784 {
1785 g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1786 / countNurseryBlocks();
1787 }
1788
1789 /* Estimate a size for the allocation area based on the
1790 * information available. We might end up going slightly under
1791 * or over the suggested heap size, but we should be pretty
1792 * close on average.
1793 *
1794 * Formula: suggested - needed
1795 * ----------------------------
1796 * 1 + g0_pcnt_kept/100
1797 *
1798 * where 'needed' is the amount of memory needed at the next
1799 * collection for collecting all gens except g0.
1800 */
1801 blocks =
1802 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1803 (100 + (long)g0_pcnt_kept);
1804
1805 if (blocks < (long)min_nursery) {
1806 blocks = min_nursery;
1807 }
1808
1809 resizeNurseries((W_)blocks);
1810 }
1811 else
1812 {
1813 // we might have added extra blocks to the nursery, so
1814 // resize back to the original size again.
1815 resizeNurseriesFixed();
1816 }
1817 }
1818 }
1819
1820 /* -----------------------------------------------------------------------------
1821 Sanity code for CAF garbage collection.
1822
1823 With DEBUG turned on, we manage a CAF list in addition to the SRT
1824 mechanism. After GC, we run down the CAF list and make any
1825 CAFs which have been garbage collected GCD_CAF. This means we get an error
1826 whenever the program tries to enter a garbage collected CAF.
1827
1828 Any garbage collected CAFs are taken off the CAF list at the same
1829 time.
1830 -------------------------------------------------------------------------- */
1831
1832 #if defined(DEBUG)
1833
1834 static void gcCAFs(void)
1835 {
1836 StgIndStatic *p, *prev;
1837
1838 const StgInfoTable *info;
1839 uint32_t i;
1840
1841 i = 0;
1842 p = debug_caf_list;
1843 prev = NULL;
1844
1845 for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
1846 p = (StgIndStatic*)p->saved_info) {
1847
1848 info = get_itbl((StgClosure*)p);
1849 ASSERT(info->type == IND_STATIC);
1850
1851 if (p->static_link == NULL) {
1852 debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
1853 SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
1854 if (prev == NULL) {
1855 debug_caf_list = (StgIndStatic*)p->saved_info;
1856 } else {
1857 prev->saved_info = p->saved_info;
1858 }
1859 } else {
1860 prev = p;
1861 i++;
1862 }
1863 }
1864
1865 debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1866 }
1867 #endif
1868
1869
1870 /* -----------------------------------------------------------------------------
1871 The GC can leave some work for the mutator to do before the next
1872 GC, provided the work can be safely overlapped with mutation. This
1873 can help reduce the GC pause time.
1874
1875 The mutator can call doIdleGCWork() any time it likes, but
1876 preferably when it is idle. It's safe for multiple capabilities to
1877 call doIdleGCWork().
1878
1879 When 'all' is
1880 * false: doIdleGCWork() should only take a short, bounded, amount
1881 of time.
1882 * true: doIdleGCWork() will complete all the outstanding GC work.
1883
1884 The return value is
1885 * true if there's more to do (only if 'all' is false).
1886 * false otherwise.
1887 -------------------------------------------------------------------------- */
1888
1889 bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
1890 {
1891 return runSomeFinalizers(all);
1892 }