c4dbdc26ca6a056ac3bb34aa82d0cc484f2fd639
[ghc.git] / rts / sm / Storage.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2012
4 *
5 * Storage manager front end
6 *
7 * Documentation on the architecture of the Storage Manager can be
8 * found in the online commentary:
9 *
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage
11 *
12 * ---------------------------------------------------------------------------*/
13
14 #include "PosixSource.h"
15 #include "Rts.h"
16
17 #include "Storage.h"
18 #include "GCThread.h"
19 #include "RtsUtils.h"
20 #include "Stats.h"
21 #include "BlockAlloc.h"
22 #include "Weak.h"
23 #include "Sanity.h"
24 #include "Arena.h"
25 #include "Capability.h"
26 #include "Schedule.h"
27 #include "RetainerProfile.h" // for counting memory blocks (memInventory)
28 #include "OSMem.h"
29 #include "Trace.h"
30 #include "GC.h"
31 #include "Evac.h"
32 #if defined(ios_HOST_OS)
33 #include "Hash.h"
34 #endif
35
36 #include <string.h>
37
38 #include "ffi.h"
39
40 /*
41 * All these globals require sm_mutex to access in THREADED_RTS mode.
42 */
43 StgIndStatic *dyn_caf_list = NULL;
44 StgIndStatic *debug_caf_list = NULL;
45 StgIndStatic *revertible_caf_list = NULL;
46 bool keepCAFs;
47
48 W_ large_alloc_lim; /* GC if n_large_blocks in any nursery
49 * reaches this. */
50
51 bdescr *exec_block;
52
53 generation *generations = NULL; /* all the generations */
54 generation *g0 = NULL; /* generation 0, for convenience */
55 generation *oldest_gen = NULL; /* oldest generation, for convenience */
56
57 /*
58 * Array of nurseries, size == n_capabilities
59 *
60 * nursery[i] belongs to NUMA node (i % n_numa_nodes)
61 * This is chosen to be the same convention as capabilities[i], so
62 * that when not using nursery chunks (+RTS -n), we just map
63 * capabilities to nurseries 1:1.
64 */
65 nursery *nurseries = NULL;
66 uint32_t n_nurseries;
67
68 /*
69 * When we are using nursery chunks, we need a separate next_nursery
70 * pointer for each NUMA node.
71 */
72 volatile StgWord next_nursery[MAX_NUMA_NODES];
73
74 #if defined(THREADED_RTS)
75 /*
76 * Storage manager mutex: protects all the above state from
77 * simultaneous access by two STG threads.
78 */
79 Mutex sm_mutex;
80 #endif
81
82 static void allocNurseries (uint32_t from, uint32_t to);
83 static void assignNurseriesToCapabilities (uint32_t from, uint32_t to);
84
85 static void
86 initGeneration (generation *gen, int g)
87 {
88 gen->no = g;
89 gen->collections = 0;
90 gen->par_collections = 0;
91 gen->failed_promotions = 0;
92 gen->max_blocks = 0;
93 gen->blocks = NULL;
94 gen->n_blocks = 0;
95 gen->n_words = 0;
96 gen->live_estimate = 0;
97 gen->old_blocks = NULL;
98 gen->n_old_blocks = 0;
99 gen->large_objects = NULL;
100 gen->n_large_blocks = 0;
101 gen->n_large_words = 0;
102 gen->n_new_large_words = 0;
103 gen->compact_objects = NULL;
104 gen->n_compact_blocks = 0;
105 gen->compact_blocks_in_import = NULL;
106 gen->n_compact_blocks_in_import = 0;
107 gen->scavenged_large_objects = NULL;
108 gen->n_scavenged_large_blocks = 0;
109 gen->live_compact_objects = NULL;
110 gen->n_live_compact_blocks = 0;
111 gen->compact_blocks_in_import = NULL;
112 gen->n_compact_blocks_in_import = 0;
113 gen->mark = 0;
114 gen->compact = 0;
115 gen->bitmap = NULL;
116 #if defined(THREADED_RTS)
117 initSpinLock(&gen->sync);
118 #endif
119 gen->threads = END_TSO_QUEUE;
120 gen->old_threads = END_TSO_QUEUE;
121 gen->weak_ptr_list = NULL;
122 gen->old_weak_ptr_list = NULL;
123 }
124
125 void
126 initStorage (void)
127 {
128 uint32_t g, n;
129
130 if (generations != NULL) {
131 // multi-init protection
132 return;
133 }
134
135 initMBlocks();
136
137 /* Sanity check to make sure the LOOKS_LIKE_ macros appear to be
138 * doing something reasonable.
139 */
140 /* We use the NOT_NULL variant or gcc warns that the test is always true */
141 ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLOCKING_QUEUE_CLEAN_info));
142 ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
143 ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
144
145 initBlockAllocator();
146
147 #if defined(THREADED_RTS)
148 initMutex(&sm_mutex);
149 #endif
150
151 ACQUIRE_SM_LOCK;
152
153 /* allocate generation info array */
154 generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
155 * sizeof(struct generation_),
156 "initStorage: gens");
157
158 /* Initialise all generations */
159 for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
160 initGeneration(&generations[g], g);
161 }
162
163 /* A couple of convenience pointers */
164 g0 = &generations[0];
165 oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
166
167 /* Set up the destination pointers in each younger gen. step */
168 for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
169 generations[g].to = &generations[g+1];
170 }
171 oldest_gen->to = oldest_gen;
172
173 /* The oldest generation has one step. */
174 if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
175 if (RtsFlags.GcFlags.generations == 1) {
176 errorBelch("WARNING: compact/sweep is incompatible with -G1; disabled");
177 } else {
178 oldest_gen->mark = 1;
179 if (RtsFlags.GcFlags.compact)
180 oldest_gen->compact = 1;
181 }
182 }
183
184 generations[0].max_blocks = 0;
185
186 dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
187 debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
188 revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
189
190 if (RtsFlags.GcFlags.largeAllocLim > 0) {
191 large_alloc_lim = RtsFlags.GcFlags.largeAllocLim * BLOCK_SIZE_W;
192 } else {
193 large_alloc_lim = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE_W;
194 }
195
196 exec_block = NULL;
197
198 #if defined(THREADED_RTS)
199 initSpinLock(&gc_alloc_block_sync);
200 #endif
201 N = 0;
202
203 for (n = 0; n < n_numa_nodes; n++) {
204 next_nursery[n] = n;
205 }
206 storageAddCapabilities(0, n_capabilities);
207
208 IF_DEBUG(gc, statDescribeGens());
209
210 RELEASE_SM_LOCK;
211
212 traceEventHeapInfo(CAPSET_HEAP_DEFAULT,
213 RtsFlags.GcFlags.generations,
214 RtsFlags.GcFlags.maxHeapSize * BLOCK_SIZE,
215 RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE,
216 MBLOCK_SIZE,
217 BLOCK_SIZE);
218 }
219
220 void storageAddCapabilities (uint32_t from, uint32_t to)
221 {
222 uint32_t n, g, i, new_n_nurseries;
223
224 if (RtsFlags.GcFlags.nurseryChunkSize == 0) {
225 new_n_nurseries = to;
226 } else {
227 memcount total_alloc = to * RtsFlags.GcFlags.minAllocAreaSize;
228 new_n_nurseries =
229 stg_max(to, total_alloc / RtsFlags.GcFlags.nurseryChunkSize);
230 }
231
232 if (from > 0) {
233 nurseries = stgReallocBytes(nurseries,
234 new_n_nurseries * sizeof(struct nursery_),
235 "storageAddCapabilities");
236 } else {
237 nurseries = stgMallocBytes(new_n_nurseries * sizeof(struct nursery_),
238 "storageAddCapabilities");
239 }
240
241 // we've moved the nurseries, so we have to update the rNursery
242 // pointers from the Capabilities.
243 for (i = 0; i < to; i++) {
244 capabilities[i]->r.rNursery = &nurseries[i];
245 }
246
247 /* The allocation area. Policy: keep the allocation area
248 * small to begin with, even if we have a large suggested heap
249 * size. Reason: we're going to do a major collection first, and we
250 * don't want it to be a big one. This vague idea is borne out by
251 * rigorous experimental evidence.
252 */
253 allocNurseries(n_nurseries, new_n_nurseries);
254 n_nurseries = new_n_nurseries;
255
256 /*
257 * Assign each of the new capabilities a nursery. Remember to start from
258 * next_nursery, because we may have already consumed some of the earlier
259 * nurseries.
260 */
261 assignNurseriesToCapabilities(from,to);
262
263 // allocate a block for each mut list
264 for (n = from; n < to; n++) {
265 for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
266 capabilities[n]->mut_lists[g] =
267 allocBlockOnNode(capNoToNumaNode(n));
268 }
269 }
270
271 #if defined(THREADED_RTS) && defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 0)
272 newThreadLocalKey(&gctKey);
273 #endif
274
275 initGcThreads(from, to);
276 }
277
278
279 void
280 exitStorage (void)
281 {
282 updateNurseriesStats();
283 stat_exit();
284 }
285
286 void
287 freeStorage (bool free_heap)
288 {
289 stgFree(generations);
290 if (free_heap) freeAllMBlocks();
291 #if defined(THREADED_RTS)
292 closeMutex(&sm_mutex);
293 #endif
294 stgFree(nurseries);
295 #if defined(THREADED_RTS) && defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 0)
296 freeThreadLocalKey(&gctKey);
297 #endif
298 freeGcThreads();
299 }
300
301 /* -----------------------------------------------------------------------------
302 Note [CAF management].
303
304 The entry code for every CAF does the following:
305
306 - calls newCaf, which builds a CAF_BLACKHOLE on the heap and atomically
307 updates the CAF with IND_STATIC pointing to the CAF_BLACKHOLE
308
309 - if newCaf returns zero, it re-enters the CAF (see Note [atomic
310 CAF entry])
311
312 - pushes an update frame pointing to the CAF_BLACKHOLE
313
314 Why do we build a BLACKHOLE in the heap rather than just updating
315 the thunk directly? It's so that we only need one kind of update
316 frame - otherwise we'd need a static version of the update frame
317 too, and various other parts of the RTS that deal with update
318 frames would also need special cases for static update frames.
319
320 newCaf() does the following:
321
322 - atomically locks the CAF (see [atomic CAF entry])
323
324 - it builds a CAF_BLACKHOLE on the heap
325
326 - it updates the CAF with an IND_STATIC pointing to the
327 CAF_BLACKHOLE, atomically.
328
329 - it puts the CAF on the oldest generation's mutable list.
330 This is so that we treat the CAF as a root when collecting
331 younger generations.
332
333 - links the CAF onto the CAF list (see below)
334
335 ------------------
336 Note [atomic CAF entry]
337
338 With THREADED_RTS, newCaf() is required to be atomic (see
339 #5558). This is because if two threads happened to enter the same
340 CAF simultaneously, they would create two distinct CAF_BLACKHOLEs,
341 and so the normal threadPaused() machinery for detecting duplicate
342 evaluation will not detect this. Hence in lockCAF() below, we
343 atomically lock the CAF with WHITEHOLE before updating it with
344 IND_STATIC, and return zero if another thread locked the CAF first.
345 In the event that we lost the race, CAF entry code will re-enter
346 the CAF and block on the other thread's CAF_BLACKHOLE.
347
348 ------------------
349 Note [GHCi CAFs]
350
351 For GHCI, we have additional requirements when dealing with CAFs:
352
353 - we must *retain* all dynamically-loaded CAFs ever entered,
354 just in case we need them again.
355 - we must be able to *revert* CAFs that have been evaluated, to
356 their pre-evaluated form.
357
358 To do this, we use an additional CAF list. When newCaf() is
359 called on a dynamically-loaded CAF, we add it to the CAF list
360 instead of the old-generation mutable list, and save away its
361 old info pointer (in caf->saved_info) for later reversion.
362
363 To revert all the CAFs, we traverse the CAF list and reset the
364 info pointer to caf->saved_info, then throw away the CAF list.
365 (see GC.c:revertCAFs()).
366
367 -- SDM 29/1/01
368
369 -------------------------------------------------------------------------- */
370
371 STATIC_INLINE StgInd *
372 lockCAF (StgRegTable *reg, StgIndStatic *caf)
373 {
374 const StgInfoTable *orig_info;
375 Capability *cap = regTableToCapability(reg);
376 StgInd *bh;
377
378 orig_info = caf->header.info;
379
380 #if defined(THREADED_RTS)
381 const StgInfoTable *cur_info;
382
383 if (orig_info == &stg_IND_STATIC_info ||
384 orig_info == &stg_WHITEHOLE_info) {
385 // already claimed by another thread; re-enter the CAF
386 return NULL;
387 }
388
389 cur_info = (const StgInfoTable *)
390 cas((StgVolatilePtr)&caf->header.info,
391 (StgWord)orig_info,
392 (StgWord)&stg_WHITEHOLE_info);
393
394 if (cur_info != orig_info) {
395 // already claimed by another thread; re-enter the CAF
396 return NULL;
397 }
398
399 // successfully claimed by us; overwrite with IND_STATIC
400 #endif
401
402 // For the benefit of revertCAFs(), save the original info pointer
403 caf->saved_info = orig_info;
404
405 // Allocate the blackhole indirection closure
406 bh = (StgInd *)allocate(cap, sizeofW(*bh));
407 SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
408 bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
409
410 caf->indirectee = (StgClosure *)bh;
411 write_barrier();
412 SET_INFO((StgClosure*)caf,&stg_IND_STATIC_info);
413
414 return bh;
415 }
416
417 StgInd *
418 newCAF(StgRegTable *reg, StgIndStatic *caf)
419 {
420 StgInd *bh;
421
422 bh = lockCAF(reg, caf);
423 if (!bh) return NULL;
424
425 if(keepCAFs)
426 {
427 // Note [dyn_caf_list]
428 // If we are in GHCi _and_ we are using dynamic libraries,
429 // then we can't redirect newCAF calls to newRetainedCAF (see below),
430 // so we make newCAF behave almost like newRetainedCAF.
431 // The dynamic libraries might be used by both the interpreted
432 // program and GHCi itself, so they must not be reverted.
433 // This also means that in GHCi with dynamic libraries, CAFs are not
434 // garbage collected. If this turns out to be a problem, we could
435 // do another hack here and do an address range test on caf to figure
436 // out whether it is from a dynamic library.
437
438 ACQUIRE_SM_LOCK; // dyn_caf_list is global, locked by sm_mutex
439 caf->static_link = (StgClosure*)dyn_caf_list;
440 dyn_caf_list = (StgIndStatic*)((StgWord)caf | STATIC_FLAG_LIST);
441 RELEASE_SM_LOCK;
442 }
443 else
444 {
445 // Put this CAF on the mutable list for the old generation.
446 if (oldest_gen->no != 0) {
447 recordMutableCap((StgClosure*)caf,
448 regTableToCapability(reg), oldest_gen->no);
449 }
450
451 #if defined(DEBUG)
452 // In the DEBUG rts, we keep track of live CAFs by chaining them
453 // onto a list debug_caf_list. This is so that we can tell if we
454 // ever enter a GC'd CAF, and emit a suitable barf().
455 //
456 // The saved_info field of the CAF is used as the link field for
457 // debug_caf_list, because this field is only used by newDynCAF
458 // for revertible CAFs, and we don't put those on the
459 // debug_caf_list.
460 ACQUIRE_SM_LOCK; // debug_caf_list is global, locked by sm_mutex
461 ((StgIndStatic *)caf)->saved_info = (const StgInfoTable*)debug_caf_list;
462 debug_caf_list = (StgIndStatic*)caf;
463 RELEASE_SM_LOCK;
464 #endif
465 }
466
467 return bh;
468 }
469
470 // External API for setting the keepCAFs flag. see #3900.
471 void
472 setKeepCAFs (void)
473 {
474 keepCAFs = 1;
475 }
476
477 // An alternate version of newCAF which is used for dynamically loaded
478 // object code in GHCi. In this case we want to retain *all* CAFs in
479 // the object code, because they might be demanded at any time from an
480 // expression evaluated on the command line.
481 // Also, GHCi might want to revert CAFs, so we add these to the
482 // revertible_caf_list.
483 //
484 // The linker hackily arranges that references to newCAF from dynamic
485 // code end up pointing to newRetainedCAF.
486 //
487 StgInd* newRetainedCAF (StgRegTable *reg, StgIndStatic *caf)
488 {
489 StgInd *bh;
490
491 bh = lockCAF(reg, caf);
492 if (!bh) return NULL;
493
494 ACQUIRE_SM_LOCK;
495
496 caf->static_link = (StgClosure*)revertible_caf_list;
497 revertible_caf_list = (StgIndStatic*)((StgWord)caf | STATIC_FLAG_LIST);
498
499 RELEASE_SM_LOCK;
500
501 return bh;
502 }
503
504 // If we are using loadObj/unloadObj in the linker, then we want to
505 //
506 // - retain all CAFs in statically linked code (keepCAFs == true),
507 // because we might link a new object that uses any of these CAFs.
508 //
509 // - GC CAFs in dynamically-linked code, so that we can detect when
510 // a dynamically-linked object is unloadable.
511 //
512 // So for this case, we set keepCAFs to true, and link newCAF to newGCdCAF
513 // for dynamically-linked code.
514 //
515 StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
516 {
517 StgInd *bh;
518
519 bh = lockCAF(reg, caf);
520 if (!bh) return NULL;
521
522 // Put this CAF on the mutable list for the old generation.
523 if (oldest_gen->no != 0) {
524 recordMutableCap((StgClosure*)caf,
525 regTableToCapability(reg), oldest_gen->no);
526 }
527
528 return bh;
529 }
530
531 /* -----------------------------------------------------------------------------
532 Nursery management.
533 -------------------------------------------------------------------------- */
534
535 static bdescr *
536 allocNursery (uint32_t node, bdescr *tail, W_ blocks)
537 {
538 bdescr *bd = NULL;
539 W_ i, n;
540
541 // We allocate the nursery as a single contiguous block and then
542 // divide it into single blocks manually. This way we guarantee
543 // that the nursery blocks are adjacent, so that the processor's
544 // automatic prefetching works across nursery blocks. This is a
545 // tiny optimisation (~0.5%), but it's free.
546
547 while (blocks > 0) {
548 n = stg_min(BLOCKS_PER_MBLOCK, blocks);
549 // allocLargeChunk will prefer large chunks, but will pick up
550 // small chunks if there are any available. We must allow
551 // single blocks here to avoid fragmentation (#7257)
552 bd = allocLargeChunkOnNode(node, 1, n);
553 n = bd->blocks;
554 blocks -= n;
555
556 for (i = 0; i < n; i++) {
557 initBdescr(&bd[i], g0, g0);
558
559 bd[i].blocks = 1;
560 bd[i].flags = 0;
561
562 if (i > 0) {
563 bd[i].u.back = &bd[i-1];
564 } else {
565 bd[i].u.back = NULL;
566 }
567
568 if (i+1 < n) {
569 bd[i].link = &bd[i+1];
570 } else {
571 bd[i].link = tail;
572 if (tail != NULL) {
573 tail->u.back = &bd[i];
574 }
575 }
576
577 bd[i].free = bd[i].start;
578 }
579
580 tail = &bd[0];
581 }
582
583 return &bd[0];
584 }
585
586 STATIC_INLINE void
587 assignNurseryToCapability (Capability *cap, uint32_t n)
588 {
589 ASSERT(n < n_nurseries);
590 cap->r.rNursery = &nurseries[n];
591 cap->r.rCurrentNursery = nurseries[n].blocks;
592 newNurseryBlock(nurseries[n].blocks);
593 cap->r.rCurrentAlloc = NULL;
594 ASSERT(cap->r.rCurrentNursery->node == cap->node);
595 }
596
597 /*
598 * Give each Capability a nursery from the pool. No need to do atomic increments
599 * here, everything must be stopped to call this function.
600 */
601 static void
602 assignNurseriesToCapabilities (uint32_t from, uint32_t to)
603 {
604 uint32_t i, node;
605
606 for (i = from; i < to; i++) {
607 node = capabilities[i]->node;
608 assignNurseryToCapability(capabilities[i], next_nursery[node]);
609 next_nursery[node] += n_numa_nodes;
610 }
611 }
612
613 static void
614 allocNurseries (uint32_t from, uint32_t to)
615 {
616 uint32_t i;
617 memcount n_blocks;
618
619 if (RtsFlags.GcFlags.nurseryChunkSize) {
620 n_blocks = RtsFlags.GcFlags.nurseryChunkSize;
621 } else {
622 n_blocks = RtsFlags.GcFlags.minAllocAreaSize;
623 }
624
625 for (i = from; i < to; i++) {
626 nurseries[i].blocks = allocNursery(capNoToNumaNode(i), NULL, n_blocks);
627 nurseries[i].n_blocks = n_blocks;
628 }
629 }
630
631 void
632 resetNurseries (void)
633 {
634 uint32_t n;
635
636 for (n = 0; n < n_numa_nodes; n++) {
637 next_nursery[n] = n;
638 }
639 assignNurseriesToCapabilities(0, n_capabilities);
640
641 #if defined(DEBUG)
642 bdescr *bd;
643 for (n = 0; n < n_nurseries; n++) {
644 for (bd = nurseries[n].blocks; bd; bd = bd->link) {
645 ASSERT(bd->gen_no == 0);
646 ASSERT(bd->gen == g0);
647 ASSERT(bd->node == capNoToNumaNode(n));
648 IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
649 }
650 }
651 #endif
652 }
653
654 W_
655 countNurseryBlocks (void)
656 {
657 uint32_t i;
658 W_ blocks = 0;
659
660 for (i = 0; i < n_nurseries; i++) {
661 blocks += nurseries[i].n_blocks;
662 }
663 return blocks;
664 }
665
666 //
667 // Resize each of the nurseries to the specified size.
668 //
669 static void
670 resizeNurseriesEach (W_ blocks)
671 {
672 uint32_t i, node;
673 bdescr *bd;
674 W_ nursery_blocks;
675 nursery *nursery;
676
677 for (i = 0; i < n_nurseries; i++) {
678 nursery = &nurseries[i];
679 nursery_blocks = nursery->n_blocks;
680 if (nursery_blocks == blocks) continue;
681
682 node = capNoToNumaNode(i);
683 if (nursery_blocks < blocks) {
684 debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
685 blocks);
686 nursery->blocks = allocNursery(node, nursery->blocks,
687 blocks-nursery_blocks);
688 }
689 else
690 {
691 bdescr *next_bd;
692
693 debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
694 blocks);
695
696 bd = nursery->blocks;
697 while (nursery_blocks > blocks) {
698 next_bd = bd->link;
699 next_bd->u.back = NULL;
700 nursery_blocks -= bd->blocks; // might be a large block
701 freeGroup(bd);
702 bd = next_bd;
703 }
704 nursery->blocks = bd;
705 // might have gone just under, by freeing a large block, so make
706 // up the difference.
707 if (nursery_blocks < blocks) {
708 nursery->blocks = allocNursery(node, nursery->blocks,
709 blocks-nursery_blocks);
710 }
711 }
712 nursery->n_blocks = blocks;
713 ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks);
714 }
715 }
716
717 void
718 resizeNurseriesFixed (void)
719 {
720 uint32_t blocks;
721
722 if (RtsFlags.GcFlags.nurseryChunkSize) {
723 blocks = RtsFlags.GcFlags.nurseryChunkSize;
724 } else {
725 blocks = RtsFlags.GcFlags.minAllocAreaSize;
726 }
727
728 resizeNurseriesEach(blocks);
729 }
730
731 //
732 // Resize the nurseries to the total specified size.
733 //
734 void
735 resizeNurseries (W_ blocks)
736 {
737 // If there are multiple nurseries, then we just divide the number
738 // of available blocks between them.
739 resizeNurseriesEach(blocks / n_nurseries);
740 }
741
742 bool
743 getNewNursery (Capability *cap)
744 {
745 StgWord i;
746 uint32_t node = cap->node;
747 uint32_t n;
748
749 for(;;) {
750 i = next_nursery[node];
751 if (i < n_nurseries) {
752 if (cas(&next_nursery[node], i, i+n_numa_nodes) == i) {
753 assignNurseryToCapability(cap, i);
754 return true;
755 }
756 } else if (n_numa_nodes > 1) {
757 // Try to find an unused nursery chunk on other nodes. We'll get
758 // remote memory, but the rationale is that avoiding GC is better
759 // than avoiding remote memory access.
760 bool lost = false;
761 for (n = 0; n < n_numa_nodes; n++) {
762 if (n == node) continue;
763 i = next_nursery[n];
764 if (i < n_nurseries) {
765 if (cas(&next_nursery[n], i, i+n_numa_nodes) == i) {
766 assignNurseryToCapability(cap, i);
767 return true;
768 } else {
769 lost = true; /* lost a race */
770 }
771 }
772 }
773 if (!lost) return false;
774 } else {
775 return false;
776 }
777 }
778 }
779
780 /* -----------------------------------------------------------------------------
781 move_STACK is called to update the TSO structure after it has been
782 moved from one place to another.
783 -------------------------------------------------------------------------- */
784
785 void
786 move_STACK (StgStack *src, StgStack *dest)
787 {
788 ptrdiff_t diff;
789
790 // relocate the stack pointer...
791 diff = (StgPtr)dest - (StgPtr)src; // In *words*
792 dest->sp = (StgPtr)dest->sp + diff;
793 }
794
795 STATIC_INLINE void
796 accountAllocation(Capability *cap, W_ n)
797 {
798 TICK_ALLOC_HEAP_NOCTR(WDS(n));
799 CCS_ALLOC(cap->r.rCCCS,n);
800 if (cap->r.rCurrentTSO != NULL) {
801 // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_)
802 ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit),
803 (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit))
804 - n*sizeof(W_)));
805 }
806
807 }
808
809 /* -----------------------------------------------------------------------------
810 StgPtr allocate (Capability *cap, W_ n)
811
812 Allocates an area of memory n *words* large, from the nursery of
813 the supplied Capability, or from the global block pool if the area
814 requested is larger than LARGE_OBJECT_THRESHOLD. Memory is not
815 allocated from the current nursery block, so as not to interfere
816 with Hp/HpLim.
817
818 The address of the allocated memory is returned. allocate() never
819 fails; if it returns, the returned value is a valid address. If
820 the nursery is already full, then another block is allocated from
821 the global block pool. If we need to get memory from the OS and
822 that operation fails, then the whole process will be killed.
823 -------------------------------------------------------------------------- */
824
825 /*
826 * Allocate some n words of heap memory; terminating
827 * on heap overflow
828 */
829 StgPtr
830 allocate (Capability *cap, W_ n)
831 {
832 StgPtr p = allocateMightFail(cap, n);
833 if (p == NULL) {
834 reportHeapOverflow();
835 // heapOverflow() doesn't exit (see #2592), but we aren't
836 // in a position to do a clean shutdown here: we
837 // either have to allocate the memory or exit now.
838 // Allocating the memory would be bad, because the user
839 // has requested that we not exceed maxHeapSize, so we
840 // just exit.
841 stg_exit(EXIT_HEAPOVERFLOW);
842 }
843 return p;
844 }
845
846 /*
847 * Allocate some n words of heap memory; returning NULL
848 * on heap overflow
849 */
850 StgPtr
851 allocateMightFail (Capability *cap, W_ n)
852 {
853 bdescr *bd;
854 StgPtr p;
855
856 if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
857 // The largest number of words such that
858 // the computation of req_blocks will not overflow.
859 W_ max_words = (HS_WORD_MAX & ~(BLOCK_SIZE-1)) / sizeof(W_);
860 W_ req_blocks;
861
862 if (n > max_words)
863 req_blocks = HS_WORD_MAX; // signal overflow below
864 else
865 req_blocks = (W_)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
866
867 // Attempting to allocate an object larger than maxHeapSize
868 // should definitely be disallowed. (bug #1791)
869 if ((RtsFlags.GcFlags.maxHeapSize > 0 &&
870 req_blocks >= RtsFlags.GcFlags.maxHeapSize) ||
871 req_blocks >= HS_INT32_MAX) // avoid overflow when
872 // calling allocGroup() below
873 {
874 return NULL;
875 }
876
877 // Only credit allocation after we've passed the size check above
878 accountAllocation(cap, n);
879
880 ACQUIRE_SM_LOCK
881 bd = allocGroupOnNode(cap->node,req_blocks);
882 dbl_link_onto(bd, &g0->large_objects);
883 g0->n_large_blocks += bd->blocks; // might be larger than req_blocks
884 g0->n_new_large_words += n;
885 RELEASE_SM_LOCK;
886 initBdescr(bd, g0, g0);
887 bd->flags = BF_LARGE;
888 bd->free = bd->start + n;
889 cap->total_allocated += n;
890 return bd->start;
891 }
892
893 /* small allocation (<LARGE_OBJECT_THRESHOLD) */
894
895 accountAllocation(cap, n);
896 bd = cap->r.rCurrentAlloc;
897 if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
898
899 if (bd) finishedNurseryBlock(cap,bd);
900
901 // The CurrentAlloc block is full, we need to find another
902 // one. First, we try taking the next block from the
903 // nursery:
904 bd = cap->r.rCurrentNursery->link;
905
906 if (bd == NULL) {
907 // The nursery is empty: allocate a fresh block (we can't
908 // fail here).
909 ACQUIRE_SM_LOCK;
910 bd = allocBlockOnNode(cap->node);
911 cap->r.rNursery->n_blocks++;
912 RELEASE_SM_LOCK;
913 initBdescr(bd, g0, g0);
914 bd->flags = 0;
915 // If we had to allocate a new block, then we'll GC
916 // pretty quickly now, because MAYBE_GC() will
917 // notice that CurrentNursery->link is NULL.
918 } else {
919 newNurseryBlock(bd);
920 // we have a block in the nursery: take it and put
921 // it at the *front* of the nursery list, and use it
922 // to allocate() from.
923 //
924 // Previously the nursery looked like this:
925 //
926 // CurrentNursery
927 // /
928 // +-+ +-+
929 // nursery -> ... |A| -> |B| -> ...
930 // +-+ +-+
931 //
932 // After doing this, it looks like this:
933 //
934 // CurrentNursery
935 // /
936 // +-+ +-+
937 // nursery -> |B| -> ... -> |A| -> ...
938 // +-+ +-+
939 // |
940 // CurrentAlloc
941 //
942 // The point is to get the block out of the way of the
943 // advancing CurrentNursery pointer, while keeping it
944 // on the nursery list so we don't lose track of it.
945 cap->r.rCurrentNursery->link = bd->link;
946 if (bd->link != NULL) {
947 bd->link->u.back = cap->r.rCurrentNursery;
948 }
949 }
950 dbl_link_onto(bd, &cap->r.rNursery->blocks);
951 cap->r.rCurrentAlloc = bd;
952 IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
953 }
954 p = bd->free;
955 bd->free += n;
956
957 IF_DEBUG(sanity, ASSERT(*((StgWord8*)p) == 0xaa));
958 return p;
959 }
960
961 /* ---------------------------------------------------------------------------
962 Allocate a fixed/pinned object.
963
964 We allocate small pinned objects into a single block, allocating a
965 new block when the current one overflows. The block is chained
966 onto the large_object_list of generation 0.
967
968 NOTE: The GC can't in general handle pinned objects. This
969 interface is only safe to use for ByteArrays, which have no
970 pointers and don't require scavenging. It works because the
971 block's descriptor has the BF_LARGE flag set, so the block is
972 treated as a large object and chained onto various lists, rather
973 than the individual objects being copied. However, when it comes
974 to scavenge the block, the GC will only scavenge the first object.
975 The reason is that the GC can't linearly scan a block of pinned
976 objects at the moment (doing so would require using the
977 mostly-copying techniques). But since we're restricting ourselves
978 to pinned ByteArrays, not scavenging is ok.
979
980 This function is called by newPinnedByteArray# which immediately
981 fills the allocated memory with a MutableByteArray#. Note that
982 this returns NULL on heap overflow.
983 ------------------------------------------------------------------------- */
984
985 StgPtr
986 allocatePinned (Capability *cap, W_ n)
987 {
988 StgPtr p;
989 bdescr *bd;
990
991 // If the request is for a large object, then allocate()
992 // will give us a pinned object anyway.
993 if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
994 p = allocateMightFail(cap, n);
995 if (p == NULL) {
996 return NULL;
997 } else {
998 Bdescr(p)->flags |= BF_PINNED;
999 return p;
1000 }
1001 }
1002
1003 accountAllocation(cap, n);
1004 bd = cap->pinned_object_block;
1005
1006 // If we don't have a block of pinned objects yet, or the current
1007 // one isn't large enough to hold the new object, get a new one.
1008 if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
1009
1010 // stash the old block on cap->pinned_object_blocks. On the
1011 // next GC cycle these objects will be moved to
1012 // g0->large_objects.
1013 if (bd != NULL) {
1014 // add it to the allocation stats when the block is full
1015 finishedNurseryBlock(cap, bd);
1016 dbl_link_onto(bd, &cap->pinned_object_blocks);
1017 }
1018
1019 // We need to find another block. We could just allocate one,
1020 // but that means taking a global lock and we really want to
1021 // avoid that (benchmarks that allocate a lot of pinned
1022 // objects scale really badly if we do this).
1023 //
1024 // So first, we try taking the next block from the nursery, in
1025 // the same way as allocate().
1026 bd = cap->r.rCurrentNursery->link;
1027 if (bd == NULL) {
1028 // The nursery is empty: allocate a fresh block (we can't fail
1029 // here).
1030 ACQUIRE_SM_LOCK;
1031 bd = allocBlockOnNode(cap->node);
1032 RELEASE_SM_LOCK;
1033 initBdescr(bd, g0, g0);
1034 } else {
1035 newNurseryBlock(bd);
1036 // we have a block in the nursery: steal it
1037 cap->r.rCurrentNursery->link = bd->link;
1038 if (bd->link != NULL) {
1039 bd->link->u.back = cap->r.rCurrentNursery;
1040 }
1041 cap->r.rNursery->n_blocks -= bd->blocks;
1042 }
1043
1044 cap->pinned_object_block = bd;
1045 bd->flags = BF_PINNED | BF_LARGE | BF_EVACUATED;
1046
1047 // The pinned_object_block remains attached to the capability
1048 // until it is full, even if a GC occurs. We want this
1049 // behaviour because otherwise the unallocated portion of the
1050 // block would be forever slop, and under certain workloads
1051 // (allocating a few ByteStrings per GC) we accumulate a lot
1052 // of slop.
1053 //
1054 // So, the pinned_object_block is initially marked
1055 // BF_EVACUATED so the GC won't touch it. When it is full,
1056 // we place it on the large_objects list, and at the start of
1057 // the next GC the BF_EVACUATED flag will be cleared, and the
1058 // block will be promoted as usual (if anything in it is
1059 // live).
1060 }
1061
1062 p = bd->free;
1063 bd->free += n;
1064 return p;
1065 }
1066
1067 /* -----------------------------------------------------------------------------
1068 Write Barriers
1069 -------------------------------------------------------------------------- */
1070
1071 /*
1072 This is the write barrier for MUT_VARs, a.k.a. IORefs. A
1073 MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
1074 is. When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY
1075 and is put on the mutable list.
1076 */
1077 void
1078 dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
1079 {
1080 Capability *cap = regTableToCapability(reg);
1081 if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
1082 p->header.info = &stg_MUT_VAR_DIRTY_info;
1083 recordClosureMutated(cap,p);
1084 }
1085 }
1086
1087 void
1088 dirty_TVAR(Capability *cap, StgTVar *p)
1089 {
1090 if (p->header.info == &stg_TVAR_CLEAN_info) {
1091 p->header.info = &stg_TVAR_DIRTY_info;
1092 recordClosureMutated(cap,(StgClosure*)p);
1093 }
1094 }
1095
1096 // Setting a TSO's link field with a write barrier.
1097 // It is *not* necessary to call this function when
1098 // * setting the link field to END_TSO_QUEUE
1099 // * setting the link field of the currently running TSO, as it
1100 // will already be dirty.
1101 void
1102 setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
1103 {
1104 if (tso->dirty == 0) {
1105 tso->dirty = 1;
1106 recordClosureMutated(cap,(StgClosure*)tso);
1107 }
1108 tso->_link = target;
1109 }
1110
1111 void
1112 setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
1113 {
1114 if (tso->dirty == 0) {
1115 tso->dirty = 1;
1116 recordClosureMutated(cap,(StgClosure*)tso);
1117 }
1118 tso->block_info.prev = target;
1119 }
1120
1121 void
1122 dirty_TSO (Capability *cap, StgTSO *tso)
1123 {
1124 if (tso->dirty == 0) {
1125 tso->dirty = 1;
1126 recordClosureMutated(cap,(StgClosure*)tso);
1127 }
1128 }
1129
1130 void
1131 dirty_STACK (Capability *cap, StgStack *stack)
1132 {
1133 if (stack->dirty == 0) {
1134 stack->dirty = 1;
1135 recordClosureMutated(cap,(StgClosure*)stack);
1136 }
1137 }
1138
1139 /*
1140 This is the write barrier for MVARs. An MVAR_CLEAN objects is not
1141 on the mutable list; a MVAR_DIRTY is. When written to, a
1142 MVAR_CLEAN turns into a MVAR_DIRTY and is put on the mutable list.
1143 The check for MVAR_CLEAN is inlined at the call site for speed,
1144 this really does make a difference on concurrency-heavy benchmarks
1145 such as Chaneneos and cheap-concurrency.
1146 */
1147 void
1148 dirty_MVAR(StgRegTable *reg, StgClosure *p)
1149 {
1150 recordClosureMutated(regTableToCapability(reg),p);
1151 }
1152
1153 /* -----------------------------------------------------------------------------
1154 * Stats and stuff
1155 * -------------------------------------------------------------------------- */
1156
1157 /* -----------------------------------------------------------------------------
1158 * [Note allocation accounting]
1159 *
1160 * - When cap->r.rCurrentNusery moves to a new block in the nursery,
1161 * we add the size of the used portion of the previous block to
1162 * cap->total_allocated. (see finishedNurseryBlock())
1163 *
1164 * - When we start a GC, the allocated portion of CurrentNursery and
1165 * CurrentAlloc are added to cap->total_allocated. (see
1166 * updateNurseriesStats())
1167 *
1168 * -------------------------------------------------------------------------- */
1169
1170 //
1171 // Calculate the total allocated memory since the start of the
1172 // program. Also emits events reporting the per-cap allocation
1173 // totals.
1174 //
1175 StgWord
1176 calcTotalAllocated (void)
1177 {
1178 W_ tot_alloc = 0;
1179 W_ n;
1180
1181 for (n = 0; n < n_capabilities; n++) {
1182 tot_alloc += capabilities[n]->total_allocated;
1183
1184 traceEventHeapAllocated(capabilities[n],
1185 CAPSET_HEAP_DEFAULT,
1186 capabilities[n]->total_allocated * sizeof(W_));
1187 }
1188
1189 return tot_alloc;
1190 }
1191
1192 //
1193 // Update the per-cap total_allocated numbers with an approximation of
1194 // the amount of memory used in each cap's nursery.
1195 //
1196 void
1197 updateNurseriesStats (void)
1198 {
1199 uint32_t i;
1200 bdescr *bd;
1201
1202 for (i = 0; i < n_capabilities; i++) {
1203 // The current nursery block and the current allocate block have not
1204 // yet been accounted for in cap->total_allocated, so we add them here.
1205 bd = capabilities[i]->r.rCurrentNursery;
1206 if (bd) finishedNurseryBlock(capabilities[i], bd);
1207 bd = capabilities[i]->r.rCurrentAlloc;
1208 if (bd) finishedNurseryBlock(capabilities[i], bd);
1209 }
1210 }
1211
1212 W_ countOccupied (bdescr *bd)
1213 {
1214 W_ words;
1215
1216 words = 0;
1217 for (; bd != NULL; bd = bd->link) {
1218 ASSERT(bd->free <= bd->start + bd->blocks * BLOCK_SIZE_W);
1219 words += bd->free - bd->start;
1220 }
1221 return words;
1222 }
1223
1224 W_ genLiveWords (generation *gen)
1225 {
1226 return gen->n_words + gen->n_large_words +
1227 gen->n_compact_blocks * BLOCK_SIZE_W;
1228 }
1229
1230 W_ genLiveBlocks (generation *gen)
1231 {
1232 return gen->n_blocks + gen->n_large_blocks + gen->n_compact_blocks;
1233 }
1234
1235 W_ gcThreadLiveWords (uint32_t i, uint32_t g)
1236 {
1237 W_ a, b, c;
1238
1239 a = countOccupied(gc_threads[i]->gens[g].todo_bd);
1240 b = gc_threads[i]->gens[g].n_part_words;
1241 c = gc_threads[i]->gens[g].n_scavd_words;
1242
1243 // debugBelch("cap %d, g%d, %ld %ld %ld\n", i, g, a, b, c);
1244 return a + b + c;
1245 }
1246
1247 W_ gcThreadLiveBlocks (uint32_t i, uint32_t g)
1248 {
1249 W_ blocks;
1250
1251 blocks = countBlocks(gc_threads[i]->gens[g].todo_bd);
1252 blocks += gc_threads[i]->gens[g].n_part_blocks;
1253 blocks += gc_threads[i]->gens[g].n_scavd_blocks;
1254
1255 return blocks;
1256 }
1257
1258 /* Determine which generation will be collected next, and approximate
1259 * the maximum amount of memory that will be required to do the GC,
1260 * taking into account data that will be copied, and the space needed
1261 * to store bitmaps and the mark stack. Note: blocks_needed does not
1262 * include the blocks in the nursery.
1263 *
1264 * Assume: all data currently live will remain live. Generationss
1265 * that will be collected next time will therefore need twice as many
1266 * blocks since all the data will be copied.
1267 */
1268 extern W_
1269 calcNeeded (bool force_major, memcount *blocks_needed)
1270 {
1271 W_ needed = 0, blocks;
1272 uint32_t g, N;
1273 generation *gen;
1274
1275 if (force_major) {
1276 N = RtsFlags.GcFlags.generations - 1;
1277 } else {
1278 N = 0;
1279 }
1280
1281 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1282 gen = &generations[g];
1283
1284 blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?)
1285 + gen->n_large_blocks
1286 + gen->n_compact_blocks;
1287
1288 // we need at least this much space
1289 needed += blocks;
1290
1291 // are we collecting this gen?
1292 if (g == 0 || // always collect gen 0
1293 blocks > gen->max_blocks)
1294 {
1295 N = stg_max(N,g);
1296
1297 // we will collect this gen next time
1298 if (gen->mark) {
1299 // bitmap:
1300 needed += gen->n_blocks / BITS_IN(W_);
1301 // mark stack:
1302 needed += gen->n_blocks / 100;
1303 }
1304 if (gen->compact) {
1305 continue; // no additional space needed for compaction
1306 } else {
1307 needed += gen->n_blocks;
1308 }
1309 }
1310 }
1311
1312 if (blocks_needed != NULL) {
1313 *blocks_needed = needed;
1314 }
1315 return N;
1316 }
1317
1318 StgWord calcTotalLargeObjectsW (void)
1319 {
1320 uint32_t g;
1321 StgWord totalW = 0;
1322
1323 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1324 totalW += generations[g].n_large_words;
1325 }
1326 return totalW;
1327 }
1328
1329 StgWord calcTotalCompactW (void)
1330 {
1331 uint32_t g;
1332 StgWord totalW = 0;
1333
1334 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1335 totalW += generations[g].n_compact_blocks * BLOCK_SIZE_W;
1336 }
1337 return totalW;
1338 }
1339
1340 /* ----------------------------------------------------------------------------
1341 Executable memory
1342
1343 Executable memory must be managed separately from non-executable
1344 memory. Most OSs these days require you to jump through hoops to
1345 dynamically allocate executable memory, due to various security
1346 measures.
1347
1348 Here we provide a small memory allocator for executable memory.
1349 Memory is managed with a page granularity; we allocate linearly
1350 in the page, and when the page is emptied (all objects on the page
1351 are free) we free the page again, not forgetting to make it
1352 non-executable.
1353
1354 TODO: The inability to handle objects bigger than BLOCK_SIZE_W means that
1355 the linker cannot use allocateExec for loading object code files
1356 on Windows. Once allocateExec can handle larger objects, the linker
1357 should be modified to use allocateExec instead of VirtualAlloc.
1358 ------------------------------------------------------------------------- */
1359
1360 #if (defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)) && defined(ios_HOST_OS)
1361 #include <libkern/OSCacheControl.h>
1362 #endif
1363
1364 #if defined(__clang__)
1365 /* clang defines __clear_cache as a builtin on some platforms.
1366 * For example on armv7-linux-androideabi. The type slightly
1367 * differs from gcc.
1368 */
1369 extern void __clear_cache(void * begin, void * end);
1370 #elif defined(__GNUC__)
1371 /* __clear_cache is a libgcc function.
1372 * It existed before __builtin___clear_cache was introduced.
1373 * See Trac #8562.
1374 */
1375 extern void __clear_cache(char * begin, char * end);
1376 #endif /* __GNUC__ */
1377
1378 /* On ARM and other platforms, we need to flush the cache after
1379 writing code into memory, so the processor reliably sees it. */
1380 void flushExec (W_ len, AdjustorExecutable exec_addr)
1381 {
1382 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
1383 /* x86 doesn't need to do anything, so just suppress some warnings. */
1384 (void)len;
1385 (void)exec_addr;
1386 #elif (defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)) && defined(ios_HOST_OS)
1387 /* On iOS we need to use the special 'sys_icache_invalidate' call. */
1388 sys_icache_invalidate(exec_addr, len);
1389 #elif defined(__clang__)
1390 unsigned char* begin = (unsigned char*)exec_addr;
1391 unsigned char* end = begin + len;
1392 # if __has_builtin(__builtin___clear_cache)
1393 __builtin___clear_cache((void*)begin, (void*)end);
1394 # else
1395 __clear_cache((void*)begin, (void*)end);
1396 # endif
1397 #elif defined(__GNUC__)
1398 /* For all other platforms, fall back to a libgcc builtin. */
1399 unsigned char* begin = (unsigned char*)exec_addr;
1400 unsigned char* end = begin + len;
1401 /* __builtin___clear_cache is supported since GNU C 4.3.6.
1402 * We pick 4.4 to simplify condition a bit.
1403 */
1404 # if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
1405 __builtin___clear_cache((void*)begin, (void*)end);
1406 # else
1407 __clear_cache((void*)begin, (void*)end);
1408 # endif
1409 #else
1410 #error Missing support to flush the instruction cache
1411 #endif
1412 }
1413
1414 #if defined(linux_HOST_OS)
1415
1416 // On Linux we need to use libffi for allocating executable memory,
1417 // because it knows how to work around the restrictions put in place
1418 // by SELinux.
1419
1420 AdjustorWritable allocateExec (W_ bytes, AdjustorExecutable *exec_ret)
1421 {
1422 void **ret, **exec;
1423 ACQUIRE_SM_LOCK;
1424 ret = ffi_closure_alloc (sizeof(void *) + (size_t)bytes, (void**)&exec);
1425 RELEASE_SM_LOCK;
1426 if (ret == NULL) return ret;
1427 *ret = ret; // save the address of the writable mapping, for freeExec().
1428 *exec_ret = exec + 1;
1429 return (ret + 1);
1430 }
1431
1432 // freeExec gets passed the executable address, not the writable address.
1433 void freeExec (AdjustorExecutable addr)
1434 {
1435 AdjustorWritable writable;
1436 writable = *((void**)addr - 1);
1437 ACQUIRE_SM_LOCK;
1438 ffi_closure_free (writable);
1439 RELEASE_SM_LOCK
1440 }
1441
1442 #elif defined(ios_HOST_OS)
1443
1444 static HashTable* allocatedExecs;
1445
1446 AdjustorWritable allocateExec(W_ bytes, AdjustorExecutable *exec_ret)
1447 {
1448 AdjustorWritable writ;
1449 ffi_closure* cl;
1450 if (bytes != sizeof(ffi_closure)) {
1451 barf("allocateExec: for ffi_closure only");
1452 }
1453 ACQUIRE_SM_LOCK;
1454 cl = writ = ffi_closure_alloc((size_t)bytes, exec_ret);
1455 if (cl != NULL) {
1456 if (allocatedExecs == NULL) {
1457 allocatedExecs = allocHashTable();
1458 }
1459 insertHashTable(allocatedExecs, (StgWord)*exec_ret, writ);
1460 }
1461 RELEASE_SM_LOCK;
1462 return writ;
1463 }
1464
1465 AdjustorWritable execToWritable(AdjustorExecutable exec)
1466 {
1467 AdjustorWritable writ;
1468 ACQUIRE_SM_LOCK;
1469 if (allocatedExecs == NULL ||
1470 (writ = lookupHashTable(allocatedExecs, (StgWord)exec)) == NULL) {
1471 RELEASE_SM_LOCK;
1472 barf("execToWritable: not found");
1473 }
1474 RELEASE_SM_LOCK;
1475 return writ;
1476 }
1477
1478 void freeExec(AdjustorExecutable exec)
1479 {
1480 AdjustorWritable writ;
1481 ffi_closure* cl;
1482 cl = writ = execToWritable(exec);
1483 ACQUIRE_SM_LOCK;
1484 removeHashTable(allocatedExecs, (StgWord)exec, writ);
1485 ffi_closure_free(cl);
1486 RELEASE_SM_LOCK
1487 }
1488
1489 #else
1490
1491 AdjustorWritable allocateExec (W_ bytes, AdjustorExecutable *exec_ret)
1492 {
1493 void *ret;
1494 W_ n;
1495
1496 ACQUIRE_SM_LOCK;
1497
1498 // round up to words.
1499 n = (bytes + sizeof(W_) + 1) / sizeof(W_);
1500
1501 if (n+1 > BLOCK_SIZE_W) {
1502 barf("allocateExec: can't handle large objects");
1503 }
1504
1505 if (exec_block == NULL ||
1506 exec_block->free + n + 1 > exec_block->start + BLOCK_SIZE_W) {
1507 bdescr *bd;
1508 W_ pagesize = getPageSize();
1509 bd = allocGroup(stg_max(1, pagesize / BLOCK_SIZE));
1510 debugTrace(DEBUG_gc, "allocate exec block %p", bd->start);
1511 bd->gen_no = 0;
1512 bd->flags = BF_EXEC;
1513 bd->link = exec_block;
1514 if (exec_block != NULL) {
1515 exec_block->u.back = bd;
1516 }
1517 bd->u.back = NULL;
1518 setExecutable(bd->start, bd->blocks * BLOCK_SIZE, true);
1519 exec_block = bd;
1520 }
1521 *(exec_block->free) = n; // store the size of this chunk
1522 exec_block->gen_no += n; // gen_no stores the number of words allocated
1523 ret = exec_block->free + 1;
1524 exec_block->free += n + 1;
1525
1526 RELEASE_SM_LOCK
1527 *exec_ret = ret;
1528 return ret;
1529 }
1530
1531 void freeExec (void *addr)
1532 {
1533 StgPtr p = (StgPtr)addr - 1;
1534 bdescr *bd = Bdescr((StgPtr)p);
1535
1536 if ((bd->flags & BF_EXEC) == 0) {
1537 barf("freeExec: not executable");
1538 }
1539
1540 if (*(StgPtr)p == 0) {
1541 barf("freeExec: already free?");
1542 }
1543
1544 ACQUIRE_SM_LOCK;
1545
1546 bd->gen_no -= *(StgPtr)p;
1547 *(StgPtr)p = 0;
1548
1549 if (bd->gen_no == 0) {
1550 // Free the block if it is empty, but not if it is the block at
1551 // the head of the queue.
1552 if (bd != exec_block) {
1553 debugTrace(DEBUG_gc, "free exec block %p", bd->start);
1554 dbl_link_remove(bd, &exec_block);
1555 setExecutable(bd->start, bd->blocks * BLOCK_SIZE, false);
1556 freeGroup(bd);
1557 } else {
1558 bd->free = bd->start;
1559 }
1560 }
1561
1562 RELEASE_SM_LOCK
1563 }
1564
1565 #endif /* switch(HOST_OS) */
1566
1567 #if defined(DEBUG)
1568
1569 // handy function for use in gdb, because Bdescr() is inlined.
1570 extern bdescr *_bdescr (StgPtr p);
1571
1572 bdescr *
1573 _bdescr (StgPtr p)
1574 {
1575 return Bdescr(p);
1576 }
1577
1578 #endif