cpp: Use #pragma once instead of #ifndef guards
[ghc.git] / rts / sm / HeapAlloc.h
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2008
4 *
5 * The HEAP_ALLOCED() test.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #pragma once
10
11 #include "BeginPrivate.h"
12
13 /* -----------------------------------------------------------------------------
14 The HEAP_ALLOCED() test.
15
16 HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
17 It needs to be FAST.
18
19 See wiki commentary at
20 http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
21
22 Implementation of HEAP_ALLOCED
23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24
25 Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
26 can just use a table to record which megablocks in the address
27 space belong to the heap. On a 32-bit machine, with 1Mb
28 megablocks, using 8 bits for each entry in the table, the table
29 requires 4k. Lookups during GC will be fast, because the table
30 will be quickly cached (indeed, performance measurements showed no
31 measurable difference between doing the table lookup and using a
32 constant comparison).
33
34 On 64-bit machines, we have two possibilities. One is to request
35 a single chunk of address space that we deem "large enough"
36 (currently 1TB or the ulimit size, whichever is smaller, although this could
37 easily be extended to, say 16TB or more). Memory from that chunk is GC
38 memory, everything else is not. This case is tricky in that it requires
39 support from the OS to allocate address space without allocating memory (in
40 practice, all modern OSes do this). It's also tricky in that it is the only
41 case where a successful HEAP_ALLOCED(p) check can trigger a segfault when
42 accessing p (and for debugging purposes, it will).
43
44 Alternatively, the older implementation caches one 12-bit block map
45 that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
46 called for an address that is not in the cache, it calls
47 slowIsHeapAlloced (see MBlock.c) which will find the block map for
48 the 4GB block in question.
49 -------------------------------------------------------------------------- */
50
51 #ifdef USE_LARGE_ADDRESS_SPACE
52
53 struct mblock_address_range {
54 W_ begin, end;
55 W_ padding[6]; // ensure nothing else inhabits this cache line
56 } ATTRIBUTE_ALIGNED(64);
57 extern struct mblock_address_range mblock_address_space;
58
59 # define HEAP_ALLOCED(p) ((W_)(p) >= mblock_address_space.begin && \
60 (W_)(p) < (mblock_address_space.end))
61 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
62
63 #elif SIZEOF_VOID_P == 4
64 extern StgWord8 mblock_map[];
65
66 /* On a 32-bit machine a 4KB table is always sufficient */
67 # define MBLOCK_MAP_SIZE 4096
68 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
69 # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
70 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
71
72 /* -----------------------------------------------------------------------------
73 HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).
74
75 Here are some cache layout options:
76
77 [1]
78 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
79 mblock size = 20 bits
80 entries = 8192 13 bits
81 line size = 0 bits (1 bit of value)
82 tag size = 15 bits
83 = 48 bits
84
85 [2]
86 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
87 mblock size = 20 bits
88 entries = 16384 14 bits
89 line size = 2 bits (4 bits of value)
90 tag size = 12 bits
91 = 48 bits
92
93 [3]
94 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
95 mblock size = 20 bits
96 entries = 8192 13 bits
97 line size = 1 bits (2 bits of value)
98 tag size = 14 bits
99 = 48 bits
100
101 [4]
102 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
103 mblock size = 20 bits
104 entries = 1024 10 bits
105 line size = 4 bits (16 bits of value)
106 tag size = 14 bits
107 = 48 bits
108
109 [5]
110 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
111 mblock size = 20 bits
112 entries = 512 9 bits
113 line size = 5 bits (32 bits of value)
114 tag size = 14 bits
115 = 48 bits
116
117 We actually use none of the above. After much experimentation it was
118 found that optimising the lookup is the most important factor,
119 followed by reducing the number of misses. To that end, we use a
120 variant of [1] in which each cache entry is ((mblock << 1) + value)
121 where value is 0 for non-heap and 1 for heap. The cache entries can
122 be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
123 1 bit for the value. The cache can be as big as we like, but
124 currently we use 8k entries, giving us 8GB capacity.
125
126 ---------------------------------------------------------------------------- */
127
128 #elif SIZEOF_VOID_P == 8
129
130 #define MBC_LINE_BITS 0
131 #define MBC_TAG_BITS 15
132
133 #if x86_64_HOST_ARCH
134 // 32bits are enough for 'entry' as modern amd64 boxes have
135 // only 48bit sized virtual addres.
136 typedef StgWord32 MbcCacheLine;
137 #else
138 // 32bits is not enough here as some arches (like ia64) use
139 // upper address bits to distinct memory areas.
140 typedef StgWord64 MbcCacheLine;
141 #endif
142
143 typedef StgWord8 MBlockMapLine;
144
145 #define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
146
147 #define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
148 #define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
149 #define MBC_ENTRIES (1<<MBC_SHIFT)
150
151 extern MbcCacheLine mblock_cache[];
152
153 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
154
155 #define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
156
157 typedef struct {
158 StgWord32 addrHigh32;
159 MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
160 } MBlockMap;
161
162 extern W_ mpc_misses;
163
164 StgBool HEAP_ALLOCED_miss(StgWord mblock, const void *p);
165
166 INLINE_HEADER
167 StgBool HEAP_ALLOCED(const void *p)
168 {
169 StgWord mblock;
170 uint32_t entry_no;
171 MbcCacheLine entry, value;
172
173 mblock = (StgWord)p >> MBLOCK_SHIFT;
174 entry_no = mblock & (MBC_ENTRIES-1);
175 entry = mblock_cache[entry_no];
176 value = entry ^ (mblock << 1);
177 // this formulation coaxes gcc into prioritising the value==1
178 // case, which we expect to be the most common.
179 // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
180 if (value == 1) {
181 return 1;
182 } else if (value == 0) {
183 return 0;
184 } else {
185 // putting the rest out of line turned out to be a slight
186 // performance improvement:
187 return HEAP_ALLOCED_miss(mblock,p);
188 }
189 }
190
191 // In the parallel GC, the cache itself is safe to *read*, and can be
192 // updated atomically, but we need to place a lock around operations
193 // that touch the MBlock map.
194 INLINE_HEADER
195 StgBool HEAP_ALLOCED_GC(void *p)
196 {
197 StgWord mblock;
198 uint32_t entry_no;
199 MbcCacheLine entry, value;
200 StgBool b;
201
202 mblock = (StgWord)p >> MBLOCK_SHIFT;
203 entry_no = mblock & (MBC_ENTRIES-1);
204 entry = mblock_cache[entry_no];
205 value = entry ^ (mblock << 1);
206 if (value == 1) {
207 return 1;
208 } else if (value == 0) {
209 return 0;
210 } else {
211 // putting the rest out of line turned out to be a slight
212 // performance improvement:
213 ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
214 b = HEAP_ALLOCED_miss(mblock,p);
215 RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
216 return b;
217 }
218 }
219
220 #else
221 # error HEAP_ALLOCED not defined
222 #endif
223
224 #include "EndPrivate.h"