0ec1e6d7bb6e5aebf9c6c949a7210a01ff44c3a7
[ghc.git] / rts / sm / HeapAlloc.h
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2008
4 *
5 * The HEAP_ALLOCED() test.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #ifndef SM_HEAP_ALLOC_H
10 #define SM_HEAP_ALLOC_H
11
12 #include "BeginPrivate.h"
13
14 /* -----------------------------------------------------------------------------
15 The HEAP_ALLOCED() test.
16
17 HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
18 It needs to be FAST.
19
20 See wiki commentary at
21 http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
22
23 Implementation of HEAP_ALLOCED
24 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25
26 Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
27 can just use a table to record which megablocks in the address
28 space belong to the heap. On a 32-bit machine, with 1Mb
29 megablocks, using 8 bits for each entry in the table, the table
30 requires 4k. Lookups during GC will be fast, because the table
31 will be quickly cached (indeed, performance measurements showed no
32 measurable difference between doing the table lookup and using a
33 constant comparison).
34
35 On 64-bit machines, we have two possibilities. One is to request
36 a single chunk of address space that we deem "large enough"
37 (currently 1TB or the ulimit size, whichever is smaller, although this could
38 easily be extended to, say 16TB or more). Memory from that chunk is GC
39 memory, everything else is not. This case is tricky in that it requires
40 support from the OS to allocate address space without allocating memory (in
41 practice, all modern OSes do this). It's also tricky in that it is the only
42 case where a successful HEAP_ALLOCED(p) check can trigger a segfault when
43 accessing p (and for debugging purposes, it will).
44
45 Alternatively, the older implementation caches one 12-bit block map
46 that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
47 called for an address that is not in the cache, it calls
48 slowIsHeapAlloced (see MBlock.c) which will find the block map for
49 the 4GB block in question.
50 -------------------------------------------------------------------------- */
51
52 #ifdef USE_LARGE_ADDRESS_SPACE
53
54 struct mblock_address_range {
55 W_ begin, end;
56 W_ padding[6]; // ensure nothing else inhabits this cache line
57 } ATTRIBUTE_ALIGNED(64);
58 extern struct mblock_address_range mblock_address_space;
59
60 # define HEAP_ALLOCED(p) ((W_)(p) >= mblock_address_space.begin && \
61 (W_)(p) < (mblock_address_space.end))
62 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
63
64 #elif SIZEOF_VOID_P == 4
65 extern StgWord8 mblock_map[];
66
67 /* On a 32-bit machine a 4KB table is always sufficient */
68 # define MBLOCK_MAP_SIZE 4096
69 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
70 # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
71 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
72
73 /* -----------------------------------------------------------------------------
74 HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).
75
76 Here are some cache layout options:
77
78 [1]
79 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
80 mblock size = 20 bits
81 entries = 8192 13 bits
82 line size = 0 bits (1 bit of value)
83 tag size = 15 bits
84 = 48 bits
85
86 [2]
87 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
88 mblock size = 20 bits
89 entries = 16384 14 bits
90 line size = 2 bits (4 bits of value)
91 tag size = 12 bits
92 = 48 bits
93
94 [3]
95 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
96 mblock size = 20 bits
97 entries = 8192 13 bits
98 line size = 1 bits (2 bits of value)
99 tag size = 14 bits
100 = 48 bits
101
102 [4]
103 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
104 mblock size = 20 bits
105 entries = 1024 10 bits
106 line size = 4 bits (16 bits of value)
107 tag size = 14 bits
108 = 48 bits
109
110 [5]
111 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
112 mblock size = 20 bits
113 entries = 512 9 bits
114 line size = 5 bits (32 bits of value)
115 tag size = 14 bits
116 = 48 bits
117
118 We actually use none of the above. After much experimentation it was
119 found that optimising the lookup is the most important factor,
120 followed by reducing the number of misses. To that end, we use a
121 variant of [1] in which each cache entry is ((mblock << 1) + value)
122 where value is 0 for non-heap and 1 for heap. The cache entries can
123 be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
124 1 bit for the value. The cache can be as big as we like, but
125 currently we use 8k entries, giving us 8GB capacity.
126
127 ---------------------------------------------------------------------------- */
128
129 #elif SIZEOF_VOID_P == 8
130
131 #define MBC_LINE_BITS 0
132 #define MBC_TAG_BITS 15
133
134 #if x86_64_HOST_ARCH
135 // 32bits are enough for 'entry' as modern amd64 boxes have
136 // only 48bit sized virtual addres.
137 typedef StgWord32 MbcCacheLine;
138 #else
139 // 32bits is not enough here as some arches (like ia64) use
140 // upper address bits to distinct memory areas.
141 typedef StgWord64 MbcCacheLine;
142 #endif
143
144 typedef StgWord8 MBlockMapLine;
145
146 #define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
147
148 #define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
149 #define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
150 #define MBC_ENTRIES (1<<MBC_SHIFT)
151
152 extern MbcCacheLine mblock_cache[];
153
154 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
155
156 #define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
157
158 typedef struct {
159 StgWord32 addrHigh32;
160 MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
161 } MBlockMap;
162
163 extern W_ mpc_misses;
164
165 StgBool HEAP_ALLOCED_miss(StgWord mblock, const void *p);
166
167 INLINE_HEADER
168 StgBool HEAP_ALLOCED(const void *p)
169 {
170 StgWord mblock;
171 uint32_t entry_no;
172 MbcCacheLine entry, value;
173
174 mblock = (StgWord)p >> MBLOCK_SHIFT;
175 entry_no = mblock & (MBC_ENTRIES-1);
176 entry = mblock_cache[entry_no];
177 value = entry ^ (mblock << 1);
178 // this formulation coaxes gcc into prioritising the value==1
179 // case, which we expect to be the most common.
180 // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
181 if (value == 1) {
182 return 1;
183 } else if (value == 0) {
184 return 0;
185 } else {
186 // putting the rest out of line turned out to be a slight
187 // performance improvement:
188 return HEAP_ALLOCED_miss(mblock,p);
189 }
190 }
191
192 // In the parallel GC, the cache itself is safe to *read*, and can be
193 // updated atomically, but we need to place a lock around operations
194 // that touch the MBlock map.
195 INLINE_HEADER
196 StgBool HEAP_ALLOCED_GC(void *p)
197 {
198 StgWord mblock;
199 uint32_t entry_no;
200 MbcCacheLine entry, value;
201 StgBool b;
202
203 mblock = (StgWord)p >> MBLOCK_SHIFT;
204 entry_no = mblock & (MBC_ENTRIES-1);
205 entry = mblock_cache[entry_no];
206 value = entry ^ (mblock << 1);
207 if (value == 1) {
208 return 1;
209 } else if (value == 0) {
210 return 0;
211 } else {
212 // putting the rest out of line turned out to be a slight
213 // performance improvement:
214 ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
215 b = HEAP_ALLOCED_miss(mblock,p);
216 RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
217 return b;
218 }
219 }
220
221 #else
222 # error HEAP_ALLOCED not defined
223 #endif
224
225 #include "EndPrivate.h"
226
227 #endif /* SM_HEAP_ALLOC_H */