EventLog: Factor out ensureRoomFor*Event
[ghc.git] / rts / sm / HeapAlloc.h
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2008
4 *
5 * The HEAP_ALLOCED() test.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #ifndef SM_HEAP_ALLOC_H
10 #define SM_HEAP_ALLOC_H
11
12 #include "BeginPrivate.h"
13
14 /* -----------------------------------------------------------------------------
15 The HEAP_ALLOCED() test.
16
17 HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
18 It needs to be FAST.
19
20 See wiki commentary at
21 http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
22
23 Implementation of HEAP_ALLOCED
24 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25
26 Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
27 can just use a table to record which megablocks in the address
28 space belong to the heap. On a 32-bit machine, with 1Mb
29 megablocks, using 8 bits for each entry in the table, the table
30 requires 4k. Lookups during GC will be fast, because the table
31 will be quickly cached (indeed, performance measurements showed no
32 measurable difference between doing the table lookup and using a
33 constant comparison).
34
35 On 64-bit machines, we have two possibilities. One is to request
36 a single chunk of address space that we deem "large enough"
37 (currently 1TB, could easily be extended to, say 16TB or more).
38 Memory from that chunk is GC memory, everything else is not. This
39 case is tricky in that it requires support from the OS to allocate
40 address space without allocating memory (in practice, all modern
41 OSes do this). It's also tricky in that it is the only case where
42 a successful HEAP_ALLOCED(p) check can trigger a segfault when
43 accessing p (and for debugging purposes, it will).
44
45 Alternatively, the older implementation caches one 12-bit block map
46 that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
47 called for an address that is not in the cache, it calls
48 slowIsHeapAlloced (see MBlock.c) which will find the block map for
49 the 4GB block in question.
50 -------------------------------------------------------------------------- */
51
52 #ifdef USE_LARGE_ADDRESS_SPACE
53
54 extern W_ mblock_address_space_begin;
55 #if aarch64_HOST_ARCH
56 # define MBLOCK_SPACE_SIZE ((StgWord)1 << 38) /* 1/4 TB */
57 #else
58 # define MBLOCK_SPACE_SIZE ((StgWord)1 << 40) /* 1 TB */
59 #endif
60
61 # define HEAP_ALLOCED(p) ((W_)(p) >= mblock_address_space_begin && \
62 (W_)(p) < (mblock_address_space_begin + \
63 MBLOCK_SPACE_SIZE))
64 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
65
66 #elif SIZEOF_VOID_P == 4
67 extern StgWord8 mblock_map[];
68
69 /* On a 32-bit machine a 4KB table is always sufficient */
70 # define MBLOCK_MAP_SIZE 4096
71 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
72 # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
73 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
74
75 /* -----------------------------------------------------------------------------
76 HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).
77
78 Here are some cache layout options:
79
80 [1]
81 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
82 mblock size = 20 bits
83 entries = 8192 13 bits
84 line size = 0 bits (1 bit of value)
85 tag size = 15 bits
86 = 48 bits
87
88 [2]
89 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
90 mblock size = 20 bits
91 entries = 16384 14 bits
92 line size = 2 bits (4 bits of value)
93 tag size = 12 bits
94 = 48 bits
95
96 [3]
97 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
98 mblock size = 20 bits
99 entries = 8192 13 bits
100 line size = 1 bits (2 bits of value)
101 tag size = 14 bits
102 = 48 bits
103
104 [4]
105 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
106 mblock size = 20 bits
107 entries = 1024 10 bits
108 line size = 4 bits (16 bits of value)
109 tag size = 14 bits
110 = 48 bits
111
112 [5]
113 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
114 mblock size = 20 bits
115 entries = 512 9 bits
116 line size = 5 bits (32 bits of value)
117 tag size = 14 bits
118 = 48 bits
119
120 We actually use none of the above. After much experimentation it was
121 found that optimising the lookup is the most important factor,
122 followed by reducing the number of misses. To that end, we use a
123 variant of [1] in which each cache entry is ((mblock << 1) + value)
124 where value is 0 for non-heap and 1 for heap. The cache entries can
125 be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
126 1 bit for the value. The cache can be as big as we like, but
127 currently we use 8k entries, giving us 8GB capacity.
128
129 ---------------------------------------------------------------------------- */
130
131 #elif SIZEOF_VOID_P == 8
132
133 #define MBC_LINE_BITS 0
134 #define MBC_TAG_BITS 15
135
136 #if x86_64_HOST_ARCH
137 // 32bits are enough for 'entry' as modern amd64 boxes have
138 // only 48bit sized virtual addres.
139 typedef StgWord32 MbcCacheLine;
140 #else
141 // 32bits is not enough here as some arches (like ia64) use
142 // upper address bits to distinct memory areas.
143 typedef StgWord64 MbcCacheLine;
144 #endif
145
146 typedef StgWord8 MBlockMapLine;
147
148 #define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
149
150 #define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
151 #define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
152 #define MBC_ENTRIES (1<<MBC_SHIFT)
153
154 extern MbcCacheLine mblock_cache[];
155
156 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
157
158 #define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
159
160 typedef struct {
161 StgWord32 addrHigh32;
162 MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
163 } MBlockMap;
164
165 extern W_ mpc_misses;
166
167 StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
168
169 INLINE_HEADER
170 StgBool HEAP_ALLOCED(void *p)
171 {
172 StgWord mblock;
173 nat entry_no;
174 MbcCacheLine entry, value;
175
176 mblock = (StgWord)p >> MBLOCK_SHIFT;
177 entry_no = mblock & (MBC_ENTRIES-1);
178 entry = mblock_cache[entry_no];
179 value = entry ^ (mblock << 1);
180 // this formulation coaxes gcc into prioritising the value==1
181 // case, which we expect to be the most common.
182 // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
183 if (value == 1) {
184 return 1;
185 } else if (value == 0) {
186 return 0;
187 } else {
188 // putting the rest out of line turned out to be a slight
189 // performance improvement:
190 return HEAP_ALLOCED_miss(mblock,p);
191 }
192 }
193
194 // In the parallel GC, the cache itself is safe to *read*, and can be
195 // updated atomically, but we need to place a lock around operations
196 // that touch the MBlock map.
197 INLINE_HEADER
198 StgBool HEAP_ALLOCED_GC(void *p)
199 {
200 StgWord mblock;
201 nat entry_no;
202 MbcCacheLine entry, value;
203 StgBool b;
204
205 mblock = (StgWord)p >> MBLOCK_SHIFT;
206 entry_no = mblock & (MBC_ENTRIES-1);
207 entry = mblock_cache[entry_no];
208 value = entry ^ (mblock << 1);
209 if (value == 1) {
210 return 1;
211 } else if (value == 0) {
212 return 0;
213 } else {
214 // putting the rest out of line turned out to be a slight
215 // performance improvement:
216 ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
217 b = HEAP_ALLOCED_miss(mblock,p);
218 RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
219 return b;
220 }
221 }
222
223 #else
224 # error HEAP_ALLOCED not defined
225 #endif
226
227 #include "EndPrivate.h"
228
229 #endif /* SM_HEAP_ALLOC_H */