Update Mingw-w64 bindist for Windows
[ghc.git] / rts / linker / M32Alloc.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2000-2012
4 *
5 * RTS Object Linker
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "Rts.h"
10 #include "sm/OSMem.h"
11 #include "linker/M32Alloc.h"
12 #include "LinkerInternals.h"
13
14 #include <inttypes.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <stdio.h>
18
19 /*
20
21 Note [Compile Time Trickery]
22 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23
24 This file implements two versions of each of the `m32_*` functions. At the top
25 of the file there is the real implementation (compiled in when
26 `RTS_LINKER_USE_MMAP` is true) and a dummy implementation that exists only to
27 satisfy the compiler and which hould never be called. If any of these dummy
28 implementations are called the program will abort.
29
30 The rationale for this is to allow the calling code to be written without using
31 the C pre-processor (CPP) `#if` hackery. The value of `RTS_LINKER_USE_MMAP` is
32 known at compile time, code like:
33
34 if (RTS_LINKER_USE_MMAP)
35 m32_allocator_init();
36
37 will be compiled to call to `m32_allocator_init` if `RTS_LINKER_USE_MMAP` is
38 true and will be optimised awat to nothing if `RTS_LINKER_USE_MMAP` is false.
39 However, regardless of the value of `RTS_LINKER_USE_MMAP` the compiler will
40 still check the call for syntax and correct function parameter types.
41
42 */
43
44 #if RTS_LINKER_USE_MMAP == 1
45
46 /*
47
48 Note [M32 Allocator]
49 ~~~~~~~~~~~~~~~~~~~~
50
51 A memory allocator that allocates only pages in the 32-bit range (lower 2GB).
52 This is useful on 64-bit platforms to ensure that addresses of allocated
53 objects can be referenced with a 32-bit relative offset.
54
55 Initially, the linker used `mmap` to allocate a page per object. Hence it
56 wasted a lot of space for small objects (see #9314). With this allocator, we
57 try to fill pages as much as we can for small objects.
58
59 How does it work?
60 -----------------
61
62 For small objects, a Word64 counter is added at the beginning of the page they
63 are stored in. It indicates the number of objects that are still alive in the
64 page. When the counter drops down to zero, the page is freed. The counter is
65 atomically decremented, hence the deallocation is thread-safe.
66
67 During the allocation phase, the allocator keeps track of some pages that are
68 not totally filled: the number of pages in the "filling" list is configurable
69 with M32_MAX_PAGES. Allocation consists in finding some place in one of these
70 pages or starting a new one, then increasing the page counter. If none of the
71 pages in the "filling" list has enough free space, the most filled one is
72 flushed (see below) and a new one is allocated.
73
74 The allocator holds a reference on pages in the "filling" list: the counter in
75 these pages is 1+n where n is the current number of objects allocated in the
76 page. Hence allocated objects can be freed while the allocator is using
77 (filling) the page. Flushing a page consists in decreasing its counter and
78 removing it from the "filling" list. By extension, flushing the allocator
79 consists in flushing all the pages in the "filling" list. Don't forget to
80 flush the allocator at the end of the allocation phase in order to avoid space
81 leaks!
82
83 Large objects are objects that are larger than a page (minus the bytes required
84 for the counter and the optional padding). These objects are allocated into
85 their own set of pages. We can differentiate large and small objects from
86 their address: large objects are aligned on page size while small objects never
87 are (because of the space reserved for the page's object counter).
88
89 For large objects, the remaining space at the end of the last page is left
90 unused by the allocator. It can be used with care as it will be freed with the
91 associated large object. GHC linker uses this feature/hack, hence changing the
92 implementation of the M32 allocator must be done with care (i.e. do not try to
93 improve the allocator to avoid wasting this space without modifying the linker
94 code accordingly).
95
96 Object allocation is *not* thread-safe (however it could be done easily with a
97 lock in the allocator structure). Object deallocation is thread-safe.
98
99 */
100
101 #define ROUND_UP(x,size) ((x + size - 1) & ~(size - 1))
102 #define ROUND_DOWN(x,size) (x & ~(size - 1))
103
104 /****************************************************************************
105 * M32 ALLOCATOR (see Note [M32 Allocator]
106 ***************************************************************************/
107
108 #define M32_MAX_PAGES 32
109 #define M32_REFCOUNT_BYTES 8
110
111
112 /**
113 * An allocated page being filled by the allocator
114 */
115 struct m32_alloc_t {
116 void * base_addr; // Page address
117 size_t current_size; // Number of bytes already reserved
118 };
119
120 /**
121 * Allocator
122 *
123 * Currently an allocator is just a set of pages being filled. The maximum
124 * number of pages can be configured with M32_MAX_PAGES.
125 */
126 typedef struct m32_allocator_t {
127 struct m32_alloc_t pages[M32_MAX_PAGES];
128 } m32_allocator;
129
130 // We use a global memory allocator
131 static struct m32_allocator_t alloc;
132
133 /**
134 * Wrapper for `unmap` that handles error cases.
135 * This is the real implementation. There is another dummy implementation below.
136 * See the note titled "Compile Time Trickery" at the top of this file.
137 */
138 static void
139 munmapForLinker (void * addr, size_t size)
140 {
141 int r = munmap(addr,size);
142 if (r == -1) {
143 // Should we abort here?
144 sysErrorBelch("munmap");
145 }
146 }
147
148 /**
149 * Initialize the allocator structure
150 * This is the real implementation. There is another dummy implementation below.
151 * See the note titled "Compile Time Trickery" at the top of this file.
152 */
153 void
154 m32_allocator_init(void)
155 {
156 memset(&alloc, 0, sizeof(struct m32_allocator_t));
157 // Preallocate the initial M32_MAX_PAGES to ensure that they don't
158 // fragment the memory.
159 size_t pgsz = getPageSize();
160 char* bigchunk = mmapForLinker(pgsz * M32_MAX_PAGES,MAP_ANONYMOUS,-1,0);
161 int i;
162 for (i=0; i<M32_MAX_PAGES; i++) {
163 alloc.pages[i].base_addr = bigchunk + i*pgsz;
164 *((uintptr_t*)alloc.pages[i].base_addr) = 1;
165 alloc.pages[i].current_size = M32_REFCOUNT_BYTES;
166 }
167 }
168
169 /**
170 * Atomically decrement the object counter on the given page and release the
171 * page if necessary. The given address must be the *base address* of the page.
172 *
173 * You shouldn't have to use this method. Use `m32_free` instead.
174 */
175 static void
176 m32_free_internal(void * addr) {
177 uintptr_t c = __sync_sub_and_fetch((uintptr_t*)addr, 1);
178 if (c == 0) {
179 munmapForLinker(addr, getPageSize());
180 }
181 }
182
183 /**
184 * Release the allocator's reference to pages on the "filling" list. This
185 * should be called when it is believed that no more allocations will be needed
186 * from the allocator to ensure that empty pages waiting to be filled aren't
187 * unnecessarily held.
188 *
189 * This is the real implementation. There is another dummy implementation below.
190 * See the note titled "Compile Time Trickery" at the top of this file.
191 */
192 void
193 m32_allocator_flush(void) {
194 int i;
195 for (i=0; i<M32_MAX_PAGES; i++) {
196 void * addr = __sync_fetch_and_and(&alloc.pages[i].base_addr, 0x0);
197 if (addr != 0) {
198 m32_free_internal(addr);
199 }
200 }
201 }
202
203 // Return true if the object has its own dedicated set of pages
204 #define m32_is_large_object(size,alignment) \
205 (size >= getPageSize() - ROUND_UP(M32_REFCOUNT_BYTES,alignment))
206
207 // Return true if the object has its own dedicated set of pages
208 #define m32_is_large_object_addr(addr) \
209 ((uintptr_t) addr % getPageSize() == 0)
210
211 /**
212 * Free the memory associated with an object.
213 *
214 * If the object is "small", the object counter of the page it is allocated in
215 * is decremented and the page is not freed until all of its objects are freed.
216 *
217 * This is the real implementation. There is another dummy implementation below.
218 * See the note titled "Compile Time Trickery" at the top of this file.
219 */
220 void
221 m32_free(void *addr, size_t size)
222 {
223 uintptr_t m = (uintptr_t) addr % getPageSize();
224
225 if (m == 0) {
226 // large object
227 munmapForLinker(addr,roundUpToPage(size));
228 }
229 else {
230 // small object
231 void * page_addr = (void*)((uintptr_t)addr - m);
232 m32_free_internal(page_addr);
233 }
234 }
235
236 /**
237 * Allocate `size` bytes of memory with the given alignment.
238 *
239 * This is the real implementation. There is another dummy implementation below.
240 * See the note titled "Compile Time Trickery" at the top of this file.
241 */
242 void *
243 m32_alloc(size_t size, size_t alignment)
244 {
245 size_t pgsz = getPageSize();
246
247 if (m32_is_large_object(size,alignment)) {
248 // large object
249 return mmapForLinker(size,MAP_ANONYMOUS,-1,0);
250 }
251
252 // small object
253 // Try to find a page that can contain it
254 int empty = -1;
255 int most_filled = -1;
256 int i;
257 for (i=0; i<M32_MAX_PAGES; i++) {
258 // empty page
259 if (alloc.pages[i].base_addr == 0) {
260 empty = empty == -1 ? i : empty;
261 continue;
262 }
263 // If the page is referenced only by the allocator, we can reuse it.
264 // If we don't then we'll be left with a bunch of pages that have a
265 // few bytes left to allocate and we don't get to use or free them
266 // until we use up all the "filling" pages. This will unnecessarily
267 // allocate new pages and fragment the address space.
268 if (*((uintptr_t*)(alloc.pages[i].base_addr)) == 1) {
269 alloc.pages[i].current_size = M32_REFCOUNT_BYTES;
270 }
271 // page can contain the buffer?
272 size_t alsize = ROUND_UP(alloc.pages[i].current_size, alignment);
273 if (size <= pgsz - alsize) {
274 void * addr = (char*)alloc.pages[i].base_addr + alsize;
275 alloc.pages[i].current_size = alsize + size;
276 // increment the counter atomically
277 __sync_fetch_and_add((uintptr_t*)alloc.pages[i].base_addr, 1);
278 return addr;
279 }
280 // most filled?
281 if (most_filled == -1
282 || alloc.pages[most_filled].current_size < alloc.pages[i].current_size)
283 {
284 most_filled = i;
285 }
286 }
287
288 // If we haven't found an empty page, flush the most filled one
289 if (empty == -1) {
290 m32_free_internal(alloc.pages[most_filled].base_addr);
291 alloc.pages[most_filled].base_addr = 0;
292 alloc.pages[most_filled].current_size = 0;
293 empty = most_filled;
294 }
295
296 // Allocate a new page
297 void * addr = mmapForLinker(pgsz,MAP_ANONYMOUS,-1,0);
298 if (addr == NULL) {
299 return NULL;
300 }
301 alloc.pages[empty].base_addr = addr;
302 // Add M32_REFCOUNT_BYTES bytes for the counter + padding
303 alloc.pages[empty].current_size =
304 size+ROUND_UP(M32_REFCOUNT_BYTES,alignment);
305 // Initialize the counter:
306 // 1 for the allocator + 1 for the returned allocated memory
307 *((uintptr_t*)addr) = 2;
308 return (char*)addr + ROUND_UP(M32_REFCOUNT_BYTES,alignment);
309 }
310
311 #elif RTS_LINKER_USE_MMAP == 0
312
313 // The following implementations of these functions should never be called. If
314 // they are, there is a bug at the call site.
315 // See the note titled "Compile Time Trickery" at the top of this file.
316
317 void
318 m32_allocator_init(void)
319 {
320 barf("%s: RTS_LINKER_USE_MMAP is %d", __func__, RTS_LINKER_USE_MMAP);
321 }
322
323 void
324 m32_allocator_flush(void)
325 {
326 barf("%s: RTS_LINKER_USE_MMAP is %d", __func__, RTS_LINKER_USE_MMAP);
327 }
328
329 void
330 m32_free(void *addr STG_UNUSED, size_t size STG_UNUSED)
331 {
332 barf("%s: RTS_LINKER_USE_MMAP is %d", __func__, RTS_LINKER_USE_MMAP);
333 }
334
335 void *
336 m32_alloc(size_t size STG_UNUSED, size_t alignment STG_UNUSED)
337 {
338 barf("%s: RTS_LINKER_USE_MMAP is %d", __func__, RTS_LINKER_USE_MMAP);
339 }
340
341 #else
342
343 #error RTS_LINKER_USE_MMAP should be either `0` or `1`.
344
345 #endif