5ff4bc86e40c20d25e9c0deb95d71350b92dd78b
[ghc.git] / rts / posix / OSMem.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2007
4 *
5 * OS-specific memory management
6 *
7 * ---------------------------------------------------------------------------*/
8
9 // This is non-posix compliant.
10 // #include "PosixSource.h"
11
12 #include "Rts.h"
13
14 #include "RtsUtils.h"
15 #include "sm/OSMem.h"
16 #include "sm/HeapAlloc.h"
17
18 #ifdef HAVE_UNISTD_H
19 #include <unistd.h>
20 #endif
21 #ifdef HAVE_SYS_TYPES_H
22 #include <sys/types.h>
23 #endif
24 #ifdef HAVE_SYS_MMAN_H
25 #include <sys/mman.h>
26 #endif
27 #ifdef HAVE_STRING_H
28 #include <string.h>
29 #endif
30 #ifdef HAVE_FCNTL_H
31 #include <fcntl.h>
32 #endif
33
34 #include <errno.h>
35
36 #if darwin_HOST_OS || ios_HOST_OS
37 #include <mach/mach.h>
38 #include <mach/vm_map.h>
39 #include <sys/sysctl.h>
40 #endif
41
42 static void *next_request = 0;
43
44 void osMemInit(void)
45 {
46 next_request = (void *)RtsFlags.GcFlags.heapBase;
47 }
48
49 /* -----------------------------------------------------------------------------
50 The mmap() method
51
52 On Unix-like systems, we use mmap() to allocate our memory. We
53 want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
54 boundary. The mmap() interface doesn't give us this level of
55 control, so we have to use some heuristics.
56
57 In the general case, if we want a block of n megablocks, then we
58 allocate n+1 and trim off the slop from either side (using
59 munmap()) to get an aligned chunk of size n. However, the next
60 time we'll try to allocate directly after the previously allocated
61 chunk, on the grounds that this is aligned and likely to be free.
62 If it turns out that we were wrong, we have to munmap() and try
63 again using the general method.
64
65 Note on posix_memalign(): this interface is available on recent
66 systems and appears to provide exactly what we want. However, it
67 turns out not to be as good as our mmap() implementation, because
68 it wastes extra space (using double the address space, in a test on
69 x86_64/Linux). The problem seems to be that posix_memalign()
70 returns memory that can be free()'d, so the library must store
71 extra information along with the allocated block, thus messing up
72 the alignment. Hence, we don't use posix_memalign() for now.
73
74 -------------------------------------------------------------------------- */
75
76 /*
77 A wrapper around mmap(), to abstract away from OS differences in
78 the mmap() interface.
79
80 It supports the following operations:
81 - reserve: find a new chunk of available address space, and make it so
82 that we own it (no other library will get it), but don't actually
83 allocate memory for it
84 the addr is a hint for where to place the memory (and most
85 of the time the OS happily ignores!)
86 - commit: given a chunk of address space that we know we own, make sure
87 there is some memory backing it
88 the addr is not a hint, it must point into previously reserved
89 address space, or bad things happen
90 - reserve&commit: do both at the same time
91
92 The naming is chosen from the Win32 API (VirtualAlloc) which does the
93 same thing and has done so forever, while support for this in Unix systems
94 has only been added recently and is hidden in the posix portability mess.
95 It is confusing because to get the reserve behavior we need MAP_NORESERVE
96 (which tells the kernel not to allocate backing space), but heh...
97 */
98 enum
99 {
100 MEM_RESERVE = 1,
101 MEM_COMMIT = 2,
102 MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
103 };
104
105 /* Returns NULL on failure; errno set */
106 static void *
107 my_mmap (void *addr, W_ size, int operation)
108 {
109 void *ret;
110
111 #if darwin_HOST_OS
112 // Without MAP_FIXED, Apple's mmap ignores addr.
113 // With MAP_FIXED, it overwrites already mapped regions, whic
114 // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
115 // and replaces it with zeroes, causing instant death.
116 // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
117 // Let's just use the underlying Mach Microkernel calls directly,
118 // they're much nicer.
119
120 kern_return_t err = 0;
121 ret = addr;
122
123 if(operation & MEM_RESERVE)
124 {
125 if(addr) // try to allocate at address
126 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
127 size, FALSE);
128 if(!addr || err) // try to allocate anywhere
129 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
130 size, TRUE);
131 }
132
133 if(err) {
134 // don't know what the error codes mean exactly, assume it's
135 // not our problem though.
136 errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
137 size);
138 stg_exit(EXIT_FAILURE);
139 }
140
141 if(operation & MEM_COMMIT) {
142 vm_protect(mach_task_self(), (vm_address_t)ret, size, FALSE,
143 VM_PROT_READ|VM_PROT_WRITE);
144 }
145
146 #else
147
148 int prot, flags;
149 if (operation & MEM_COMMIT)
150 prot = PROT_READ | PROT_WRITE;
151 else
152 prot = PROT_NONE;
153 if (operation == MEM_RESERVE)
154 # if defined(MAP_NORESERVE)
155 flags = MAP_NORESERVE;
156 # else
157 # ifdef USE_LARGE_ADDRESS_SPACE
158 # error USE_LARGE_ADDRESS_SPACE needs MAP_NORESERVE
159 # endif
160 errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
161 # endif
162 else if (operation == MEM_COMMIT)
163 flags = MAP_FIXED;
164 else
165 flags = 0;
166
167 #if hpux_HOST_OS
168 ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
169 #elif linux_HOST_OS
170 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
171 if (ret == (void *)-1 && errno == EPERM) {
172 // Linux may return EPERM if it tried to give us
173 // a chunk of address space below mmap_min_addr,
174 // See Trac #7500.
175 if (addr != 0 && (operation & MEM_RESERVE)) {
176 // Try again with no hint address.
177 // It's not clear that this can ever actually help,
178 // but since our alternative is to abort, we may as well try.
179 ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
180 }
181 if (ret == (void *)-1 && errno == EPERM) {
182 // Linux is not willing to give us any mapping,
183 // so treat this as an out-of-memory condition
184 // (really out of virtual address space).
185 errno = ENOMEM;
186 }
187 }
188 #else
189 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
190 #endif
191 #endif
192
193 if (ret == (void *)-1) {
194 return NULL;
195 }
196
197 return ret;
198 }
199
200 /* Variant of my_mmap which aborts in the case of an error */
201 static void *
202 my_mmap_or_barf (void *addr, W_ size, int operation)
203 {
204 void *ret = my_mmap(addr, size, operation);
205
206 if (ret == NULL) {
207 if (errno == ENOMEM ||
208 (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
209 // If we request more than 3Gig, then we get EINVAL
210 // instead of ENOMEM (at least on Linux).
211 errorBelch("out of memory (requested %" FMT_Word " bytes)", size);
212 stg_exit(EXIT_HEAPOVERFLOW);
213 } else {
214 barf("getMBlock: mmap: %s", strerror(errno));
215 }
216 }
217
218 return ret;
219 }
220
221 // Implements the general case: allocate a chunk of memory of 'size'
222 // mblocks.
223
224 static void *
225 gen_map_mblocks (W_ size)
226 {
227 int slop;
228 StgWord8 *ret;
229
230 // Try to map a larger block, and take the aligned portion from
231 // it (unmap the rest).
232 size += MBLOCK_SIZE;
233 ret = my_mmap_or_barf(0, size, MEM_RESERVE_AND_COMMIT);
234
235 // unmap the slop bits around the chunk we allocated
236 slop = (W_)ret & MBLOCK_MASK;
237
238 if (munmap((void*)ret, MBLOCK_SIZE - slop) == -1) {
239 barf("gen_map_mblocks: munmap failed");
240 }
241 if (slop > 0 && munmap((void*)(ret+size-slop), slop) == -1) {
242 barf("gen_map_mblocks: munmap failed");
243 }
244
245 // ToDo: if we happened to get an aligned block, then don't
246 // unmap the excess, just use it. For this to work, you
247 // need to keep in mind the following:
248 // * Calling my_mmap() with an 'addr' arg pointing to
249 // already my_mmap()ed space is OK and won't fail.
250 // * If my_mmap() can't satisfy the request at the
251 // given 'next_request' address in getMBlocks(), that
252 // you unmap the extra mblock mmap()ed here (or simply
253 // satisfy yourself that the slop introduced isn't worth
254 // salvaging.)
255 //
256
257 // next time, try after the block we just got.
258 ret += MBLOCK_SIZE - slop;
259 return ret;
260 }
261
262 void *
263 osGetMBlocks(uint32_t n)
264 {
265 void *ret;
266 W_ size = MBLOCK_SIZE * (W_)n;
267
268 if (next_request == 0) {
269 // use gen_map_mblocks the first time.
270 ret = gen_map_mblocks(size);
271 } else {
272 ret = my_mmap_or_barf(next_request, size, MEM_RESERVE_AND_COMMIT);
273
274 if (((W_)ret & MBLOCK_MASK) != 0) {
275 // misaligned block!
276 #if 0 // defined(DEBUG)
277 errorBelch("warning: getMBlock: misaligned block %p returned "
278 "when allocating %d megablock(s) at %p",
279 ret, n, next_request);
280 #endif
281
282 // unmap this block...
283 if (munmap(ret, size) == -1) {
284 barf("getMBlock: munmap failed");
285 }
286 // and do it the hard way
287 ret = gen_map_mblocks(size);
288 }
289 }
290 // Next time, we'll try to allocate right after the block we just got.
291 // ToDo: check that we haven't already grabbed the memory at next_request
292 next_request = (char *)ret + size;
293
294 return ret;
295 }
296
297 void osFreeMBlocks(void *addr, uint32_t n)
298 {
299 munmap(addr, n * MBLOCK_SIZE);
300 }
301
302 void osReleaseFreeMemory(void) {
303 /* Nothing to do on POSIX */
304 }
305
306 void osFreeAllMBlocks(void)
307 {
308 void *mblock;
309 void *state;
310
311 for (mblock = getFirstMBlock(&state);
312 mblock != NULL;
313 mblock = getNextMBlock(&state, mblock)) {
314 munmap(mblock, MBLOCK_SIZE);
315 }
316 }
317
318 size_t getPageSize (void)
319 {
320 static size_t pageSize = 0;
321
322 if (pageSize == 0) {
323 long ret;
324 ret = sysconf(_SC_PAGESIZE);
325 if (ret == -1) {
326 barf("getPageSize: cannot get page size");
327 }
328 pageSize = ret;
329 }
330
331 return pageSize;
332 }
333
334 /* Returns 0 if physical memory size cannot be identified */
335 StgWord64 getPhysicalMemorySize (void)
336 {
337 static StgWord64 physMemSize = 0;
338 if (!physMemSize) {
339 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
340 /* So, darwin doesn't support _SC_PHYS_PAGES, but it does
341 support getting the raw memory size in bytes through
342 sysctlbyname(hw.memsize); */
343 size_t len = sizeof(physMemSize);
344 int ret = -1;
345
346 /* Note hw.memsize is in bytes, so no need to multiply by page size. */
347 ret = sysctlbyname("hw.memsize", &physMemSize, &len, NULL, 0);
348 if (ret == -1) {
349 physMemSize = 0;
350 return 0;
351 }
352 #else
353 /* We'll politely assume we have a system supporting _SC_PHYS_PAGES
354 * otherwise. */
355 W_ pageSize = getPageSize();
356 long ret = sysconf(_SC_PHYS_PAGES);
357 if (ret == -1) {
358 #if defined(DEBUG)
359 errorBelch("warning: getPhysicalMemorySize: cannot get "
360 "physical memory size");
361 #endif
362 return 0;
363 }
364 physMemSize = ret * pageSize;
365 #endif /* darwin_HOST_OS */
366 }
367 return physMemSize;
368 }
369
370 void setExecutable (void *p, W_ len, rtsBool exec)
371 {
372 StgWord pageSize = getPageSize();
373
374 /* malloced memory isn't executable by default on OpenBSD */
375 StgWord mask = ~(pageSize - 1);
376 StgWord startOfFirstPage = ((StgWord)p ) & mask;
377 StgWord startOfLastPage = ((StgWord)p + len - 1) & mask;
378 StgWord size = startOfLastPage - startOfFirstPage + pageSize;
379 if (mprotect((void*)startOfFirstPage, (size_t)size,
380 (exec ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE) != 0) {
381 barf("setExecutable: failed to protect 0x%p\n", p);
382 }
383 }
384
385 #ifdef USE_LARGE_ADDRESS_SPACE
386
387 static void *
388 osTryReserveHeapMemory (W_ len, void *hint)
389 {
390 void *base, *top;
391 void *start, *end;
392
393 /* We try to allocate len + MBLOCK_SIZE,
394 because we need memory which is MBLOCK_SIZE aligned,
395 and then we discard what we don't need */
396
397 base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
398 if (base == NULL)
399 return NULL;
400
401 top = (void*)((W_)base + len + MBLOCK_SIZE);
402
403 if (((W_)base & MBLOCK_MASK) != 0) {
404 start = MBLOCK_ROUND_UP(base);
405 end = MBLOCK_ROUND_DOWN(top);
406 ASSERT(((W_)end - (W_)start) == len);
407
408 if (munmap(base, (W_)start-(W_)base) < 0) {
409 sysErrorBelch("unable to release slop before heap");
410 }
411 if (munmap(end, (W_)top-(W_)end) < 0) {
412 sysErrorBelch("unable to release slop after heap");
413 }
414 } else {
415 start = base;
416 }
417
418 return start;
419 }
420
421 void *osReserveHeapMemory(W_ *len)
422 {
423 int attempt;
424 void *at;
425
426 /* We want to ensure the heap starts at least 8 GB inside the address space,
427 to make sure that any dynamically loaded code will be close enough to the
428 original code so that short relocations will work. This is in particular
429 important on Darwin/Mach-O, because object files not compiled as shared
430 libraries are position independent but cannot be loaded about 4GB.
431
432 We do so with a hint to the mmap, and we verify the OS satisfied our
433 hint. We loop, shifting our hint by 1 BLOCK_SIZE every time, in case
434 there is already something allocated there.
435
436 Some systems impose resource limits restricting the amount of memory we
437 can request (see, e.g. #10877). If mmap fails we halve our allocation
438 request and try again. If our request size gets absurdly small we simply
439 give up.
440
441 */
442
443 attempt = 0;
444 while (1) {
445 if (*len < MBLOCK_SIZE) {
446 // Give up if the system won't even give us 16 blocks worth of heap
447 barf("osReserveHeapMemory: Failed to allocate heap storage");
448 }
449
450 void *hint = (void*)((W_)8 * (1 << 30) + attempt * BLOCK_SIZE);
451 at = osTryReserveHeapMemory(*len, hint);
452 if (at == NULL) {
453 // This means that mmap failed which we take to mean that we asked
454 // for too much memory. This can happen due to POSIX resource
455 // limits. In this case we reduce our allocation request by a factor
456 // of two and try again.
457 *len /= 2;
458 } else if ((W_)at >= ((W_)8 * (1 << 30))) {
459 // Success! We were given a block of memory starting above the 8 GB
460 // mark, which is what we were looking for.
461 break;
462 } else {
463 // We got addressing space but it wasn't above the 8GB mark.
464 // Try again.
465 if (munmap(at, *len) < 0) {
466 sysErrorBelch("unable to release reserved heap");
467 }
468 }
469 }
470
471 return at;
472 }
473
474 void osCommitMemory(void *at, W_ size)
475 {
476 my_mmap(at, size, MEM_COMMIT);
477 }
478
479 void osDecommitMemory(void *at, W_ size)
480 {
481 int r;
482
483 // First make the memory unaccessible (so that we get a segfault
484 // at the next attempt to touch it)
485 // We only do this in DEBUG because it forces the OS to remove
486 // all MMU entries for this page range, and there is no reason
487 // to do so unless there is memory pressure
488 #ifdef DEBUG
489 r = mprotect(at, size, PROT_NONE);
490 if(r < 0)
491 sysErrorBelch("unable to make released memory unaccessible");
492 #endif
493
494 #ifdef MADV_FREE
495 // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
496 // just swaps memory out
497 r = madvise(at, size, MADV_FREE);
498 #else
499 r = madvise(at, size, MADV_DONTNEED);
500 #endif
501 if(r < 0)
502 sysErrorBelch("unable to decommit memory");
503 }
504
505 void osReleaseHeapMemory(void)
506 {
507 int r;
508
509 r = munmap((void*)mblock_address_space.begin,
510 mblock_address_space.end - mblock_address_space.begin);
511 if(r < 0)
512 sysErrorBelch("unable to release address space");
513 }
514
515 #endif