Revert "rts: Enable two-step allocator on FreeBSD"
[ghc.git] / rts / posix / OSMem.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2007
4 *
5 * OS-specific memory management
6 *
7 * ---------------------------------------------------------------------------*/
8
9 // This is non-posix compliant.
10 // #include "PosixSource.h"
11
12 #include "Rts.h"
13
14 #include "RtsUtils.h"
15 #include "sm/OSMem.h"
16 #include "sm/HeapAlloc.h"
17
18 #if defined(HAVE_UNISTD_H)
19 #include <unistd.h>
20 #endif
21 #if defined(HAVE_SYS_TYPES_H)
22 #include <sys/types.h>
23 #endif
24 #if defined(HAVE_SYS_MMAN_H)
25 #include <sys/mman.h>
26 #endif
27 #if defined(HAVE_STRING_H)
28 #include <string.h>
29 #endif
30 #if defined(HAVE_FCNTL_H)
31 #include <fcntl.h>
32 #endif
33 #if defined(HAVE_NUMA_H)
34 #include <numa.h>
35 #endif
36 #if defined(HAVE_NUMAIF_H)
37 #include <numaif.h>
38 #endif
39 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
40 #include <sys/time.h>
41 #include <sys/resource.h>
42 #endif
43
44 #include <errno.h>
45
46 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
47 #include <mach/mach.h>
48 #include <mach/vm_map.h>
49 #include <sys/sysctl.h>
50 #endif
51
52 static void *next_request = 0;
53
54 void osMemInit(void)
55 {
56 next_request = (void *)RtsFlags.GcFlags.heapBase;
57 }
58
59 /* -----------------------------------------------------------------------------
60 The mmap() method
61
62 On Unix-like systems, we use mmap() to allocate our memory. We
63 want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
64 boundary. The mmap() interface doesn't give us this level of
65 control, so we have to use some heuristics.
66
67 In the general case, if we want a block of n megablocks, then we
68 allocate n+1 and trim off the slop from either side (using
69 munmap()) to get an aligned chunk of size n. However, the next
70 time we'll try to allocate directly after the previously allocated
71 chunk, on the grounds that this is aligned and likely to be free.
72 If it turns out that we were wrong, we have to munmap() and try
73 again using the general method.
74
75 Note on posix_memalign(): this interface is available on recent
76 systems and appears to provide exactly what we want. However, it
77 turns out not to be as good as our mmap() implementation, because
78 it wastes extra space (using double the address space, in a test on
79 x86_64/Linux). The problem seems to be that posix_memalign()
80 returns memory that can be free()'d, so the library must store
81 extra information along with the allocated block, thus messing up
82 the alignment. Hence, we don't use posix_memalign() for now.
83
84 -------------------------------------------------------------------------- */
85
86 /*
87 A wrapper around mmap(), to abstract away from OS differences in
88 the mmap() interface.
89
90 It supports the following operations:
91 - reserve: find a new chunk of available address space, and make it so
92 that we own it (no other library will get it), but don't actually
93 allocate memory for it
94 the addr is a hint for where to place the memory (and most
95 of the time the OS happily ignores!)
96 - commit: given a chunk of address space that we know we own, make sure
97 there is some memory backing it
98 the addr is not a hint, it must point into previously reserved
99 address space, or bad things happen
100 - reserve&commit: do both at the same time
101
102 The naming is chosen from the Win32 API (VirtualAlloc) which does the
103 same thing and has done so forever, while support for this in Unix systems
104 has only been added recently and is hidden in the posix portability mess.
105 It is confusing because to get the reserve behavior we need MAP_NORESERVE
106 (which tells the kernel not to allocate backing space), but heh...
107 */
108 enum
109 {
110 MEM_RESERVE = 1,
111 MEM_COMMIT = 2,
112 MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
113 };
114
115 /* Returns NULL on failure; errno set */
116 static void *
117 my_mmap (void *addr, W_ size, int operation)
118 {
119 void *ret;
120
121 #if defined(darwin_HOST_OS)
122 // Without MAP_FIXED, Apple's mmap ignores addr.
123 // With MAP_FIXED, it overwrites already mapped regions, whic
124 // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
125 // and replaces it with zeroes, causing instant death.
126 // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
127 // Let's just use the underlying Mach Microkernel calls directly,
128 // they're much nicer.
129
130 kern_return_t err = 0;
131 ret = addr;
132
133 if(operation & MEM_RESERVE)
134 {
135 if(addr) // try to allocate at address
136 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
137 size, false);
138 if(!addr || err) // try to allocate anywhere
139 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
140 size, true);
141 }
142
143 if(err) {
144 // don't know what the error codes mean exactly, assume it's
145 // not our problem though.
146 errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
147 size);
148 stg_exit(EXIT_FAILURE);
149 }
150
151 if(operation & MEM_COMMIT) {
152 vm_protect(mach_task_self(), (vm_address_t)ret, size, false,
153 VM_PROT_READ|VM_PROT_WRITE);
154 }
155
156 #else
157
158 int prot, flags;
159 if (operation & MEM_COMMIT)
160 prot = PROT_READ | PROT_WRITE;
161 else
162 prot = PROT_NONE;
163 if (operation == MEM_RESERVE)
164 # if defined(MAP_NORESERVE)
165 flags = MAP_NORESERVE;
166 # else
167 # if defined(USE_LARGE_ADDRESS_SPACE)
168 # error USE_LARGE_ADDRESS_SPACE needs MAP_NORESERVE
169 # endif
170 errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
171 # endif
172 else if (operation == MEM_COMMIT)
173 flags = MAP_FIXED;
174 else
175 flags = 0;
176
177 #if defined(hpux_HOST_OS)
178 ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
179 #elif defined(linux_HOST_OS)
180 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
181 if (ret == (void *)-1 && errno == EPERM) {
182 // Linux may return EPERM if it tried to give us
183 // a chunk of address space below mmap_min_addr,
184 // See Trac #7500.
185 if (addr != 0 && (operation & MEM_RESERVE)) {
186 // Try again with no hint address.
187 // It's not clear that this can ever actually help,
188 // but since our alternative is to abort, we may as well try.
189 ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
190 }
191 if (ret == (void *)-1 && errno == EPERM) {
192 // Linux is not willing to give us any mapping,
193 // so treat this as an out-of-memory condition
194 // (really out of virtual address space).
195 errno = ENOMEM;
196 }
197 }
198
199 if (ret != (void *)-1) {
200 if (operation & MEM_COMMIT) {
201 madvise(ret, size, MADV_WILLNEED);
202 #if defined(MADV_DODUMP)
203 madvise(ret, size, MADV_DODUMP);
204 #endif
205 } else {
206 madvise(ret, size, MADV_DONTNEED);
207 #if defined(MADV_DONTDUMP)
208 madvise(ret, size, MADV_DONTDUMP);
209 #endif
210 }
211 }
212
213 #else
214 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
215 #endif
216 #endif
217
218 if (ret == (void *)-1) {
219 return NULL;
220 }
221
222 return ret;
223 }
224
225 /* Variant of my_mmap which aborts in the case of an error */
226 static void *
227 my_mmap_or_barf (void *addr, W_ size, int operation)
228 {
229 void *ret = my_mmap(addr, size, operation);
230
231 if (ret == NULL) {
232 if (errno == ENOMEM ||
233 (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
234 // If we request more than 3Gig, then we get EINVAL
235 // instead of ENOMEM (at least on Linux).
236 errorBelch("out of memory (requested %" FMT_Word " bytes)", size);
237 stg_exit(EXIT_HEAPOVERFLOW);
238 } else {
239 barf("getMBlock: mmap: %s", strerror(errno));
240 }
241 }
242
243 return ret;
244 }
245
246 // Implements the general case: allocate a chunk of memory of 'size'
247 // mblocks.
248
249 static void *
250 gen_map_mblocks (W_ size)
251 {
252 int slop;
253 StgWord8 *ret;
254
255 // Try to map a larger block, and take the aligned portion from
256 // it (unmap the rest).
257 size += MBLOCK_SIZE;
258 ret = my_mmap_or_barf(0, size, MEM_RESERVE_AND_COMMIT);
259
260 // unmap the slop bits around the chunk we allocated
261 slop = (W_)ret & MBLOCK_MASK;
262
263 if (munmap((void*)ret, MBLOCK_SIZE - slop) == -1) {
264 barf("gen_map_mblocks: munmap failed");
265 }
266 if (slop > 0 && munmap((void*)(ret+size-slop), slop) == -1) {
267 barf("gen_map_mblocks: munmap failed");
268 }
269
270 // ToDo: if we happened to get an aligned block, then don't
271 // unmap the excess, just use it. For this to work, you
272 // need to keep in mind the following:
273 // * Calling my_mmap() with an 'addr' arg pointing to
274 // already my_mmap()ed space is OK and won't fail.
275 // * If my_mmap() can't satisfy the request at the
276 // given 'next_request' address in getMBlocks(), that
277 // you unmap the extra mblock mmap()ed here (or simply
278 // satisfy yourself that the slop introduced isn't worth
279 // salvaging.)
280 //
281
282 // next time, try after the block we just got.
283 ret += MBLOCK_SIZE - slop;
284 return ret;
285 }
286
287 void *
288 osGetMBlocks(uint32_t n)
289 {
290 void *ret;
291 W_ size = MBLOCK_SIZE * (W_)n;
292
293 if (next_request == 0) {
294 // use gen_map_mblocks the first time.
295 ret = gen_map_mblocks(size);
296 } else {
297 ret = my_mmap_or_barf(next_request, size, MEM_RESERVE_AND_COMMIT);
298
299 if (((W_)ret & MBLOCK_MASK) != 0) {
300 // misaligned block!
301 #if 0 // defined(DEBUG)
302 errorBelch("warning: getMBlock: misaligned block %p returned "
303 "when allocating %d megablock(s) at %p",
304 ret, n, next_request);
305 #endif
306
307 // unmap this block...
308 if (munmap(ret, size) == -1) {
309 barf("getMBlock: munmap failed");
310 }
311 // and do it the hard way
312 ret = gen_map_mblocks(size);
313 }
314 }
315
316 // Next time, we'll try to allocate right after the block we just got.
317 // ToDo: check that we haven't already grabbed the memory at next_request
318 next_request = (char *)ret + size;
319
320 return ret;
321 }
322
323 void osBindMBlocksToNode(
324 void *addr STG_UNUSED,
325 StgWord size STG_UNUSED,
326 uint32_t node STG_UNUSED)
327 {
328 #if HAVE_LIBNUMA
329 int ret;
330 StgWord mask = 0;
331 mask |= 1 << node;
332 if (RtsFlags.GcFlags.numa) {
333 ret = mbind(addr, (unsigned long)size,
334 MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
335 // paranoia: MPOL_BIND guarantees memory on the correct node;
336 // MPOL_MF_STRICT will tell us if it didn't work. We might want to
337 // relax these in due course, but I want to be sure it's doing what we
338 // want first.
339 if (ret != 0) {
340 sysErrorBelch("mbind");
341 stg_exit(EXIT_FAILURE);
342 }
343 }
344 #endif
345 }
346
347
348 void osFreeMBlocks(void *addr, uint32_t n)
349 {
350 munmap(addr, n * MBLOCK_SIZE);
351 }
352
353 void osReleaseFreeMemory(void) {
354 /* Nothing to do on POSIX */
355 }
356
357 void osFreeAllMBlocks(void)
358 {
359 void *mblock;
360 void *state;
361
362 for (mblock = getFirstMBlock(&state);
363 mblock != NULL;
364 mblock = getNextMBlock(&state, mblock)) {
365 munmap(mblock, MBLOCK_SIZE);
366 }
367 }
368
369 size_t getPageSize (void)
370 {
371 static size_t pageSize = 0;
372
373 if (pageSize == 0) {
374 long ret;
375 ret = sysconf(_SC_PAGESIZE);
376 if (ret == -1) {
377 barf("getPageSize: cannot get page size");
378 }
379 pageSize = ret;
380 }
381
382 return pageSize;
383 }
384
385 /* Returns 0 if physical memory size cannot be identified */
386 StgWord64 getPhysicalMemorySize (void)
387 {
388 static StgWord64 physMemSize = 0;
389 if (!physMemSize) {
390 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
391 /* So, darwin doesn't support _SC_PHYS_PAGES, but it does
392 support getting the raw memory size in bytes through
393 sysctlbyname(hw.memsize); */
394 size_t len = sizeof(physMemSize);
395 int ret = -1;
396
397 /* Note hw.memsize is in bytes, so no need to multiply by page size. */
398 ret = sysctlbyname("hw.memsize", &physMemSize, &len, NULL, 0);
399 if (ret == -1) {
400 physMemSize = 0;
401 return 0;
402 }
403 #else
404 /* We'll politely assume we have a system supporting _SC_PHYS_PAGES
405 * otherwise. */
406 W_ pageSize = getPageSize();
407 long ret = sysconf(_SC_PHYS_PAGES);
408 if (ret == -1) {
409 #if defined(DEBUG)
410 errorBelch("warning: getPhysicalMemorySize: cannot get "
411 "physical memory size");
412 #endif
413 return 0;
414 }
415 physMemSize = ret * pageSize;
416 #endif /* darwin_HOST_OS */
417 }
418 return physMemSize;
419 }
420
421 void setExecutable (void *p, W_ len, bool exec)
422 {
423 StgWord pageSize = getPageSize();
424
425 /* malloced memory isn't executable by default on OpenBSD */
426 StgWord mask = ~(pageSize - 1);
427 StgWord startOfFirstPage = ((StgWord)p ) & mask;
428 StgWord startOfLastPage = ((StgWord)p + len - 1) & mask;
429 StgWord size = startOfLastPage - startOfFirstPage + pageSize;
430 if (mprotect((void*)startOfFirstPage, (size_t)size,
431 (exec ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE) != 0) {
432 barf("setExecutable: failed to protect 0x%p\n", p);
433 }
434 }
435
436 #if defined(USE_LARGE_ADDRESS_SPACE)
437
438 static void *
439 osTryReserveHeapMemory (W_ len, void *hint)
440 {
441 void *base, *top;
442 void *start, *end;
443
444 /* We try to allocate len + MBLOCK_SIZE,
445 because we need memory which is MBLOCK_SIZE aligned,
446 and then we discard what we don't need */
447
448 base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
449 if (base == NULL)
450 return NULL;
451
452 top = (void*)((W_)base + len + MBLOCK_SIZE);
453
454 if (((W_)base & MBLOCK_MASK) != 0) {
455 start = MBLOCK_ROUND_UP(base);
456 end = MBLOCK_ROUND_DOWN(top);
457 ASSERT(((W_)end - (W_)start) == len);
458
459 if (munmap(base, (W_)start-(W_)base) < 0) {
460 sysErrorBelch("unable to release slop before heap");
461 }
462 if (munmap(end, (W_)top-(W_)end) < 0) {
463 sysErrorBelch("unable to release slop after heap");
464 }
465 } else {
466 start = base;
467 }
468
469 return start;
470 }
471
472 void *osReserveHeapMemory(void *startAddressPtr, W_ *len)
473 {
474 int attempt;
475 void *at;
476
477 /* We want to ensure the heap starts at least 8 GB inside the address space,
478 since we want to reserve the address space below that address for code.
479 Specifically, we need to make sure that any dynamically loaded code will
480 be close enough to the original code so that short relocations will work.
481 This is in particular important on Darwin/Mach-O, because object files
482 not compiled as shared libraries are position independent but cannot be
483 loaded above 4GB.
484
485 We do so with a hint to the mmap, and we verify the OS satisfied our
486 hint. We loop, shifting our hint by 1 BLOCK_SIZE every time, in case
487 there is already something allocated there.
488
489 Some systems impose resource limits restricting the amount of memory we
490 can request (see, e.g. #10877). If mmap fails we halve our allocation
491 request and try again. If our request size gets absurdly small we simply
492 give up.
493
494 */
495
496 W_ minimumAddress = (W_)8 * (1 << 30);
497 // We don't use minimumAddress (0x200000000) as default because we know
498 // it can clash with third-party libraries. See ticket #12573.
499 W_ startAddress = 0x4200000000;
500 if (startAddressPtr) {
501 startAddress = (W_)startAddressPtr;
502 }
503 if (startAddress < minimumAddress) {
504 errorBelch(
505 "Provided heap start address %p is lower than minimum address %p",
506 (void*)startAddress, (void*)minimumAddress);
507 }
508
509 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
510 struct rlimit limit;
511 if (!getrlimit(RLIMIT_AS, &limit)
512 && limit.rlim_cur > 0
513 && *len > limit.rlim_cur) {
514 *len = limit.rlim_cur;
515 }
516 #endif
517
518 attempt = 0;
519 while (1) {
520 if (*len < MBLOCK_SIZE) {
521 // Give up if the system won't even give us 16 blocks worth of heap
522 barf("osReserveHeapMemory: Failed to allocate heap storage");
523 }
524
525 void *hint = (void*)(startAddress + attempt * BLOCK_SIZE);
526 at = osTryReserveHeapMemory(*len, hint);
527 if (at == NULL) {
528 // This means that mmap failed which we take to mean that we asked
529 // for too much memory. This can happen due to POSIX resource
530 // limits. In this case we reduce our allocation request by a
531 // fraction of the current size and try again.
532 //
533 // Note that the previously would instead decrease the request size
534 // by a factor of two; however, this meant that significant amounts
535 // of memory will be wasted (e.g. imagine a machine with 512GB of
536 // physical memory but a 511GB ulimit). See #14492.
537 *len -= *len / 8;
538 } else if ((W_)at >= minimumAddress) {
539 // Success! We were given a block of memory starting above the 8 GB
540 // mark, which is what we were looking for.
541 break;
542 } else {
543 // We got addressing space but it wasn't above the 8GB mark.
544 // Try again.
545 if (munmap(at, *len) < 0) {
546 sysErrorBelch("unable to release reserved heap");
547 }
548 }
549 attempt++;
550 }
551
552 return at;
553 }
554
555 void osCommitMemory(void *at, W_ size)
556 {
557 void *r = my_mmap(at, size, MEM_COMMIT);
558 if (r == NULL) {
559 barf("Unable to commit %" FMT_Word " bytes of memory", size);
560 }
561 }
562
563 void osDecommitMemory(void *at, W_ size)
564 {
565 int r;
566
567 // First make the memory unaccessible (so that we get a segfault
568 // at the next attempt to touch it)
569 // We only do this in DEBUG because it forces the OS to remove
570 // all MMU entries for this page range, and there is no reason
571 // to do so unless there is memory pressure
572 #if defined(DEBUG)
573 r = mprotect(at, size, PROT_NONE);
574 if(r < 0)
575 sysErrorBelch("unable to make released memory unaccessible");
576 #endif
577
578 #if defined(MADV_FREE)
579 // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
580 // just swaps memory out. Linux >= 4.5 has both DONTNEED and FREE; either
581 // will work as they both allow the system to free anonymous pages.
582 // It is important that we try both methods as the kernel which we were
583 // built on may differ from the kernel we are now running on.
584 r = madvise(at, size, MADV_FREE);
585 if(r < 0) {
586 if (errno == EINVAL) {
587 // Perhaps the system doesn't support MADV_FREE; fall-through and
588 // try MADV_DONTNEED.
589 } else {
590 sysErrorBelch("unable to decommit memory");
591 }
592 } else {
593 return;
594 }
595 #endif
596
597 r = madvise(at, size, MADV_DONTNEED);
598 if(r < 0)
599 sysErrorBelch("unable to decommit memory");
600 }
601
602 void osReleaseHeapMemory(void)
603 {
604 int r;
605
606 r = munmap((void*)mblock_address_space.begin,
607 mblock_address_space.end - mblock_address_space.begin);
608 if(r < 0)
609 sysErrorBelch("unable to release address space");
610 }
611
612 #endif
613
614 bool osBuiltWithNumaSupport(void)
615 {
616 #if HAVE_LIBNUMA
617 return true;
618 #else
619 return false;
620 #endif
621 }
622
623 bool osNumaAvailable(void)
624 {
625 #if HAVE_LIBNUMA
626 return (numa_available() != -1);
627 #else
628 return false;
629 #endif
630 }
631
632 uint32_t osNumaNodes(void)
633 {
634 #if HAVE_LIBNUMA
635 return numa_num_configured_nodes();
636 #else
637 return 1;
638 #endif
639 }
640
641 uint64_t osNumaMask(void)
642 {
643 #if HAVE_LIBNUMA
644 struct bitmask *mask;
645 mask = numa_get_mems_allowed();
646 if (osNumaNodes() > sizeof(StgWord)*8) {
647 barf("osNumaMask: too many NUMA nodes (%d)", osNumaNodes());
648 }
649 uint64_t r = mask->maskp[0];
650 numa_bitmask_free(mask);
651 return r;
652 #else
653 return 1;
654 #endif
655 }