4e5c5c170fd3239acd02a429a112800e954d44fa
[ghc.git] / rts / posix / OSMem.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2007
4 *
5 * OS-specific memory management
6 *
7 * ---------------------------------------------------------------------------*/
8
9 // This is non-posix compliant.
10 // #include "PosixSource.h"
11
12 #include "Rts.h"
13
14 #include "RtsUtils.h"
15 #include "sm/OSMem.h"
16 #include "sm/HeapAlloc.h"
17
18 #if defined(HAVE_UNISTD_H)
19 #include <unistd.h>
20 #endif
21 #if defined(HAVE_SYS_TYPES_H)
22 #include <sys/types.h>
23 #endif
24 #if defined(HAVE_SYS_MMAN_H)
25 #include <sys/mman.h>
26 #endif
27 #if defined(HAVE_STRING_H)
28 #include <string.h>
29 #endif
30 #if defined(HAVE_FCNTL_H)
31 #include <fcntl.h>
32 #endif
33 #if defined(HAVE_NUMA_H)
34 #include <numa.h>
35 #endif
36 #if defined(HAVE_NUMAIF_H)
37 #include <numaif.h>
38 #endif
39 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
40 #include <sys/time.h>
41 #include <sys/resource.h>
42 #endif
43
44 #include <errno.h>
45
46 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
47 #include <mach/mach.h>
48 #include <mach/vm_map.h>
49 #include <sys/sysctl.h>
50 #endif
51
52 #ifndef MAP_FAILED
53 # define MAP_FAILED ((void *)-1)
54 #endif
55
56 #if defined(hpux_HOST_OS)
57 # ifndef MAP_ANON
58 # define MAP_ANON MAP_ANONYMOUS
59 # endif
60 #endif
61
62 #ifndef darwin_HOST_OS
63 # undef RESERVE_FLAGS
64 # if defined(MAP_GUARD)
65 # define RESERVE_FLAGS MAP_GUARD /* FreeBSD */
66 # elif defined(MAP_NORESERVE)
67 # define RESERVE_FLAGS MAP_NORESERVE | MAP_ANON | MAP_PRIVATE;
68 # else
69 # if defined(USE_LARGE_ADDRESS_SPACE)
70 # error USE_LARGE_ADDRESS_SPACE needs MAP_NORESERVE or MAP_GUARD
71 # endif
72 # endif
73 #endif
74
75 static void *next_request = 0;
76
77 void osMemInit(void)
78 {
79 next_request = (void *)RtsFlags.GcFlags.heapBase;
80 }
81
82 /* -----------------------------------------------------------------------------
83 The mmap() method
84
85 On Unix-like systems, we use mmap() to allocate our memory. We
86 want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
87 boundary. The mmap() interface doesn't give us this level of
88 control, so we have to use some heuristics.
89
90 In the general case, if we want a block of n megablocks, then we
91 allocate n+1 and trim off the slop from either side (using
92 munmap()) to get an aligned chunk of size n. However, the next
93 time we'll try to allocate directly after the previously allocated
94 chunk, on the grounds that this is aligned and likely to be free.
95 If it turns out that we were wrong, we have to munmap() and try
96 again using the general method.
97
98 Note on posix_memalign(): this interface is available on recent
99 systems and appears to provide exactly what we want. However, it
100 turns out not to be as good as our mmap() implementation, because
101 it wastes extra space (using double the address space, in a test on
102 x86_64/Linux). The problem seems to be that posix_memalign()
103 returns memory that can be free()'d, so the library must store
104 extra information along with the allocated block, thus messing up
105 the alignment. Hence, we don't use posix_memalign() for now.
106
107 -------------------------------------------------------------------------- */
108
109 /*
110 A wrapper around mmap(), to abstract away from OS differences in
111 the mmap() interface.
112
113 It supports the following operations:
114 - reserve: find a new chunk of available address space, and make it so
115 that we own it (no other library will get it), but don't actually
116 allocate memory for it
117 the addr is a hint for where to place the memory (and most
118 of the time the OS happily ignores!)
119 - commit: given a chunk of address space that we know we own, make sure
120 there is some memory backing it
121 the addr is not a hint, it must point into previously reserved
122 address space, or bad things happen
123 - reserve&commit: do both at the same time
124
125 The naming is chosen from the Win32 API (VirtualAlloc) which does the
126 same thing and has done so forever, while support for this in Unix systems
127 has only been added recently and is hidden in the posix portability mess.
128 The Linux manpage suggests that mmap must be passed MAP_NORESERVE in order
129 to get reservation-only behavior. It is confusing because to get the reserve
130 behavior we need MAP_NORESERVE (which tells the kernel not to allocate backing
131 space), but heh...
132 */
133 enum
134 {
135 MEM_RESERVE = 1,
136 MEM_COMMIT = 2,
137 MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
138 };
139
140 #if defined(linux_HOST_OS)
141 static void *
142 linux_retry_mmap(int operation, W_ size, void *ret, void *addr, int prot, int flags)
143 {
144 if (addr != 0 && (operation & MEM_RESERVE)) {
145 // Try again with no hint address.
146 // It's not clear that this can ever actually help,
147 // but since our alternative is to abort, we may as well try.
148 ret = mmap(0, size, prot, flags, -1, 0);
149 }
150 if (ret == MAP_FAILED && errno == EPERM) {
151 // Linux is not willing to give us any mapping,
152 // so treat this as an out-of-memory condition
153 // (really out of virtual address space).
154 errno = ENOMEM;
155 }
156 return ret;
157 }
158 #endif /* defined(linux_HOST_OS) */
159
160 static void
161 post_mmap_madvise(int operation, W_ size, void *ret)
162 {
163 #if defined(MADV_WILLNEED)
164 if (operation & MEM_COMMIT) {
165 madvise(ret, size, MADV_WILLNEED);
166 # if defined(MADV_DODUMP)
167 madvise(ret, size, MADV_DODUMP);
168 # endif
169 } else {
170 madvise(ret, size, MADV_DONTNEED);
171 # if defined(MADV_DONTDUMP)
172 madvise(ret, size, MADV_DONTDUMP);
173 # endif
174 }
175 #endif
176 }
177
178 /* Returns NULL on failure; errno set */
179 static void *
180 my_mmap (void *addr, W_ size, int operation)
181 {
182 void *ret;
183
184 #if defined(darwin_HOST_OS)
185 // Without MAP_FIXED, Apple's mmap ignores addr.
186 // With MAP_FIXED, it overwrites already mapped regions, whic
187 // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
188 // and replaces it with zeroes, causing instant death.
189 // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
190 // Let's just use the underlying Mach Microkernel calls directly,
191 // they're much nicer.
192
193 kern_return_t err = 0;
194 ret = addr;
195
196 if(operation & MEM_RESERVE)
197 {
198 if(addr) // try to allocate at address
199 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
200 size, false);
201 if(!addr || err) // try to allocate anywhere
202 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
203 size, true);
204 }
205
206 if(err) {
207 // don't know what the error codes mean exactly, assume it's
208 // not our problem though.
209 errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
210 size);
211 stg_exit(EXIT_FAILURE);
212 }
213
214 if(operation & MEM_COMMIT) {
215 vm_protect(mach_task_self(), (vm_address_t)ret, size, false,
216 VM_PROT_READ|VM_PROT_WRITE);
217 }
218
219 #else /* defined(darwin_HOST_OS) */
220
221 int prot, flags;
222 if (operation & MEM_COMMIT) {
223 prot = PROT_READ | PROT_WRITE;
224 } else {
225 prot = PROT_NONE;
226 }
227
228 if (operation == MEM_RESERVE) {
229 # if defined(RESERVE_FLAGS)
230 flags = RESERVE_FLAGS;
231 # else
232 errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
233 # endif
234 } else if (operation == MEM_COMMIT) {
235 flags = MAP_FIXED | MAP_ANON | MAP_PRIVATE;
236 } else {
237 flags = MAP_ANON | MAP_PRIVATE;
238 }
239
240 ret = mmap(addr, size, prot, flags, -1, 0);
241 # if defined(linux_HOST_OS)
242 if (ret == MAP_FAILED && errno == EPERM) {
243 // Linux may return EPERM if it tried to give us
244 // a chunk of address space below mmap_min_addr,
245 // See #7500.
246 ret = linux_retry_mmap(operation, size, ret, addr, prot, flags);
247 }
248 # endif
249 if (ret == MAP_FAILED) {
250 return NULL;
251 }
252 #endif /* defined(darwin_HOST_OS) */
253
254 // Map in committed pages rather than take a fault for each chunk.
255 // Also arrange to include them in core-dump files.
256 post_mmap_madvise(operation, size, ret);
257
258 return ret;
259 }
260
261 /* Variant of my_mmap which aborts in the case of an error */
262 static void *
263 my_mmap_or_barf (void *addr, W_ size, int operation)
264 {
265 void *ret = my_mmap(addr, size, operation);
266
267 if (ret == NULL) {
268 if (errno == ENOMEM ||
269 (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
270 // If we request more than 3Gig, then we get EINVAL
271 // instead of ENOMEM (at least on Linux).
272 errorBelch("out of memory (requested %" FMT_Word " bytes)", size);
273 stg_exit(EXIT_HEAPOVERFLOW);
274 } else {
275 barf("getMBlock: mmap: %s", strerror(errno));
276 }
277 }
278
279 return ret;
280 }
281
282 // Implements the general case: allocate a chunk of memory of 'size'
283 // mblocks.
284
285 static void *
286 gen_map_mblocks (W_ size)
287 {
288 int slop;
289 StgWord8 *ret;
290
291 // Try to map a larger block, and take the aligned portion from
292 // it (unmap the rest).
293 size += MBLOCK_SIZE;
294 ret = my_mmap_or_barf(0, size, MEM_RESERVE_AND_COMMIT);
295
296 // unmap the slop bits around the chunk we allocated
297 slop = (W_)ret & MBLOCK_MASK;
298
299 if (munmap((void*)ret, MBLOCK_SIZE - slop) == -1) {
300 barf("gen_map_mblocks: munmap failed");
301 }
302 if (slop > 0 && munmap((void*)(ret+size-slop), slop) == -1) {
303 barf("gen_map_mblocks: munmap failed");
304 }
305
306 // ToDo: if we happened to get an aligned block, then don't
307 // unmap the excess, just use it. For this to work, you
308 // need to keep in mind the following:
309 // * Calling my_mmap() with an 'addr' arg pointing to
310 // already my_mmap()ed space is OK and won't fail.
311 // * If my_mmap() can't satisfy the request at the
312 // given 'next_request' address in getMBlocks(), that
313 // you unmap the extra mblock mmap()ed here (or simply
314 // satisfy yourself that the slop introduced isn't worth
315 // salvaging.)
316 //
317
318 // next time, try after the block we just got.
319 ret += MBLOCK_SIZE - slop;
320 return ret;
321 }
322
323 void *
324 osGetMBlocks(uint32_t n)
325 {
326 void *ret;
327 W_ size = MBLOCK_SIZE * (W_)n;
328
329 if (next_request == 0) {
330 // use gen_map_mblocks the first time.
331 ret = gen_map_mblocks(size);
332 } else {
333 ret = my_mmap_or_barf(next_request, size, MEM_RESERVE_AND_COMMIT);
334
335 if (((W_)ret & MBLOCK_MASK) != 0) {
336 // misaligned block!
337 #if 0 // defined(DEBUG)
338 errorBelch("warning: getMBlock: misaligned block %p returned "
339 "when allocating %d megablock(s) at %p",
340 ret, n, next_request);
341 #endif
342
343 // unmap this block...
344 if (munmap(ret, size) == -1) {
345 barf("getMBlock: munmap failed");
346 }
347 // and do it the hard way
348 ret = gen_map_mblocks(size);
349 }
350 }
351
352 // Next time, we'll try to allocate right after the block we just got.
353 // ToDo: check that we haven't already grabbed the memory at next_request
354 next_request = (char *)ret + size;
355
356 return ret;
357 }
358
359 void osBindMBlocksToNode(
360 void *addr STG_UNUSED,
361 StgWord size STG_UNUSED,
362 uint32_t node STG_UNUSED)
363 {
364 #if HAVE_LIBNUMA
365 int ret;
366 StgWord mask = 0;
367 mask |= 1 << node;
368 if (RtsFlags.GcFlags.numa) {
369 ret = mbind(addr, (unsigned long)size,
370 MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
371 // paranoia: MPOL_BIND guarantees memory on the correct node;
372 // MPOL_MF_STRICT will tell us if it didn't work. We might want to
373 // relax these in due course, but I want to be sure it's doing what we
374 // want first.
375 if (ret != 0) {
376 sysErrorBelch("mbind");
377 stg_exit(EXIT_FAILURE);
378 }
379 }
380 #endif
381 }
382
383
384 void osFreeMBlocks(void *addr, uint32_t n)
385 {
386 munmap(addr, n * MBLOCK_SIZE);
387 }
388
389 void osReleaseFreeMemory(void) {
390 /* Nothing to do on POSIX */
391 }
392
393 void osFreeAllMBlocks(void)
394 {
395 void *mblock;
396 void *state;
397
398 for (mblock = getFirstMBlock(&state);
399 mblock != NULL;
400 mblock = getNextMBlock(&state, mblock)) {
401 munmap(mblock, MBLOCK_SIZE);
402 }
403 }
404
405 size_t getPageSize (void)
406 {
407 static size_t pageSize = 0;
408
409 if (pageSize == 0) {
410 long ret;
411 ret = sysconf(_SC_PAGESIZE);
412 if (ret == -1) {
413 barf("getPageSize: cannot get page size");
414 }
415 pageSize = ret;
416 }
417
418 return pageSize;
419 }
420
421 /* Returns 0 if physical memory size cannot be identified */
422 StgWord64 getPhysicalMemorySize (void)
423 {
424 static StgWord64 physMemSize = 0;
425 if (!physMemSize) {
426 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
427 /* So, darwin doesn't support _SC_PHYS_PAGES, but it does
428 support getting the raw memory size in bytes through
429 sysctlbyname(hw.memsize); */
430 size_t len = sizeof(physMemSize);
431 int ret = -1;
432
433 /* Note hw.memsize is in bytes, so no need to multiply by page size. */
434 ret = sysctlbyname("hw.memsize", &physMemSize, &len, NULL, 0);
435 if (ret == -1) {
436 physMemSize = 0;
437 return 0;
438 }
439 #else
440 /* We'll politely assume we have a system supporting _SC_PHYS_PAGES
441 * otherwise. */
442 W_ pageSize = getPageSize();
443 long ret = sysconf(_SC_PHYS_PAGES);
444 if (ret == -1) {
445 #if defined(DEBUG)
446 errorBelch("warning: getPhysicalMemorySize: cannot get "
447 "physical memory size");
448 #endif
449 return 0;
450 }
451 physMemSize = ret * pageSize;
452 #endif /* darwin_HOST_OS */
453 }
454 return physMemSize;
455 }
456
457 void setExecutable (void *p, W_ len, bool exec)
458 {
459 StgWord pageSize = getPageSize();
460
461 /* malloced memory isn't executable by default on OpenBSD */
462 StgWord mask = ~(pageSize - 1);
463 StgWord startOfFirstPage = ((StgWord)p ) & mask;
464 StgWord startOfLastPage = ((StgWord)p + len - 1) & mask;
465 StgWord size = startOfLastPage - startOfFirstPage + pageSize;
466 if (mprotect((void*)startOfFirstPage, (size_t)size,
467 (exec ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE) != 0) {
468 barf("setExecutable: failed to protect 0x%p\n", p);
469 }
470 }
471
472 #if defined(USE_LARGE_ADDRESS_SPACE)
473
474 static void *
475 osTryReserveHeapMemory (W_ len, void *hint)
476 {
477 void *base, *top;
478 void *start, *end;
479
480 ASSERT((len & ~MBLOCK_MASK) == len);
481
482 /* We try to allocate len + MBLOCK_SIZE,
483 because we need memory which is MBLOCK_SIZE aligned,
484 and then we discard what we don't need */
485
486 base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
487 if (base == NULL)
488 return NULL;
489
490 top = (void*)((W_)base + len + MBLOCK_SIZE);
491
492 if (((W_)base & MBLOCK_MASK) != 0) {
493 start = MBLOCK_ROUND_UP(base);
494 end = MBLOCK_ROUND_DOWN(top);
495 ASSERT(((W_)end - (W_)start) == len);
496
497 if (munmap(base, (W_)start-(W_)base) < 0) {
498 sysErrorBelch("unable to release slop before heap");
499 }
500 if (munmap(end, (W_)top-(W_)end) < 0) {
501 sysErrorBelch("unable to release slop after heap");
502 }
503 } else {
504 start = base;
505 }
506
507 return start;
508 }
509
510 void *osReserveHeapMemory(void *startAddressPtr, W_ *len)
511 {
512 int attempt;
513 void *at;
514
515 /* We want to ensure the heap starts at least 8 GB inside the address space,
516 since we want to reserve the address space below that address for code.
517 Specifically, we need to make sure that any dynamically loaded code will
518 be close enough to the original code so that short relocations will work.
519 This is in particular important on Darwin/Mach-O, because object files
520 not compiled as shared libraries are position independent but cannot be
521 loaded above 4GB.
522
523 We do so with a hint to the mmap, and we verify the OS satisfied our
524 hint. We loop, shifting our hint by 1 BLOCK_SIZE every time, in case
525 there is already something allocated there.
526
527 Some systems impose resource limits restricting the amount of memory we
528 can request (see, e.g. #10877). If mmap fails we halve our allocation
529 request and try again. If our request size gets absurdly small we simply
530 give up.
531
532 */
533
534 W_ minimumAddress = (W_)8 * (1 << 30);
535 // We don't use minimumAddress (0x200000000) as default because we know
536 // it can clash with third-party libraries. See ticket #12573.
537 W_ startAddress = 0x4200000000;
538 if (startAddressPtr) {
539 startAddress = (W_)startAddressPtr;
540 }
541 if (startAddress < minimumAddress) {
542 errorBelch(
543 "Provided heap start address %p is lower than minimum address %p",
544 (void*)startAddress, (void*)minimumAddress);
545 }
546
547 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
548 struct rlimit limit;
549 /* rlim_t is signed on some platforms, including FreeBSD;
550 * explicitly cast to avoid sign compare error */
551 if (!getrlimit(RLIMIT_AS, &limit)
552 && limit.rlim_cur > 0
553 && *len > (unsigned) limit.rlim_cur) {
554 *len = (unsigned) limit.rlim_cur;
555 }
556 #endif
557
558 attempt = 0;
559 while (1) {
560 *len &= ~MBLOCK_MASK;
561
562 if (*len < MBLOCK_SIZE) {
563 // Give up if the system won't even give us 16 blocks worth of heap
564 barf("osReserveHeapMemory: Failed to allocate heap storage");
565 }
566
567 void *hint = (void*)(startAddress + attempt * BLOCK_SIZE);
568 at = osTryReserveHeapMemory(*len, hint);
569 if (at == NULL) {
570 // This means that mmap failed which we take to mean that we asked
571 // for too much memory. This can happen due to POSIX resource
572 // limits. In this case we reduce our allocation request by a
573 // fraction of the current size and try again.
574 //
575 // Note that the previously would instead decrease the request size
576 // by a factor of two; however, this meant that significant amounts
577 // of memory will be wasted (e.g. imagine a machine with 512GB of
578 // physical memory but a 511GB ulimit). See #14492.
579 *len -= *len / 8;
580 } else if ((W_)at >= minimumAddress) {
581 // Success! We were given a block of memory starting above the 8 GB
582 // mark, which is what we were looking for.
583 break;
584 } else {
585 // We got addressing space but it wasn't above the 8GB mark.
586 // Try again.
587 if (munmap(at, *len) < 0) {
588 sysErrorBelch("unable to release reserved heap");
589 }
590 }
591 attempt++;
592 }
593
594 return at;
595 }
596
597 void osCommitMemory(void *at, W_ size)
598 {
599 void *r = my_mmap(at, size, MEM_COMMIT);
600 if (r == NULL) {
601 barf("Unable to commit %" FMT_Word " bytes of memory", size);
602 }
603 }
604
605 void osDecommitMemory(void *at, W_ size)
606 {
607 int r;
608
609 // First make the memory unaccessible (so that we get a segfault
610 // at the next attempt to touch it)
611 // We only do this in DEBUG because it forces the OS to remove
612 // all MMU entries for this page range, and there is no reason
613 // to do so unless there is memory pressure
614 #if defined(DEBUG)
615 r = mprotect(at, size, PROT_NONE);
616 if(r < 0)
617 sysErrorBelch("unable to make released memory unaccessible");
618 #endif
619
620 #if defined(MADV_FREE)
621 // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
622 // just swaps memory out. Linux >= 4.5 has both DONTNEED and FREE; either
623 // will work as they both allow the system to free anonymous pages.
624 // It is important that we try both methods as the kernel which we were
625 // built on may differ from the kernel we are now running on.
626 r = madvise(at, size, MADV_FREE);
627 if(r < 0) {
628 if (errno == EINVAL) {
629 // Perhaps the system doesn't support MADV_FREE; fall-through and
630 // try MADV_DONTNEED.
631 } else {
632 sysErrorBelch("unable to decommit memory");
633 }
634 } else {
635 return;
636 }
637 #endif
638
639 r = madvise(at, size, MADV_DONTNEED);
640 if(r < 0)
641 sysErrorBelch("unable to decommit memory");
642 }
643
644 void osReleaseHeapMemory(void)
645 {
646 int r;
647
648 r = munmap((void*)mblock_address_space.begin,
649 mblock_address_space.end - mblock_address_space.begin);
650 if(r < 0)
651 sysErrorBelch("unable to release address space");
652 }
653
654 #endif
655
656 bool osBuiltWithNumaSupport(void)
657 {
658 #if HAVE_LIBNUMA
659 return true;
660 #else
661 return false;
662 #endif
663 }
664
665 bool osNumaAvailable(void)
666 {
667 #if HAVE_LIBNUMA
668 return (numa_available() != -1);
669 #else
670 return false;
671 #endif
672 }
673
674 uint32_t osNumaNodes(void)
675 {
676 #if HAVE_LIBNUMA
677 return numa_num_configured_nodes();
678 #else
679 return 1;
680 #endif
681 }
682
683 uint64_t osNumaMask(void)
684 {
685 #if HAVE_LIBNUMA
686 struct bitmask *mask;
687 mask = numa_get_mems_allowed();
688 if (osNumaNodes() > sizeof(StgWord)*8) {
689 barf("osNumaMask: too many NUMA nodes (%d)", osNumaNodes());
690 }
691 uint64_t r = mask->maskp[0];
692 numa_bitmask_free(mask);
693 return r;
694 #else
695 return 1;
696 #endif
697 }