08f96350c90fe0f65d20e3401ae750936e202b0e
[ghc.git] / rts / posix / OSMem.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2007
4 *
5 * OS-specific memory management
6 *
7 * ---------------------------------------------------------------------------*/
8
9 // This is non-posix compliant.
10 // #include "PosixSource.h"
11
12 #include "Rts.h"
13
14 #include "RtsUtils.h"
15 #include "sm/OSMem.h"
16 #include "sm/HeapAlloc.h"
17
18 #if defined(HAVE_UNISTD_H)
19 #include <unistd.h>
20 #endif
21 #if defined(HAVE_SYS_TYPES_H)
22 #include <sys/types.h>
23 #endif
24 #if defined(HAVE_SYS_MMAN_H)
25 #include <sys/mman.h>
26 #endif
27 #if defined(HAVE_STRING_H)
28 #include <string.h>
29 #endif
30 #if defined(HAVE_FCNTL_H)
31 #include <fcntl.h>
32 #endif
33 #if defined(HAVE_NUMA_H)
34 #include <numa.h>
35 #endif
36 #if defined(HAVE_NUMAIF_H)
37 #include <numaif.h>
38 #endif
39 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
40 #include <sys/time.h>
41 #include <sys/resource.h>
42 #endif
43
44 #include <errno.h>
45
46 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
47 #include <mach/mach.h>
48 #include <mach/vm_map.h>
49 #include <sys/sysctl.h>
50 #endif
51
52 static void *next_request = 0;
53
54 void osMemInit(void)
55 {
56 next_request = (void *)RtsFlags.GcFlags.heapBase;
57 }
58
59 /* -----------------------------------------------------------------------------
60 The mmap() method
61
62 On Unix-like systems, we use mmap() to allocate our memory. We
63 want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
64 boundary. The mmap() interface doesn't give us this level of
65 control, so we have to use some heuristics.
66
67 In the general case, if we want a block of n megablocks, then we
68 allocate n+1 and trim off the slop from either side (using
69 munmap()) to get an aligned chunk of size n. However, the next
70 time we'll try to allocate directly after the previously allocated
71 chunk, on the grounds that this is aligned and likely to be free.
72 If it turns out that we were wrong, we have to munmap() and try
73 again using the general method.
74
75 Note on posix_memalign(): this interface is available on recent
76 systems and appears to provide exactly what we want. However, it
77 turns out not to be as good as our mmap() implementation, because
78 it wastes extra space (using double the address space, in a test on
79 x86_64/Linux). The problem seems to be that posix_memalign()
80 returns memory that can be free()'d, so the library must store
81 extra information along with the allocated block, thus messing up
82 the alignment. Hence, we don't use posix_memalign() for now.
83
84 -------------------------------------------------------------------------- */
85
86 /*
87 A wrapper around mmap(), to abstract away from OS differences in
88 the mmap() interface.
89
90 It supports the following operations:
91 - reserve: find a new chunk of available address space, and make it so
92 that we own it (no other library will get it), but don't actually
93 allocate memory for it
94 the addr is a hint for where to place the memory (and most
95 of the time the OS happily ignores!)
96 - commit: given a chunk of address space that we know we own, make sure
97 there is some memory backing it
98 the addr is not a hint, it must point into previously reserved
99 address space, or bad things happen
100 - reserve&commit: do both at the same time
101
102 The naming is chosen from the Win32 API (VirtualAlloc) which does the
103 same thing and has done so forever, while support for this in Unix systems
104 has only been added recently and is hidden in the posix portability mess.
105 The Linux manpage suggests that mmap must be passed MAP_NORESERVE in order
106 to get reservation-only behavior. It is confusing because to get the reserve
107 behavior we need MAP_NORESERVE (which tells the kernel not to allocate backing
108 space), but heh...
109 */
110 enum
111 {
112 MEM_RESERVE = 1,
113 MEM_COMMIT = 2,
114 MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
115 };
116
117 /* Returns NULL on failure; errno set */
118 static void *
119 my_mmap (void *addr, W_ size, int operation)
120 {
121 void *ret;
122
123 #if defined(darwin_HOST_OS)
124 // Without MAP_FIXED, Apple's mmap ignores addr.
125 // With MAP_FIXED, it overwrites already mapped regions, whic
126 // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
127 // and replaces it with zeroes, causing instant death.
128 // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
129 // Let's just use the underlying Mach Microkernel calls directly,
130 // they're much nicer.
131
132 kern_return_t err = 0;
133 ret = addr;
134
135 if(operation & MEM_RESERVE)
136 {
137 if(addr) // try to allocate at address
138 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
139 size, false);
140 if(!addr || err) // try to allocate anywhere
141 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
142 size, true);
143 }
144
145 if(err) {
146 // don't know what the error codes mean exactly, assume it's
147 // not our problem though.
148 errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
149 size);
150 stg_exit(EXIT_FAILURE);
151 }
152
153 if(operation & MEM_COMMIT) {
154 vm_protect(mach_task_self(), (vm_address_t)ret, size, false,
155 VM_PROT_READ|VM_PROT_WRITE);
156 }
157
158 #else
159
160 int prot, flags;
161 if (operation & MEM_COMMIT)
162 prot = PROT_READ | PROT_WRITE;
163 else
164 prot = PROT_NONE;
165 if (operation == MEM_RESERVE)
166 # if defined(MAP_GUARD)
167 // Provided by FreeBSD
168 flags = MAP_GUARD;
169 # elif defined(MAP_NORESERVE)
170 flags = MAP_NORESERVE;
171 # else
172 # if defined(USE_LARGE_ADDRESS_SPACE)
173 # error USE_LARGE_ADDRESS_SPACE needs MAP_NORESERVE
174 # endif
175 errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
176 # endif
177 else if (operation == MEM_COMMIT)
178 flags = MAP_FIXED;
179 else
180 flags = 0;
181
182 #if defined(hpux_HOST_OS)
183 ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
184 #elif defined(linux_HOST_OS)
185 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
186 if (ret == (void *)-1 && errno == EPERM) {
187 // Linux may return EPERM if it tried to give us
188 // a chunk of address space below mmap_min_addr,
189 // See Trac #7500.
190 if (addr != 0 && (operation & MEM_RESERVE)) {
191 // Try again with no hint address.
192 // It's not clear that this can ever actually help,
193 // but since our alternative is to abort, we may as well try.
194 ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
195 }
196 if (ret == (void *)-1 && errno == EPERM) {
197 // Linux is not willing to give us any mapping,
198 // so treat this as an out-of-memory condition
199 // (really out of virtual address space).
200 errno = ENOMEM;
201 }
202 }
203
204 if (ret != (void *)-1) {
205 if (operation & MEM_COMMIT) {
206 madvise(ret, size, MADV_WILLNEED);
207 #if defined(MADV_DODUMP)
208 madvise(ret, size, MADV_DODUMP);
209 #endif
210 } else {
211 madvise(ret, size, MADV_DONTNEED);
212 #if defined(MADV_DONTDUMP)
213 madvise(ret, size, MADV_DONTDUMP);
214 #endif
215 }
216 }
217
218 #else
219 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
220 #endif
221 #endif
222
223 if (ret == (void *)-1) {
224 return NULL;
225 }
226
227 return ret;
228 }
229
230 /* Variant of my_mmap which aborts in the case of an error */
231 static void *
232 my_mmap_or_barf (void *addr, W_ size, int operation)
233 {
234 void *ret = my_mmap(addr, size, operation);
235
236 if (ret == NULL) {
237 if (errno == ENOMEM ||
238 (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
239 // If we request more than 3Gig, then we get EINVAL
240 // instead of ENOMEM (at least on Linux).
241 errorBelch("out of memory (requested %" FMT_Word " bytes)", size);
242 stg_exit(EXIT_HEAPOVERFLOW);
243 } else {
244 barf("getMBlock: mmap: %s", strerror(errno));
245 }
246 }
247
248 return ret;
249 }
250
251 // Implements the general case: allocate a chunk of memory of 'size'
252 // mblocks.
253
254 static void *
255 gen_map_mblocks (W_ size)
256 {
257 int slop;
258 StgWord8 *ret;
259
260 // Try to map a larger block, and take the aligned portion from
261 // it (unmap the rest).
262 size += MBLOCK_SIZE;
263 ret = my_mmap_or_barf(0, size, MEM_RESERVE_AND_COMMIT);
264
265 // unmap the slop bits around the chunk we allocated
266 slop = (W_)ret & MBLOCK_MASK;
267
268 if (munmap((void*)ret, MBLOCK_SIZE - slop) == -1) {
269 barf("gen_map_mblocks: munmap failed");
270 }
271 if (slop > 0 && munmap((void*)(ret+size-slop), slop) == -1) {
272 barf("gen_map_mblocks: munmap failed");
273 }
274
275 // ToDo: if we happened to get an aligned block, then don't
276 // unmap the excess, just use it. For this to work, you
277 // need to keep in mind the following:
278 // * Calling my_mmap() with an 'addr' arg pointing to
279 // already my_mmap()ed space is OK and won't fail.
280 // * If my_mmap() can't satisfy the request at the
281 // given 'next_request' address in getMBlocks(), that
282 // you unmap the extra mblock mmap()ed here (or simply
283 // satisfy yourself that the slop introduced isn't worth
284 // salvaging.)
285 //
286
287 // next time, try after the block we just got.
288 ret += MBLOCK_SIZE - slop;
289 return ret;
290 }
291
292 void *
293 osGetMBlocks(uint32_t n)
294 {
295 void *ret;
296 W_ size = MBLOCK_SIZE * (W_)n;
297
298 if (next_request == 0) {
299 // use gen_map_mblocks the first time.
300 ret = gen_map_mblocks(size);
301 } else {
302 ret = my_mmap_or_barf(next_request, size, MEM_RESERVE_AND_COMMIT);
303
304 if (((W_)ret & MBLOCK_MASK) != 0) {
305 // misaligned block!
306 #if 0 // defined(DEBUG)
307 errorBelch("warning: getMBlock: misaligned block %p returned "
308 "when allocating %d megablock(s) at %p",
309 ret, n, next_request);
310 #endif
311
312 // unmap this block...
313 if (munmap(ret, size) == -1) {
314 barf("getMBlock: munmap failed");
315 }
316 // and do it the hard way
317 ret = gen_map_mblocks(size);
318 }
319 }
320
321 // Next time, we'll try to allocate right after the block we just got.
322 // ToDo: check that we haven't already grabbed the memory at next_request
323 next_request = (char *)ret + size;
324
325 return ret;
326 }
327
328 void osBindMBlocksToNode(
329 void *addr STG_UNUSED,
330 StgWord size STG_UNUSED,
331 uint32_t node STG_UNUSED)
332 {
333 #if HAVE_LIBNUMA
334 int ret;
335 StgWord mask = 0;
336 mask |= 1 << node;
337 if (RtsFlags.GcFlags.numa) {
338 ret = mbind(addr, (unsigned long)size,
339 MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
340 // paranoia: MPOL_BIND guarantees memory on the correct node;
341 // MPOL_MF_STRICT will tell us if it didn't work. We might want to
342 // relax these in due course, but I want to be sure it's doing what we
343 // want first.
344 if (ret != 0) {
345 sysErrorBelch("mbind");
346 stg_exit(EXIT_FAILURE);
347 }
348 }
349 #endif
350 }
351
352
353 void osFreeMBlocks(void *addr, uint32_t n)
354 {
355 munmap(addr, n * MBLOCK_SIZE);
356 }
357
358 void osReleaseFreeMemory(void) {
359 /* Nothing to do on POSIX */
360 }
361
362 void osFreeAllMBlocks(void)
363 {
364 void *mblock;
365 void *state;
366
367 for (mblock = getFirstMBlock(&state);
368 mblock != NULL;
369 mblock = getNextMBlock(&state, mblock)) {
370 munmap(mblock, MBLOCK_SIZE);
371 }
372 }
373
374 size_t getPageSize (void)
375 {
376 static size_t pageSize = 0;
377
378 if (pageSize == 0) {
379 long ret;
380 ret = sysconf(_SC_PAGESIZE);
381 if (ret == -1) {
382 barf("getPageSize: cannot get page size");
383 }
384 pageSize = ret;
385 }
386
387 return pageSize;
388 }
389
390 /* Returns 0 if physical memory size cannot be identified */
391 StgWord64 getPhysicalMemorySize (void)
392 {
393 static StgWord64 physMemSize = 0;
394 if (!physMemSize) {
395 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
396 /* So, darwin doesn't support _SC_PHYS_PAGES, but it does
397 support getting the raw memory size in bytes through
398 sysctlbyname(hw.memsize); */
399 size_t len = sizeof(physMemSize);
400 int ret = -1;
401
402 /* Note hw.memsize is in bytes, so no need to multiply by page size. */
403 ret = sysctlbyname("hw.memsize", &physMemSize, &len, NULL, 0);
404 if (ret == -1) {
405 physMemSize = 0;
406 return 0;
407 }
408 #else
409 /* We'll politely assume we have a system supporting _SC_PHYS_PAGES
410 * otherwise. */
411 W_ pageSize = getPageSize();
412 long ret = sysconf(_SC_PHYS_PAGES);
413 if (ret == -1) {
414 #if defined(DEBUG)
415 errorBelch("warning: getPhysicalMemorySize: cannot get "
416 "physical memory size");
417 #endif
418 return 0;
419 }
420 physMemSize = ret * pageSize;
421 #endif /* darwin_HOST_OS */
422 }
423 return physMemSize;
424 }
425
426 void setExecutable (void *p, W_ len, bool exec)
427 {
428 StgWord pageSize = getPageSize();
429
430 /* malloced memory isn't executable by default on OpenBSD */
431 StgWord mask = ~(pageSize - 1);
432 StgWord startOfFirstPage = ((StgWord)p ) & mask;
433 StgWord startOfLastPage = ((StgWord)p + len - 1) & mask;
434 StgWord size = startOfLastPage - startOfFirstPage + pageSize;
435 if (mprotect((void*)startOfFirstPage, (size_t)size,
436 (exec ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE) != 0) {
437 barf("setExecutable: failed to protect 0x%p\n", p);
438 }
439 }
440
441 #if defined(USE_LARGE_ADDRESS_SPACE)
442
443 static void *
444 osTryReserveHeapMemory (W_ len, void *hint)
445 {
446 void *base, *top;
447 void *start, *end;
448
449 /* We try to allocate len + MBLOCK_SIZE,
450 because we need memory which is MBLOCK_SIZE aligned,
451 and then we discard what we don't need */
452
453 base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
454 if (base == NULL)
455 return NULL;
456
457 top = (void*)((W_)base + len + MBLOCK_SIZE);
458
459 if (((W_)base & MBLOCK_MASK) != 0) {
460 start = MBLOCK_ROUND_UP(base);
461 end = MBLOCK_ROUND_DOWN(top);
462 ASSERT(((W_)end - (W_)start) == len);
463
464 if (munmap(base, (W_)start-(W_)base) < 0) {
465 sysErrorBelch("unable to release slop before heap");
466 }
467 if (munmap(end, (W_)top-(W_)end) < 0) {
468 sysErrorBelch("unable to release slop after heap");
469 }
470 } else {
471 start = base;
472 }
473
474 return start;
475 }
476
477 void *osReserveHeapMemory(void *startAddressPtr, W_ *len)
478 {
479 int attempt;
480 void *at;
481
482 /* We want to ensure the heap starts at least 8 GB inside the address space,
483 since we want to reserve the address space below that address for code.
484 Specifically, we need to make sure that any dynamically loaded code will
485 be close enough to the original code so that short relocations will work.
486 This is in particular important on Darwin/Mach-O, because object files
487 not compiled as shared libraries are position independent but cannot be
488 loaded above 4GB.
489
490 We do so with a hint to the mmap, and we verify the OS satisfied our
491 hint. We loop, shifting our hint by 1 BLOCK_SIZE every time, in case
492 there is already something allocated there.
493
494 Some systems impose resource limits restricting the amount of memory we
495 can request (see, e.g. #10877). If mmap fails we halve our allocation
496 request and try again. If our request size gets absurdly small we simply
497 give up.
498
499 */
500
501 W_ minimumAddress = (W_)8 * (1 << 30);
502 // We don't use minimumAddress (0x200000000) as default because we know
503 // it can clash with third-party libraries. See ticket #12573.
504 W_ startAddress = 0x4200000000;
505 if (startAddressPtr) {
506 startAddress = (W_)startAddressPtr;
507 }
508 if (startAddress < minimumAddress) {
509 errorBelch(
510 "Provided heap start address %p is lower than minimum address %p",
511 (void*)startAddress, (void*)minimumAddress);
512 }
513
514 #if defined(HAVE_SYS_RESOURCE_H) && defined(HAVE_SYS_TIME_H)
515 struct rlimit limit;
516 if (!getrlimit(RLIMIT_AS, &limit)
517 && limit.rlim_cur > 0
518 && *len > limit.rlim_cur) {
519 *len = limit.rlim_cur;
520 }
521 #endif
522
523 attempt = 0;
524 while (1) {
525 if (*len < MBLOCK_SIZE) {
526 // Give up if the system won't even give us 16 blocks worth of heap
527 barf("osReserveHeapMemory: Failed to allocate heap storage");
528 }
529
530 void *hint = (void*)(startAddress + attempt * BLOCK_SIZE);
531 at = osTryReserveHeapMemory(*len, hint);
532 if (at == NULL) {
533 // This means that mmap failed which we take to mean that we asked
534 // for too much memory. This can happen due to POSIX resource
535 // limits. In this case we reduce our allocation request by a
536 // fraction of the current size and try again.
537 //
538 // Note that the previously would instead decrease the request size
539 // by a factor of two; however, this meant that significant amounts
540 // of memory will be wasted (e.g. imagine a machine with 512GB of
541 // physical memory but a 511GB ulimit). See #14492.
542 *len -= *len / 8;
543 } else if ((W_)at >= minimumAddress) {
544 // Success! We were given a block of memory starting above the 8 GB
545 // mark, which is what we were looking for.
546 break;
547 } else {
548 // We got addressing space but it wasn't above the 8GB mark.
549 // Try again.
550 if (munmap(at, *len) < 0) {
551 sysErrorBelch("unable to release reserved heap");
552 }
553 }
554 attempt++;
555 }
556
557 return at;
558 }
559
560 void osCommitMemory(void *at, W_ size)
561 {
562 void *r = my_mmap(at, size, MEM_COMMIT);
563 if (r == NULL) {
564 barf("Unable to commit %" FMT_Word " bytes of memory", size);
565 }
566 }
567
568 void osDecommitMemory(void *at, W_ size)
569 {
570 int r;
571
572 // First make the memory unaccessible (so that we get a segfault
573 // at the next attempt to touch it)
574 // We only do this in DEBUG because it forces the OS to remove
575 // all MMU entries for this page range, and there is no reason
576 // to do so unless there is memory pressure
577 #if defined(DEBUG)
578 r = mprotect(at, size, PROT_NONE);
579 if(r < 0)
580 sysErrorBelch("unable to make released memory unaccessible");
581 #endif
582
583 #if defined(MADV_FREE)
584 // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
585 // just swaps memory out. Linux >= 4.5 has both DONTNEED and FREE; either
586 // will work as they both allow the system to free anonymous pages.
587 // It is important that we try both methods as the kernel which we were
588 // built on may differ from the kernel we are now running on.
589 r = madvise(at, size, MADV_FREE);
590 if(r < 0) {
591 if (errno == EINVAL) {
592 // Perhaps the system doesn't support MADV_FREE; fall-through and
593 // try MADV_DONTNEED.
594 } else {
595 sysErrorBelch("unable to decommit memory");
596 }
597 } else {
598 return;
599 }
600 #endif
601
602 r = madvise(at, size, MADV_DONTNEED);
603 if(r < 0)
604 sysErrorBelch("unable to decommit memory");
605 }
606
607 void osReleaseHeapMemory(void)
608 {
609 int r;
610
611 r = munmap((void*)mblock_address_space.begin,
612 mblock_address_space.end - mblock_address_space.begin);
613 if(r < 0)
614 sysErrorBelch("unable to release address space");
615 }
616
617 #endif
618
619 bool osBuiltWithNumaSupport(void)
620 {
621 #if HAVE_LIBNUMA
622 return true;
623 #else
624 return false;
625 #endif
626 }
627
628 bool osNumaAvailable(void)
629 {
630 #if HAVE_LIBNUMA
631 return (numa_available() != -1);
632 #else
633 return false;
634 #endif
635 }
636
637 uint32_t osNumaNodes(void)
638 {
639 #if HAVE_LIBNUMA
640 return numa_num_configured_nodes();
641 #else
642 return 1;
643 #endif
644 }
645
646 uint64_t osNumaMask(void)
647 {
648 #if HAVE_LIBNUMA
649 struct bitmask *mask;
650 mask = numa_get_mems_allowed();
651 if (osNumaNodes() > sizeof(StgWord)*8) {
652 barf("osNumaMask: too many NUMA nodes (%d)", osNumaNodes());
653 }
654 uint64_t r = mask->maskp[0];
655 numa_bitmask_free(mask);
656 return r;
657 #else
658 return 1;
659 #endif
660 }