NUMA support
[ghc.git] / rts / posix / OSMem.c
1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The University of Glasgow 2006-2007
4 *
5 * OS-specific memory management
6 *
7 * ---------------------------------------------------------------------------*/
8
9 // This is non-posix compliant.
10 // #include "PosixSource.h"
11
12 #include "Rts.h"
13
14 #include "RtsUtils.h"
15 #include "sm/OSMem.h"
16 #include "sm/HeapAlloc.h"
17
18 #ifdef HAVE_UNISTD_H
19 #include <unistd.h>
20 #endif
21 #ifdef HAVE_SYS_TYPES_H
22 #include <sys/types.h>
23 #endif
24 #ifdef HAVE_SYS_MMAN_H
25 #include <sys/mman.h>
26 #endif
27 #ifdef HAVE_STRING_H
28 #include <string.h>
29 #endif
30 #ifdef HAVE_FCNTL_H
31 #include <fcntl.h>
32 #endif
33 #ifdef HAVE_NUMA_H
34 #include <numa.h>
35 #endif
36 #ifdef HAVE_NUMAIF_H
37 #include <numaif.h>
38 #endif
39
40 #include <errno.h>
41
42 #if darwin_HOST_OS || ios_HOST_OS
43 #include <mach/mach.h>
44 #include <mach/vm_map.h>
45 #include <sys/sysctl.h>
46 #endif
47
48 static void *next_request = 0;
49
50 void osMemInit(void)
51 {
52 next_request = (void *)RtsFlags.GcFlags.heapBase;
53 }
54
55 /* -----------------------------------------------------------------------------
56 The mmap() method
57
58 On Unix-like systems, we use mmap() to allocate our memory. We
59 want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
60 boundary. The mmap() interface doesn't give us this level of
61 control, so we have to use some heuristics.
62
63 In the general case, if we want a block of n megablocks, then we
64 allocate n+1 and trim off the slop from either side (using
65 munmap()) to get an aligned chunk of size n. However, the next
66 time we'll try to allocate directly after the previously allocated
67 chunk, on the grounds that this is aligned and likely to be free.
68 If it turns out that we were wrong, we have to munmap() and try
69 again using the general method.
70
71 Note on posix_memalign(): this interface is available on recent
72 systems and appears to provide exactly what we want. However, it
73 turns out not to be as good as our mmap() implementation, because
74 it wastes extra space (using double the address space, in a test on
75 x86_64/Linux). The problem seems to be that posix_memalign()
76 returns memory that can be free()'d, so the library must store
77 extra information along with the allocated block, thus messing up
78 the alignment. Hence, we don't use posix_memalign() for now.
79
80 -------------------------------------------------------------------------- */
81
82 /*
83 A wrapper around mmap(), to abstract away from OS differences in
84 the mmap() interface.
85
86 It supports the following operations:
87 - reserve: find a new chunk of available address space, and make it so
88 that we own it (no other library will get it), but don't actually
89 allocate memory for it
90 the addr is a hint for where to place the memory (and most
91 of the time the OS happily ignores!)
92 - commit: given a chunk of address space that we know we own, make sure
93 there is some memory backing it
94 the addr is not a hint, it must point into previously reserved
95 address space, or bad things happen
96 - reserve&commit: do both at the same time
97
98 The naming is chosen from the Win32 API (VirtualAlloc) which does the
99 same thing and has done so forever, while support for this in Unix systems
100 has only been added recently and is hidden in the posix portability mess.
101 It is confusing because to get the reserve behavior we need MAP_NORESERVE
102 (which tells the kernel not to allocate backing space), but heh...
103 */
104 enum
105 {
106 MEM_RESERVE = 1,
107 MEM_COMMIT = 2,
108 MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
109 };
110
111 /* Returns NULL on failure; errno set */
112 static void *
113 my_mmap (void *addr, W_ size, int operation)
114 {
115 void *ret;
116
117 #if darwin_HOST_OS
118 // Without MAP_FIXED, Apple's mmap ignores addr.
119 // With MAP_FIXED, it overwrites already mapped regions, whic
120 // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
121 // and replaces it with zeroes, causing instant death.
122 // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
123 // Let's just use the underlying Mach Microkernel calls directly,
124 // they're much nicer.
125
126 kern_return_t err = 0;
127 ret = addr;
128
129 if(operation & MEM_RESERVE)
130 {
131 if(addr) // try to allocate at address
132 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
133 size, FALSE);
134 if(!addr || err) // try to allocate anywhere
135 err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
136 size, TRUE);
137 }
138
139 if(err) {
140 // don't know what the error codes mean exactly, assume it's
141 // not our problem though.
142 errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
143 size);
144 stg_exit(EXIT_FAILURE);
145 }
146
147 if(operation & MEM_COMMIT) {
148 vm_protect(mach_task_self(), (vm_address_t)ret, size, FALSE,
149 VM_PROT_READ|VM_PROT_WRITE);
150 }
151
152 #else
153
154 int prot, flags;
155 if (operation & MEM_COMMIT)
156 prot = PROT_READ | PROT_WRITE;
157 else
158 prot = PROT_NONE;
159 if (operation == MEM_RESERVE)
160 # if defined(MAP_NORESERVE)
161 flags = MAP_NORESERVE;
162 # else
163 # ifdef USE_LARGE_ADDRESS_SPACE
164 # error USE_LARGE_ADDRESS_SPACE needs MAP_NORESERVE
165 # endif
166 errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
167 # endif
168 else if (operation == MEM_COMMIT)
169 flags = MAP_FIXED;
170 else
171 flags = 0;
172
173 #if hpux_HOST_OS
174 ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
175 #elif linux_HOST_OS
176 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
177 if (ret == (void *)-1 && errno == EPERM) {
178 // Linux may return EPERM if it tried to give us
179 // a chunk of address space below mmap_min_addr,
180 // See Trac #7500.
181 if (addr != 0 && (operation & MEM_RESERVE)) {
182 // Try again with no hint address.
183 // It's not clear that this can ever actually help,
184 // but since our alternative is to abort, we may as well try.
185 ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
186 }
187 if (ret == (void *)-1 && errno == EPERM) {
188 // Linux is not willing to give us any mapping,
189 // so treat this as an out-of-memory condition
190 // (really out of virtual address space).
191 errno = ENOMEM;
192 }
193 }
194 #else
195 ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
196 #endif
197 #endif
198
199 if (ret == (void *)-1) {
200 return NULL;
201 }
202
203 return ret;
204 }
205
206 /* Variant of my_mmap which aborts in the case of an error */
207 static void *
208 my_mmap_or_barf (void *addr, W_ size, int operation)
209 {
210 void *ret = my_mmap(addr, size, operation);
211
212 if (ret == NULL) {
213 if (errno == ENOMEM ||
214 (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
215 // If we request more than 3Gig, then we get EINVAL
216 // instead of ENOMEM (at least on Linux).
217 errorBelch("out of memory (requested %" FMT_Word " bytes)", size);
218 stg_exit(EXIT_HEAPOVERFLOW);
219 } else {
220 barf("getMBlock: mmap: %s", strerror(errno));
221 }
222 }
223
224 return ret;
225 }
226
227 // Implements the general case: allocate a chunk of memory of 'size'
228 // mblocks.
229
230 static void *
231 gen_map_mblocks (W_ size)
232 {
233 int slop;
234 StgWord8 *ret;
235
236 // Try to map a larger block, and take the aligned portion from
237 // it (unmap the rest).
238 size += MBLOCK_SIZE;
239 ret = my_mmap_or_barf(0, size, MEM_RESERVE_AND_COMMIT);
240
241 // unmap the slop bits around the chunk we allocated
242 slop = (W_)ret & MBLOCK_MASK;
243
244 if (munmap((void*)ret, MBLOCK_SIZE - slop) == -1) {
245 barf("gen_map_mblocks: munmap failed");
246 }
247 if (slop > 0 && munmap((void*)(ret+size-slop), slop) == -1) {
248 barf("gen_map_mblocks: munmap failed");
249 }
250
251 // ToDo: if we happened to get an aligned block, then don't
252 // unmap the excess, just use it. For this to work, you
253 // need to keep in mind the following:
254 // * Calling my_mmap() with an 'addr' arg pointing to
255 // already my_mmap()ed space is OK and won't fail.
256 // * If my_mmap() can't satisfy the request at the
257 // given 'next_request' address in getMBlocks(), that
258 // you unmap the extra mblock mmap()ed here (or simply
259 // satisfy yourself that the slop introduced isn't worth
260 // salvaging.)
261 //
262
263 // next time, try after the block we just got.
264 ret += MBLOCK_SIZE - slop;
265 return ret;
266 }
267
268 void *
269 osGetMBlocks(uint32_t n)
270 {
271 void *ret;
272 W_ size = MBLOCK_SIZE * (W_)n;
273
274 if (next_request == 0) {
275 // use gen_map_mblocks the first time.
276 ret = gen_map_mblocks(size);
277 } else {
278 ret = my_mmap_or_barf(next_request, size, MEM_RESERVE_AND_COMMIT);
279
280 if (((W_)ret & MBLOCK_MASK) != 0) {
281 // misaligned block!
282 #if 0 // defined(DEBUG)
283 errorBelch("warning: getMBlock: misaligned block %p returned "
284 "when allocating %d megablock(s) at %p",
285 ret, n, next_request);
286 #endif
287
288 // unmap this block...
289 if (munmap(ret, size) == -1) {
290 barf("getMBlock: munmap failed");
291 }
292 // and do it the hard way
293 ret = gen_map_mblocks(size);
294 }
295 }
296
297 // Next time, we'll try to allocate right after the block we just got.
298 // ToDo: check that we haven't already grabbed the memory at next_request
299 next_request = (char *)ret + size;
300
301 return ret;
302 }
303
304 void osBindMBlocksToNode(
305 void *addr STG_UNUSED,
306 StgWord size STG_UNUSED,
307 uint32_t node STG_UNUSED)
308 {
309 #ifdef HAVE_NUMAIF_H
310 int ret;
311 StgWord mask = 0;
312 mask |= 1 << node;
313 if (RtsFlags.GcFlags.numa) {
314 ret = mbind(addr, (unsigned long)size,
315 MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
316 // paranoia: MPOL_BIND guarantees memory on the correct node;
317 // MPOL_MF_STRICT will tell us if it didn't work. We might want to
318 // relax these in due course, but I want to be sure it's doing what we
319 // want first.
320 if (ret != 0) {
321 sysErrorBelch("mbind");
322 stg_exit(EXIT_FAILURE);
323 }
324 }
325 #endif
326 }
327
328
329 void osFreeMBlocks(void *addr, uint32_t n)
330 {
331 munmap(addr, n * MBLOCK_SIZE);
332 }
333
334 void osReleaseFreeMemory(void) {
335 /* Nothing to do on POSIX */
336 }
337
338 void osFreeAllMBlocks(void)
339 {
340 void *mblock;
341 void *state;
342
343 for (mblock = getFirstMBlock(&state);
344 mblock != NULL;
345 mblock = getNextMBlock(&state, mblock)) {
346 munmap(mblock, MBLOCK_SIZE);
347 }
348 }
349
350 size_t getPageSize (void)
351 {
352 static size_t pageSize = 0;
353
354 if (pageSize == 0) {
355 long ret;
356 ret = sysconf(_SC_PAGESIZE);
357 if (ret == -1) {
358 barf("getPageSize: cannot get page size");
359 }
360 pageSize = ret;
361 }
362
363 return pageSize;
364 }
365
366 /* Returns 0 if physical memory size cannot be identified */
367 StgWord64 getPhysicalMemorySize (void)
368 {
369 static StgWord64 physMemSize = 0;
370 if (!physMemSize) {
371 #if defined(darwin_HOST_OS) || defined(ios_HOST_OS)
372 /* So, darwin doesn't support _SC_PHYS_PAGES, but it does
373 support getting the raw memory size in bytes through
374 sysctlbyname(hw.memsize); */
375 size_t len = sizeof(physMemSize);
376 int ret = -1;
377
378 /* Note hw.memsize is in bytes, so no need to multiply by page size. */
379 ret = sysctlbyname("hw.memsize", &physMemSize, &len, NULL, 0);
380 if (ret == -1) {
381 physMemSize = 0;
382 return 0;
383 }
384 #else
385 /* We'll politely assume we have a system supporting _SC_PHYS_PAGES
386 * otherwise. */
387 W_ pageSize = getPageSize();
388 long ret = sysconf(_SC_PHYS_PAGES);
389 if (ret == -1) {
390 #if defined(DEBUG)
391 errorBelch("warning: getPhysicalMemorySize: cannot get "
392 "physical memory size");
393 #endif
394 return 0;
395 }
396 physMemSize = ret * pageSize;
397 #endif /* darwin_HOST_OS */
398 }
399 return physMemSize;
400 }
401
402 void setExecutable (void *p, W_ len, rtsBool exec)
403 {
404 StgWord pageSize = getPageSize();
405
406 /* malloced memory isn't executable by default on OpenBSD */
407 StgWord mask = ~(pageSize - 1);
408 StgWord startOfFirstPage = ((StgWord)p ) & mask;
409 StgWord startOfLastPage = ((StgWord)p + len - 1) & mask;
410 StgWord size = startOfLastPage - startOfFirstPage + pageSize;
411 if (mprotect((void*)startOfFirstPage, (size_t)size,
412 (exec ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE) != 0) {
413 barf("setExecutable: failed to protect 0x%p\n", p);
414 }
415 }
416
417 #ifdef USE_LARGE_ADDRESS_SPACE
418
419 static void *
420 osTryReserveHeapMemory (W_ len, void *hint)
421 {
422 void *base, *top;
423 void *start, *end;
424
425 /* We try to allocate len + MBLOCK_SIZE,
426 because we need memory which is MBLOCK_SIZE aligned,
427 and then we discard what we don't need */
428
429 base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
430 if (base == NULL)
431 return NULL;
432
433 top = (void*)((W_)base + len + MBLOCK_SIZE);
434
435 if (((W_)base & MBLOCK_MASK) != 0) {
436 start = MBLOCK_ROUND_UP(base);
437 end = MBLOCK_ROUND_DOWN(top);
438 ASSERT(((W_)end - (W_)start) == len);
439
440 if (munmap(base, (W_)start-(W_)base) < 0) {
441 sysErrorBelch("unable to release slop before heap");
442 }
443 if (munmap(end, (W_)top-(W_)end) < 0) {
444 sysErrorBelch("unable to release slop after heap");
445 }
446 } else {
447 start = base;
448 }
449
450 return start;
451 }
452
453 void *osReserveHeapMemory(W_ *len)
454 {
455 int attempt;
456 void *at;
457
458 /* We want to ensure the heap starts at least 8 GB inside the address space,
459 to make sure that any dynamically loaded code will be close enough to the
460 original code so that short relocations will work. This is in particular
461 important on Darwin/Mach-O, because object files not compiled as shared
462 libraries are position independent but cannot be loaded about 4GB.
463
464 We do so with a hint to the mmap, and we verify the OS satisfied our
465 hint. We loop, shifting our hint by 1 BLOCK_SIZE every time, in case
466 there is already something allocated there.
467
468 Some systems impose resource limits restricting the amount of memory we
469 can request (see, e.g. #10877). If mmap fails we halve our allocation
470 request and try again. If our request size gets absurdly small we simply
471 give up.
472
473 */
474
475 attempt = 0;
476 while (1) {
477 if (*len < MBLOCK_SIZE) {
478 // Give up if the system won't even give us 16 blocks worth of heap
479 barf("osReserveHeapMemory: Failed to allocate heap storage");
480 }
481
482 void *hint = (void*)((W_)8 * (1 << 30) + attempt * BLOCK_SIZE);
483 at = osTryReserveHeapMemory(*len, hint);
484 if (at == NULL) {
485 // This means that mmap failed which we take to mean that we asked
486 // for too much memory. This can happen due to POSIX resource
487 // limits. In this case we reduce our allocation request by a factor
488 // of two and try again.
489 *len /= 2;
490 } else if ((W_)at >= ((W_)8 * (1 << 30))) {
491 // Success! We were given a block of memory starting above the 8 GB
492 // mark, which is what we were looking for.
493 break;
494 } else {
495 // We got addressing space but it wasn't above the 8GB mark.
496 // Try again.
497 if (munmap(at, *len) < 0) {
498 sysErrorBelch("unable to release reserved heap");
499 }
500 }
501 }
502
503 return at;
504 }
505
506 void osCommitMemory(void *at, W_ size)
507 {
508 my_mmap(at, size, MEM_COMMIT);
509 }
510
511 void osDecommitMemory(void *at, W_ size)
512 {
513 int r;
514
515 // First make the memory unaccessible (so that we get a segfault
516 // at the next attempt to touch it)
517 // We only do this in DEBUG because it forces the OS to remove
518 // all MMU entries for this page range, and there is no reason
519 // to do so unless there is memory pressure
520 #ifdef DEBUG
521 r = mprotect(at, size, PROT_NONE);
522 if(r < 0)
523 sysErrorBelch("unable to make released memory unaccessible");
524 #endif
525
526 #ifdef MADV_FREE
527 // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
528 // just swaps memory out
529 r = madvise(at, size, MADV_FREE);
530 #else
531 r = madvise(at, size, MADV_DONTNEED);
532 #endif
533 if(r < 0)
534 sysErrorBelch("unable to decommit memory");
535 }
536
537 void osReleaseHeapMemory(void)
538 {
539 int r;
540
541 r = munmap((void*)mblock_address_space.begin,
542 mblock_address_space.end - mblock_address_space.begin);
543 if(r < 0)
544 sysErrorBelch("unable to release address space");
545 }
546
547 rtsBool osNumaAvailable(void)
548 {
549 #ifdef HAVE_NUMA_H
550 return (numa_available() != -1);
551 #else
552 return rtsFalse;
553 #endif
554 }
555
556 uint32_t osNumaNodes(void)
557 {
558 #ifdef HAVE_NUMA_H
559 return numa_num_configured_nodes();
560 #else
561 return 1;
562 #endif
563 }
564
565 StgWord osNumaMask(void)
566 {
567 #ifdef HAVE_NUMA_H
568 struct bitmask *mask;
569 mask = numa_get_mems_allowed();
570 if (mask->size > sizeof(StgWord)*8) {
571 barf("Too many NUMA nodes");
572 }
573 return mask->maskp[0];
574 #else
575 return 1;
576 #endif
577 }
578
579 #endif