RtsFlags: Make `mallocFailHook` const correct
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
18 void arm_atomic_spin_lock(void);
19 void arm_atomic_spin_unlock(void);
20 #endif
21
22 #if defined(THREADED_RTS)
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 EXTERN_INLINE StgWord
107 xchg(StgPtr p, StgWord w)
108 {
109 StgWord result;
110 #if defined(NOSMP)
111 result = *p;
112 *p = w;
113 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
114 result = w;
115 __asm__ __volatile__ (
116 // NB: the xchg instruction is implicitly locked, so we do not
117 // need a lock prefix here.
118 "xchg %1,%0"
119 :"+r" (result), "+m" (*p)
120 : /* no input-only operands */
121 );
122 #elif powerpc_HOST_ARCH
123 __asm__ __volatile__ (
124 # if aix_HOST_OS
125 /* IBM's assembler doesn't seem to support local labels so we use
126 * explicit relative numeric offsets to workaround this limitation
127 */
128 " lwarx %0, 0, %2\n"
129 " stwcx. %1, 0, %2\n"
130 " bne- $-8"
131 # else // aix_HOST_OS
132 "1: lwarx %0, 0, %2\n"
133 " stwcx. %1, 0, %2\n"
134 " bne- 1b"
135 # endif
136 :"=&r" (result)
137 :"r" (w), "r" (p)
138 );
139 #elif powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
140 __asm__ __volatile__ (
141 "1: ldarx %0, 0, %2\n"
142 " stdcx. %1, 0, %2\n"
143 " bne- 1b"
144 :"=&r" (result)
145 :"r" (w), "r" (p)
146 );
147 #elif sparc_HOST_ARCH
148 result = w;
149 __asm__ __volatile__ (
150 "swap %1,%0"
151 : "+r" (result), "+m" (*p)
152 : /* no input-only operands */
153 );
154 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
155 __asm__ __volatile__ ("swp %0, %1, [%2]"
156 : "=&r" (result)
157 : "r" (w), "r" (p) : "memory");
158 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
159 // swp instruction which is used in pre-ARMv6 code above
160 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
161 // to use ldrex/strex instruction pair for the same purpose
162 // see chapter: Synchronization and semaphores in ARM Architecture
163 // Reference manual
164 StgWord tmp;
165 __asm__ __volatile__ (
166 "1: ldrex %0, [%3]\n"
167 " strex %1, %2, [%3]\n"
168 " teq %1, #1\n"
169 " beq 1b\n"
170 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
171 " dmb\n"
172 #endif
173 : "=&r" (result), "=&r" (tmp)
174 : "r" (w), "r" (p)
175 : "memory"
176 );
177 #elif aarch64_HOST_ARCH
178 StgWord tmp;
179 __asm__ __volatile__ (
180 "1: ldaxr %0, [%3]\n"
181 " stlxr %w1, %2, [%3]\n"
182 " cbnz %w1, 1b\n"
183 " dmb sy\n"
184 : "=&r" (result), "=&r" (tmp)
185 : "r" (w), "r" (p)
186 : "memory"
187 );
188 #else
189 #error xchg() unimplemented on this architecture
190 #endif
191 return result;
192 }
193
194 /*
195 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
196 * in the STM implementation.
197 */
198 EXTERN_INLINE StgWord
199 cas(StgVolatilePtr p, StgWord o, StgWord n)
200 {
201 #if defined(NOSMP)
202 StgWord result;
203 result = *p;
204 if (result == o) {
205 *p = n;
206 }
207 return result;
208 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
209 __asm__ __volatile__ (
210 "lock\ncmpxchg %3,%1"
211 :"=a"(o), "+m" (*(volatile unsigned int *)p)
212 :"0" (o), "r" (n));
213 return o;
214 #elif powerpc_HOST_ARCH
215 StgWord result;
216 __asm__ __volatile__ (
217 # if aix_HOST_OS
218 /* IBM's assembler doesn't seem to support local labels so we use
219 * explicit relative numeric offsets to workaround this limitation
220 */
221 " lwarx %0, 0, %3\n"
222 " cmpw %0, %1\n"
223 " bne $+12\n"
224 " stwcx. %2, 0, %3\n"
225 " bne- $-16\n"
226 # else // aix_HOST_OS
227 "1: lwarx %0, 0, %3\n"
228 " cmpw %0, %1\n"
229 " bne 2f\n"
230 " stwcx. %2, 0, %3\n"
231 " bne- 1b\n"
232 "2:"
233 # endif // !aix_HOST_OS
234 :"=&r" (result)
235 :"r" (o), "r" (n), "r" (p)
236 :"cc", "memory"
237 );
238 return result;
239 #elif powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
240 StgWord result;
241 __asm__ __volatile__ (
242 "1: ldarx %0, 0, %3\n"
243 " cmpd %0, %1\n"
244 " bne 2f\n"
245 " stdcx. %2, 0, %3\n"
246 " bne- 1b\n"
247 "2:"
248 :"=&r" (result)
249 :"r" (o), "r" (n), "r" (p)
250 :"cc", "memory"
251 );
252 return result;
253 #elif sparc_HOST_ARCH
254 __asm__ __volatile__ (
255 "cas [%1], %2, %0"
256 : "+r" (n)
257 : "r" (p), "r" (o)
258 : "memory"
259 );
260 return n;
261 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
262 StgWord r;
263 arm_atomic_spin_lock();
264 r = *p;
265 if (r == o) { *p = n; }
266 arm_atomic_spin_unlock();
267 return r;
268 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
269 StgWord result,tmp;
270
271 __asm__ __volatile__(
272 "1: ldrex %1, [%2]\n"
273 " mov %0, #0\n"
274 " teq %1, %3\n"
275 " it eq\n"
276 " strexeq %0, %4, [%2]\n"
277 " teq %0, #1\n"
278 " it eq\n"
279 " beq 1b\n"
280 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
281 " dmb\n"
282 #endif
283 : "=&r"(tmp), "=&r"(result)
284 : "r"(p), "r"(o), "r"(n)
285 : "cc","memory");
286
287 return result;
288 #elif aarch64_HOST_ARCH
289 // Don't think we actually use tmp here, but leaving
290 // it for consistent numbering
291 StgWord result,tmp;
292
293 __asm__ __volatile__(
294 "1: ldxr %1, [%2]\n"
295 " mov %w0, #0\n"
296 " cmp %1, %3\n"
297 " b.ne 2f\n"
298 " stxr %w0, %4, [%2]\n"
299 " cbnz %w0, 1b\n"
300 "2: dmb sy\n"
301 : "=&r"(tmp), "=&r"(result)
302 : "r"(p), "r"(o), "r"(n)
303 : "cc","memory");
304
305 return result;
306 #else
307 #error cas() unimplemented on this architecture
308 #endif
309 }
310
311 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
312 // Haskell code (fetchAddIntArray#).
313 EXTERN_INLINE StgWord
314 atomic_inc(StgVolatilePtr p, StgWord incr)
315 {
316 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
317 StgWord r;
318 r = incr;
319 __asm__ __volatile__ (
320 "lock\nxadd %0,%1":
321 "+r" (r), "+m" (*p):
322 );
323 return r + incr;
324 #else
325 StgWord old, new_;
326 do {
327 old = *p;
328 new_ = old + incr;
329 } while (cas(p, old, new_) != old);
330 return new_;
331 #endif
332 }
333
334 EXTERN_INLINE StgWord
335 atomic_dec(StgVolatilePtr p)
336 {
337 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
338 StgWord r;
339 r = (StgWord)-1;
340 __asm__ __volatile__ (
341 "lock\nxadd %0,%1":
342 "+r" (r), "+m" (*p):
343 );
344 return r-1;
345 #else
346 StgWord old, new_;
347 do {
348 old = *p;
349 new_ = old - 1;
350 } while (cas(p, old, new_) != old);
351 return new_;
352 #endif
353 }
354
355 /*
356 * Some architectures have a way to tell the CPU that we're in a
357 * busy-wait loop, and the processor should look for something else to
358 * do (such as run another hardware thread).
359 */
360 EXTERN_INLINE void
361 busy_wait_nop(void)
362 {
363 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
364 // On Intel, the busy-wait-nop instruction is called "pause",
365 // which is actually represented as a nop with the rep prefix.
366 // On processors before the P4 this behaves as a nop; on P4 and
367 // later it might do something clever like yield to another
368 // hyperthread. In any case, Intel recommends putting one
369 // of these in a spin lock loop.
370 __asm__ __volatile__ ("rep; nop");
371 #else
372 // nothing
373 #endif
374 }
375
376 #endif // !IN_STG_CODE
377
378 /*
379 * We need to tell both the compiler AND the CPU about the barriers.
380 * It's no good preventing the CPU from reordering the operations if
381 * the compiler has already done so - hence the "memory" restriction
382 * on each of the barriers below.
383 */
384 EXTERN_INLINE void
385 write_barrier(void) {
386 #if defined(NOSMP)
387 return;
388 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
389 __asm__ __volatile__ ("" : : : "memory");
390 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
391 __asm__ __volatile__ ("lwsync" : : : "memory");
392 #elif sparc_HOST_ARCH
393 /* Sparc in TSO mode does not require store/store barriers. */
394 __asm__ __volatile__ ("" : : : "memory");
395 #elif (arm_HOST_ARCH) || aarch64_HOST_ARCH
396 __asm__ __volatile__ ("dmb st" : : : "memory");
397 #else
398 #error memory barriers unimplemented on this architecture
399 #endif
400 }
401
402 EXTERN_INLINE void
403 store_load_barrier(void) {
404 #if defined(NOSMP)
405 return;
406 #elif i386_HOST_ARCH
407 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
408 #elif x86_64_HOST_ARCH
409 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
410 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
411 __asm__ __volatile__ ("sync" : : : "memory");
412 #elif sparc_HOST_ARCH
413 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
414 #elif arm_HOST_ARCH
415 __asm__ __volatile__ ("dmb" : : : "memory");
416 #elif aarch64_HOST_ARCH
417 __asm__ __volatile__ ("dmb sy" : : : "memory");
418 #else
419 #error memory barriers unimplemented on this architecture
420 #endif
421 }
422
423 EXTERN_INLINE void
424 load_load_barrier(void) {
425 #if defined(NOSMP)
426 return;
427 #elif i386_HOST_ARCH
428 __asm__ __volatile__ ("" : : : "memory");
429 #elif x86_64_HOST_ARCH
430 __asm__ __volatile__ ("" : : : "memory");
431 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
432 __asm__ __volatile__ ("lwsync" : : : "memory");
433 #elif sparc_HOST_ARCH
434 /* Sparc in TSO mode does not require load/load barriers. */
435 __asm__ __volatile__ ("" : : : "memory");
436 #elif arm_HOST_ARCH
437 __asm__ __volatile__ ("dmb" : : : "memory");
438 #elif aarch64_HOST_ARCH
439 __asm__ __volatile__ ("dmb sy" : : : "memory");
440 #else
441 #error memory barriers unimplemented on this architecture
442 #endif
443 }
444
445 // Load a pointer from a memory location that might be being modified
446 // concurrently. This prevents the compiler from optimising away
447 // multiple loads of the memory location, as it might otherwise do in
448 // a busy wait loop for example.
449 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
450
451 /* ---------------------------------------------------------------------- */
452 #else /* !THREADED_RTS */
453
454 EXTERN_INLINE void write_barrier(void);
455 EXTERN_INLINE void store_load_barrier(void);
456 EXTERN_INLINE void load_load_barrier(void);
457 EXTERN_INLINE void write_barrier () {} /* nothing */
458 EXTERN_INLINE void store_load_barrier() {} /* nothing */
459 EXTERN_INLINE void load_load_barrier () {} /* nothing */
460
461 #if !IN_STG_CODE || IN_STGCRUN
462 INLINE_HEADER StgWord
463 xchg(StgPtr p, StgWord w)
464 {
465 StgWord old = *p;
466 *p = w;
467 return old;
468 }
469
470 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
471 EXTERN_INLINE StgWord
472 cas(StgVolatilePtr p, StgWord o, StgWord n)
473 {
474 StgWord result;
475 result = *p;
476 if (result == o) {
477 *p = n;
478 }
479 return result;
480 }
481
482 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
483 EXTERN_INLINE StgWord
484 atomic_inc(StgVolatilePtr p, StgWord incr)
485 {
486 return ((*p) += incr);
487 }
488
489
490 INLINE_HEADER StgWord
491 atomic_dec(StgVolatilePtr p)
492 {
493 return --(*p);
494 }
495 #endif
496
497 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
498
499 #endif /* !THREADED_RTS */
500
501 #endif /* SMP_H */