cmm: Expose machine's stack and return address register
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
18 void arm_atomic_spin_lock(void);
19 void arm_atomic_spin_unlock(void);
20 #endif
21
22 #if defined(THREADED_RTS)
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 EXTERN_INLINE StgWord
107 xchg(StgPtr p, StgWord w)
108 {
109 StgWord result;
110 #if defined(NOSMP)
111 result = *p;
112 *p = w;
113 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
114 result = w;
115 __asm__ __volatile__ (
116 // NB: the xchg instruction is implicitly locked, so we do not
117 // need a lock prefix here.
118 "xchg %1,%0"
119 :"+r" (result), "+m" (*p)
120 : /* no input-only operands */
121 );
122 #elif powerpc_HOST_ARCH
123 __asm__ __volatile__ (
124 "1: lwarx %0, 0, %2\n"
125 " stwcx. %1, 0, %2\n"
126 " bne- 1b"
127 :"=&r" (result)
128 :"r" (w), "r" (p)
129 );
130 #elif powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
131 __asm__ __volatile__ (
132 "1: ldarx %0, 0, %2\n"
133 " stdcx. %1, 0, %2\n"
134 " bne- 1b"
135 :"=&r" (result)
136 :"r" (w), "r" (p)
137 );
138 #elif sparc_HOST_ARCH
139 result = w;
140 __asm__ __volatile__ (
141 "swap %1,%0"
142 : "+r" (result), "+m" (*p)
143 : /* no input-only operands */
144 );
145 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
146 __asm__ __volatile__ ("swp %0, %1, [%2]"
147 : "=&r" (result)
148 : "r" (w), "r" (p) : "memory");
149 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
150 // swp instruction which is used in pre-ARMv6 code above
151 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
152 // to use ldrex/strex instruction pair for the same purpose
153 // see chapter: Synchronization and semaphores in ARM Architecture
154 // Reference manual
155 StgWord tmp;
156 __asm__ __volatile__ (
157 "1: ldrex %0, [%3]\n"
158 " strex %1, %2, [%3]\n"
159 " teq %1, #1\n"
160 " beq 1b\n"
161 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
162 " dmb\n"
163 #endif
164 : "=&r" (result), "=&r" (tmp)
165 : "r" (w), "r" (p)
166 : "memory"
167 );
168 #elif aarch64_HOST_ARCH
169 StgWord tmp;
170 __asm__ __volatile__ (
171 "1: ldaxr %0, [%3]\n"
172 " stlxr %w1, %2, [%3]\n"
173 " cbnz %w1, 1b\n"
174 " dmb sy\n"
175 : "=&r" (result), "=&r" (tmp)
176 : "r" (w), "r" (p)
177 : "memory"
178 );
179 #else
180 #error xchg() unimplemented on this architecture
181 #endif
182 return result;
183 }
184
185 /*
186 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
187 * in the STM implementation.
188 */
189 EXTERN_INLINE StgWord
190 cas(StgVolatilePtr p, StgWord o, StgWord n)
191 {
192 #if defined(NOSMP)
193 StgWord result;
194 result = *p;
195 if (result == o) {
196 *p = n;
197 }
198 return result;
199 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
200 __asm__ __volatile__ (
201 "lock\ncmpxchg %3,%1"
202 :"=a"(o), "+m" (*(volatile unsigned int *)p)
203 :"0" (o), "r" (n));
204 return o;
205 #elif powerpc_HOST_ARCH
206 StgWord result;
207 __asm__ __volatile__ (
208 "1: lwarx %0, 0, %3\n"
209 " cmpw %0, %1\n"
210 " bne 2f\n"
211 " stwcx. %2, 0, %3\n"
212 " bne- 1b\n"
213 "2:"
214 :"=&r" (result)
215 :"r" (o), "r" (n), "r" (p)
216 :"cc", "memory"
217 );
218 return result;
219 #elif powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
220 StgWord result;
221 __asm__ __volatile__ (
222 "1: ldarx %0, 0, %3\n"
223 " cmpd %0, %1\n"
224 " bne 2f\n"
225 " stdcx. %2, 0, %3\n"
226 " bne- 1b\n"
227 "2:"
228 :"=&r" (result)
229 :"r" (o), "r" (n), "r" (p)
230 :"cc", "memory"
231 );
232 return result;
233 #elif sparc_HOST_ARCH
234 __asm__ __volatile__ (
235 "cas [%1], %2, %0"
236 : "+r" (n)
237 : "r" (p), "r" (o)
238 : "memory"
239 );
240 return n;
241 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
242 StgWord r;
243 arm_atomic_spin_lock();
244 r = *p;
245 if (r == o) { *p = n; }
246 arm_atomic_spin_unlock();
247 return r;
248 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
249 StgWord result,tmp;
250
251 __asm__ __volatile__(
252 "1: ldrex %1, [%2]\n"
253 " mov %0, #0\n"
254 " teq %1, %3\n"
255 " it eq\n"
256 " strexeq %0, %4, [%2]\n"
257 " teq %0, #1\n"
258 " it eq\n"
259 " beq 1b\n"
260 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
261 " dmb\n"
262 #endif
263 : "=&r"(tmp), "=&r"(result)
264 : "r"(p), "r"(o), "r"(n)
265 : "cc","memory");
266
267 return result;
268 #elif aarch64_HOST_ARCH
269 // Don't think we actually use tmp here, but leaving
270 // it for consistent numbering
271 StgWord result,tmp;
272
273 __asm__ __volatile__(
274 "1: ldxr %1, [%2]\n"
275 " mov %w0, #0\n"
276 " cmp %1, %3\n"
277 " b.ne 2f\n"
278 " stxr %w0, %4, [%2]\n"
279 " cbnz %w0, 1b\n"
280 "2: dmb sy\n"
281 : "=&r"(tmp), "=&r"(result)
282 : "r"(p), "r"(o), "r"(n)
283 : "cc","memory");
284
285 return result;
286 #else
287 #error cas() unimplemented on this architecture
288 #endif
289 }
290
291 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
292 // Haskell code (fetchAddIntArray#).
293 EXTERN_INLINE StgWord
294 atomic_inc(StgVolatilePtr p, StgWord incr)
295 {
296 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
297 StgWord r;
298 r = incr;
299 __asm__ __volatile__ (
300 "lock\nxadd %0,%1":
301 "+r" (r), "+m" (*p):
302 );
303 return r + incr;
304 #else
305 StgWord old, new_;
306 do {
307 old = *p;
308 new_ = old + incr;
309 } while (cas(p, old, new_) != old);
310 return new_;
311 #endif
312 }
313
314 EXTERN_INLINE StgWord
315 atomic_dec(StgVolatilePtr p)
316 {
317 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
318 StgWord r;
319 r = (StgWord)-1;
320 __asm__ __volatile__ (
321 "lock\nxadd %0,%1":
322 "+r" (r), "+m" (*p):
323 );
324 return r-1;
325 #else
326 StgWord old, new_;
327 do {
328 old = *p;
329 new_ = old - 1;
330 } while (cas(p, old, new_) != old);
331 return new_;
332 #endif
333 }
334
335 /*
336 * Some architectures have a way to tell the CPU that we're in a
337 * busy-wait loop, and the processor should look for something else to
338 * do (such as run another hardware thread).
339 */
340 EXTERN_INLINE void
341 busy_wait_nop(void)
342 {
343 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
344 // On Intel, the busy-wait-nop instruction is called "pause",
345 // which is actually represented as a nop with the rep prefix.
346 // On processors before the P4 this behaves as a nop; on P4 and
347 // later it might do something clever like yield to another
348 // hyperthread. In any case, Intel recommends putting one
349 // of these in a spin lock loop.
350 __asm__ __volatile__ ("rep; nop");
351 #else
352 // nothing
353 #endif
354 }
355
356 #endif // !IN_STG_CODE
357
358 /*
359 * We need to tell both the compiler AND the CPU about the barriers.
360 * It's no good preventing the CPU from reordering the operations if
361 * the compiler has already done so - hence the "memory" restriction
362 * on each of the barriers below.
363 */
364 EXTERN_INLINE void
365 write_barrier(void) {
366 #if defined(NOSMP)
367 return;
368 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
369 __asm__ __volatile__ ("" : : : "memory");
370 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
371 __asm__ __volatile__ ("lwsync" : : : "memory");
372 #elif sparc_HOST_ARCH
373 /* Sparc in TSO mode does not require store/store barriers. */
374 __asm__ __volatile__ ("" : : : "memory");
375 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
376 __asm__ __volatile__ ("" : : : "memory");
377 #elif (arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)) || aarch64_HOST_ARCH
378 __asm__ __volatile__ ("dmb st" : : : "memory");
379 #else
380 #error memory barriers unimplemented on this architecture
381 #endif
382 }
383
384 EXTERN_INLINE void
385 store_load_barrier(void) {
386 #if defined(NOSMP)
387 return;
388 #elif i386_HOST_ARCH
389 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
390 #elif x86_64_HOST_ARCH
391 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
392 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
393 __asm__ __volatile__ ("sync" : : : "memory");
394 #elif sparc_HOST_ARCH
395 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
396 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
397 // TODO FIXME: This case probably isn't totally correct - just because we
398 // use a pre-ARMv7 toolchain (e.g. to target an old Android device), doesn't
399 // mean the binary won't run on a newer ARMv7 system - in which case it
400 // needs a proper barrier. So we should rethink this
401 // - Reid
402 __asm__ __volatile__ ("" : : : "memory");
403 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
404 __asm__ __volatile__ ("dmb" : : : "memory");
405 #elif aarch64_HOST_ARCH
406 __asm__ __volatile__ ("dmb sy" : : : "memory");
407 #else
408 #error memory barriers unimplemented on this architecture
409 #endif
410 }
411
412 EXTERN_INLINE void
413 load_load_barrier(void) {
414 #if defined(NOSMP)
415 return;
416 #elif i386_HOST_ARCH
417 __asm__ __volatile__ ("" : : : "memory");
418 #elif x86_64_HOST_ARCH
419 __asm__ __volatile__ ("" : : : "memory");
420 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
421 __asm__ __volatile__ ("lwsync" : : : "memory");
422 #elif sparc_HOST_ARCH
423 /* Sparc in TSO mode does not require load/load barriers. */
424 __asm__ __volatile__ ("" : : : "memory");
425 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
426 __asm__ __volatile__ ("" : : : "memory");
427 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
428 __asm__ __volatile__ ("dmb" : : : "memory");
429 #elif aarch64_HOST_ARCH
430 __asm__ __volatile__ ("dmb sy" : : : "memory");
431 #else
432 #error memory barriers unimplemented on this architecture
433 #endif
434 }
435
436 // Load a pointer from a memory location that might be being modified
437 // concurrently. This prevents the compiler from optimising away
438 // multiple loads of the memory location, as it might otherwise do in
439 // a busy wait loop for example.
440 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
441
442 /* ---------------------------------------------------------------------- */
443 #else /* !THREADED_RTS */
444
445 EXTERN_INLINE void write_barrier(void);
446 EXTERN_INLINE void store_load_barrier(void);
447 EXTERN_INLINE void load_load_barrier(void);
448 EXTERN_INLINE void write_barrier () {} /* nothing */
449 EXTERN_INLINE void store_load_barrier() {} /* nothing */
450 EXTERN_INLINE void load_load_barrier () {} /* nothing */
451
452 #if !IN_STG_CODE || IN_STGCRUN
453 INLINE_HEADER StgWord
454 xchg(StgPtr p, StgWord w)
455 {
456 StgWord old = *p;
457 *p = w;
458 return old;
459 }
460
461 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
462 EXTERN_INLINE StgWord
463 cas(StgVolatilePtr p, StgWord o, StgWord n)
464 {
465 StgWord result;
466 result = *p;
467 if (result == o) {
468 *p = n;
469 }
470 return result;
471 }
472
473 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
474 EXTERN_INLINE StgWord
475 atomic_inc(StgVolatilePtr p, StgWord incr)
476 {
477 return ((*p) += incr);
478 }
479
480
481 INLINE_HEADER StgWord
482 atomic_dec(StgVolatilePtr p)
483 {
484 return --(*p);
485 }
486 #endif
487
488 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
489
490 #endif /* !THREADED_RTS */
491
492 #endif /* SMP_H */