fabc540169655900b451d5efe0e36bbd32245af0
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
18 void arm_atomic_spin_lock(void);
19 void arm_atomic_spin_unlock(void);
20 #endif
21
22 #if defined(THREADED_RTS)
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 EXTERN_INLINE StgWord
107 xchg(StgPtr p, StgWord w)
108 {
109 StgWord result;
110 #if defined(NOSMP)
111 result = *p;
112 *p = w;
113 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
114 result = w;
115 __asm__ __volatile__ (
116 // NB: the xchg instruction is implicitly locked, so we do not
117 // need a lock prefix here.
118 "xchg %1,%0"
119 :"+r" (result), "+m" (*p)
120 : /* no input-only operands */
121 );
122 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
123 result = __sync_lock_test_and_set(p, w);
124 #elif sparc_HOST_ARCH
125 result = w;
126 __asm__ __volatile__ (
127 "swap %1,%0"
128 : "+r" (result), "+m" (*p)
129 : /* no input-only operands */
130 );
131 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
132 __asm__ __volatile__ ("swp %0, %1, [%2]"
133 : "=&r" (result)
134 : "r" (w), "r" (p) : "memory");
135 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
136 // swp instruction which is used in pre-ARMv6 code above
137 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
138 // to use ldrex/strex instruction pair for the same purpose
139 // see chapter: Synchronization and semaphores in ARM Architecture
140 // Reference manual
141 StgWord tmp;
142 __asm__ __volatile__ (
143 "1: ldrex %0, [%3]\n"
144 " strex %1, %2, [%3]\n"
145 " teq %1, #1\n"
146 " beq 1b\n"
147 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
148 " dmb\n"
149 #endif
150 : "=&r" (result), "=&r" (tmp)
151 : "r" (w), "r" (p)
152 : "memory"
153 );
154 #elif aarch64_HOST_ARCH
155 StgWord tmp;
156 __asm__ __volatile__ (
157 "1: ldaxr %0, [%3]\n"
158 " stlxr %w1, %2, [%3]\n"
159 " cbnz %w1, 1b\n"
160 " dmb sy\n"
161 : "=&r" (result), "=&r" (tmp)
162 : "r" (w), "r" (p)
163 : "memory"
164 );
165 #else
166 #error xchg() unimplemented on this architecture
167 #endif
168 return result;
169 }
170
171 /*
172 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
173 * in the STM implementation.
174 */
175 EXTERN_INLINE StgWord
176 cas(StgVolatilePtr p, StgWord o, StgWord n)
177 {
178 #if defined(NOSMP)
179 StgWord result;
180 result = *p;
181 if (result == o) {
182 *p = n;
183 }
184 return result;
185 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
186 __asm__ __volatile__ (
187 "lock\ncmpxchg %3,%1"
188 :"=a"(o), "+m" (*(volatile unsigned int *)p)
189 :"0" (o), "r" (n));
190 return o;
191 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
192 return __sync_val_compare_and_swap(p, o, n);
193 #elif sparc_HOST_ARCH
194 __asm__ __volatile__ (
195 "cas [%1], %2, %0"
196 : "+r" (n)
197 : "r" (p), "r" (o)
198 : "memory"
199 );
200 return n;
201 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
202 StgWord r;
203 arm_atomic_spin_lock();
204 r = *p;
205 if (r == o) { *p = n; }
206 arm_atomic_spin_unlock();
207 return r;
208 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
209 StgWord result,tmp;
210
211 __asm__ __volatile__(
212 "1: ldrex %1, [%2]\n"
213 " mov %0, #0\n"
214 " teq %1, %3\n"
215 " it eq\n"
216 " strexeq %0, %4, [%2]\n"
217 " teq %0, #1\n"
218 " it eq\n"
219 " beq 1b\n"
220 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
221 " dmb\n"
222 #endif
223 : "=&r"(tmp), "=&r"(result)
224 : "r"(p), "r"(o), "r"(n)
225 : "cc","memory");
226
227 return result;
228 #elif aarch64_HOST_ARCH
229 // Don't think we actually use tmp here, but leaving
230 // it for consistent numbering
231 StgWord result,tmp;
232
233 __asm__ __volatile__(
234 "1: ldxr %1, [%2]\n"
235 " mov %w0, #0\n"
236 " cmp %1, %3\n"
237 " b.ne 2f\n"
238 " stxr %w0, %4, [%2]\n"
239 " cbnz %w0, 1b\n"
240 "2: dmb sy\n"
241 : "=&r"(tmp), "=&r"(result)
242 : "r"(p), "r"(o), "r"(n)
243 : "cc","memory");
244
245 return result;
246 #else
247 #error cas() unimplemented on this architecture
248 #endif
249 }
250
251 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
252 // Haskell code (fetchAddIntArray#).
253 // PT: add-and-fetch, returns new value
254 EXTERN_INLINE StgWord
255 atomic_inc(StgVolatilePtr p, StgWord incr)
256 {
257 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
258 StgWord r;
259 r = incr;
260 __asm__ __volatile__ (
261 "lock\nxadd %0,%1":
262 "+r" (r), "+m" (*p):
263 );
264 return r + incr;
265 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
266 return __sync_add_and_fetch(p, incr);
267 #else
268 StgWord old, new_;
269 do {
270 old = *p;
271 new_ = old + incr;
272 } while (cas(p, old, new_) != old);
273 return new_;
274 #endif
275 }
276
277 EXTERN_INLINE StgWord
278 atomic_dec(StgVolatilePtr p)
279 {
280 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
281 StgWord r;
282 r = (StgWord)-1;
283 __asm__ __volatile__ (
284 "lock\nxadd %0,%1":
285 "+r" (r), "+m" (*p):
286 );
287 return r-1;
288 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
289 return __sync_sub_and_fetch(p, (StgWord) 1);
290 #else
291 StgWord old, new_;
292 do {
293 old = *p;
294 new_ = old - 1;
295 } while (cas(p, old, new_) != old);
296 return new_;
297 #endif
298 }
299
300 /*
301 * Some architectures have a way to tell the CPU that we're in a
302 * busy-wait loop, and the processor should look for something else to
303 * do (such as run another hardware thread).
304 */
305 EXTERN_INLINE void
306 busy_wait_nop(void)
307 {
308 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
309 // On Intel, the busy-wait-nop instruction is called "pause",
310 // which is actually represented as a nop with the rep prefix.
311 // On processors before the P4 this behaves as a nop; on P4 and
312 // later it might do something clever like yield to another
313 // hyperthread. In any case, Intel recommends putting one
314 // of these in a spin lock loop.
315 __asm__ __volatile__ ("rep; nop");
316 #else
317 // nothing
318 #endif
319 }
320
321 #endif // !IN_STG_CODE
322
323 /*
324 * We need to tell both the compiler AND the CPU about the barriers.
325 * It's no good preventing the CPU from reordering the operations if
326 * the compiler has already done so - hence the "memory" restriction
327 * on each of the barriers below.
328 */
329 EXTERN_INLINE void
330 write_barrier(void) {
331 #if defined(NOSMP)
332 return;
333 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
334 __asm__ __volatile__ ("" : : : "memory");
335 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
336 __asm__ __volatile__ ("lwsync" : : : "memory");
337 #elif sparc_HOST_ARCH
338 /* Sparc in TSO mode does not require store/store barriers. */
339 __asm__ __volatile__ ("" : : : "memory");
340 #elif (arm_HOST_ARCH) || aarch64_HOST_ARCH
341 __asm__ __volatile__ ("dmb st" : : : "memory");
342 #else
343 #error memory barriers unimplemented on this architecture
344 #endif
345 }
346
347 EXTERN_INLINE void
348 store_load_barrier(void) {
349 #if defined(NOSMP)
350 return;
351 #elif i386_HOST_ARCH
352 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
353 #elif x86_64_HOST_ARCH
354 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
355 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
356 __asm__ __volatile__ ("sync" : : : "memory");
357 #elif sparc_HOST_ARCH
358 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
359 #elif arm_HOST_ARCH
360 __asm__ __volatile__ ("dmb" : : : "memory");
361 #elif aarch64_HOST_ARCH
362 __asm__ __volatile__ ("dmb sy" : : : "memory");
363 #else
364 #error memory barriers unimplemented on this architecture
365 #endif
366 }
367
368 EXTERN_INLINE void
369 load_load_barrier(void) {
370 #if defined(NOSMP)
371 return;
372 #elif i386_HOST_ARCH
373 __asm__ __volatile__ ("" : : : "memory");
374 #elif x86_64_HOST_ARCH
375 __asm__ __volatile__ ("" : : : "memory");
376 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
377 __asm__ __volatile__ ("lwsync" : : : "memory");
378 #elif sparc_HOST_ARCH
379 /* Sparc in TSO mode does not require load/load barriers. */
380 __asm__ __volatile__ ("" : : : "memory");
381 #elif arm_HOST_ARCH
382 __asm__ __volatile__ ("dmb" : : : "memory");
383 #elif aarch64_HOST_ARCH
384 __asm__ __volatile__ ("dmb sy" : : : "memory");
385 #else
386 #error memory barriers unimplemented on this architecture
387 #endif
388 }
389
390 // Load a pointer from a memory location that might be being modified
391 // concurrently. This prevents the compiler from optimising away
392 // multiple loads of the memory location, as it might otherwise do in
393 // a busy wait loop for example.
394 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
395
396 /* ---------------------------------------------------------------------- */
397 #else /* !THREADED_RTS */
398
399 EXTERN_INLINE void write_barrier(void);
400 EXTERN_INLINE void store_load_barrier(void);
401 EXTERN_INLINE void load_load_barrier(void);
402 EXTERN_INLINE void write_barrier () {} /* nothing */
403 EXTERN_INLINE void store_load_barrier() {} /* nothing */
404 EXTERN_INLINE void load_load_barrier () {} /* nothing */
405
406 #if !IN_STG_CODE || IN_STGCRUN
407 INLINE_HEADER StgWord
408 xchg(StgPtr p, StgWord w)
409 {
410 StgWord old = *p;
411 *p = w;
412 return old;
413 }
414
415 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
416 EXTERN_INLINE StgWord
417 cas(StgVolatilePtr p, StgWord o, StgWord n)
418 {
419 StgWord result;
420 result = *p;
421 if (result == o) {
422 *p = n;
423 }
424 return result;
425 }
426
427 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
428 EXTERN_INLINE StgWord
429 atomic_inc(StgVolatilePtr p, StgWord incr)
430 {
431 return ((*p) += incr);
432 }
433
434
435 INLINE_HEADER StgWord
436 atomic_dec(StgVolatilePtr p)
437 {
438 return --(*p);
439 }
440 #endif
441
442 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
443
444 #endif /* !THREADED_RTS */
445
446 #endif /* SMP_H */