comments
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
18 void arm_atomic_spin_lock(void);
19 void arm_atomic_spin_unlock(void);
20 #endif
21
22 #if defined(THREADED_RTS)
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 EXTERN_INLINE StgWord
107 xchg(StgPtr p, StgWord w)
108 {
109 StgWord result;
110 #if defined(NOSMP)
111 result = *p;
112 *p = w;
113 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
114 result = w;
115 __asm__ __volatile__ (
116 // NB: the xchg instruction is implicitly locked, so we do not
117 // need a lock prefix here.
118 "xchg %1,%0"
119 :"+r" (result), "+m" (*p)
120 : /* no input-only operands */
121 );
122 #elif powerpc_HOST_ARCH
123 __asm__ __volatile__ (
124 "1: lwarx %0, 0, %2\n"
125 " stwcx. %1, 0, %2\n"
126 " bne- 1b"
127 :"=&r" (result)
128 :"r" (w), "r" (p)
129 );
130 #elif sparc_HOST_ARCH
131 result = w;
132 __asm__ __volatile__ (
133 "swap %1,%0"
134 : "+r" (result), "+m" (*p)
135 : /* no input-only operands */
136 );
137 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
138 __asm__ __volatile__ ("swp %0, %1, [%2]"
139 : "=&r" (result)
140 : "r" (w), "r" (p) : "memory");
141 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
142 // swp instruction which is used in pre-ARMv6 code above
143 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
144 // to use ldrex/strex instruction pair for the same purpose
145 // see chapter: Synchronization and semaphores in ARM Architecture
146 // Reference manual
147 StgWord tmp;
148 __asm__ __volatile__ (
149 "1: ldrex %0, [%3]\n"
150 " strex %1, %2, [%3]\n"
151 " teq %1, #1\n"
152 " beq 1b\n"
153 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
154 " dmb\n"
155 #endif
156 : "=&r" (result), "=&r" (tmp)
157 : "r" (w), "r" (p)
158 : "memory"
159 );
160 #elif aarch64_HOST_ARCH
161 // Don't think we actually use tmp here, but leaving
162 // it for consistent numbering
163 StgWord tmp;
164 __asm__ __volatile__ (
165 "1: ldaxr %0, [%3]\n"
166 " stlxr %w0, %2, [%3]\n"
167 " cbnz %w0, 1b\n"
168 " dmb sy\n"
169 : "=&r" (result), "=&r" (tmp)
170 : "r" (w), "r" (p)
171 : "memory"
172 );
173 #else
174 #error xchg() unimplemented on this architecture
175 #endif
176 return result;
177 }
178
179 /*
180 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
181 * in the STM implementation.
182 */
183 EXTERN_INLINE StgWord
184 cas(StgVolatilePtr p, StgWord o, StgWord n)
185 {
186 #if defined(NOSMP)
187 StgWord result;
188 result = *p;
189 if (result == o) {
190 *p = n;
191 }
192 return result;
193 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
194 __asm__ __volatile__ (
195 "lock\ncmpxchg %3,%1"
196 :"=a"(o), "+m" (*(volatile unsigned int *)p)
197 :"0" (o), "r" (n));
198 return o;
199 #elif powerpc_HOST_ARCH
200 StgWord result;
201 __asm__ __volatile__ (
202 "1: lwarx %0, 0, %3\n"
203 " cmpw %0, %1\n"
204 " bne 2f\n"
205 " stwcx. %2, 0, %3\n"
206 " bne- 1b\n"
207 "2:"
208 :"=&r" (result)
209 :"r" (o), "r" (n), "r" (p)
210 :"cc", "memory"
211 );
212 return result;
213 #elif sparc_HOST_ARCH
214 __asm__ __volatile__ (
215 "cas [%1], %2, %0"
216 : "+r" (n)
217 : "r" (p), "r" (o)
218 : "memory"
219 );
220 return n;
221 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
222 StgWord r;
223 arm_atomic_spin_lock();
224 r = *p;
225 if (r == o) { *p = n; }
226 arm_atomic_spin_unlock();
227 return r;
228 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
229 StgWord result,tmp;
230
231 __asm__ __volatile__(
232 "1: ldrex %1, [%2]\n"
233 " mov %0, #0\n"
234 " teq %1, %3\n"
235 " it eq\n"
236 " strexeq %0, %4, [%2]\n"
237 " teq %0, #1\n"
238 " it eq\n"
239 " beq 1b\n"
240 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
241 " dmb\n"
242 #endif
243 : "=&r"(tmp), "=&r"(result)
244 : "r"(p), "r"(o), "r"(n)
245 : "cc","memory");
246
247 return result;
248 #elif aarch64_HOST_ARCH
249 // Don't think we actually use tmp here, but leaving
250 // it for consistent numbering
251 StgWord result,tmp;
252
253 __asm__ __volatile__(
254 "1: ldxr %1, [%2]\n"
255 " mov %w0, #0\n"
256 " cmp %1, %3\n"
257 " b.ne 2f\n"
258 " stxr %w0, %4, [%2]\n"
259 " cbnz %w0, 1b\n"
260 "2: dmb sy\n"
261 : "=&r"(tmp), "=&r"(result)
262 : "r"(p), "r"(o), "r"(n)
263 : "cc","memory");
264
265 return result;
266 #else
267 #error cas() unimplemented on this architecture
268 #endif
269 }
270
271 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
272 // Haskell code (fetchAddIntArray#).
273 EXTERN_INLINE StgWord
274 atomic_inc(StgVolatilePtr p, StgWord incr)
275 {
276 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
277 StgWord r;
278 r = incr;
279 __asm__ __volatile__ (
280 "lock\nxadd %0,%1":
281 "+r" (r), "+m" (*p):
282 );
283 return r + incr;
284 #else
285 StgWord old, new;
286 do {
287 old = *p;
288 new = old + incr;
289 } while (cas(p, old, new) != old);
290 return new;
291 #endif
292 }
293
294 EXTERN_INLINE StgWord
295 atomic_dec(StgVolatilePtr p)
296 {
297 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
298 StgWord r;
299 r = (StgWord)-1;
300 __asm__ __volatile__ (
301 "lock\nxadd %0,%1":
302 "+r" (r), "+m" (*p):
303 );
304 return r-1;
305 #else
306 StgWord old, new;
307 do {
308 old = *p;
309 new = old - 1;
310 } while (cas(p, old, new) != old);
311 return new;
312 #endif
313 }
314
315 /*
316 * Some architectures have a way to tell the CPU that we're in a
317 * busy-wait loop, and the processor should look for something else to
318 * do (such as run another hardware thread).
319 */
320 EXTERN_INLINE void
321 busy_wait_nop(void)
322 {
323 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
324 // On Intel, the busy-wait-nop instruction is called "pause",
325 // which is actually represented as a nop with the rep prefix.
326 // On processors before the P4 this behaves as a nop; on P4 and
327 // later it might do something clever like yield to another
328 // hyperthread. In any case, Intel recommends putting one
329 // of these in a spin lock loop.
330 __asm__ __volatile__ ("rep; nop");
331 #else
332 // nothing
333 #endif
334 }
335
336 #endif // !IN_STG_CODE
337
338 /*
339 * We need to tell both the compiler AND the CPU about the barriers.
340 * It's no good preventing the CPU from reordering the operations if
341 * the compiler has already done so - hence the "memory" restriction
342 * on each of the barriers below.
343 */
344 EXTERN_INLINE void
345 write_barrier(void) {
346 #if defined(NOSMP)
347 return;
348 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
349 __asm__ __volatile__ ("" : : : "memory");
350 #elif powerpc_HOST_ARCH
351 __asm__ __volatile__ ("lwsync" : : : "memory");
352 #elif sparc_HOST_ARCH
353 /* Sparc in TSO mode does not require store/store barriers. */
354 __asm__ __volatile__ ("" : : : "memory");
355 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
356 __asm__ __volatile__ ("" : : : "memory");
357 #elif (arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)) || aarch64_HOST_ARCH
358 __asm__ __volatile__ ("dmb st" : : : "memory");
359 #else
360 #error memory barriers unimplemented on this architecture
361 #endif
362 }
363
364 EXTERN_INLINE void
365 store_load_barrier(void) {
366 #if defined(NOSMP)
367 return;
368 #elif i386_HOST_ARCH
369 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
370 #elif x86_64_HOST_ARCH
371 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
372 #elif powerpc_HOST_ARCH
373 __asm__ __volatile__ ("sync" : : : "memory");
374 #elif sparc_HOST_ARCH
375 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
376 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
377 __asm__ __volatile__ ("dmb" : : : "memory");
378 #elif aarch64_HOST_ARCH
379 __asm__ __volatile__ ("dmb sy" : : : "memory");
380 #else
381 #error memory barriers unimplemented on this architecture
382 #endif
383 }
384
385 EXTERN_INLINE void
386 load_load_barrier(void) {
387 #if defined(NOSMP)
388 return;
389 #elif i386_HOST_ARCH
390 __asm__ __volatile__ ("" : : : "memory");
391 #elif x86_64_HOST_ARCH
392 __asm__ __volatile__ ("" : : : "memory");
393 #elif powerpc_HOST_ARCH
394 __asm__ __volatile__ ("lwsync" : : : "memory");
395 #elif sparc_HOST_ARCH
396 /* Sparc in TSO mode does not require load/load barriers. */
397 __asm__ __volatile__ ("" : : : "memory");
398 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
399 __asm__ __volatile__ ("dmb" : : : "memory");
400 #elif aarch64_HOST_ARCH
401 __asm__ __volatile__ ("dmb sy" : : : "memory");
402 #else
403 #error memory barriers unimplemented on this architecture
404 #endif
405 }
406
407 // Load a pointer from a memory location that might be being modified
408 // concurrently. This prevents the compiler from optimising away
409 // multiple loads of the memory location, as it might otherwise do in
410 // a busy wait loop for example.
411 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
412
413 /* ---------------------------------------------------------------------- */
414 #else /* !THREADED_RTS */
415
416 EXTERN_INLINE void write_barrier(void);
417 EXTERN_INLINE void store_load_barrier(void);
418 EXTERN_INLINE void load_load_barrier(void);
419 EXTERN_INLINE void write_barrier () {} /* nothing */
420 EXTERN_INLINE void store_load_barrier() {} /* nothing */
421 EXTERN_INLINE void load_load_barrier () {} /* nothing */
422
423 #if !IN_STG_CODE || IN_STGCRUN
424 INLINE_HEADER StgWord
425 xchg(StgPtr p, StgWord w)
426 {
427 StgWord old = *p;
428 *p = w;
429 return old;
430 }
431
432 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
433 EXTERN_INLINE StgWord
434 cas(StgVolatilePtr p, StgWord o, StgWord n)
435 {
436 StgWord result;
437 result = *p;
438 if (result == o) {
439 *p = n;
440 }
441 return result;
442 }
443
444 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
445 EXTERN_INLINE StgWord
446 atomic_inc(StgVolatilePtr p, StgWord incr)
447 {
448 return ((*p) += incr);
449 }
450
451
452 INLINE_HEADER StgWord
453 atomic_dec(StgVolatilePtr p)
454 {
455 return --(*p);
456 }
457 #endif
458
459 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
460
461 #endif /* !THREADED_RTS */
462
463 #endif /* SMP_H */