bfd6bbcfab52a71a458ac0106ed3824cdbd04e0a
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://hackage.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if defined(THREADED_RTS)
18
19 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
20 void arm_atomic_spin_lock(void);
21 void arm_atomic_spin_unlock(void);
22 #endif
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic increment
56 *
57 * atomic_inc(p) {
58 * return ++(*p);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p);
62
63 /*
64 * Atomic decrement
65 *
66 * atomic_dec(p) {
67 * return --(*p);
68 * }
69 */
70 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
71
72 /*
73 * Busy-wait nop: this is a hint to the CPU that we are currently in a
74 * busy-wait loop waiting for another CPU to change something. On a
75 * hypertreaded CPU it should yield to another thread, for example.
76 */
77 EXTERN_INLINE void busy_wait_nop(void);
78
79 #endif // !IN_STG_CODE
80
81 /*
82 * Various kinds of memory barrier.
83 * write_barrier: prevents future stores occurring before prededing stores.
84 * store_load_barrier: prevents future loads occurring before preceding stores.
85 * load_load_barrier: prevents future loads occurring before earlier stores.
86 *
87 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
88 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
89 *
90 * To check whether you got these right, try the test in
91 * testsuite/tests/rts/testwsdeque.c
92 * This tests the work-stealing deque implementation, which relies on
93 * properly working store_load and load_load memory barriers.
94 */
95 EXTERN_INLINE void write_barrier(void);
96 EXTERN_INLINE void store_load_barrier(void);
97 EXTERN_INLINE void load_load_barrier(void);
98
99 /* ----------------------------------------------------------------------------
100 Implementations
101 ------------------------------------------------------------------------- */
102
103 #if !IN_STG_CODE || IN_STGCRUN
104
105 EXTERN_INLINE StgWord
106 xchg(StgPtr p, StgWord w)
107 {
108 StgWord result;
109 #if i386_HOST_ARCH || x86_64_HOST_ARCH
110 result = w;
111 __asm__ __volatile__ (
112 // NB: the xchg instruction is implicitly locked, so we do not
113 // need a lock prefix here.
114 "xchg %1,%0"
115 :"+r" (result), "+m" (*p)
116 : /* no input-only operands */
117 );
118 #elif powerpc_HOST_ARCH
119 __asm__ __volatile__ (
120 "1: lwarx %0, 0, %2\n"
121 " stwcx. %1, 0, %2\n"
122 " bne- 1b"
123 :"=&r" (result)
124 :"r" (w), "r" (p)
125 );
126 #elif sparc_HOST_ARCH
127 result = w;
128 __asm__ __volatile__ (
129 "swap %1,%0"
130 : "+r" (result), "+m" (*p)
131 : /* no input-only operands */
132 );
133 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
134 __asm__ __volatile__ ("swp %0, %1, [%2]"
135 : "=&r" (result)
136 : "r" (w), "r" (p) : "memory");
137 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
138 // swp instruction which is used in pre-ARMv6 code above
139 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
140 // to use ldrex/strex instruction pair for the same purpose
141 // see chapter: Synchronization and semaphores in ARM Architecture
142 // Reference manual
143 StgWord tmp;
144 __asm__ __volatile__ (
145 "1: ldrex %0, [%3]\n"
146 " strex %1, %2, [%3]\n"
147 " teq %1, #1\n"
148 " beq 1b\n"
149 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
150 " dmb\n"
151 #endif
152 : "=&r" (result), "=&r" (tmp)
153 : "r" (w), "r" (p)
154 : "memory"
155 );
156 #elif !defined(WITHSMP)
157 result = *p;
158 *p = w;
159 #else
160 #error xchg() unimplemented on this architecture
161 #endif
162 return result;
163 }
164
165 /*
166 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
167 * in the STM implementation.
168 */
169 EXTERN_INLINE StgWord
170 cas(StgVolatilePtr p, StgWord o, StgWord n)
171 {
172 #if i386_HOST_ARCH || x86_64_HOST_ARCH
173 __asm__ __volatile__ (
174 "lock\ncmpxchg %3,%1"
175 :"=a"(o), "=m" (*(volatile unsigned int *)p)
176 :"0" (o), "r" (n));
177 return o;
178 #elif powerpc_HOST_ARCH
179 StgWord result;
180 __asm__ __volatile__ (
181 "1: lwarx %0, 0, %3\n"
182 " cmpw %0, %1\n"
183 " bne 2f\n"
184 " stwcx. %2, 0, %3\n"
185 " bne- 1b\n"
186 "2:"
187 :"=&r" (result)
188 :"r" (o), "r" (n), "r" (p)
189 :"cc", "memory"
190 );
191 return result;
192 #elif sparc_HOST_ARCH
193 __asm__ __volatile__ (
194 "cas [%1], %2, %0"
195 : "+r" (n)
196 : "r" (p), "r" (o)
197 : "memory"
198 );
199 return n;
200 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
201 StgWord r;
202 arm_atomic_spin_lock();
203 r = *p;
204 if (r == o) { *p = n; }
205 arm_atomic_spin_unlock();
206 return r;
207 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
208 StgWord result,tmp;
209
210 __asm__ __volatile__(
211 "1: ldrex %1, [%2]\n"
212 " mov %0, #0\n"
213 " teq %1, %3\n"
214 " it eq\n"
215 " strexeq %0, %4, [%2]\n"
216 " teq %0, #1\n"
217 " it eq\n"
218 " beq 1b\n"
219 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
220 " dmb\n"
221 #endif
222 : "=&r"(tmp), "=&r"(result)
223 : "r"(p), "r"(o), "r"(n)
224 : "cc","memory");
225
226 return result;
227 #elif !defined(WITHSMP)
228 StgWord result;
229 result = *p;
230 if (result == o) {
231 *p = n;
232 }
233 return result;
234 #else
235 #error cas() unimplemented on this architecture
236 #endif
237 }
238
239 EXTERN_INLINE StgWord
240 atomic_inc(StgVolatilePtr p)
241 {
242 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
243 StgWord r;
244 r = 1;
245 __asm__ __volatile__ (
246 "lock\nxadd %0,%1":
247 "+r" (r), "+m" (*p):
248 );
249 return r+1;
250 #else
251 StgWord old, new;
252 do {
253 old = *p;
254 new = old + 1;
255 } while (cas(p, old, new) != old);
256 return new;
257 #endif
258 }
259
260 EXTERN_INLINE StgWord
261 atomic_dec(StgVolatilePtr p)
262 {
263 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
264 StgWord r;
265 r = (StgWord)-1;
266 __asm__ __volatile__ (
267 "lock\nxadd %0,%1":
268 "+r" (r), "+m" (*p):
269 );
270 return r-1;
271 #else
272 StgWord old, new;
273 do {
274 old = *p;
275 new = old - 1;
276 } while (cas(p, old, new) != old);
277 return new;
278 #endif
279 }
280
281 EXTERN_INLINE void
282 busy_wait_nop(void)
283 {
284 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
285 __asm__ __volatile__ ("rep; nop");
286 //
287 #else
288 // nothing
289 #endif
290 }
291
292 #endif // !IN_STG_CODE
293
294 /*
295 * We need to tell both the compiler AND the CPU about the barriers.
296 * It's no good preventing the CPU from reordering the operations if
297 * the compiler has already done so - hence the "memory" restriction
298 * on each of the barriers below.
299 */
300 EXTERN_INLINE void
301 write_barrier(void) {
302 #if i386_HOST_ARCH || x86_64_HOST_ARCH
303 __asm__ __volatile__ ("" : : : "memory");
304 #elif powerpc_HOST_ARCH
305 __asm__ __volatile__ ("lwsync" : : : "memory");
306 #elif sparc_HOST_ARCH
307 /* Sparc in TSO mode does not require store/store barriers. */
308 __asm__ __volatile__ ("" : : : "memory");
309 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
310 __asm__ __volatile__ ("" : : : "memory");
311 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
312 __asm__ __volatile__ ("dmb st" : : : "memory");
313 #elif !defined(WITHSMP)
314 return;
315 #else
316 #error memory barriers unimplemented on this architecture
317 #endif
318 }
319
320 EXTERN_INLINE void
321 store_load_barrier(void) {
322 #if i386_HOST_ARCH
323 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
324 #elif x86_64_HOST_ARCH
325 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
326 #elif powerpc_HOST_ARCH
327 __asm__ __volatile__ ("sync" : : : "memory");
328 #elif sparc_HOST_ARCH
329 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
330 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
331 __asm__ __volatile__ ("dmb" : : : "memory");
332 #elif !defined(WITHSMP)
333 return;
334 #else
335 #error memory barriers unimplemented on this architecture
336 #endif
337 }
338
339 EXTERN_INLINE void
340 load_load_barrier(void) {
341 #if i386_HOST_ARCH
342 __asm__ __volatile__ ("" : : : "memory");
343 #elif x86_64_HOST_ARCH
344 __asm__ __volatile__ ("" : : : "memory");
345 #elif powerpc_HOST_ARCH
346 __asm__ __volatile__ ("lwsync" : : : "memory");
347 #elif sparc_HOST_ARCH
348 /* Sparc in TSO mode does not require load/load barriers. */
349 __asm__ __volatile__ ("" : : : "memory");
350 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
351 __asm__ __volatile__ ("dmb" : : : "memory");
352 #elif !defined(WITHSMP)
353 return;
354 #else
355 #error memory barriers unimplemented on this architecture
356 #endif
357 }
358
359 // Load a pointer from a memory location that might be being modified
360 // concurrently. This prevents the compiler from optimising away
361 // multiple loads of the memory location, as it might otherwise do in
362 // a busy wait loop for example.
363 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
364
365 /* ---------------------------------------------------------------------- */
366 #else /* !THREADED_RTS */
367
368 #define write_barrier() /* nothing */
369 #define store_load_barrier() /* nothing */
370 #define load_load_barrier() /* nothing */
371
372 #if !IN_STG_CODE || IN_STGCRUN
373 INLINE_HEADER StgWord
374 xchg(StgPtr p, StgWord w)
375 {
376 StgWord old = *p;
377 *p = w;
378 return old;
379 }
380
381 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
382 EXTERN_INLINE StgWord
383 cas(StgVolatilePtr p, StgWord o, StgWord n)
384 {
385 StgWord result;
386 result = *p;
387 if (result == o) {
388 *p = n;
389 }
390 return result;
391 }
392
393 INLINE_HEADER StgWord
394 atomic_inc(StgVolatilePtr p)
395 {
396 return ++(*p);
397 }
398
399 INLINE_HEADER StgWord
400 atomic_dec(StgVolatilePtr p)
401 {
402 return --(*p);
403 }
404 #endif
405
406 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
407
408 #endif /* !THREADED_RTS */
409
410 #endif /* SMP_H */