implement ARMv6/7 specific xchg function
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://hackage.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) || \
18 defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || defined(__ARM_ARCH_5__) || \
19 defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
20 #define PRE_ARMv6
21 #endif
22
23 #if defined(THREADED_RTS)
24
25 #if arm_HOST_ARCH && defined(PRE_ARMv6)
26 void arm_atomic_spin_lock(void);
27 void arm_atomic_spin_unlock(void);
28 #endif
29
30 /* ----------------------------------------------------------------------------
31 Atomic operations
32 ------------------------------------------------------------------------- */
33
34 #if !IN_STG_CODE || IN_STGCRUN
35 // We only want the barriers, e.g. write_barrier(), declared in .hc
36 // files. Defining the other inline functions here causes type
37 // mismatch errors from gcc, because the generated C code is assuming
38 // that there are no prototypes in scope.
39
40 /*
41 * The atomic exchange operation: xchg(p,w) exchanges the value
42 * pointed to by p with the value w, returning the old value.
43 *
44 * Used for locking closures during updates (see lockClosure() below)
45 * and the MVar primops.
46 */
47 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
48
49 /*
50 * Compare-and-swap. Atomically does this:
51 *
52 * cas(p,o,n) {
53 * r = *p;
54 * if (r == o) { *p = n };
55 * return r;
56 * }
57 */
58 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
59
60 /*
61 * Atomic increment
62 *
63 * atomic_inc(p) {
64 * return ++(*p);
65 * }
66 */
67 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p);
68
69 /*
70 * Atomic decrement
71 *
72 * atomic_dec(p) {
73 * return --(*p);
74 * }
75 */
76 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
77
78 /*
79 * Busy-wait nop: this is a hint to the CPU that we are currently in a
80 * busy-wait loop waiting for another CPU to change something. On a
81 * hypertreaded CPU it should yield to another thread, for example.
82 */
83 EXTERN_INLINE void busy_wait_nop(void);
84
85 #endif // !IN_STG_CODE
86
87 /*
88 * Various kinds of memory barrier.
89 * write_barrier: prevents future stores occurring before prededing stores.
90 * store_load_barrier: prevents future loads occurring before preceding stores.
91 * load_load_barrier: prevents future loads occurring before earlier stores.
92 *
93 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
94 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
95 *
96 * To check whether you got these right, try the test in
97 * testsuite/tests/rts/testwsdeque.c
98 * This tests the work-stealing deque implementation, which relies on
99 * properly working store_load and load_load memory barriers.
100 */
101 EXTERN_INLINE void write_barrier(void);
102 EXTERN_INLINE void store_load_barrier(void);
103 EXTERN_INLINE void load_load_barrier(void);
104
105 /* ----------------------------------------------------------------------------
106 Implementations
107 ------------------------------------------------------------------------- */
108
109 #if !IN_STG_CODE || IN_STGCRUN
110
111 EXTERN_INLINE StgWord
112 xchg(StgPtr p, StgWord w)
113 {
114 StgWord result;
115 #if i386_HOST_ARCH || x86_64_HOST_ARCH
116 result = w;
117 __asm__ __volatile__ (
118 // NB: the xchg instruction is implicitly locked, so we do not
119 // need a lock prefix here.
120 "xchg %1,%0"
121 :"+r" (result), "+m" (*p)
122 : /* no input-only operands */
123 );
124 #elif powerpc_HOST_ARCH
125 __asm__ __volatile__ (
126 "1: lwarx %0, 0, %2\n"
127 " stwcx. %1, 0, %2\n"
128 " bne- 1b"
129 :"=&r" (result)
130 :"r" (w), "r" (p)
131 );
132 #elif sparc_HOST_ARCH
133 result = w;
134 __asm__ __volatile__ (
135 "swap %1,%0"
136 : "+r" (result), "+m" (*p)
137 : /* no input-only operands */
138 );
139 #elif arm_HOST_ARCH && defined(PRE_ARMv6)
140 __asm__ __volatile__ ("swp %0, %1, [%2]"
141 : "=&r" (result)
142 : "r" (w), "r" (p) : "memory");
143 #elif arm_HOST_ARCH && !defined(PRE_ARMv6)
144 // swp instruction which is used in PRE_ARMv6 code above
145 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
146 // to use ldrex/strex instruction pair for the same purpose
147 // see chapter: Synchronization and semaphores in ARM Architecture
148 // Reference manual
149 StgWord tmp;
150 __asm__ __volatile__ (
151 "1: ldrex %0, [%3]\n"
152 " strex %1, %2, [%3]\n"
153 " teq %1, #1\n"
154 " beq 1b\n"
155 : "=&r" (result), "=&r" (tmp)
156 : "r" (w), "r" (p)
157 : "memory"
158 );
159 #elif !defined(WITHSMP)
160 #error xchg() unimplemented on this architecture
161 result = *p;
162 *p = w;
163 #else
164 #error xchg() unimplemented on this architecture
165 #endif
166 return result;
167 }
168
169 /*
170 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
171 * in the STM implementation.
172 */
173 EXTERN_INLINE StgWord
174 cas(StgVolatilePtr p, StgWord o, StgWord n)
175 {
176 #if i386_HOST_ARCH || x86_64_HOST_ARCH
177 __asm__ __volatile__ (
178 "lock\ncmpxchg %3,%1"
179 :"=a"(o), "=m" (*(volatile unsigned int *)p)
180 :"0" (o), "r" (n));
181 return o;
182 #elif powerpc_HOST_ARCH
183 StgWord result;
184 __asm__ __volatile__ (
185 "1: lwarx %0, 0, %3\n"
186 " cmpw %0, %1\n"
187 " bne 2f\n"
188 " stwcx. %2, 0, %3\n"
189 " bne- 1b\n"
190 "2:"
191 :"=&r" (result)
192 :"r" (o), "r" (n), "r" (p)
193 :"cc", "memory"
194 );
195 return result;
196 #elif sparc_HOST_ARCH
197 __asm__ __volatile__ (
198 "cas [%1], %2, %0"
199 : "+r" (n)
200 : "r" (p), "r" (o)
201 : "memory"
202 );
203 return n;
204 #elif arm_HOST_ARCH
205 #if defined(PRE_ARMv6)
206 StgWord r;
207 arm_atomic_spin_lock();
208 r = *p;
209 if (r == o) { *p = n; }
210 arm_atomic_spin_unlock();
211 return r;
212 #else
213 StgWord result,tmp;
214
215 __asm__ __volatile__(
216 "1: ldrex %1, [%2]\n"
217 " mov %0, #0\n"
218 " teq %1, %3\n"
219 " strexeq %0, %4, [%2]\n"
220 " teq %0, #1\n"
221 " beq 1b\n"
222 : "=&r"(tmp), "=&r"(result)
223 : "r"(p), "r"(o), "r"(n)
224 : "cc","memory");
225
226 return result;
227 #endif
228 #elif !defined(WITHSMP)
229 #error cas() unimplemented on this architecture
230 StgWord result;
231 result = *p;
232 if (result == o) {
233 *p = n;
234 }
235 return result;
236 #else
237 #error cas() unimplemented on this architecture
238 #endif
239 }
240
241 EXTERN_INLINE StgWord
242 atomic_inc(StgVolatilePtr p)
243 {
244 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
245 StgWord r;
246 r = 1;
247 __asm__ __volatile__ (
248 "lock\nxadd %0,%1":
249 "+r" (r), "+m" (*p):
250 );
251 return r+1;
252 #else
253 StgWord old, new;
254 do {
255 old = *p;
256 new = old + 1;
257 } while (cas(p, old, new) != old);
258 return new;
259 #endif
260 }
261
262 EXTERN_INLINE StgWord
263 atomic_dec(StgVolatilePtr p)
264 {
265 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
266 StgWord r;
267 r = (StgWord)-1;
268 __asm__ __volatile__ (
269 "lock\nxadd %0,%1":
270 "+r" (r), "+m" (*p):
271 );
272 return r-1;
273 #else
274 StgWord old, new;
275 do {
276 old = *p;
277 new = old - 1;
278 } while (cas(p, old, new) != old);
279 return new;
280 #endif
281 }
282
283 EXTERN_INLINE void
284 busy_wait_nop(void)
285 {
286 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
287 __asm__ __volatile__ ("rep; nop");
288 //
289 #else
290 // nothing
291 #endif
292 }
293
294 #endif // !IN_STG_CODE
295
296 /*
297 * We need to tell both the compiler AND the CPU about the barriers.
298 * It's no good preventing the CPU from reordering the operations if
299 * the compiler has already done so - hence the "memory" restriction
300 * on each of the barriers below.
301 */
302 EXTERN_INLINE void
303 write_barrier(void) {
304 #if i386_HOST_ARCH || x86_64_HOST_ARCH
305 __asm__ __volatile__ ("" : : : "memory");
306 #elif powerpc_HOST_ARCH
307 __asm__ __volatile__ ("lwsync" : : : "memory");
308 #elif sparc_HOST_ARCH
309 /* Sparc in TSO mode does not require store/store barriers. */
310 __asm__ __volatile__ ("" : : : "memory");
311 #elif arm_HOST_ARCH
312 __asm__ __volatile__ ("" : : : "memory");
313 #elif !defined(WITHSMP)
314 return;
315 #else
316 #error memory barriers unimplemented on this architecture
317 #endif
318 }
319
320 EXTERN_INLINE void
321 store_load_barrier(void) {
322 #if i386_HOST_ARCH
323 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
324 #elif x86_64_HOST_ARCH
325 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
326 #elif powerpc_HOST_ARCH
327 __asm__ __volatile__ ("sync" : : : "memory");
328 #elif sparc_HOST_ARCH
329 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
330 #elif !defined(WITHSMP)
331 return;
332 #else
333 #error memory barriers unimplemented on this architecture
334 #endif
335 }
336
337 EXTERN_INLINE void
338 load_load_barrier(void) {
339 #if i386_HOST_ARCH
340 __asm__ __volatile__ ("" : : : "memory");
341 #elif x86_64_HOST_ARCH
342 __asm__ __volatile__ ("" : : : "memory");
343 #elif powerpc_HOST_ARCH
344 __asm__ __volatile__ ("lwsync" : : : "memory");
345 #elif sparc_HOST_ARCH
346 /* Sparc in TSO mode does not require load/load barriers. */
347 __asm__ __volatile__ ("" : : : "memory");
348 #elif !defined(WITHSMP)
349 return;
350 #else
351 #error memory barriers unimplemented on this architecture
352 #endif
353 }
354
355 // Load a pointer from a memory location that might be being modified
356 // concurrently. This prevents the compiler from optimising away
357 // multiple loads of the memory location, as it might otherwise do in
358 // a busy wait loop for example.
359 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
360
361 /* ---------------------------------------------------------------------- */
362 #else /* !THREADED_RTS */
363
364 #define write_barrier() /* nothing */
365 #define store_load_barrier() /* nothing */
366 #define load_load_barrier() /* nothing */
367
368 #if !IN_STG_CODE || IN_STGCRUN
369 INLINE_HEADER StgWord
370 xchg(StgPtr p, StgWord w)
371 {
372 StgWord old = *p;
373 *p = w;
374 return old;
375 }
376
377 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
378 EXTERN_INLINE StgWord
379 cas(StgVolatilePtr p, StgWord o, StgWord n)
380 {
381 StgWord result;
382 result = *p;
383 if (result == o) {
384 *p = n;
385 }
386 return result;
387 }
388
389 INLINE_HEADER StgWord
390 atomic_inc(StgVolatilePtr p)
391 {
392 return ++(*p);
393 }
394
395 INLINE_HEADER StgWord
396 atomic_dec(StgVolatilePtr p)
397 {
398 return --(*p);
399 }
400 #endif
401
402 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
403
404 #endif /* !THREADED_RTS */
405
406 #endif /* SMP_H */