includes/Stg.h: add declarations for hs_popcnt and frinds
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if defined(THREADED_RTS)
18
19 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
20 void arm_atomic_spin_lock(void);
21 void arm_atomic_spin_unlock(void);
22 #endif
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure() below)
39 * and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 EXTERN_INLINE StgWord
107 xchg(StgPtr p, StgWord w)
108 {
109 StgWord result;
110 #if i386_HOST_ARCH || x86_64_HOST_ARCH
111 result = w;
112 __asm__ __volatile__ (
113 // NB: the xchg instruction is implicitly locked, so we do not
114 // need a lock prefix here.
115 "xchg %1,%0"
116 :"+r" (result), "+m" (*p)
117 : /* no input-only operands */
118 );
119 #elif powerpc_HOST_ARCH
120 __asm__ __volatile__ (
121 "1: lwarx %0, 0, %2\n"
122 " stwcx. %1, 0, %2\n"
123 " bne- 1b"
124 :"=&r" (result)
125 :"r" (w), "r" (p)
126 );
127 #elif sparc_HOST_ARCH
128 result = w;
129 __asm__ __volatile__ (
130 "swap %1,%0"
131 : "+r" (result), "+m" (*p)
132 : /* no input-only operands */
133 );
134 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
135 __asm__ __volatile__ ("swp %0, %1, [%2]"
136 : "=&r" (result)
137 : "r" (w), "r" (p) : "memory");
138 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
139 // swp instruction which is used in pre-ARMv6 code above
140 // is deprecated in AMRv6 and later. ARM, Ltd. *highly* recommends
141 // to use ldrex/strex instruction pair for the same purpose
142 // see chapter: Synchronization and semaphores in ARM Architecture
143 // Reference manual
144 StgWord tmp;
145 __asm__ __volatile__ (
146 "1: ldrex %0, [%3]\n"
147 " strex %1, %2, [%3]\n"
148 " teq %1, #1\n"
149 " beq 1b\n"
150 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
151 " dmb\n"
152 #endif
153 : "=&r" (result), "=&r" (tmp)
154 : "r" (w), "r" (p)
155 : "memory"
156 );
157 #elif !defined(WITHSMP)
158 result = *p;
159 *p = w;
160 #else
161 #error xchg() unimplemented on this architecture
162 #endif
163 return result;
164 }
165
166 /*
167 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
168 * in the STM implementation.
169 */
170 EXTERN_INLINE StgWord
171 cas(StgVolatilePtr p, StgWord o, StgWord n)
172 {
173 #if i386_HOST_ARCH || x86_64_HOST_ARCH
174 __asm__ __volatile__ (
175 "lock\ncmpxchg %3,%1"
176 :"=a"(o), "+m" (*(volatile unsigned int *)p)
177 :"0" (o), "r" (n));
178 return o;
179 #elif powerpc_HOST_ARCH
180 StgWord result;
181 __asm__ __volatile__ (
182 "1: lwarx %0, 0, %3\n"
183 " cmpw %0, %1\n"
184 " bne 2f\n"
185 " stwcx. %2, 0, %3\n"
186 " bne- 1b\n"
187 "2:"
188 :"=&r" (result)
189 :"r" (o), "r" (n), "r" (p)
190 :"cc", "memory"
191 );
192 return result;
193 #elif sparc_HOST_ARCH
194 __asm__ __volatile__ (
195 "cas [%1], %2, %0"
196 : "+r" (n)
197 : "r" (p), "r" (o)
198 : "memory"
199 );
200 return n;
201 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
202 StgWord r;
203 arm_atomic_spin_lock();
204 r = *p;
205 if (r == o) { *p = n; }
206 arm_atomic_spin_unlock();
207 return r;
208 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv6)
209 StgWord result,tmp;
210
211 __asm__ __volatile__(
212 "1: ldrex %1, [%2]\n"
213 " mov %0, #0\n"
214 " teq %1, %3\n"
215 " it eq\n"
216 " strexeq %0, %4, [%2]\n"
217 " teq %0, #1\n"
218 " it eq\n"
219 " beq 1b\n"
220 #if !defined(arm_HOST_ARCH_PRE_ARMv7)
221 " dmb\n"
222 #endif
223 : "=&r"(tmp), "=&r"(result)
224 : "r"(p), "r"(o), "r"(n)
225 : "cc","memory");
226
227 return result;
228 #elif !defined(WITHSMP)
229 StgWord result;
230 result = *p;
231 if (result == o) {
232 *p = n;
233 }
234 return result;
235 #else
236 #error cas() unimplemented on this architecture
237 #endif
238 }
239
240 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
241 // Haskell code (fetchAddIntArray#).
242 EXTERN_INLINE StgWord
243 atomic_inc(StgVolatilePtr p, StgWord incr)
244 {
245 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
246 StgWord r;
247 r = incr;
248 __asm__ __volatile__ (
249 "lock\nxadd %0,%1":
250 "+r" (r), "+m" (*p):
251 );
252 return r + incr;
253 #else
254 StgWord old, new;
255 do {
256 old = *p;
257 new = old + incr;
258 } while (cas(p, old, new) != old);
259 return new;
260 #endif
261 }
262
263 EXTERN_INLINE StgWord
264 atomic_dec(StgVolatilePtr p)
265 {
266 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
267 StgWord r;
268 r = (StgWord)-1;
269 __asm__ __volatile__ (
270 "lock\nxadd %0,%1":
271 "+r" (r), "+m" (*p):
272 );
273 return r-1;
274 #else
275 StgWord old, new;
276 do {
277 old = *p;
278 new = old - 1;
279 } while (cas(p, old, new) != old);
280 return new;
281 #endif
282 }
283
284 EXTERN_INLINE void
285 busy_wait_nop(void)
286 {
287 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
288 __asm__ __volatile__ ("rep; nop");
289 //
290 #else
291 // nothing
292 #endif
293 }
294
295 #endif // !IN_STG_CODE
296
297 /*
298 * We need to tell both the compiler AND the CPU about the barriers.
299 * It's no good preventing the CPU from reordering the operations if
300 * the compiler has already done so - hence the "memory" restriction
301 * on each of the barriers below.
302 */
303 EXTERN_INLINE void
304 write_barrier(void) {
305 #if i386_HOST_ARCH || x86_64_HOST_ARCH
306 __asm__ __volatile__ ("" : : : "memory");
307 #elif powerpc_HOST_ARCH
308 __asm__ __volatile__ ("lwsync" : : : "memory");
309 #elif sparc_HOST_ARCH
310 /* Sparc in TSO mode does not require store/store barriers. */
311 __asm__ __volatile__ ("" : : : "memory");
312 #elif arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv7)
313 __asm__ __volatile__ ("" : : : "memory");
314 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
315 __asm__ __volatile__ ("dmb st" : : : "memory");
316 #elif !defined(WITHSMP)
317 return;
318 #else
319 #error memory barriers unimplemented on this architecture
320 #endif
321 }
322
323 EXTERN_INLINE void
324 store_load_barrier(void) {
325 #if i386_HOST_ARCH
326 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
327 #elif x86_64_HOST_ARCH
328 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
329 #elif powerpc_HOST_ARCH
330 __asm__ __volatile__ ("sync" : : : "memory");
331 #elif sparc_HOST_ARCH
332 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
333 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
334 __asm__ __volatile__ ("dmb" : : : "memory");
335 #elif !defined(WITHSMP)
336 return;
337 #else
338 #error memory barriers unimplemented on this architecture
339 #endif
340 }
341
342 EXTERN_INLINE void
343 load_load_barrier(void) {
344 #if i386_HOST_ARCH
345 __asm__ __volatile__ ("" : : : "memory");
346 #elif x86_64_HOST_ARCH
347 __asm__ __volatile__ ("" : : : "memory");
348 #elif powerpc_HOST_ARCH
349 __asm__ __volatile__ ("lwsync" : : : "memory");
350 #elif sparc_HOST_ARCH
351 /* Sparc in TSO mode does not require load/load barriers. */
352 __asm__ __volatile__ ("" : : : "memory");
353 #elif arm_HOST_ARCH && !defined(arm_HOST_ARCH_PRE_ARMv7)
354 __asm__ __volatile__ ("dmb" : : : "memory");
355 #elif !defined(WITHSMP)
356 return;
357 #else
358 #error memory barriers unimplemented on this architecture
359 #endif
360 }
361
362 // Load a pointer from a memory location that might be being modified
363 // concurrently. This prevents the compiler from optimising away
364 // multiple loads of the memory location, as it might otherwise do in
365 // a busy wait loop for example.
366 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
367
368 /* ---------------------------------------------------------------------- */
369 #else /* !THREADED_RTS */
370
371 EXTERN_INLINE void write_barrier(void);
372 EXTERN_INLINE void store_load_barrier(void);
373 EXTERN_INLINE void load_load_barrier(void);
374 EXTERN_INLINE void write_barrier () {} /* nothing */
375 EXTERN_INLINE void store_load_barrier() {} /* nothing */
376 EXTERN_INLINE void load_load_barrier () {} /* nothing */
377
378 #if !IN_STG_CODE || IN_STGCRUN
379 INLINE_HEADER StgWord
380 xchg(StgPtr p, StgWord w)
381 {
382 StgWord old = *p;
383 *p = w;
384 return old;
385 }
386
387 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
388 EXTERN_INLINE StgWord
389 cas(StgVolatilePtr p, StgWord o, StgWord n)
390 {
391 StgWord result;
392 result = *p;
393 if (result == o) {
394 *p = n;
395 }
396 return result;
397 }
398
399 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
400 EXTERN_INLINE StgWord
401 atomic_inc(StgVolatilePtr p, StgWord incr)
402 {
403 return ((*p) += incr);
404 }
405
406
407 INLINE_HEADER StgWord
408 atomic_dec(StgVolatilePtr p)
409 {
410 return --(*p);
411 }
412 #endif
413
414 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
415
416 #endif /* !THREADED_RTS */
417
418 #endif /* SMP_H */