cpp: Use #pragma once instead of #ifndef guards
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #pragma once
15
16 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
17 void arm_atomic_spin_lock(void);
18 void arm_atomic_spin_unlock(void);
19 #endif
20
21 #if defined(THREADED_RTS)
22
23 /* ----------------------------------------------------------------------------
24 Atomic operations
25 ------------------------------------------------------------------------- */
26
27 #if !IN_STG_CODE || IN_STGCRUN
28 // We only want the barriers, e.g. write_barrier(), declared in .hc
29 // files. Defining the other inline functions here causes type
30 // mismatch errors from gcc, because the generated C code is assuming
31 // that there are no prototypes in scope.
32
33 /*
34 * The atomic exchange operation: xchg(p,w) exchanges the value
35 * pointed to by p with the value w, returning the old value.
36 *
37 * Used for locking closures during updates (see lockClosure()
38 * in includes/rts/storage/SMPClosureOps.h) and the MVar primops.
39 */
40 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
41
42 /*
43 * Compare-and-swap. Atomically does this:
44 *
45 * cas(p,o,n) {
46 * r = *p;
47 * if (r == o) { *p = n };
48 * return r;
49 * }
50 */
51 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
52
53 /*
54 * Atomic addition by the provided quantity
55 *
56 * atomic_inc(p, n) {
57 * return ((*p) += n);
58 * }
59 */
60 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
61
62
63 /*
64 * Atomic decrement
65 *
66 * atomic_dec(p) {
67 * return --(*p);
68 * }
69 */
70 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
71
72 /*
73 * Busy-wait nop: this is a hint to the CPU that we are currently in a
74 * busy-wait loop waiting for another CPU to change something. On a
75 * hypertreaded CPU it should yield to another thread, for example.
76 */
77 EXTERN_INLINE void busy_wait_nop(void);
78
79 #endif // !IN_STG_CODE
80
81 /*
82 * Various kinds of memory barrier.
83 * write_barrier: prevents future stores occurring before prededing stores.
84 * store_load_barrier: prevents future loads occurring before preceding stores.
85 * load_load_barrier: prevents future loads occurring before earlier stores.
86 *
87 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
88 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
89 *
90 * To check whether you got these right, try the test in
91 * testsuite/tests/rts/testwsdeque.c
92 * This tests the work-stealing deque implementation, which relies on
93 * properly working store_load and load_load memory barriers.
94 */
95 EXTERN_INLINE void write_barrier(void);
96 EXTERN_INLINE void store_load_barrier(void);
97 EXTERN_INLINE void load_load_barrier(void);
98
99 /* ----------------------------------------------------------------------------
100 Implementations
101 ------------------------------------------------------------------------- */
102
103 #if !IN_STG_CODE || IN_STGCRUN
104
105 /*
106 * Exchange the value pointed to by p with w and return the former. This
107 * function is used to acquire a lock. An acquire memory barrier is sufficient
108 * for a lock operation because corresponding unlock operation issues a
109 * store-store barrier (write_barrier()) immediately before releasing the lock.
110 */
111 EXTERN_INLINE StgWord
112 xchg(StgPtr p, StgWord w)
113 {
114 // When porting GHC to a new platform check that
115 // __sync_lock_test_and_set() actually stores w in *p.
116 // Use test rts/atomicxchg to verify that the correct value is stored.
117 // From the gcc manual:
118 // (https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html)
119 // This built-in function, as described by Intel, is not
120 // a traditional test-and-set operation, but rather an atomic
121 // exchange operation.
122 // [...]
123 // Many targets have only minimal support for such locks,
124 // and do not support a full exchange operation. In this case,
125 // a target may support reduced functionality here by which the
126 // only valid value to store is the immediate constant 1. The
127 // exact value actually stored in *ptr is implementation defined.
128 return __sync_lock_test_and_set(p, w);
129 }
130
131 /*
132 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
133 * in the STM implementation.
134 */
135 EXTERN_INLINE StgWord
136 cas(StgVolatilePtr p, StgWord o, StgWord n)
137 {
138 return __sync_val_compare_and_swap(p, o, n);
139 }
140
141 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
142 // Haskell code (fetchAddIntArray#).
143 // PT: add-and-fetch, returns new value
144 EXTERN_INLINE StgWord
145 atomic_inc(StgVolatilePtr p, StgWord incr)
146 {
147 return __sync_add_and_fetch(p, incr);
148 }
149
150 EXTERN_INLINE StgWord
151 atomic_dec(StgVolatilePtr p)
152 {
153 return __sync_sub_and_fetch(p, (StgWord) 1);
154 }
155
156 /*
157 * Some architectures have a way to tell the CPU that we're in a
158 * busy-wait loop, and the processor should look for something else to
159 * do (such as run another hardware thread).
160 */
161 EXTERN_INLINE void
162 busy_wait_nop(void)
163 {
164 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
165 // On Intel, the busy-wait-nop instruction is called "pause",
166 // which is actually represented as a nop with the rep prefix.
167 // On processors before the P4 this behaves as a nop; on P4 and
168 // later it might do something clever like yield to another
169 // hyperthread. In any case, Intel recommends putting one
170 // of these in a spin lock loop.
171 __asm__ __volatile__ ("rep; nop");
172 #else
173 // nothing
174 #endif
175 }
176
177 #endif // !IN_STG_CODE
178
179 /*
180 * We need to tell both the compiler AND the CPU about the barriers.
181 * It's no good preventing the CPU from reordering the operations if
182 * the compiler has already done so - hence the "memory" restriction
183 * on each of the barriers below.
184 */
185 EXTERN_INLINE void
186 write_barrier(void) {
187 #if defined(NOSMP)
188 return;
189 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
190 __asm__ __volatile__ ("" : : : "memory");
191 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
192 __asm__ __volatile__ ("lwsync" : : : "memory");
193 #elif sparc_HOST_ARCH
194 /* Sparc in TSO mode does not require store/store barriers. */
195 __asm__ __volatile__ ("" : : : "memory");
196 #elif (arm_HOST_ARCH) || aarch64_HOST_ARCH
197 __asm__ __volatile__ ("dmb st" : : : "memory");
198 #else
199 #error memory barriers unimplemented on this architecture
200 #endif
201 }
202
203 EXTERN_INLINE void
204 store_load_barrier(void) {
205 #if defined(NOSMP)
206 return;
207 #elif i386_HOST_ARCH
208 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
209 #elif x86_64_HOST_ARCH
210 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
211 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
212 __asm__ __volatile__ ("sync" : : : "memory");
213 #elif sparc_HOST_ARCH
214 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
215 #elif arm_HOST_ARCH
216 __asm__ __volatile__ ("dmb" : : : "memory");
217 #elif aarch64_HOST_ARCH
218 __asm__ __volatile__ ("dmb sy" : : : "memory");
219 #else
220 #error memory barriers unimplemented on this architecture
221 #endif
222 }
223
224 EXTERN_INLINE void
225 load_load_barrier(void) {
226 #if defined(NOSMP)
227 return;
228 #elif i386_HOST_ARCH
229 __asm__ __volatile__ ("" : : : "memory");
230 #elif x86_64_HOST_ARCH
231 __asm__ __volatile__ ("" : : : "memory");
232 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
233 __asm__ __volatile__ ("lwsync" : : : "memory");
234 #elif sparc_HOST_ARCH
235 /* Sparc in TSO mode does not require load/load barriers. */
236 __asm__ __volatile__ ("" : : : "memory");
237 #elif arm_HOST_ARCH
238 __asm__ __volatile__ ("dmb" : : : "memory");
239 #elif aarch64_HOST_ARCH
240 __asm__ __volatile__ ("dmb sy" : : : "memory");
241 #else
242 #error memory barriers unimplemented on this architecture
243 #endif
244 }
245
246 // Load a pointer from a memory location that might be being modified
247 // concurrently. This prevents the compiler from optimising away
248 // multiple loads of the memory location, as it might otherwise do in
249 // a busy wait loop for example.
250 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
251
252 /* ---------------------------------------------------------------------- */
253 #else /* !THREADED_RTS */
254
255 EXTERN_INLINE void write_barrier(void);
256 EXTERN_INLINE void store_load_barrier(void);
257 EXTERN_INLINE void load_load_barrier(void);
258 EXTERN_INLINE void write_barrier () {} /* nothing */
259 EXTERN_INLINE void store_load_barrier() {} /* nothing */
260 EXTERN_INLINE void load_load_barrier () {} /* nothing */
261
262 #if !IN_STG_CODE || IN_STGCRUN
263 INLINE_HEADER StgWord
264 xchg(StgPtr p, StgWord w)
265 {
266 StgWord old = *p;
267 *p = w;
268 return old;
269 }
270
271 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
272 EXTERN_INLINE StgWord
273 cas(StgVolatilePtr p, StgWord o, StgWord n)
274 {
275 StgWord result;
276 result = *p;
277 if (result == o) {
278 *p = n;
279 }
280 return result;
281 }
282
283 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
284 EXTERN_INLINE StgWord
285 atomic_inc(StgVolatilePtr p, StgWord incr)
286 {
287 return ((*p) += incr);
288 }
289
290
291 INLINE_HEADER StgWord
292 atomic_dec(StgVolatilePtr p)
293 {
294 return --(*p);
295 }
296 #endif
297
298 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
299
300 #endif /* !THREADED_RTS */