RTS SMP: Use compiler built-ins on all platforms.
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #ifndef SMP_H
15 #define SMP_H
16
17 #if arm_HOST_ARCH && defined(arm_HOST_ARCH_PRE_ARMv6)
18 void arm_atomic_spin_lock(void);
19 void arm_atomic_spin_unlock(void);
20 #endif
21
22 #if defined(THREADED_RTS)
23
24 /* ----------------------------------------------------------------------------
25 Atomic operations
26 ------------------------------------------------------------------------- */
27
28 #if !IN_STG_CODE || IN_STGCRUN
29 // We only want the barriers, e.g. write_barrier(), declared in .hc
30 // files. Defining the other inline functions here causes type
31 // mismatch errors from gcc, because the generated C code is assuming
32 // that there are no prototypes in scope.
33
34 /*
35 * The atomic exchange operation: xchg(p,w) exchanges the value
36 * pointed to by p with the value w, returning the old value.
37 *
38 * Used for locking closures during updates (see lockClosure()
39 * in includes/rts/storage/SMPClosureOps.h) and the MVar primops.
40 */
41 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
42
43 /*
44 * Compare-and-swap. Atomically does this:
45 *
46 * cas(p,o,n) {
47 * r = *p;
48 * if (r == o) { *p = n };
49 * return r;
50 * }
51 */
52 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier stores.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /* ----------------------------------------------------------------------------
101 Implementations
102 ------------------------------------------------------------------------- */
103
104 #if !IN_STG_CODE || IN_STGCRUN
105
106 /*
107 * Exchange the value pointed to by p with w and return the former. This
108 * function is used to acquire a lock. An acquire memory barrier is sufficient
109 * for a lock operation because corresponding unlock operation issues a
110 * store-store barrier (write_barrier()) immediately before releasing the lock.
111 */
112 EXTERN_INLINE StgWord
113 xchg(StgPtr p, StgWord w)
114 {
115 // When porting GHC to a new platform check that
116 // __sync_lock_test_and_set() actually stores w in *p.
117 // Use test rts/atomicxchg to verify that the correct value is stored.
118 // From the gcc manual:
119 // (https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html)
120 // This built-in function, as described by Intel, is not
121 // a traditional test-and-set operation, but rather an atomic
122 // exchange operation.
123 // [...]
124 // Many targets have only minimal support for such locks,
125 // and do not support a full exchange operation. In this case,
126 // a target may support reduced functionality here by which the
127 // only valid value to store is the immediate constant 1. The
128 // exact value actually stored in *ptr is implementation defined.
129 return __sync_lock_test_and_set(p, w);
130 }
131
132 /*
133 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
134 * in the STM implementation.
135 */
136 EXTERN_INLINE StgWord
137 cas(StgVolatilePtr p, StgWord o, StgWord n)
138 {
139 return __sync_val_compare_and_swap(p, o, n);
140 }
141
142 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
143 // Haskell code (fetchAddIntArray#).
144 // PT: add-and-fetch, returns new value
145 EXTERN_INLINE StgWord
146 atomic_inc(StgVolatilePtr p, StgWord incr)
147 {
148 return __sync_add_and_fetch(p, incr);
149 }
150
151 EXTERN_INLINE StgWord
152 atomic_dec(StgVolatilePtr p)
153 {
154 return __sync_sub_and_fetch(p, (StgWord) 1);
155 }
156
157 /*
158 * Some architectures have a way to tell the CPU that we're in a
159 * busy-wait loop, and the processor should look for something else to
160 * do (such as run another hardware thread).
161 */
162 EXTERN_INLINE void
163 busy_wait_nop(void)
164 {
165 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
166 // On Intel, the busy-wait-nop instruction is called "pause",
167 // which is actually represented as a nop with the rep prefix.
168 // On processors before the P4 this behaves as a nop; on P4 and
169 // later it might do something clever like yield to another
170 // hyperthread. In any case, Intel recommends putting one
171 // of these in a spin lock loop.
172 __asm__ __volatile__ ("rep; nop");
173 #else
174 // nothing
175 #endif
176 }
177
178 #endif // !IN_STG_CODE
179
180 /*
181 * We need to tell both the compiler AND the CPU about the barriers.
182 * It's no good preventing the CPU from reordering the operations if
183 * the compiler has already done so - hence the "memory" restriction
184 * on each of the barriers below.
185 */
186 EXTERN_INLINE void
187 write_barrier(void) {
188 #if defined(NOSMP)
189 return;
190 #elif i386_HOST_ARCH || x86_64_HOST_ARCH
191 __asm__ __volatile__ ("" : : : "memory");
192 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
193 __asm__ __volatile__ ("lwsync" : : : "memory");
194 #elif sparc_HOST_ARCH
195 /* Sparc in TSO mode does not require store/store barriers. */
196 __asm__ __volatile__ ("" : : : "memory");
197 #elif (arm_HOST_ARCH) || aarch64_HOST_ARCH
198 __asm__ __volatile__ ("dmb st" : : : "memory");
199 #else
200 #error memory barriers unimplemented on this architecture
201 #endif
202 }
203
204 EXTERN_INLINE void
205 store_load_barrier(void) {
206 #if defined(NOSMP)
207 return;
208 #elif i386_HOST_ARCH
209 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
210 #elif x86_64_HOST_ARCH
211 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
212 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
213 __asm__ __volatile__ ("sync" : : : "memory");
214 #elif sparc_HOST_ARCH
215 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
216 #elif arm_HOST_ARCH
217 __asm__ __volatile__ ("dmb" : : : "memory");
218 #elif aarch64_HOST_ARCH
219 __asm__ __volatile__ ("dmb sy" : : : "memory");
220 #else
221 #error memory barriers unimplemented on this architecture
222 #endif
223 }
224
225 EXTERN_INLINE void
226 load_load_barrier(void) {
227 #if defined(NOSMP)
228 return;
229 #elif i386_HOST_ARCH
230 __asm__ __volatile__ ("" : : : "memory");
231 #elif x86_64_HOST_ARCH
232 __asm__ __volatile__ ("" : : : "memory");
233 #elif powerpc_HOST_ARCH || powerpc64_HOST_ARCH || powerpc64le_HOST_ARCH
234 __asm__ __volatile__ ("lwsync" : : : "memory");
235 #elif sparc_HOST_ARCH
236 /* Sparc in TSO mode does not require load/load barriers. */
237 __asm__ __volatile__ ("" : : : "memory");
238 #elif arm_HOST_ARCH
239 __asm__ __volatile__ ("dmb" : : : "memory");
240 #elif aarch64_HOST_ARCH
241 __asm__ __volatile__ ("dmb sy" : : : "memory");
242 #else
243 #error memory barriers unimplemented on this architecture
244 #endif
245 }
246
247 // Load a pointer from a memory location that might be being modified
248 // concurrently. This prevents the compiler from optimising away
249 // multiple loads of the memory location, as it might otherwise do in
250 // a busy wait loop for example.
251 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
252
253 /* ---------------------------------------------------------------------- */
254 #else /* !THREADED_RTS */
255
256 EXTERN_INLINE void write_barrier(void);
257 EXTERN_INLINE void store_load_barrier(void);
258 EXTERN_INLINE void load_load_barrier(void);
259 EXTERN_INLINE void write_barrier () {} /* nothing */
260 EXTERN_INLINE void store_load_barrier() {} /* nothing */
261 EXTERN_INLINE void load_load_barrier () {} /* nothing */
262
263 #if !IN_STG_CODE || IN_STGCRUN
264 INLINE_HEADER StgWord
265 xchg(StgPtr p, StgWord w)
266 {
267 StgWord old = *p;
268 *p = w;
269 return old;
270 }
271
272 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
273 EXTERN_INLINE StgWord
274 cas(StgVolatilePtr p, StgWord o, StgWord n)
275 {
276 StgWord result;
277 result = *p;
278 if (result == o) {
279 *p = n;
280 }
281 return result;
282 }
283
284 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
285 EXTERN_INLINE StgWord
286 atomic_inc(StgVolatilePtr p, StgWord incr)
287 {
288 return ((*p) += incr);
289 }
290
291
292 INLINE_HEADER StgWord
293 atomic_dec(StgVolatilePtr p)
294 {
295 return --(*p);
296 }
297 #endif
298
299 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
300
301 #endif /* !THREADED_RTS */
302
303 #endif /* SMP_H */