testsuite: Assert that testsuite ways are known
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/source-tree/includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #pragma once
15
16 #if defined(arm_HOST_ARCH) && defined(arm_HOST_ARCH_PRE_ARMv6)
17 void arm_atomic_spin_lock(void);
18 void arm_atomic_spin_unlock(void);
19 #endif
20
21 #if defined(THREADED_RTS)
22
23 /* ----------------------------------------------------------------------------
24 Atomic operations
25 ------------------------------------------------------------------------- */
26
27 #if !IN_STG_CODE || IN_STGCRUN
28 // We only want the barriers, e.g. write_barrier(), declared in .hc
29 // files. Defining the other inline functions here causes type
30 // mismatch errors from gcc, because the generated C code is assuming
31 // that there are no prototypes in scope.
32
33 /*
34 * The atomic exchange operation: xchg(p,w) exchanges the value
35 * pointed to by p with the value w, returning the old value.
36 *
37 * Used for locking closures during updates (see lockClosure()
38 * in includes/rts/storage/SMPClosureOps.h) and the MVar primops.
39 */
40 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
41
42 /*
43 * Compare-and-swap. Atomically does this:
44 *
45 * cas(p,o,n) {
46 * r = *p;
47 * if (r == o) { *p = n };
48 * return r;
49 * }
50 */
51 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
52
53 /*
54 * Atomic addition by the provided quantity
55 *
56 * atomic_inc(p, n) {
57 * return ((*p) += n);
58 * }
59 */
60 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
61
62
63 /*
64 * Atomic decrement
65 *
66 * atomic_dec(p) {
67 * return --(*p);
68 * }
69 */
70 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
71
72 /*
73 * Busy-wait nop: this is a hint to the CPU that we are currently in a
74 * busy-wait loop waiting for another CPU to change something. On a
75 * hypertreaded CPU it should yield to another thread, for example.
76 */
77 EXTERN_INLINE void busy_wait_nop(void);
78
79 #endif // !IN_STG_CODE
80
81 /*
82 * Various kinds of memory barrier.
83 * write_barrier: prevents future stores occurring before prededing stores.
84 * store_load_barrier: prevents future loads occurring before preceding stores.
85 * load_load_barrier: prevents future loads occurring before earlier loads.
86 *
87 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
88 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
89 *
90 * To check whether you got these right, try the test in
91 * testsuite/tests/rts/testwsdeque.c
92 * This tests the work-stealing deque implementation, which relies on
93 * properly working store_load and load_load memory barriers.
94 */
95 EXTERN_INLINE void write_barrier(void);
96 EXTERN_INLINE void store_load_barrier(void);
97 EXTERN_INLINE void load_load_barrier(void);
98
99 /*
100 * Note [Heap memory barriers]
101 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
102 *
103 * Machines with weak memory ordering semantics have consequences for how
104 * closures are observed and mutated. For example, consider a thunk that needs
105 * to be updated to an indirection. In order for the indirection to be safe for
106 * concurrent observers to enter, said observers must read the indirection's
107 * info table before they read the indirectee. Furthermore, the indirectee must
108 * be set before the info table pointer. This ensures that if the observer sees
109 * an IND info table then the indirectee is valid.
110 *
111 * When a closure is updated with an indirection, both its info table and its
112 * indirectee must be written. With weak memory ordering, these two writes can
113 * be arbitrarily reordered, and perhaps even interleaved with other threads'
114 * reads and writes (in the absence of memory barrier instructions). Consider
115 * this example of a bad reordering:
116 *
117 * - An updater writes to a closure's info table (INFO_TYPE is now IND).
118 * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND.
119 * - A concurrent observer reads the closure's indirectee and enters it.
120 * - An updater writes the closure's indirectee.
121 *
122 * Here the update to the indirectee comes too late and the concurrent observer
123 * has jumped off into the abyss. Speculative execution can also cause us
124 * issues, consider:
125 *
126 * - an observer is about to case on a value in closure's info table.
127 * - the observer speculatively reads one or more of closure's fields.
128 * - an updater writes to closure's info table.
129 * - the observer takes a branch based on the new info table value, but with the
130 * old closure fields!
131 * - the updater writes to the closure's other fields, but its too late.
132 *
133 * Because of these effects, reads and writes to a closure's info table must be
134 * ordered carefully with respect to reads and writes to the closure's other
135 * fields, and memory barriers must be placed to ensure that reads and writes
136 * occur in program order. Specifically, updates to an already existing closure
137 * must follow the following pattern:
138 *
139 * - Update the closure's (non-info table) fields.
140 * - Write barrier.
141 * - Update the closure's info table.
142 *
143 * Observing the fields of an updateable closure (e.g. a THUNK) must follow the
144 * following pattern:
145 *
146 * - Read the closure's info pointer.
147 * - Read barrier.
148 * - Read the closure's (non-info table) fields.
149 *
150 * We must also take care when we expose a newly-allocated closure to other cores
151 * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message,
152 * or MutVar#). Specifically, we need to ensure that all writes constructing the
153 * closure are visible *before* the write exposing the new closure is made visible:
154 *
155 * - Allocate memory for the closure
156 * - Write the closure's info pointer and fields (ordering betweeen this doesn't
157 * matter since the closure isn't yet visible to anyone else).
158 * - Write barrier
159 * - Make closure visible to other cores
160 *
161 * Note that thread stacks are inherently thread-local and consequently allocating an
162 * object and introducing a reference to it to our stack needs no barrier.
163 *
164 * There are several ways in which the mutator may make a newly-allocated
165 * closure visible to other cores:
166 *
167 * - Eager blackholing a THUNK:
168 * This is protected by an explicit write barrier in the eager blackholing
169 * code produced by the codegen. See GHC.StgToCmm.Bind.emitBlackHoleCode.
170 *
171 * - Lazy blackholing a THUNK:
172 * This is is protected by an explicit write barrier in the thread suspension
173 * code. See ThreadPaused.c:threadPaused.
174 *
175 * - Updating a BLACKHOLE:
176 * This case is protected by explicit write barriers in the the update frame
177 * entry code (see rts/Updates.h).
178 *
179 * - Blocking on an MVar# (e.g. takeMVar#):
180 * In this case the appropriate MVar primops (e.g. stg_takeMVarzh). include
181 * explicit memory barriers to ensure that the the newly-allocated
182 * MVAR_TSO_QUEUE is visible to other cores.
183 *
184 * - Write to an MVar# (e.g. putMVar#):
185 * This protected by the full barrier implied by the CAS in putMVar#.
186 *
187 * - Write to a TVar#:
188 * This is protected by the full barrier implied by the CAS in STM.c:lock_stm.
189 *
190 * - Write to an Array#, ArrayArray#, or SmallArray#:
191 * This case is protected by an explicit write barrier in the code produced
192 * for this primop by the codegen. See GHC.StgToCmm.Prim.doWritePtrArrayOp and
193 * GHC.StgToCmm.Prim.doWriteSmallPtrArrayOp. Relevant issue: #12469.
194 *
195 * - Write to MutVar# via writeMutVar#:
196 * This case is protected by an explicit write barrier in the code produced
197 * for this primop by the codegen.
198 *
199 * - Write to MutVar# via atomicModifyMutVar# or casMutVar#:
200 * This is protected by the full barrier implied by the cmpxchg operations
201 * in this primops.
202 *
203 * - Sending a Message to another capability:
204 * This is protected by the acquition and release of the target capability's
205 * lock in Messages.c:sendMessage.
206 *
207 * Finally, we must ensure that we flush all cores store buffers before
208 * entering and leaving GC, since stacks may be read by other cores. This
209 * happens as a side-effect of taking and release mutexes (which implies
210 * acquire and release barriers, respectively).
211 *
212 * N.B. recordClosureMutated places a reference to the mutated object on
213 * the capability-local mut_list. Consequently this does not require any memory
214 * barrier.
215 *
216 * During parallel GC we need to be careful during evacuation: before replacing
217 * a closure with a forwarding pointer we must commit a write barrier to ensure
218 * that the copy we made in to-space is visible to other cores.
219 *
220 * However, we can be a bit lax when *reading* during GC. Specifically, the GC
221 * can only make a very limited set of changes to existing closures:
222 *
223 * - it can replace a closure's info table with stg_WHITEHOLE.
224 * - it can replace a previously-whitehole'd closure's info table with a
225 * forwarding pointer
226 * - it can replace a previously-whitehole'd closure's info table with a
227 * valid info table pointer (done in eval_thunk_selector)
228 * - it can update the value of a pointer field after evacuating it
229 *
230 * This is quite nice since we don't need to worry about an interleaving
231 * of writes producing an invalid state: a closure's fields remain valid after
232 * an update of its info table pointer and vice-versa.
233 *
234 * After a round of parallel scavenging we must also ensure that any writes the
235 * GC thread workers made are visible to the main GC thread. This is ensured by
236 * the full barrier implied by the atomic decrement in
237 * GC.c:scavenge_until_all_done.
238 *
239 * The work-stealing queue (WSDeque) also requires barriers; these are
240 * documented in WSDeque.c.
241 *
242 */
243
244 /* ----------------------------------------------------------------------------
245 Implementations
246 ------------------------------------------------------------------------- */
247
248 #if !IN_STG_CODE || IN_STGCRUN
249
250 /*
251 * Exchange the value pointed to by p with w and return the former. This
252 * function is used to acquire a lock. An acquire memory barrier is sufficient
253 * for a lock operation because corresponding unlock operation issues a
254 * store-store barrier (write_barrier()) immediately before releasing the lock.
255 */
256 EXTERN_INLINE StgWord
257 xchg(StgPtr p, StgWord w)
258 {
259 // When porting GHC to a new platform check that
260 // __sync_lock_test_and_set() actually stores w in *p.
261 // Use test rts/atomicxchg to verify that the correct value is stored.
262 // From the gcc manual:
263 // (https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html)
264 // This built-in function, as described by Intel, is not
265 // a traditional test-and-set operation, but rather an atomic
266 // exchange operation.
267 // [...]
268 // Many targets have only minimal support for such locks,
269 // and do not support a full exchange operation. In this case,
270 // a target may support reduced functionality here by which the
271 // only valid value to store is the immediate constant 1. The
272 // exact value actually stored in *ptr is implementation defined.
273 return __sync_lock_test_and_set(p, w);
274 }
275
276 /*
277 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
278 * in the STM implementation.
279 */
280 EXTERN_INLINE StgWord
281 cas(StgVolatilePtr p, StgWord o, StgWord n)
282 {
283 return __sync_val_compare_and_swap(p, o, n);
284 }
285
286 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
287 // Haskell code (fetchAddIntArray#).
288 // PT: add-and-fetch, returns new value
289 EXTERN_INLINE StgWord
290 atomic_inc(StgVolatilePtr p, StgWord incr)
291 {
292 return __sync_add_and_fetch(p, incr);
293 }
294
295 EXTERN_INLINE StgWord
296 atomic_dec(StgVolatilePtr p)
297 {
298 return __sync_sub_and_fetch(p, (StgWord) 1);
299 }
300
301 /*
302 * Some architectures have a way to tell the CPU that we're in a
303 * busy-wait loop, and the processor should look for something else to
304 * do (such as run another hardware thread).
305 */
306 EXTERN_INLINE void
307 busy_wait_nop(void)
308 {
309 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
310 // On Intel, the busy-wait-nop instruction is called "pause",
311 // which is actually represented as a nop with the rep prefix.
312 // On processors before the P4 this behaves as a nop; on P4 and
313 // later it might do something clever like yield to another
314 // hyperthread. In any case, Intel recommends putting one
315 // of these in a spin lock loop.
316 __asm__ __volatile__ ("rep; nop");
317 #else
318 // nothing
319 #endif
320 }
321
322 #endif // !IN_STG_CODE
323
324 /*
325 * We need to tell both the compiler AND the CPU about the barriers.
326 * It's no good preventing the CPU from reordering the operations if
327 * the compiler has already done so - hence the "memory" restriction
328 * on each of the barriers below.
329 */
330 EXTERN_INLINE void
331 write_barrier(void) {
332 #if defined(NOSMP)
333 return;
334 #elif defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
335 __asm__ __volatile__ ("" : : : "memory");
336 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
337 || defined(powerpc64le_HOST_ARCH)
338 __asm__ __volatile__ ("lwsync" : : : "memory");
339 #elif defined(sparc_HOST_ARCH)
340 /* Sparc in TSO mode does not require store/store barriers. */
341 __asm__ __volatile__ ("" : : : "memory");
342 #elif defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
343 __asm__ __volatile__ ("dmb st" : : : "memory");
344 #else
345 #error memory barriers unimplemented on this architecture
346 #endif
347 }
348
349 EXTERN_INLINE void
350 store_load_barrier(void) {
351 #if defined(NOSMP)
352 return;
353 #elif defined(i386_HOST_ARCH)
354 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
355 #elif defined(x86_64_HOST_ARCH)
356 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
357 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
358 || defined(powerpc64le_HOST_ARCH)
359 __asm__ __volatile__ ("sync" : : : "memory");
360 #elif defined(sparc_HOST_ARCH)
361 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
362 #elif defined(arm_HOST_ARCH)
363 __asm__ __volatile__ ("dmb" : : : "memory");
364 #elif defined(aarch64_HOST_ARCH)
365 __asm__ __volatile__ ("dmb sy" : : : "memory");
366 #else
367 #error memory barriers unimplemented on this architecture
368 #endif
369 }
370
371 EXTERN_INLINE void
372 load_load_barrier(void) {
373 #if defined(NOSMP)
374 return;
375 #elif defined(i386_HOST_ARCH)
376 __asm__ __volatile__ ("" : : : "memory");
377 #elif defined(x86_64_HOST_ARCH)
378 __asm__ __volatile__ ("" : : : "memory");
379 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
380 || defined(powerpc64le_HOST_ARCH)
381 __asm__ __volatile__ ("lwsync" : : : "memory");
382 #elif defined(sparc_HOST_ARCH)
383 /* Sparc in TSO mode does not require load/load barriers. */
384 __asm__ __volatile__ ("" : : : "memory");
385 #elif defined(arm_HOST_ARCH)
386 __asm__ __volatile__ ("dmb" : : : "memory");
387 #elif defined(aarch64_HOST_ARCH)
388 __asm__ __volatile__ ("dmb sy" : : : "memory");
389 #else
390 #error memory barriers unimplemented on this architecture
391 #endif
392 }
393
394 // Load a pointer from a memory location that might be being modified
395 // concurrently. This prevents the compiler from optimising away
396 // multiple loads of the memory location, as it might otherwise do in
397 // a busy wait loop for example.
398 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
399
400 /* ---------------------------------------------------------------------- */
401 #else /* !THREADED_RTS */
402
403 EXTERN_INLINE void write_barrier(void);
404 EXTERN_INLINE void store_load_barrier(void);
405 EXTERN_INLINE void load_load_barrier(void);
406 EXTERN_INLINE void write_barrier () {} /* nothing */
407 EXTERN_INLINE void store_load_barrier() {} /* nothing */
408 EXTERN_INLINE void load_load_barrier () {} /* nothing */
409
410 #if !IN_STG_CODE || IN_STGCRUN
411 INLINE_HEADER StgWord
412 xchg(StgPtr p, StgWord w)
413 {
414 StgWord old = *p;
415 *p = w;
416 return old;
417 }
418
419 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
420 EXTERN_INLINE StgWord
421 cas(StgVolatilePtr p, StgWord o, StgWord n)
422 {
423 StgWord result;
424 result = *p;
425 if (result == o) {
426 *p = n;
427 }
428 return result;
429 }
430
431 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
432 EXTERN_INLINE StgWord
433 atomic_inc(StgVolatilePtr p, StgWord incr)
434 {
435 return ((*p) += incr);
436 }
437
438
439 INLINE_HEADER StgWord
440 atomic_dec(StgVolatilePtr p)
441 {
442 return --(*p);
443 }
444 #endif
445
446 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
447
448 #endif /* !THREADED_RTS */