Merge non-moving garbage collector
[ghc.git] / includes / stg / SMP.h
1 /* ----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2005-2011
4 *
5 * Macros for multi-CPU support
6 *
7 * Do not #include this file directly: #include "Rts.h" instead.
8 *
9 * To understand the structure of the RTS headers, see the wiki:
10 * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/source-tree/includes
11 *
12 * -------------------------------------------------------------------------- */
13
14 #pragma once
15
16 #if defined(arm_HOST_ARCH) && defined(arm_HOST_ARCH_PRE_ARMv6)
17 void arm_atomic_spin_lock(void);
18 void arm_atomic_spin_unlock(void);
19 #endif
20
21 #if defined(THREADED_RTS)
22
23 /* ----------------------------------------------------------------------------
24 Atomic operations
25 ------------------------------------------------------------------------- */
26
27 #if !IN_STG_CODE || IN_STGCRUN
28 // We only want the barriers, e.g. write_barrier(), declared in .hc
29 // files. Defining the other inline functions here causes type
30 // mismatch errors from gcc, because the generated C code is assuming
31 // that there are no prototypes in scope.
32
33 /*
34 * The atomic exchange operation: xchg(p,w) exchanges the value
35 * pointed to by p with the value w, returning the old value.
36 *
37 * Used for locking closures during updates (see lockClosure()
38 * in includes/rts/storage/SMPClosureOps.h) and the MVar primops.
39 */
40 EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
41
42 /*
43 * Compare-and-swap. Atomically does this:
44 *
45 * cas(p,o,n) {
46 * r = *p;
47 * if (r == o) { *p = n };
48 * return r;
49 * }
50 */
51 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
52 EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n);
53
54 /*
55 * Atomic addition by the provided quantity
56 *
57 * atomic_inc(p, n) {
58 * return ((*p) += n);
59 * }
60 */
61 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n);
62
63
64 /*
65 * Atomic decrement
66 *
67 * atomic_dec(p) {
68 * return --(*p);
69 * }
70 */
71 EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p);
72
73 /*
74 * Busy-wait nop: this is a hint to the CPU that we are currently in a
75 * busy-wait loop waiting for another CPU to change something. On a
76 * hypertreaded CPU it should yield to another thread, for example.
77 */
78 EXTERN_INLINE void busy_wait_nop(void);
79
80 #endif // !IN_STG_CODE
81
82 /*
83 * Various kinds of memory barrier.
84 * write_barrier: prevents future stores occurring before prededing stores.
85 * store_load_barrier: prevents future loads occurring before preceding stores.
86 * load_load_barrier: prevents future loads occurring before earlier loads.
87 *
88 * Reference for these: "The JSR-133 Cookbook for Compiler Writers"
89 * http://gee.cs.oswego.edu/dl/jmm/cookbook.html
90 *
91 * To check whether you got these right, try the test in
92 * testsuite/tests/rts/testwsdeque.c
93 * This tests the work-stealing deque implementation, which relies on
94 * properly working store_load and load_load memory barriers.
95 */
96 EXTERN_INLINE void write_barrier(void);
97 EXTERN_INLINE void store_load_barrier(void);
98 EXTERN_INLINE void load_load_barrier(void);
99
100 /*
101 * Note [Heap memory barriers]
102 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
103 *
104 * Machines with weak memory ordering semantics have consequences for how
105 * closures are observed and mutated. For example, consider a thunk that needs
106 * to be updated to an indirection. In order for the indirection to be safe for
107 * concurrent observers to enter, said observers must read the indirection's
108 * info table before they read the indirectee. Furthermore, the indirectee must
109 * be set before the info table pointer. This ensures that if the observer sees
110 * an IND info table then the indirectee is valid.
111 *
112 * When a closure is updated with an indirection, both its info table and its
113 * indirectee must be written. With weak memory ordering, these two writes can
114 * be arbitrarily reordered, and perhaps even interleaved with other threads'
115 * reads and writes (in the absence of memory barrier instructions). Consider
116 * this example of a bad reordering:
117 *
118 * - An updater writes to a closure's info table (INFO_TYPE is now IND).
119 * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND.
120 * - A concurrent observer reads the closure's indirectee and enters it.
121 * - An updater writes the closure's indirectee.
122 *
123 * Here the update to the indirectee comes too late and the concurrent observer
124 * has jumped off into the abyss. Speculative execution can also cause us
125 * issues, consider:
126 *
127 * - an observer is about to case on a value in closure's info table.
128 * - the observer speculatively reads one or more of closure's fields.
129 * - an updater writes to closure's info table.
130 * - the observer takes a branch based on the new info table value, but with the
131 * old closure fields!
132 * - the updater writes to the closure's other fields, but its too late.
133 *
134 * Because of these effects, reads and writes to a closure's info table must be
135 * ordered carefully with respect to reads and writes to the closure's other
136 * fields, and memory barriers must be placed to ensure that reads and writes
137 * occur in program order. Specifically, updates to an already existing closure
138 * must follow the following pattern:
139 *
140 * - Update the closure's (non-info table) fields.
141 * - Write barrier.
142 * - Update the closure's info table.
143 *
144 * Observing the fields of an updateable closure (e.g. a THUNK) must follow the
145 * following pattern:
146 *
147 * - Read the closure's info pointer.
148 * - Read barrier.
149 * - Read the closure's (non-info table) fields.
150 *
151 * We must also take care when we expose a newly-allocated closure to other cores
152 * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message,
153 * or MutVar#). Specifically, we need to ensure that all writes constructing the
154 * closure are visible *before* the write exposing the new closure is made visible:
155 *
156 * - Allocate memory for the closure
157 * - Write the closure's info pointer and fields (ordering betweeen this doesn't
158 * matter since the closure isn't yet visible to anyone else).
159 * - Write barrier
160 * - Make closure visible to other cores
161 *
162 * Note that thread stacks are inherently thread-local and consequently allocating an
163 * object and introducing a reference to it to our stack needs no barrier.
164 *
165 * There are several ways in which the mutator may make a newly-allocated
166 * closure visible to other cores:
167 *
168 * - Eager blackholing a THUNK:
169 * This is protected by an explicit write barrier in the eager blackholing
170 * code produced by the codegen. See GHC.StgToCmm.Bind.emitBlackHoleCode.
171 *
172 * - Lazy blackholing a THUNK:
173 * This is is protected by an explicit write barrier in the thread suspension
174 * code. See ThreadPaused.c:threadPaused.
175 *
176 * - Updating a BLACKHOLE:
177 * This case is protected by explicit write barriers in the the update frame
178 * entry code (see rts/Updates.h).
179 *
180 * - Blocking on an MVar# (e.g. takeMVar#):
181 * In this case the appropriate MVar primops (e.g. stg_takeMVarzh). include
182 * explicit memory barriers to ensure that the the newly-allocated
183 * MVAR_TSO_QUEUE is visible to other cores.
184 *
185 * - Write to an MVar# (e.g. putMVar#):
186 * This protected by the full barrier implied by the CAS in putMVar#.
187 *
188 * - Write to a TVar#:
189 * This is protected by the full barrier implied by the CAS in STM.c:lock_stm.
190 *
191 * - Write to an Array#, ArrayArray#, or SmallArray#:
192 * This case is protected by an explicit write barrier in the code produced
193 * for this primop by the codegen. See GHC.StgToCmm.Prim.doWritePtrArrayOp and
194 * GHC.StgToCmm.Prim.doWriteSmallPtrArrayOp. Relevant issue: #12469.
195 *
196 * - Write to MutVar# via writeMutVar#:
197 * This case is protected by an explicit write barrier in the code produced
198 * for this primop by the codegen.
199 *
200 * - Write to MutVar# via atomicModifyMutVar# or casMutVar#:
201 * This is protected by the full barrier implied by the cmpxchg operations
202 * in this primops.
203 *
204 * - Sending a Message to another capability:
205 * This is protected by the acquition and release of the target capability's
206 * lock in Messages.c:sendMessage.
207 *
208 * Finally, we must ensure that we flush all cores store buffers before
209 * entering and leaving GC, since stacks may be read by other cores. This
210 * happens as a side-effect of taking and release mutexes (which implies
211 * acquire and release barriers, respectively).
212 *
213 * N.B. recordClosureMutated places a reference to the mutated object on
214 * the capability-local mut_list. Consequently this does not require any memory
215 * barrier.
216 *
217 * During parallel GC we need to be careful during evacuation: before replacing
218 * a closure with a forwarding pointer we must commit a write barrier to ensure
219 * that the copy we made in to-space is visible to other cores.
220 *
221 * However, we can be a bit lax when *reading* during GC. Specifically, the GC
222 * can only make a very limited set of changes to existing closures:
223 *
224 * - it can replace a closure's info table with stg_WHITEHOLE.
225 * - it can replace a previously-whitehole'd closure's info table with a
226 * forwarding pointer
227 * - it can replace a previously-whitehole'd closure's info table with a
228 * valid info table pointer (done in eval_thunk_selector)
229 * - it can update the value of a pointer field after evacuating it
230 *
231 * This is quite nice since we don't need to worry about an interleaving
232 * of writes producing an invalid state: a closure's fields remain valid after
233 * an update of its info table pointer and vice-versa.
234 *
235 * After a round of parallel scavenging we must also ensure that any writes the
236 * GC thread workers made are visible to the main GC thread. This is ensured by
237 * the full barrier implied by the atomic decrement in
238 * GC.c:scavenge_until_all_done.
239 *
240 * The work-stealing queue (WSDeque) also requires barriers; these are
241 * documented in WSDeque.c.
242 *
243 */
244
245 /* ----------------------------------------------------------------------------
246 Implementations
247 ------------------------------------------------------------------------- */
248
249 #if !IN_STG_CODE || IN_STGCRUN
250
251 /*
252 * Exchange the value pointed to by p with w and return the former. This
253 * function is used to acquire a lock. An acquire memory barrier is sufficient
254 * for a lock operation because corresponding unlock operation issues a
255 * store-store barrier (write_barrier()) immediately before releasing the lock.
256 */
257 EXTERN_INLINE StgWord
258 xchg(StgPtr p, StgWord w)
259 {
260 // When porting GHC to a new platform check that
261 // __sync_lock_test_and_set() actually stores w in *p.
262 // Use test rts/atomicxchg to verify that the correct value is stored.
263 // From the gcc manual:
264 // (https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html)
265 // This built-in function, as described by Intel, is not
266 // a traditional test-and-set operation, but rather an atomic
267 // exchange operation.
268 // [...]
269 // Many targets have only minimal support for such locks,
270 // and do not support a full exchange operation. In this case,
271 // a target may support reduced functionality here by which the
272 // only valid value to store is the immediate constant 1. The
273 // exact value actually stored in *ptr is implementation defined.
274 return __sync_lock_test_and_set(p, w);
275 }
276
277 /*
278 * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used
279 * in the STM implementation.
280 */
281 EXTERN_INLINE StgWord
282 cas(StgVolatilePtr p, StgWord o, StgWord n)
283 {
284 return __sync_val_compare_and_swap(p, o, n);
285 }
286
287 EXTERN_INLINE StgWord8
288 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n)
289 {
290 return __sync_val_compare_and_swap(p, o, n);
291 }
292
293 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
294 // Haskell code (fetchAddIntArray#).
295 // PT: add-and-fetch, returns new value
296 EXTERN_INLINE StgWord
297 atomic_inc(StgVolatilePtr p, StgWord incr)
298 {
299 return __sync_add_and_fetch(p, incr);
300 }
301
302 EXTERN_INLINE StgWord
303 atomic_dec(StgVolatilePtr p)
304 {
305 return __sync_sub_and_fetch(p, (StgWord) 1);
306 }
307
308 /*
309 * Some architectures have a way to tell the CPU that we're in a
310 * busy-wait loop, and the processor should look for something else to
311 * do (such as run another hardware thread).
312 */
313 EXTERN_INLINE void
314 busy_wait_nop(void)
315 {
316 #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
317 // On Intel, the busy-wait-nop instruction is called "pause",
318 // which is actually represented as a nop with the rep prefix.
319 // On processors before the P4 this behaves as a nop; on P4 and
320 // later it might do something clever like yield to another
321 // hyperthread. In any case, Intel recommends putting one
322 // of these in a spin lock loop.
323 __asm__ __volatile__ ("rep; nop");
324 #else
325 // nothing
326 #endif
327 }
328
329 #endif // !IN_STG_CODE
330
331 /*
332 * We need to tell both the compiler AND the CPU about the barriers.
333 * It's no good preventing the CPU from reordering the operations if
334 * the compiler has already done so - hence the "memory" restriction
335 * on each of the barriers below.
336 */
337 EXTERN_INLINE void
338 write_barrier(void) {
339 #if defined(NOSMP)
340 return;
341 #elif defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
342 __asm__ __volatile__ ("" : : : "memory");
343 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
344 || defined(powerpc64le_HOST_ARCH)
345 __asm__ __volatile__ ("lwsync" : : : "memory");
346 #elif defined(s390x_HOST_ARCH)
347 __asm__ __volatile__ ("" : : : "memory");
348 #elif defined(sparc_HOST_ARCH)
349 /* Sparc in TSO mode does not require store/store barriers. */
350 __asm__ __volatile__ ("" : : : "memory");
351 #elif defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
352 __asm__ __volatile__ ("dmb st" : : : "memory");
353 #else
354 #error memory barriers unimplemented on this architecture
355 #endif
356 }
357
358 EXTERN_INLINE void
359 store_load_barrier(void) {
360 #if defined(NOSMP)
361 return;
362 #elif defined(i386_HOST_ARCH)
363 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory");
364 #elif defined(x86_64_HOST_ARCH)
365 __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory");
366 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
367 || defined(powerpc64le_HOST_ARCH)
368 __asm__ __volatile__ ("sync" : : : "memory");
369 #elif defined(s390x_HOST_ARCH)
370 __asm__ __volatile__ ("bcr 14,0" : : : "memory");
371 #elif defined(sparc_HOST_ARCH)
372 __asm__ __volatile__ ("membar #StoreLoad" : : : "memory");
373 #elif defined(arm_HOST_ARCH)
374 __asm__ __volatile__ ("dmb" : : : "memory");
375 #elif defined(aarch64_HOST_ARCH)
376 __asm__ __volatile__ ("dmb sy" : : : "memory");
377 #else
378 #error memory barriers unimplemented on this architecture
379 #endif
380 }
381
382 EXTERN_INLINE void
383 load_load_barrier(void) {
384 #if defined(NOSMP)
385 return;
386 #elif defined(i386_HOST_ARCH)
387 __asm__ __volatile__ ("" : : : "memory");
388 #elif defined(x86_64_HOST_ARCH)
389 __asm__ __volatile__ ("" : : : "memory");
390 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \
391 || defined(powerpc64le_HOST_ARCH)
392 __asm__ __volatile__ ("lwsync" : : : "memory");
393 #elif defined(s390x_HOST_ARCH)
394 __asm__ __volatile__ ("" : : : "memory");
395 #elif defined(sparc_HOST_ARCH)
396 /* Sparc in TSO mode does not require load/load barriers. */
397 __asm__ __volatile__ ("" : : : "memory");
398 #elif defined(arm_HOST_ARCH)
399 __asm__ __volatile__ ("dmb" : : : "memory");
400 #elif defined(aarch64_HOST_ARCH)
401 __asm__ __volatile__ ("dmb sy" : : : "memory");
402 #else
403 #error memory barriers unimplemented on this architecture
404 #endif
405 }
406
407 // Load a pointer from a memory location that might be being modified
408 // concurrently. This prevents the compiler from optimising away
409 // multiple loads of the memory location, as it might otherwise do in
410 // a busy wait loop for example.
411 #define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p)))
412
413 /* ---------------------------------------------------------------------- */
414 #else /* !THREADED_RTS */
415
416 EXTERN_INLINE void write_barrier(void);
417 EXTERN_INLINE void store_load_barrier(void);
418 EXTERN_INLINE void load_load_barrier(void);
419 EXTERN_INLINE void write_barrier () {} /* nothing */
420 EXTERN_INLINE void store_load_barrier() {} /* nothing */
421 EXTERN_INLINE void load_load_barrier () {} /* nothing */
422
423 #if !IN_STG_CODE || IN_STGCRUN
424 INLINE_HEADER StgWord
425 xchg(StgPtr p, StgWord w)
426 {
427 StgWord old = *p;
428 *p = w;
429 return old;
430 }
431
432 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
433 EXTERN_INLINE StgWord
434 cas(StgVolatilePtr p, StgWord o, StgWord n)
435 {
436 StgWord result;
437 result = *p;
438 if (result == o) {
439 *p = n;
440 }
441 return result;
442 }
443
444 EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n);
445 EXTERN_INLINE StgWord8
446 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n)
447 {
448 StgWord8 result;
449 result = *p;
450 if (result == o) {
451 *p = n;
452 }
453 return result;
454 }
455
456 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
457 EXTERN_INLINE StgWord
458 atomic_inc(StgVolatilePtr p, StgWord incr)
459 {
460 return ((*p) += incr);
461 }
462
463
464 INLINE_HEADER StgWord
465 atomic_dec(StgVolatilePtr p)
466 {
467 return --(*p);
468 }
469 #endif
470
471 #define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p)))
472
473 #endif /* !THREADED_RTS */