652ba13b4fbc5e985015764a37ac81b139a5c7e1
[ghc.git] / rts / win32 / OSThreads.c
1 /* ---------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2001-2005
4 *
5 * Accessing OS threads functionality in a (mostly) OS-independent
6 * manner.
7 *
8 * --------------------------------------------------------------------------*/
9
10 #include "Rts.h"
11 #include <windows.h>
12 #include "sm/OSMem.h"
13 #if defined(THREADED_RTS)
14 #include "RtsUtils.h"
15
16 /* For reasons not yet clear, the entire contents of process.h is protected
17 * by __STRICT_ANSI__ not being defined.
18 */
19 #undef __STRICT_ANSI__
20 #include <process.h>
21
22
23 /* Processor group info cache. */
24 static uint8_t* cpuGroupCache = NULL;
25 /* Processor group cumulative summary cache. */
26 static uint32_t* cpuGroupCumulativeCache = NULL;
27 /* Processor group dist cache. */
28 static uint8_t* cpuGroupDistCache = NULL;
29
30 /* Win32 threads and synchronisation objects */
31
32 /* A Condition is represented by a Win32 Event object;
33 * a Mutex by a Mutex kernel object.
34 *
35 * ToDo: go through the defn and usage of these to
36 * make sure the semantics match up with that of
37 * the (assumed) pthreads behaviour. This is really
38 * just a first pass at getting something compilable.
39 */
40
41 void
42 initCondition( Condition* pCond )
43 {
44 HANDLE h = CreateEvent(NULL,
45 FALSE, /* auto reset */
46 FALSE, /* initially not signalled */
47 NULL); /* unnamed => process-local. */
48
49 if ( h == NULL ) {
50 sysErrorBelch("initCondition: unable to create");
51 stg_exit(EXIT_FAILURE);
52 }
53 *pCond = h;
54 return;
55 }
56
57 void
58 closeCondition( Condition* pCond )
59 {
60 if ( CloseHandle(*pCond) == 0 ) {
61 sysErrorBelch("closeCondition: failed to close");
62 }
63 return;
64 }
65
66 bool
67 broadcastCondition ( Condition* pCond )
68 {
69 PulseEvent(*pCond);
70 return true;
71 }
72
73 bool
74 signalCondition ( Condition* pCond )
75 {
76 if (SetEvent(*pCond) == 0) {
77 sysErrorBelch("SetEvent");
78 stg_exit(EXIT_FAILURE);
79 }
80 return true;
81 }
82
83 bool
84 waitCondition ( Condition* pCond, Mutex* pMut )
85 {
86 RELEASE_LOCK(pMut);
87 WaitForSingleObject(*pCond, INFINITE);
88 /* Hmm..use WaitForMultipleObjects() ? */
89 ACQUIRE_LOCK(pMut);
90 return true;
91 }
92
93 void
94 yieldThread()
95 {
96 SwitchToThread();
97 return;
98 }
99
100 void
101 shutdownThread()
102 {
103 ExitThread(0);
104 barf("ExitThread() returned"); // avoid gcc warning
105 }
106
107 int
108 createOSThread (OSThreadId* pId, char *name STG_UNUSED,
109 OSThreadProc *startProc, void *param)
110 {
111 HANDLE h;
112 h = CreateThread ( NULL, /* default security attributes */
113 0,
114 (LPTHREAD_START_ROUTINE)startProc,
115 param,
116 0,
117 pId);
118
119 if (h == 0) {
120 return 1;
121 } else {
122 // This handle leaks if we don't close it here. Perhaps we
123 // should try to keep it around to avoid needing OpenThread()
124 // later.
125 CloseHandle(h);
126 return 0;
127 }
128 }
129
130 OSThreadId
131 osThreadId()
132 {
133 return GetCurrentThreadId();
134 }
135
136 bool
137 osThreadIsAlive(OSThreadId id)
138 {
139 DWORD exit_code;
140 HANDLE hdl;
141 if (!(hdl = OpenThread(THREAD_QUERY_INFORMATION,FALSE,id))) {
142 sysErrorBelch("osThreadIsAlive: OpenThread");
143 stg_exit(EXIT_FAILURE);
144 }
145 if (!GetExitCodeThread(hdl, &exit_code)) {
146 sysErrorBelch("osThreadIsAlive: GetExitCodeThread");
147 stg_exit(EXIT_FAILURE);
148 }
149 CloseHandle(hdl);
150 return (exit_code == STILL_ACTIVE);
151 }
152
153 #ifdef USE_CRITICAL_SECTIONS
154 void
155 initMutex (Mutex* pMut)
156 {
157 InitializeCriticalSectionAndSpinCount(pMut,4000);
158 }
159 void
160 closeMutex (Mutex* pMut)
161 {
162 DeleteCriticalSection(pMut);
163 }
164 #else
165 void
166 initMutex (Mutex* pMut)
167 {
168 HANDLE h = CreateMutex ( NULL, /* default sec. attributes */
169 TRUE, /* not owned => initially signalled */
170 NULL
171 );
172 *pMut = h;
173 return;
174 }
175 void
176 closeMutex (Mutex* pMut)
177 {
178 CloseHandle(*pMut);
179 }
180 #endif
181
182 void
183 newThreadLocalKey (ThreadLocalKey *key)
184 {
185 DWORD r;
186 r = TlsAlloc();
187 if (r == TLS_OUT_OF_INDEXES) {
188 barf("newThreadLocalKey: out of keys");
189 }
190 *key = r;
191 }
192
193 void *
194 getThreadLocalVar (ThreadLocalKey *key)
195 {
196 void *r;
197 r = TlsGetValue(*key);
198 #ifdef DEBUG
199 // r is allowed to be NULL - it can mean that either there was an
200 // error or the stored value is in fact NULL.
201 if (GetLastError() != NO_ERROR) {
202 sysErrorBelch("getThreadLocalVar");
203 stg_exit(EXIT_FAILURE);
204 }
205 #endif
206 return r;
207 }
208
209 void
210 setThreadLocalVar (ThreadLocalKey *key, void *value)
211 {
212 BOOL b;
213 b = TlsSetValue(*key, value);
214 if (!b) {
215 sysErrorBelch("setThreadLocalVar");
216 stg_exit(EXIT_FAILURE);
217 }
218 }
219
220 void
221 freeThreadLocalKey (ThreadLocalKey *key)
222 {
223 BOOL r;
224 r = TlsFree(*key);
225 if (r == 0) {
226 DWORD dw = GetLastError();
227 barf("freeThreadLocalKey failed: %lu", dw);
228 }
229 }
230
231
232 static unsigned __stdcall
233 forkOS_createThreadWrapper ( void * entry )
234 {
235 Capability *cap;
236 cap = rts_lock();
237 rts_evalStableIO(&cap, (HsStablePtr) entry, NULL);
238 rts_unlock(cap);
239 return 0;
240 }
241
242 int
243 forkOS_createThread ( HsStablePtr entry )
244 {
245 unsigned long pId;
246 return (_beginthreadex ( NULL, /* default security attributes */
247 0,
248 forkOS_createThreadWrapper,
249 (void*)entry,
250 0,
251 (unsigned*)&pId) == 0);
252 }
253
254 #if x86_64_HOST_ARCH
255 /* We still support Windows Vista, so we can't depend on it
256 and must manually resolve these. */
257 typedef DWORD(WINAPI *GetItemCountProc)(WORD);
258 typedef DWORD(WINAPI *GetGroupCountProc)(void);
259 typedef BOOL(WINAPI *SetThreadGroupAffinityProc)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
260 #ifndef ALL_PROCESSOR_GROUPS
261 #define ALL_PROCESSOR_GROUPS 0xffff
262 #endif
263 #endif
264
265 void freeThreadingResources (void)
266 {
267 if (cpuGroupCache)
268 {
269 free(cpuGroupCache);
270 cpuGroupCache = NULL;
271 }
272
273 if (cpuGroupCumulativeCache)
274 {
275 free(cpuGroupCumulativeCache);
276 cpuGroupCumulativeCache = NULL;
277 }
278
279 if (cpuGroupDistCache)
280 {
281 free(cpuGroupDistCache);
282 cpuGroupDistCache = NULL;
283 }
284 }
285
286 /* Processor groups are not guaranteed to be uniformly distributed
287 nor guaranteed to be filled before a next group is needed.
288 The OS will assign processors to groups based on physical proximity
289 and will never partially assign cores from one physical cpu to more
290 than one group. If one has two 48 core CPUs then you'd end up with
291 two groups of 48 logical cpus. Now add a 3rd CPU with 10 cores and
292 the group it is assigned to depends where the socket is on the board.
293
294 So we need to make a map of where the CPUs reside and how the groups are filled.
295 Since groups are created at boot time by the kernel, we can cache this information.
296
297 NOTE: This code does not support hot-swapping cores as it's caching the information.
298 If you activate a new core you have to restart the program. This builds a
299 simple lookup array for cpu -> group indexes. This gives O(1) lookup against
300 O(n) space. But n is < 256 so we'll only use 256 bytes of extra memory. */
301
302 static uint8_t
303 getNumberOfProcessorsGroups (void)
304 {
305 /* Group count cache. */
306 static uint8_t n_groups = 0;
307
308
309 #if x86_64_HOST_ARCH
310 if (!n_groups)
311 {
312 /* We still support Windows Vista. Which means we can't rely
313 on the API being available. So we'll have to resolve manually. */
314 HMODULE kernel = GetModuleHandleW(L"kernel32");
315
316 GetGroupCountProc GetActiveProcessorGroupCount = (GetGroupCountProc)GetProcAddress(kernel, "GetActiveProcessorGroupCount");
317 n_groups = GetActiveProcessorGroupCount();
318
319 IF_DEBUG(scheduler, debugBelch("[*] Number of processor groups detected: %u\n", n_groups));
320 }
321 #endif
322
323 if (!n_groups)
324 {
325 n_groups = 1;
326 }
327
328 return n_groups;
329 }
330
331 static uint8_t*
332 getProcessorsDistribution (void)
333 {
334 if (cpuGroupDistCache)
335 {
336 return cpuGroupDistCache;
337 }
338
339 if (!cpuGroupDistCache)
340 {
341 uint8_t n_groups = getNumberOfProcessorsGroups();
342 cpuGroupDistCache = malloc(n_groups * sizeof(uint8_t));
343 memset(cpuGroupDistCache, MAXIMUM_PROCESSORS, n_groups * sizeof(uint8_t));
344
345 #if x86_64_HOST_ARCH
346 /* We still support Windows Vista. Which means we can't rely
347 on the API being available. So we'll have to resolve manually. */
348 HMODULE kernel = GetModuleHandleW(L"kernel32");
349
350 GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
351
352 if (GetActiveProcessorCount)
353 {
354 for (int i = 0; i < n_groups; i++)
355 {
356 cpuGroupDistCache[i] = GetActiveProcessorCount(i);
357 IF_DEBUG(scheduler, debugBelch("[*] Number of active processors in group %u detected: %u\n", i, cpuGroupDistCache[i]));
358 }
359 }
360 #endif
361 }
362
363 return cpuGroupDistCache;
364 }
365
366 static uint32_t*
367 getProcessorsCumulativeSum(void)
368 {
369 if (cpuGroupCumulativeCache)
370 {
371 return cpuGroupCumulativeCache;
372 }
373
374 if (!cpuGroupCumulativeCache)
375 {
376 uint8_t n_groups = getNumberOfProcessorsGroups();
377 cpuGroupCumulativeCache = malloc(n_groups * sizeof(uint32_t));
378 memset(cpuGroupCumulativeCache, 0, n_groups * sizeof(uint32_t));
379 uint8_t* proc_dist = getProcessorsDistribution();
380 uint32_t cum_num_proc = 0;
381
382 #if x86_64_HOST_ARCH
383 for (int i = 0; i < n_groups; i++)
384 {
385 cpuGroupCumulativeCache[i] = cum_num_proc;
386 cum_num_proc += proc_dist[i];
387 IF_DEBUG(scheduler, debugBelch("[*] Cumulative active processors for group %u: %u\n", i, cpuGroupCumulativeCache[i]));
388 }
389 #endif
390 }
391
392 return cpuGroupCumulativeCache;
393 }
394
395 /*
396 Because processors can be distributed rather unpredictably inside
397 processor groups, we need to keep track of which processors are in
398 which group to be able to determine which mask to set and which bit
399 in the mask to set.
400
401 This can either be done by the typical trade-off: speed or
402 memory usage. In this case I prioritize speed.
403
404 This function will generate an array where each index is a processor
405 and the value of the array the group it belongs to. This allows us to
406 in constant time determine where a processor is.
407 */
408 static uint8_t*
409 createProcessorGroupMap (void)
410 {
411 if (cpuGroupCache)
412 {
413 return cpuGroupCache;
414 }
415
416 uint32_t numProcs = getNumberOfProcessors();
417
418 cpuGroupCache = malloc(numProcs * sizeof(uint8_t));
419 /* For 32bit Windows and 64bit older than Windows 7, create a default mapping. */
420 memset(cpuGroupCache, 0, numProcs * sizeof(uint8_t));
421
422 #if x86_64_HOST_ARCH
423 uint8_t* proc_dist = getProcessorsDistribution();
424
425 int totalProcs = 0;
426 uint8_t nGroups = getNumberOfProcessorsGroups();
427 int group;
428 for (group = 0; group < nGroups; group++)
429 {
430 uint8_t nProc = proc_dist[group];
431 memset(cpuGroupCache + totalProcs, group, nProc * sizeof(uint8_t));
432 totalProcs += nProc;
433 }
434
435 IF_DEBUG(scheduler, debugBelch("[*] Processor group map created\n"));
436 #endif
437
438 return cpuGroupCache;
439 }
440
441 uint32_t
442 getNumberOfProcessors (void)
443 {
444 static uint32_t nproc = 0;
445
446 #if x86_64_HOST_ARCH
447 /* We still support Windows Vista. Which means we can't rely
448 on the API being available. So we'll have to resolve manually. */
449 HMODULE kernel = GetModuleHandleW(L"kernel32");
450
451 GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
452 if (GetActiveProcessorCount && !nproc)
453 {
454 nproc = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
455
456 if (nproc)
457 {
458 IF_DEBUG(scheduler, debugBelch("[*] Total number of active processors detected: %u\n", nproc));
459 return nproc;
460 }
461
462 IF_DEBUG(scheduler, debugBelch("Could not determine Max number of logical processors.\n"
463 "Falling back to old code which limits to 64 logical processors.\n"));
464 }
465 #endif
466
467 /* This will return the maximum number of processes
468 within one processor group. It's also slower
469 so use it only when needed. */
470 if (nproc == 0) {
471 SYSTEM_INFO si;
472 GetSystemInfo(&si);
473 nproc = si.dwNumberOfProcessors;
474 }
475
476 return nproc;
477 }
478
479 void
480 setThreadAffinity (uint32_t n, uint32_t m) // cap N of M
481 {
482 ASSERT(n <= m);
483
484 HANDLE hThread;
485 DWORD_PTR *mask, r; // 64-bit win is required to handle more than 32 procs
486 // and Windows 7+ required for more than 64 procs
487 uint32_t n_proc, i, ix;
488 uint8_t* proc_map = createProcessorGroupMap();
489 uint32_t n_groups = getNumberOfProcessorsGroups();
490 uint32_t* proc_cum = getProcessorsCumulativeSum();
491 n_proc = getNumberOfProcessors();
492 hThread = GetCurrentThread();
493
494 ASSERT(proc_map );
495 ASSERT(proc_cum );
496 ASSERT(hThread );
497 ASSERT(n_groups > 0);
498 ASSERT(n_proc > 0);
499
500 mask = malloc(n_groups * sizeof(DWORD_PTR));
501 memset(mask, 0, n_groups * sizeof(DWORD_PTR));
502
503 /* The mask for the individual groups are all 0 based
504 so we need different masks for every group. */
505 int group;
506 for (i = n; i < n_proc; i += m)
507 {
508 group = proc_map[i];
509 ix = i - proc_cum[group];
510 mask[group] |= 1 << ix;
511 }
512
513 #if x86_64_HOST_ARCH
514 /* We still support Windows Vista. Which means we can't rely
515 on the API being available. So we'll have to resolve manually. */
516 HMODULE kernel = GetModuleHandleW(L"kernel32");
517
518 SetThreadGroupAffinityProc SetThreadGroupAffinity = (SetThreadGroupAffinityProc)GetProcAddress(kernel, "SetThreadGroupAffinity");
519 #endif
520
521 for (i = 0; i < n_groups; i++)
522 {
523 #if x86_64_HOST_ARCH
524 // If we support the new API, use it.
525 if (mask[i] > 0 && SetThreadGroupAffinity)
526 {
527 GROUP_AFFINITY hGroup;
528 ZeroMemory(&hGroup, sizeof(hGroup));
529 hGroup.Mask = mask[i];
530 hGroup.Group = i;
531
532 if (!SetThreadGroupAffinity(hThread, &hGroup, NULL))
533 {
534 sysErrorBelch("SetThreadGroupAffinity");
535 }
536
537 continue;
538 }
539 #endif
540 // Fall-back methods. Only do it if there's a mask to set
541 if (mask[i] > 0)
542 {
543 r = SetThreadAffinityMask(hThread, mask[i]);
544 if (r == 0) {
545 free(mask);
546 sysErrorBelch("SetThreadAffinity");
547 stg_exit(EXIT_FAILURE);
548 }
549 }
550 }
551
552 free(mask);
553 }
554
555 typedef BOOL (WINAPI *PCSIO)(HANDLE);
556
557 void
558 interruptOSThread (OSThreadId id)
559 {
560 HANDLE hdl;
561 PCSIO pCSIO;
562 if (!(hdl = OpenThread(THREAD_TERMINATE,FALSE,id))) {
563 sysErrorBelch("interruptOSThread: OpenThread");
564 stg_exit(EXIT_FAILURE);
565 }
566 pCSIO = (PCSIO) GetProcAddress(GetModuleHandle(TEXT("Kernel32.dll")),
567 "CancelSynchronousIo");
568 if ( NULL != pCSIO ) {
569 pCSIO(hdl);
570 } else {
571 // Nothing to do, unfortunately
572 }
573 CloseHandle(hdl);
574 }
575
576 void setThreadNode (uint32_t node)
577 {
578 if (osNumaAvailable())
579 {
580 StgWord mask = 0;
581 mask |= 1 << node;
582 if (!SetThreadAffinityMask(GetCurrentThread(), mask))
583 {
584 sysErrorBelch(
585 "setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.",
586 node, GetLastError());
587 stg_exit(EXIT_FAILURE);
588 }
589 }
590 }
591
592 void releaseThreadNode (void)
593 {
594 if (osNumaAvailable())
595 {
596 StgWord processMask;
597 StgWord systemMask;
598 if (!GetProcessAffinityMask(GetCurrentProcess(),
599 &processMask,
600 &systemMask))
601 {
602 sysErrorBelch(
603 "releaseThreadNode: Error resetting affinity of thread: %lu",
604 GetLastError());
605 stg_exit(EXIT_FAILURE);
606 }
607
608 if (!SetThreadAffinityMask(GetCurrentThread(), processMask))
609 {
610 sysErrorBelch(
611 "releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.",
612 GetLastError());
613 stg_exit(EXIT_FAILURE);
614 }
615
616 }
617 }
618
619 #else /* !defined(THREADED_RTS) */
620
621 int
622 forkOS_createThread ( HsStablePtr entry STG_UNUSED )
623 {
624 return -1;
625 }
626
627 void freeThreadingResources (void) { /* nothing */ }
628
629 uint32_t getNumberOfProcessors (void)
630 {
631 return 1;
632 }
633
634 #endif /* !defined(THREADED_RTS) */
635
636 KernelThreadId kernelThreadId (void)
637 {
638 DWORD tid = GetCurrentThreadId();
639 return tid;
640 }