rts: Ensure that forkOS releases Task on termination
[ghc.git] / rts / win32 / OSThreads.c
1 /* ---------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 2001-2005
4 *
5 * Accessing OS threads functionality in a (mostly) OS-independent
6 * manner.
7 *
8 * --------------------------------------------------------------------------*/
9
10 #include "Rts.h"
11 #include <windows.h>
12 #include "sm/OSMem.h"
13 #if defined(THREADED_RTS)
14 #include "RtsUtils.h"
15
16 /* For reasons not yet clear, the entire contents of process.h is protected
17 * by __STRICT_ANSI__ not being defined.
18 */
19 #undef __STRICT_ANSI__
20 #include <process.h>
21
22
23 /* Processor group info cache. */
24 static uint8_t* cpuGroupCache = NULL;
25 /* Processor group cumulative summary cache. */
26 static uint32_t* cpuGroupCumulativeCache = NULL;
27 /* Processor group dist cache. */
28 static uint8_t* cpuGroupDistCache = NULL;
29
30 /* Win32 threads and synchronisation objects */
31
32 /* A Condition is represented by a Win32 Event object;
33 * a Mutex by a Mutex kernel object.
34 *
35 * ToDo: go through the defn and usage of these to
36 * make sure the semantics match up with that of
37 * the (assumed) pthreads behaviour. This is really
38 * just a first pass at getting something compilable.
39 */
40
41 void
42 initCondition( Condition* pCond )
43 {
44 HANDLE h = CreateEvent(NULL,
45 FALSE, /* auto reset */
46 FALSE, /* initially not signalled */
47 NULL); /* unnamed => process-local. */
48
49 if ( h == NULL ) {
50 sysErrorBelch("initCondition: unable to create");
51 stg_exit(EXIT_FAILURE);
52 }
53 *pCond = h;
54 return;
55 }
56
57 void
58 closeCondition( Condition* pCond )
59 {
60 if ( CloseHandle(*pCond) == 0 ) {
61 sysErrorBelch("closeCondition: failed to close");
62 }
63 return;
64 }
65
66 bool
67 broadcastCondition ( Condition* pCond )
68 {
69 PulseEvent(*pCond);
70 return true;
71 }
72
73 bool
74 signalCondition ( Condition* pCond )
75 {
76 if (SetEvent(*pCond) == 0) {
77 sysErrorBelch("SetEvent");
78 stg_exit(EXIT_FAILURE);
79 }
80 return true;
81 }
82
83 bool
84 waitCondition ( Condition* pCond, Mutex* pMut )
85 {
86 RELEASE_LOCK(pMut);
87 WaitForSingleObject(*pCond, INFINITE);
88 /* Hmm..use WaitForMultipleObjects() ? */
89 ACQUIRE_LOCK(pMut);
90 return true;
91 }
92
93 void
94 yieldThread()
95 {
96 SwitchToThread();
97 return;
98 }
99
100 void
101 shutdownThread()
102 {
103 ExitThread(0);
104 barf("ExitThread() returned"); // avoid gcc warning
105 }
106
107 int
108 createOSThread (OSThreadId* pId, char *name STG_UNUSED,
109 OSThreadProc *startProc, void *param)
110 {
111 HANDLE h;
112 h = CreateThread ( NULL, /* default security attributes */
113 0,
114 (LPTHREAD_START_ROUTINE)startProc,
115 param,
116 0,
117 pId);
118
119 if (h == 0) {
120 return 1;
121 } else {
122 // This handle leaks if we don't close it here. Perhaps we
123 // should try to keep it around to avoid needing OpenThread()
124 // later.
125 CloseHandle(h);
126 return 0;
127 }
128 }
129
130 OSThreadId
131 osThreadId()
132 {
133 return GetCurrentThreadId();
134 }
135
136 bool
137 osThreadIsAlive(OSThreadId id)
138 {
139 DWORD exit_code;
140 HANDLE hdl;
141 if (!(hdl = OpenThread(THREAD_QUERY_INFORMATION,FALSE,id))) {
142 sysErrorBelch("osThreadIsAlive: OpenThread");
143 stg_exit(EXIT_FAILURE);
144 }
145 if (!GetExitCodeThread(hdl, &exit_code)) {
146 sysErrorBelch("osThreadIsAlive: GetExitCodeThread");
147 stg_exit(EXIT_FAILURE);
148 }
149 CloseHandle(hdl);
150 return (exit_code == STILL_ACTIVE);
151 }
152
153 #if defined(USE_CRITICAL_SECTIONS)
154 void
155 initMutex (Mutex* pMut)
156 {
157 InitializeCriticalSectionAndSpinCount(pMut,4000);
158 }
159 void
160 closeMutex (Mutex* pMut)
161 {
162 DeleteCriticalSection(pMut);
163 }
164 #else
165 void
166 initMutex (Mutex* pMut)
167 {
168 HANDLE h = CreateMutex ( NULL, /* default sec. attributes */
169 TRUE, /* not owned => initially signalled */
170 NULL
171 );
172 *pMut = h;
173 return;
174 }
175 void
176 closeMutex (Mutex* pMut)
177 {
178 CloseHandle(*pMut);
179 }
180 #endif
181
182 void
183 newThreadLocalKey (ThreadLocalKey *key)
184 {
185 DWORD r;
186 r = TlsAlloc();
187 if (r == TLS_OUT_OF_INDEXES) {
188 barf("newThreadLocalKey: out of keys");
189 }
190 *key = r;
191 }
192
193 void *
194 getThreadLocalVar (ThreadLocalKey *key)
195 {
196 void *r;
197 r = TlsGetValue(*key);
198 #if defined(DEBUG)
199 // r is allowed to be NULL - it can mean that either there was an
200 // error or the stored value is in fact NULL.
201 if (GetLastError() != NO_ERROR) {
202 sysErrorBelch("getThreadLocalVar");
203 stg_exit(EXIT_FAILURE);
204 }
205 #endif
206 return r;
207 }
208
209 void
210 setThreadLocalVar (ThreadLocalKey *key, void *value)
211 {
212 BOOL b;
213 b = TlsSetValue(*key, value);
214 if (!b) {
215 sysErrorBelch("setThreadLocalVar");
216 stg_exit(EXIT_FAILURE);
217 }
218 }
219
220 void
221 freeThreadLocalKey (ThreadLocalKey *key)
222 {
223 BOOL r;
224 r = TlsFree(*key);
225 if (r == 0) {
226 DWORD dw = GetLastError();
227 barf("freeThreadLocalKey failed: %lu", dw);
228 }
229 }
230
231
232 static unsigned __stdcall
233 forkOS_createThreadWrapper ( void * entry )
234 {
235 Capability *cap;
236 cap = rts_lock();
237 rts_evalStableIO(&cap, (HsStablePtr) entry, NULL);
238 rts_unlock(cap);
239 rts_done();
240 return 0;
241 }
242
243 int
244 forkOS_createThread ( HsStablePtr entry )
245 {
246 unsigned long pId;
247 return (_beginthreadex ( NULL, /* default security attributes */
248 0,
249 forkOS_createThreadWrapper,
250 (void*)entry,
251 0,
252 (unsigned*)&pId) == 0);
253 }
254
255 #if defined(x86_64_HOST_ARCH)
256 /* We still support Windows Vista, so we can't depend on it
257 and must manually resolve these. */
258 typedef DWORD(WINAPI *GetItemCountProc)(WORD);
259 typedef DWORD(WINAPI *GetGroupCountProc)(void);
260 typedef BOOL(WINAPI *SetThreadGroupAffinityProc)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
261 #if !defined(ALL_PROCESSOR_GROUPS)
262 #define ALL_PROCESSOR_GROUPS 0xffff
263 #endif
264 #endif
265
266 void freeThreadingResources (void)
267 {
268 if (cpuGroupCache)
269 {
270 free(cpuGroupCache);
271 cpuGroupCache = NULL;
272 }
273
274 if (cpuGroupCumulativeCache)
275 {
276 free(cpuGroupCumulativeCache);
277 cpuGroupCumulativeCache = NULL;
278 }
279
280 if (cpuGroupDistCache)
281 {
282 free(cpuGroupDistCache);
283 cpuGroupDistCache = NULL;
284 }
285 }
286
287 /* Processor groups are not guaranteed to be uniformly distributed
288 nor guaranteed to be filled before a next group is needed.
289 The OS will assign processors to groups based on physical proximity
290 and will never partially assign cores from one physical cpu to more
291 than one group. If one has two 48 core CPUs then you'd end up with
292 two groups of 48 logical cpus. Now add a 3rd CPU with 10 cores and
293 the group it is assigned to depends where the socket is on the board.
294
295 So we need to make a map of where the CPUs reside and how the groups are filled.
296 Since groups are created at boot time by the kernel, we can cache this information.
297
298 NOTE: This code does not support hot-swapping cores as it's caching the information.
299 If you activate a new core you have to restart the program. This builds a
300 simple lookup array for cpu -> group indexes. This gives O(1) lookup against
301 O(n) space. But n is < 256 so we'll only use 256 bytes of extra memory. */
302
303 static uint8_t
304 getNumberOfProcessorsGroups (void)
305 {
306 /* Group count cache. */
307 static uint8_t n_groups = 0;
308
309
310 #if defined(x86_64_HOST_ARCH)
311 if (!n_groups)
312 {
313 /* We still support Windows Vista. Which means we can't rely
314 on the API being available. So we'll have to resolve manually. */
315 HMODULE kernel = GetModuleHandleW(L"kernel32");
316
317 GetGroupCountProc GetActiveProcessorGroupCount = (GetGroupCountProc)GetProcAddress(kernel, "GetActiveProcessorGroupCount");
318 n_groups = GetActiveProcessorGroupCount();
319
320 IF_DEBUG(scheduler, debugBelch("[*] Number of processor groups detected: %u\n", n_groups));
321 }
322 #endif
323
324 if (!n_groups)
325 {
326 n_groups = 1;
327 }
328
329 return n_groups;
330 }
331
332 #if defined(x86_64_HOST_ARCH)
333 static uint8_t*
334 getProcessorsDistribution (void)
335 {
336 if (cpuGroupDistCache)
337 {
338 return cpuGroupDistCache;
339 }
340
341 if (!cpuGroupDistCache)
342 {
343 uint8_t n_groups = getNumberOfProcessorsGroups();
344 cpuGroupDistCache = malloc(n_groups * sizeof(uint8_t));
345 memset(cpuGroupDistCache, MAXIMUM_PROCESSORS, n_groups * sizeof(uint8_t));
346
347 /* We still support Windows Vista. Which means we can't rely
348 on the API being available. So we'll have to resolve manually. */
349 HMODULE kernel = GetModuleHandleW(L"kernel32");
350
351 GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
352
353 if (GetActiveProcessorCount)
354 {
355 for (int i = 0; i < n_groups; i++)
356 {
357 cpuGroupDistCache[i] = GetActiveProcessorCount(i);
358 IF_DEBUG(scheduler, debugBelch("[*] Number of active processors in group %u detected: %u\n", i, cpuGroupDistCache[i]));
359 }
360 }
361 }
362
363 return cpuGroupDistCache;
364 }
365 #endif
366
367 static uint32_t*
368 getProcessorsCumulativeSum(void)
369 {
370 if (cpuGroupCumulativeCache)
371 {
372 return cpuGroupCumulativeCache;
373 }
374
375 if (!cpuGroupCumulativeCache)
376 {
377 uint8_t n_groups = getNumberOfProcessorsGroups();
378 cpuGroupCumulativeCache = malloc(n_groups * sizeof(uint32_t));
379 memset(cpuGroupCumulativeCache, 0, n_groups * sizeof(uint32_t));
380
381 #if defined(x86_64_HOST_ARCH)
382 uint8_t* proc_dist = getProcessorsDistribution();
383 uint32_t cum_num_proc = 0;
384 for (int i = 0; i < n_groups; i++)
385 {
386 cpuGroupCumulativeCache[i] = cum_num_proc;
387 cum_num_proc += proc_dist[i];
388 IF_DEBUG(scheduler, debugBelch("[*] Cumulative active processors for group %u: %u\n", i, cpuGroupCumulativeCache[i]));
389 }
390 #endif
391 }
392
393 return cpuGroupCumulativeCache;
394 }
395
396 /*
397 Because processors can be distributed rather unpredictably inside
398 processor groups, we need to keep track of which processors are in
399 which group to be able to determine which mask to set and which bit
400 in the mask to set.
401
402 This can either be done by the typical trade-off: speed or
403 memory usage. In this case I prioritize speed.
404
405 This function will generate an array where each index is a processor
406 and the value of the array the group it belongs to. This allows us to
407 in constant time determine where a processor is.
408 */
409 static uint8_t*
410 createProcessorGroupMap (void)
411 {
412 if (cpuGroupCache)
413 {
414 return cpuGroupCache;
415 }
416
417 uint32_t numProcs = getNumberOfProcessors();
418
419 cpuGroupCache = malloc(numProcs * sizeof(uint8_t));
420 /* For 32bit Windows and 64bit older than Windows 7, create a default mapping. */
421 memset(cpuGroupCache, 0, numProcs * sizeof(uint8_t));
422
423 #if defined(x86_64_HOST_ARCH)
424 uint8_t* proc_dist = getProcessorsDistribution();
425
426 int totalProcs = 0;
427 uint8_t nGroups = getNumberOfProcessorsGroups();
428 int group;
429 for (group = 0; group < nGroups; group++)
430 {
431 uint8_t nProc = proc_dist[group];
432 memset(cpuGroupCache + totalProcs, group, nProc * sizeof(uint8_t));
433 totalProcs += nProc;
434 }
435
436 IF_DEBUG(scheduler, debugBelch("[*] Processor group map created\n"));
437 #endif
438
439 return cpuGroupCache;
440 }
441
442 uint32_t
443 getNumberOfProcessors (void)
444 {
445 static uint32_t nproc = 0;
446
447 #if defined(x86_64_HOST_ARCH)
448 /* We still support Windows Vista. Which means we can't rely
449 on the API being available. So we'll have to resolve manually. */
450 HMODULE kernel = GetModuleHandleW(L"kernel32");
451
452 GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
453 if (GetActiveProcessorCount && !nproc)
454 {
455 nproc = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
456
457 if (nproc)
458 {
459 IF_DEBUG(scheduler, debugBelch("[*] Total number of active processors detected: %u\n", nproc));
460 return nproc;
461 }
462
463 IF_DEBUG(scheduler, debugBelch("Could not determine Max number of logical processors.\n"
464 "Falling back to old code which limits to 64 logical processors.\n"));
465 }
466 #endif
467
468 /* This will return the maximum number of processes
469 within one processor group. It's also slower
470 so use it only when needed. */
471 if (nproc == 0) {
472 SYSTEM_INFO si;
473 GetSystemInfo(&si);
474 nproc = si.dwNumberOfProcessors;
475 }
476
477 return nproc;
478 }
479
480 void
481 setThreadAffinity (uint32_t n, uint32_t m) // cap N of M
482 {
483 ASSERT(n <= m);
484
485 HANDLE hThread;
486 DWORD_PTR *mask, r; // 64-bit win is required to handle more than 32 procs
487 // and Windows 7+ required for more than 64 procs
488 uint32_t n_proc, i, ix;
489 uint8_t* proc_map = createProcessorGroupMap();
490 uint32_t n_groups = getNumberOfProcessorsGroups();
491 uint32_t* proc_cum = getProcessorsCumulativeSum();
492 n_proc = getNumberOfProcessors();
493 hThread = GetCurrentThread();
494
495 ASSERT(proc_map );
496 ASSERT(proc_cum );
497 ASSERT(hThread );
498 ASSERT(n_groups > 0);
499 ASSERT(n_proc > 0);
500
501 mask = malloc(n_groups * sizeof(DWORD_PTR));
502 memset(mask, 0, n_groups * sizeof(DWORD_PTR));
503
504 /* The mask for the individual groups are all 0 based
505 so we need different masks for every group. */
506 int group;
507 for (i = n; i < n_proc; i += m)
508 {
509 group = proc_map[i];
510 ix = i - proc_cum[group];
511 mask[group] |= 1 << ix;
512 }
513
514 #if defined(x86_64_HOST_ARCH)
515 /* We still support Windows Vista. Which means we can't rely
516 on the API being available. So we'll have to resolve manually. */
517 HMODULE kernel = GetModuleHandleW(L"kernel32");
518
519 SetThreadGroupAffinityProc SetThreadGroupAffinity = (SetThreadGroupAffinityProc)GetProcAddress(kernel, "SetThreadGroupAffinity");
520 #endif
521
522 for (i = 0; i < n_groups; i++)
523 {
524 #if defined(x86_64_HOST_ARCH)
525 // If we support the new API, use it.
526 if (mask[i] > 0 && SetThreadGroupAffinity)
527 {
528 GROUP_AFFINITY hGroup;
529 ZeroMemory(&hGroup, sizeof(hGroup));
530 hGroup.Mask = mask[i];
531 hGroup.Group = i;
532
533 if (!SetThreadGroupAffinity(hThread, &hGroup, NULL))
534 {
535 sysErrorBelch("SetThreadGroupAffinity");
536 }
537
538 continue;
539 }
540 #endif
541 // Fall-back methods. Only do it if there's a mask to set
542 if (mask[i] > 0)
543 {
544 r = SetThreadAffinityMask(hThread, mask[i]);
545 if (r == 0) {
546 free(mask);
547 sysErrorBelch("SetThreadAffinity");
548 stg_exit(EXIT_FAILURE);
549 }
550 }
551 }
552
553 free(mask);
554 }
555
556 typedef BOOL (WINAPI *PCSIO)(HANDLE);
557
558 void
559 interruptOSThread (OSThreadId id)
560 {
561 HANDLE hdl;
562 PCSIO pCSIO;
563 if (!(hdl = OpenThread(THREAD_TERMINATE,FALSE,id))) {
564 sysErrorBelch("interruptOSThread: OpenThread");
565 stg_exit(EXIT_FAILURE);
566 }
567 pCSIO = (PCSIO) GetProcAddress(GetModuleHandle(TEXT("Kernel32.dll")),
568 "CancelSynchronousIo");
569 if ( NULL != pCSIO ) {
570 pCSIO(hdl);
571 } else {
572 // Nothing to do, unfortunately
573 }
574 CloseHandle(hdl);
575 }
576
577 void setThreadNode (uint32_t node)
578 {
579 if (osNumaAvailable())
580 {
581 StgWord mask = 0;
582 mask |= 1 << node;
583 if (!SetThreadAffinityMask(GetCurrentThread(), mask))
584 {
585 sysErrorBelch(
586 "setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.",
587 node, GetLastError());
588 stg_exit(EXIT_FAILURE);
589 }
590 }
591 }
592
593 void releaseThreadNode (void)
594 {
595 if (osNumaAvailable())
596 {
597 PDWORD_PTR processMask = NULL;
598 PDWORD_PTR systemMask = NULL;
599 if (!GetProcessAffinityMask(GetCurrentProcess(),
600 processMask,
601 systemMask))
602 {
603 sysErrorBelch(
604 "releaseThreadNode: Error resetting affinity of thread: %lu",
605 GetLastError());
606 stg_exit(EXIT_FAILURE);
607 }
608
609 if (!SetThreadAffinityMask(GetCurrentThread(), *processMask))
610 {
611 sysErrorBelch(
612 "releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.",
613 GetLastError());
614 stg_exit(EXIT_FAILURE);
615 }
616
617 }
618 }
619
620 #else /* !defined(THREADED_RTS) */
621
622 int
623 forkOS_createThread ( HsStablePtr entry STG_UNUSED )
624 {
625 return -1;
626 }
627
628 void freeThreadingResources (void) { /* nothing */ }
629
630 uint32_t getNumberOfProcessors (void)
631 {
632 return 1;
633 }
634
635 #endif /* !defined(THREADED_RTS) */
636
637 KernelThreadId kernelThreadId (void)
638 {
639 DWORD tid = GetCurrentThreadId();
640 return tid;
641 }