FIX #2164: check for ThreadRelocated in isAlive()
[ghc.git] / rts / Adjustor.c
1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
3 *
4 * Copyright (c) 1998.
5 *
6 * ---------------------------------------------------------------------------*/
7
8 /* A little bit of background...
9
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
12
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
21
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
28
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
32
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
36 Haskell side.
37 */
38
39 #include "PosixSource.h"
40 #include "Rts.h"
41 #include "RtsExternal.h"
42 #include "RtsUtils.h"
43 #include <stdlib.h>
44
45 #if defined(USE_LIBFFI_FOR_ADJUSTORS)
46
47 #include "ffi.h"
48 #include <string.h>
49
50 void
51 freeHaskellFunctionPtr(void* ptr)
52 {
53 ffi_closure *cl;
54
55 cl = (ffi_closure*)ptr;
56 freeStablePtr(cl->user_data);
57 stgFree(cl->cif->arg_types);
58 stgFree(cl->cif);
59 freeExec(cl);
60 }
61
62 static ffi_type * char_to_ffi_type(char c)
63 {
64 switch (c) {
65 case 'v': return &ffi_type_void;
66 case 'f': return &ffi_type_float;
67 case 'd': return &ffi_type_double;
68 case 'L': return &ffi_type_sint64;
69 case 'l': return &ffi_type_uint64;
70 case 'W': return &ffi_type_sint32;
71 case 'w': return &ffi_type_uint32;
72 case 'S': return &ffi_type_sint16;
73 case 's': return &ffi_type_uint16;
74 case 'B': return &ffi_type_sint8;
75 case 'b': return &ffi_type_uint8;
76 case 'p': return &ffi_type_pointer;
77 default: barf("char_to_ffi_type: unknown type '%c'", c);
78 }
79 }
80
81 void*
82 createAdjustor (int cconv,
83 StgStablePtr hptr,
84 StgFunPtr wptr,
85 char *typeString)
86 {
87 ffi_cif *cif;
88 ffi_type **arg_types;
89 nat n_args, i;
90 ffi_type *result_type;
91 ffi_closure *cl;
92 int r, abi;
93
94 n_args = strlen(typeString) - 1;
95 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
96 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
97
98 result_type = char_to_ffi_type(typeString[0]);
99 for (i=0; i < n_args; i++) {
100 arg_types[i] = char_to_ffi_type(typeString[i+1]);
101 }
102 switch (cconv) {
103 #ifdef mingw32_TARGET_OS
104 case 0: /* stdcall */
105 abi = FFI_STDCALL;
106 break;
107 #endif
108 case 1: /* ccall */
109 abi = FFI_DEFAULT_ABI;
110 break;
111 default:
112 barf("createAdjustor: convention %d not supported on this platform", cconv);
113 }
114
115 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
116 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
117
118 // ToDo: use ffi_closure_alloc()
119 cl = allocateExec(sizeof(ffi_closure));
120
121 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
122 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
123
124 return (void*)cl;
125 }
126
127 #else // To end of file...
128
129 #if defined(_WIN32)
130 #include <windows.h>
131 #endif
132
133 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
134 #include <string.h>
135 #endif
136
137 #ifdef LEADING_UNDERSCORE
138 #define UNDERSCORE "_"
139 #else
140 #define UNDERSCORE ""
141 #endif
142 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
143 /*
144 Now here's something obscure for you:
145
146 When generating an adjustor thunk that uses the C calling
147 convention, we have to make sure that the thunk kicks off
148 the process of jumping into Haskell with a tail jump. Why?
149 Because as a result of jumping in into Haskell we may end
150 up freeing the very adjustor thunk we came from using
151 freeHaskellFunctionPtr(). Hence, we better not return to
152 the adjustor code on our way out, since it could by then
153 point to junk.
154
155 The fix is readily at hand, just include the opcodes
156 for the C stack fixup code that we need to perform when
157 returning in some static piece of memory and arrange
158 to return to it before tail jumping from the adjustor thunk.
159 */
160 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
161 {
162 __asm__ (
163 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
164 UNDERSCORE "obscure_ccall_ret_code:\n\t"
165 "addl $0x4, %esp\n\t"
166 "ret"
167 );
168 }
169 extern void obscure_ccall_ret_code(void);
170
171 #endif
172
173 #if defined(x86_64_HOST_ARCH)
174 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
175 {
176 __asm__ (
177 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
178 UNDERSCORE "obscure_ccall_ret_code:\n\t"
179 "addq $0x8, %rsp\n\t"
180 "ret"
181 );
182 }
183 extern void obscure_ccall_ret_code(void);
184 #endif
185
186 #if defined(alpha_HOST_ARCH)
187 /* To get the definition of PAL_imb: */
188 # if defined(linux_HOST_OS)
189 # include <asm/pal.h>
190 # else
191 # include <machine/pal.h>
192 # endif
193 #endif
194
195 #if defined(ia64_HOST_ARCH)
196
197 /* Layout of a function descriptor */
198 typedef struct _IA64FunDesc {
199 StgWord64 ip;
200 StgWord64 gp;
201 } IA64FunDesc;
202
203 static void *
204 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
205 {
206 StgArrWords* arr;
207 nat data_size_in_words, total_size_in_words;
208
209 /* round up to a whole number of words */
210 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
211 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
212
213 /* allocate and fill it in */
214 arr = (StgArrWords *)allocate(total_size_in_words);
215 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
216
217 /* obtain a stable ptr */
218 *stable = getStablePtr((StgPtr)arr);
219
220 /* and return a ptr to the goods inside the array */
221 return(&(arr->payload));
222 }
223 #endif
224
225 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
226 __asm__("obscure_ccall_ret_code:\n\t"
227 "lwz 1,0(1)\n\t"
228 "lwz 0,4(1)\n\t"
229 "mtlr 0\n\t"
230 "blr");
231 extern void obscure_ccall_ret_code(void);
232 #endif
233
234 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
235 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
236
237 /* !!! !!! WARNING: !!! !!!
238 * This structure is accessed from AdjustorAsm.s
239 * Any changes here have to be mirrored in the offsets there.
240 */
241
242 typedef struct AdjustorStub {
243 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
244 unsigned lis;
245 unsigned ori;
246 unsigned lwz;
247 unsigned mtctr;
248 unsigned bctr;
249 StgFunPtr code;
250 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
251 /* powerpc64-darwin: just guessing that it won't use fundescs. */
252 unsigned lis;
253 unsigned ori;
254 unsigned rldimi;
255 unsigned oris;
256 unsigned ori2;
257 unsigned lwz;
258 unsigned mtctr;
259 unsigned bctr;
260 StgFunPtr code;
261 #else
262 /* fundesc-based ABIs */
263 #define FUNDESCS
264 StgFunPtr code;
265 struct AdjustorStub
266 *toc;
267 void *env;
268 #endif
269 StgStablePtr hptr;
270 StgFunPtr wptr;
271 StgInt negative_framesize;
272 StgInt extrawords_plus_one;
273 } AdjustorStub;
274
275 #endif
276 #endif
277
278 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
279
280 /* !!! !!! WARNING: !!! !!!
281 * This structure is accessed from AdjustorAsm.s
282 * Any changes here have to be mirrored in the offsets there.
283 */
284
285 typedef struct AdjustorStub {
286 unsigned char call[8];
287 StgStablePtr hptr;
288 StgFunPtr wptr;
289 StgInt frame_size;
290 StgInt argument_size;
291 } AdjustorStub;
292 #endif
293
294 #if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
295 static int totalArgumentSize(char *typeString)
296 {
297 int sz = 0;
298 while(*typeString)
299 {
300 char t = *typeString++;
301
302 switch(t)
303 {
304 // on 32-bit platforms, Double and Int64 occupy two words.
305 case 'd':
306 case 'l':
307 case 'L':
308 if(sizeof(void*) == 4)
309 {
310 sz += 2;
311 break;
312 }
313 // everything else is one word.
314 default:
315 sz += 1;
316 }
317 }
318 return sz;
319 }
320 #endif
321
322 void*
323 createAdjustor(int cconv, StgStablePtr hptr,
324 StgFunPtr wptr,
325 char *typeString
326 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
327 STG_UNUSED
328 #endif
329 )
330 {
331 void *adjustor = NULL;
332
333 switch (cconv)
334 {
335 case 0: /* _stdcall */
336 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
337 /* Magic constant computed by inspecting the code length of
338 the following assembly language snippet
339 (offset and machine code prefixed):
340
341 <0>: 58 popl %eax # temp. remove ret addr..
342 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
343 # hold a StgStablePtr
344 <6>: 50 pushl %eax # put back ret. addr
345 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
346 <c>: ff e0 jmp %eax # and jump to it.
347 # the callee cleans up the stack
348 */
349 adjustor = allocateExec(14);
350 {
351 unsigned char *const adj_code = (unsigned char *)adjustor;
352 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
353
354 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
355 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
356
357 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
358
359 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
360 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
361
362 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
363 adj_code[0x0d] = (unsigned char)0xe0;
364 }
365 #endif
366 break;
367
368 case 1: /* _ccall */
369 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
370 /* Magic constant computed by inspecting the code length of
371 the following assembly language snippet
372 (offset and machine code prefixed):
373
374 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
375 # hold a StgStablePtr
376 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
377 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
378 <0f>: ff e0 jmp *%eax # jump to wptr
379
380 The ccall'ing version is a tad different, passing in the return
381 address of the caller to the auto-generated C stub (which enters
382 via the stable pointer.) (The auto-generated C stub is in on this
383 game, don't worry :-)
384
385 See the comment next to obscure_ccall_ret_code why we need to
386 perform a tail jump instead of a call, followed by some C stack
387 fixup.
388
389 Note: The adjustor makes the assumption that any return value
390 coming back from the C stub is not stored on the stack.
391 That's (thankfully) the case here with the restricted set of
392 return types that we support.
393 */
394 adjustor = allocateExec(17);
395 {
396 unsigned char *const adj_code = (unsigned char *)adjustor;
397
398 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
399 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
400
401 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
402 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
403
404 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
405 *((StgFunPtr*)(adj_code + 0x0b)) =
406 (StgFunPtr)obscure_ccall_ret_code;
407
408 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
409 adj_code[0x10] = (unsigned char)0xe0;
410 }
411 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
412 {
413 /*
414 What's special about Darwin/Mac OS X on i386?
415 It wants the stack to stay 16-byte aligned.
416
417 We offload most of the work to AdjustorAsm.S.
418 */
419 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub));
420 adjustor = adjustorStub;
421
422 extern void adjustorCode(void);
423 int sz = totalArgumentSize(typeString);
424
425 adjustorStub->call[0] = 0xe8;
426 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
427 adjustorStub->hptr = hptr;
428 adjustorStub->wptr = wptr;
429
430 // The adjustor puts the following things on the stack:
431 // 1.) %ebp link
432 // 2.) padding and (a copy of) the arguments
433 // 3.) a dummy argument
434 // 4.) hptr
435 // 5.) return address (for returning to the adjustor)
436 // All these have to add up to a multiple of 16.
437
438 // first, include everything in frame_size
439 adjustorStub->frame_size = sz * 4 + 16;
440 // align to 16 bytes
441 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
442 // only count 2.) and 3.) as part of frame_size
443 adjustorStub->frame_size -= 12;
444 adjustorStub->argument_size = sz;
445 }
446
447 #elif defined(x86_64_HOST_ARCH)
448 /*
449 stack at call:
450 argn
451 ...
452 arg7
453 return address
454 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
455
456 if there are <6 integer args, then we can just push the
457 StablePtr into %edi and shuffle the other args up.
458
459 If there are >=6 integer args, then we have to flush one arg
460 to the stack, and arrange to adjust the stack ptr on return.
461 The stack will be rearranged to this:
462
463 argn
464 ...
465 arg7
466 return address *** <-- dummy arg in stub fn.
467 arg6
468 obscure_ccall_ret_code
469
470 This unfortunately means that the type of the stub function
471 must have a dummy argument for the original return address
472 pointer inserted just after the 6th integer argument.
473
474 Code for the simple case:
475
476 0: 4d 89 c1 mov %r8,%r9
477 3: 49 89 c8 mov %rcx,%r8
478 6: 48 89 d1 mov %rdx,%rcx
479 9: 48 89 f2 mov %rsi,%rdx
480 c: 48 89 fe mov %rdi,%rsi
481 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
482 16: ff 25 0c 00 00 00 jmpq *12(%rip)
483 ...
484 20: .quad 0 # aligned on 8-byte boundary
485 28: .quad 0 # aligned on 8-byte boundary
486
487
488 And the version for >=6 integer arguments:
489
490 0: 41 51 push %r9
491 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
492 8: 4d 89 c1 mov %r8,%r9
493 b: 49 89 c8 mov %rcx,%r8
494 e: 48 89 d1 mov %rdx,%rcx
495 11: 48 89 f2 mov %rsi,%rdx
496 14: 48 89 fe mov %rdi,%rsi
497 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
498 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
499 ...
500 28: .quad 0 # aligned on 8-byte boundary
501 30: .quad 0 # aligned on 8-byte boundary
502 38: .quad 0 # aligned on 8-byte boundary
503 */
504
505 {
506 int i = 0;
507 char *c;
508
509 // determine whether we have 6 or more integer arguments,
510 // and therefore need to flush one to the stack.
511 for (c = typeString; *c != '\0'; c++) {
512 if (*c != 'f' && *c != 'd') i++;
513 if (i == 6) break;
514 }
515
516 if (i < 6) {
517 adjustor = allocateExec(0x30);
518
519 *(StgInt32 *)adjustor = 0x49c1894d;
520 *(StgInt32 *)(adjustor+0x4) = 0x8948c889;
521 *(StgInt32 *)(adjustor+0x8) = 0xf28948d1;
522 *(StgInt32 *)(adjustor+0xc) = 0x48fe8948;
523 *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
524 *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
525 *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
526 *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
527 *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
528 }
529 else
530 {
531 adjustor = allocateExec(0x40);
532
533 *(StgInt32 *)adjustor = 0x35ff5141;
534 *(StgInt32 *)(adjustor+0x4) = 0x00000020;
535 *(StgInt32 *)(adjustor+0x8) = 0x49c1894d;
536 *(StgInt32 *)(adjustor+0xc) = 0x8948c889;
537 *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
538 *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
539 *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
540 *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
541 *(StgInt32 *)(adjustor+0x20) = 0x00000014;
542
543 *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
544 *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
545 *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
546 }
547 }
548 #elif defined(sparc_HOST_ARCH)
549 /* Magic constant computed by inspecting the code length of the following
550 assembly language snippet (offset and machine code prefixed):
551
552 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
553 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
554 <08>: D823A05C st %o4, [%sp + 92]
555 <0C>: 9A10000B mov %o3, %o5
556 <10>: 9810000A mov %o2, %o4
557 <14>: 96100009 mov %o1, %o3
558 <18>: 94100008 mov %o0, %o2
559 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
560 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
561 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
562 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
563 <2C> 00000000 ! place for getting hptr back easily
564
565 ccall'ing on SPARC is easy, because we are quite lucky to push a
566 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
567 existing arguments (note that %sp must stay double-word aligned at
568 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
569 To do this, we extend the *caller's* stack frame by 2 words and shift
570 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
571 procedure because of the tail-jump) by 2 positions. This makes room in
572 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
573 for destination addr of jump on SPARC, return address on x86, ...). This
574 shouldn't cause any problems for a C-like caller: alloca is implemented
575 similarly, and local variables should be accessed via %fp, not %sp. In a
576 nutshell: This should work! (Famous last words! :-)
577 */
578 adjustor = allocateExec(4*(11+1));
579 {
580 unsigned long *const adj_code = (unsigned long *)adjustor;
581
582 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
583 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
584 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
585 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
586 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
587 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
588 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
589 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
590 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
591 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
592 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
593 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
594 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
595 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
596 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
597
598 adj_code[11] = (unsigned long)hptr;
599
600 /* flush cache */
601 asm("flush %0" : : "r" (adj_code ));
602 asm("flush %0" : : "r" (adj_code + 2));
603 asm("flush %0" : : "r" (adj_code + 4));
604 asm("flush %0" : : "r" (adj_code + 6));
605 asm("flush %0" : : "r" (adj_code + 10));
606
607 /* max. 5 instructions latency, and we need at >= 1 for returning */
608 asm("nop");
609 asm("nop");
610 asm("nop");
611 asm("nop");
612 }
613 #elif defined(alpha_HOST_ARCH)
614 /* Magic constant computed by inspecting the code length of
615 the following assembly language snippet
616 (offset and machine code prefixed; note that the machine code
617 shown is longwords stored in little-endian order):
618
619 <00>: 46520414 mov a2, a4
620 <04>: 46100412 mov a0, a2
621 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
622 <0c>: 46730415 mov a3, a5
623 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
624 <14>: 46310413 mov a1, a3
625 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
626 <1c>: 00000000 # padding for alignment
627 <20>: [8 bytes for hptr quadword]
628 <28>: [8 bytes for wptr quadword]
629
630 The "computed" jump at <08> above is really a jump to a fixed
631 location. Accordingly, we place an always-correct hint in the
632 jump instruction, namely the address offset from <0c> to wptr,
633 divided by 4, taking the lowest 14 bits.
634
635 We only support passing 4 or fewer argument words, for the same
636 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
637 On the Alpha the first 6 integer arguments are in a0 through a5,
638 and the rest on the stack. Hence we want to shuffle the original
639 caller's arguments by two.
640
641 On the Alpha the calling convention is so complex and dependent
642 on the callee's signature -- for example, the stack pointer has
643 to be a multiple of 16 -- that it seems impossible to me [ccshan]
644 to handle the general case correctly without changing how the
645 adjustor is called from C. For now, our solution of shuffling
646 registers only and ignoring the stack only works if the original
647 caller passed 4 or fewer argument words.
648
649 TODO: Depending on how much allocation overhead stgMallocBytes uses for
650 header information (more precisely, if the overhead is no more than
651 4 bytes), we should move the first three instructions above down by
652 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
653 */
654 ASSERT(((StgWord64)wptr & 3) == 0);
655 adjustor = allocateExec(48);
656 {
657 StgWord64 *const code = (StgWord64 *)adjustor;
658
659 code[0] = 0x4610041246520414L;
660 code[1] = 0x46730415a61b0020L;
661 code[2] = 0x46310413a77b0028L;
662 code[3] = 0x000000006bfb0000L
663 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
664
665 code[4] = (StgWord64)hptr;
666 code[5] = (StgWord64)wptr;
667
668 /* Ensure that instruction cache is consistent with our new code */
669 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
670 }
671 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
672
673 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
674 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
675 {
676 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
677 We need to calculate all the details of the stack frame layout,
678 taking into account the types of all the arguments, and then
679 generate code on the fly. */
680
681 int src_gpr = 3, dst_gpr = 5;
682 int fpr = 3;
683 int src_offset = 0, dst_offset = 0;
684 int n = strlen(typeString),i;
685 int src_locs[n], dst_locs[n];
686 int frameSize;
687 unsigned *code;
688
689 /* Step 1:
690 Calculate where the arguments should go.
691 src_locs[] will contain the locations of the arguments in the
692 original stack frame passed to the adjustor.
693 dst_locs[] will contain the locations of the arguments after the
694 adjustor runs, on entry to the wrapper proc pointed to by wptr.
695
696 This algorithm is based on the one described on page 3-19 of the
697 System V ABI PowerPC Processor Supplement.
698 */
699 for(i=0;typeString[i];i++)
700 {
701 char t = typeString[i];
702 if((t == 'f' || t == 'd') && fpr <= 8)
703 src_locs[i] = dst_locs[i] = -32-(fpr++);
704 else
705 {
706 if((t == 'l' || t == 'L') && src_gpr <= 9)
707 {
708 if((src_gpr & 1) == 0)
709 src_gpr++;
710 src_locs[i] = -src_gpr;
711 src_gpr += 2;
712 }
713 else if((t == 'w' || t == 'W') && src_gpr <= 10)
714 {
715 src_locs[i] = -(src_gpr++);
716 }
717 else
718 {
719 if((t == 'l' || t == 'L' || t == 'd')
720 {
721 if(src_offset % 8)
722 src_offset += 4;
723 }
724 src_locs[i] = src_offset;
725 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
726 }
727
728 if((t == 'l' || t == 'L') && dst_gpr <= 9)
729 {
730 if((dst_gpr & 1) == 0)
731 dst_gpr++;
732 dst_locs[i] = -dst_gpr;
733 dst_gpr += 2;
734 }
735 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
736 {
737 dst_locs[i] = -(dst_gpr++);
738 }
739 else
740 {
741 if(t == 'l' || t == 'L' || t == 'd')
742 {
743 if(dst_offset % 8)
744 dst_offset += 4;
745 }
746 dst_locs[i] = dst_offset;
747 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
748 }
749 }
750 }
751
752 frameSize = dst_offset + 8;
753 frameSize = (frameSize+15) & ~0xF;
754
755 /* Step 2:
756 Build the adjustor.
757 */
758 // allocate space for at most 4 insns per parameter
759 // plus 14 more instructions.
760 adjustor = allocateExec(4 * (4*n + 14));
761 code = (unsigned*)adjustor;
762
763 *code++ = 0x48000008; // b *+8
764 // * Put the hptr in a place where freeHaskellFunctionPtr
765 // can get at it.
766 *code++ = (unsigned) hptr;
767
768 // * save the link register
769 *code++ = 0x7c0802a6; // mflr r0;
770 *code++ = 0x90010004; // stw r0, 4(r1);
771 // * and build a new stack frame
772 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
773
774 // * now generate instructions to copy arguments
775 // from the old stack frame into the new stack frame.
776 for(i=n-1;i>=0;i--)
777 {
778 if(src_locs[i] < -32)
779 ASSERT(dst_locs[i] == src_locs[i]);
780 else if(src_locs[i] < 0)
781 {
782 // source in GPR.
783 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
784 if(dst_locs[i] < 0)
785 {
786 ASSERT(dst_locs[i] > -32);
787 // dst is in GPR, too.
788
789 if(typeString[i] == 'l' || typeString[i] == 'L')
790 {
791 // mr dst+1, src+1
792 *code++ = 0x7c000378
793 | ((-dst_locs[i]+1) << 16)
794 | ((-src_locs[i]+1) << 11)
795 | ((-src_locs[i]+1) << 21);
796 }
797 // mr dst, src
798 *code++ = 0x7c000378
799 | ((-dst_locs[i]) << 16)
800 | ((-src_locs[i]) << 11)
801 | ((-src_locs[i]) << 21);
802 }
803 else
804 {
805 if(typeString[i] == 'l' || typeString[i] == 'L')
806 {
807 // stw src+1, dst_offset+4(r1)
808 *code++ = 0x90010000
809 | ((-src_locs[i]+1) << 21)
810 | (dst_locs[i] + 4);
811 }
812
813 // stw src, dst_offset(r1)
814 *code++ = 0x90010000
815 | ((-src_locs[i]) << 21)
816 | (dst_locs[i] + 8);
817 }
818 }
819 else
820 {
821 ASSERT(dst_locs[i] >= 0);
822 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
823
824 if(typeString[i] == 'l' || typeString[i] == 'L')
825 {
826 // lwz r0, src_offset(r1)
827 *code++ = 0x80010000
828 | (src_locs[i] + frameSize + 8 + 4);
829 // stw r0, dst_offset(r1)
830 *code++ = 0x90010000
831 | (dst_locs[i] + 8 + 4);
832 }
833 // lwz r0, src_offset(r1)
834 *code++ = 0x80010000
835 | (src_locs[i] + frameSize + 8);
836 // stw r0, dst_offset(r1)
837 *code++ = 0x90010000
838 | (dst_locs[i] + 8);
839 }
840 }
841
842 // * hptr will be the new first argument.
843 // lis r3, hi(hptr)
844 *code++ = OP_HI(0x3c60, hptr);
845 // ori r3,r3,lo(hptr)
846 *code++ = OP_LO(0x6063, hptr);
847
848 // * we need to return to a piece of code
849 // which will tear down the stack frame.
850 // lis r11,hi(obscure_ccall_ret_code)
851 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
852 // ori r11,r11,lo(obscure_ccall_ret_code)
853 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
854 // mtlr r11
855 *code++ = 0x7d6803a6;
856
857 // * jump to wptr
858 // lis r11,hi(wptr)
859 *code++ = OP_HI(0x3d60, wptr);
860 // ori r11,r11,lo(wptr)
861 *code++ = OP_LO(0x616b, wptr);
862 // mtctr r11
863 *code++ = 0x7d6903a6;
864 // bctr
865 *code++ = 0x4e800420;
866
867 // Flush the Instruction cache:
868 {
869 unsigned *p = adjustor;
870 while(p < code)
871 {
872 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
873 : : "r" (p));
874 p++;
875 }
876 __asm__ volatile ("sync\n\tisync");
877 }
878 }
879
880 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
881
882 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
883 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
884 {
885 /* The following code applies to all PowerPC and PowerPC64 platforms
886 whose stack layout is based on the AIX ABI.
887
888 Besides (obviously) AIX, this includes
889 Mac OS 9 and BeOS/PPC (may they rest in peace),
890 which use the 32-bit AIX ABI
891 powerpc64-linux,
892 which uses the 64-bit AIX ABI
893 and Darwin (Mac OS X),
894 which uses the same stack layout as AIX,
895 but no function descriptors.
896
897 The actual stack-frame shuffling is implemented out-of-line
898 in the function adjustorCode, in AdjustorAsm.S.
899 Here, we set up an AdjustorStub structure, which
900 is a function descriptor (on platforms that have function
901 descriptors) or a short piece of stub code (on Darwin) to call
902 adjustorCode with a pointer to the AdjustorStub struct loaded
903 into register r2.
904
905 One nice thing about this is that there is _no_ code generated at
906 runtime on the platforms that have function descriptors.
907 */
908 AdjustorStub *adjustorStub;
909 int sz = 0, extra_sz, total_sz;
910
911 // from AdjustorAsm.s
912 // not declared as a function so that AIX-style
913 // fundescs can never get in the way.
914 extern void *adjustorCode;
915
916 #ifdef FUNDESCS
917 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
918 #else
919 adjustorStub = allocateExec(sizeof(AdjustorStub));
920 #endif
921 adjustor = adjustorStub;
922
923 adjustorStub->code = (void*) &adjustorCode;
924
925 #ifdef FUNDESCS
926 // function descriptors are a cool idea.
927 // We don't need to generate any code at runtime.
928 adjustorStub->toc = adjustorStub;
929 #else
930
931 // no function descriptors :-(
932 // We need to do things "by hand".
933 #if defined(powerpc_HOST_ARCH)
934 // lis r2, hi(adjustorStub)
935 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
936 // ori r2, r2, lo(adjustorStub)
937 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
938 // lwz r0, code(r2)
939 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
940 - (char*)adjustorStub);
941 // mtctr r0
942 adjustorStub->mtctr = 0x7c0903a6;
943 // bctr
944 adjustorStub->bctr = 0x4e800420;
945 #else
946 barf("adjustor creation not supported on this platform");
947 #endif
948
949 // Flush the Instruction cache:
950 {
951 int n = sizeof(AdjustorStub)/sizeof(unsigned);
952 unsigned *p = (unsigned*)adjustor;
953 while(n--)
954 {
955 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
956 : : "r" (p));
957 p++;
958 }
959 __asm__ volatile ("sync\n\tisync");
960 }
961 #endif
962
963 // Calculate the size of the stack frame, in words.
964 sz = totalArgumentSize(typeString);
965
966 // The first eight words of the parameter area
967 // are just "backing store" for the parameters passed in
968 // the GPRs. extra_sz is the number of words beyond those first
969 // 8 words.
970 extra_sz = sz - 8;
971 if(extra_sz < 0)
972 extra_sz = 0;
973
974 // Calculate the total size of the stack frame.
975 total_sz = (6 /* linkage area */
976 + 8 /* minimum parameter area */
977 + 2 /* two extra arguments */
978 + extra_sz)*sizeof(StgWord);
979
980 // align to 16 bytes.
981 // AIX only requires 8 bytes, but who cares?
982 total_sz = (total_sz+15) & ~0xF;
983
984 // Fill in the information that adjustorCode in AdjustorAsm.S
985 // will use to create a new stack frame with the additional args.
986 adjustorStub->hptr = hptr;
987 adjustorStub->wptr = wptr;
988 adjustorStub->negative_framesize = -total_sz;
989 adjustorStub->extrawords_plus_one = extra_sz + 1;
990 }
991
992 #elif defined(ia64_HOST_ARCH)
993 /*
994 Up to 8 inputs are passed in registers. We flush the last two inputs to
995 the stack, initially into the 16-byte scratch region left by the caller.
996 We then shuffle the others along by 4 (taking 2 registers for ourselves
997 to save return address and previous function state - we need to come back
998 here on the way out to restore the stack, so this is a real function
999 rather than just a trampoline).
1000
1001 The function descriptor we create contains the gp of the target function
1002 so gp is already loaded correctly.
1003
1004 [MLX] alloc r16=ar.pfs,10,2,0
1005 movl r17=wptr
1006 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
1007 mov r41=r37 // out7 = in5 (out3)
1008 mov r40=r36;; // out6 = in4 (out2)
1009 [MII] st8.spill [r12]=r39 // spill in7 (out5)
1010 mov.sptk b6=r17,50
1011 mov r38=r34;; // out4 = in2 (out0)
1012 [MII] mov r39=r35 // out5 = in3 (out1)
1013 mov r37=r33 // out3 = in1 (loc1)
1014 mov r36=r32 // out2 = in0 (loc0)
1015 [MLX] adds r12=-24,r12 // update sp
1016 movl r34=hptr;; // out0 = hptr
1017 [MIB] mov r33=r16 // loc1 = ar.pfs
1018 mov r32=b0 // loc0 = retaddr
1019 br.call.sptk.many b0=b6;;
1020
1021 [MII] adds r12=-16,r12
1022 mov b0=r32
1023 mov.i ar.pfs=r33
1024 [MFB] nop.m 0x0
1025 nop.f 0x0
1026 br.ret.sptk.many b0;;
1027 */
1028
1029 /* These macros distribute a long constant into the two words of an MLX bundle */
1030 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
1031 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
1032 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
1033 | (BITS(val,7,9) << 50) \
1034 | (BITS(val,16,5) << 45) \
1035 | (BITS(val,21,1) << 44) \
1036 | (BITS(val,40,23)) \
1037 | (BITS(val,63,1) << 59))
1038
1039 {
1040 StgStablePtr stable;
1041 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1042 StgWord64 wcode = wdesc->ip;
1043 IA64FunDesc *fdesc;
1044 StgWord64 *code;
1045
1046 /* we allocate on the Haskell heap since malloc'd memory isn't
1047 * executable - argh */
1048 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1049 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1050 * wiggle room so that we can put the code on a 16 byte boundary. */
1051 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1052
1053 fdesc = (IA64FunDesc *)adjustor;
1054 code = (StgWord64 *)(fdesc + 1);
1055 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1056 if ((StgWord64)code & 15) code++;
1057 fdesc->ip = (StgWord64)code;
1058 fdesc->gp = wdesc->gp;
1059
1060 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1061 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1062 code[2] = 0x029015d818984001;
1063 code[3] = 0x8401200500420094;
1064 code[4] = 0x886011d8189c0001;
1065 code[5] = 0x84011004c00380c0;
1066 code[6] = 0x0250210046013800;
1067 code[7] = 0x8401000480420084;
1068 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1069 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1070 code[10] = 0x0200210020010811;
1071 code[11] = 0x1080006800006200;
1072 code[12] = 0x0000210018406000;
1073 code[13] = 0x00aa021000038005;
1074 code[14] = 0x000000010000001d;
1075 code[15] = 0x0084000880000200;
1076
1077 /* save stable pointers in convenient form */
1078 code[16] = (StgWord64)hptr;
1079 code[17] = (StgWord64)stable;
1080 }
1081 #else
1082 barf("adjustor creation not supported on this platform");
1083 #endif
1084 break;
1085
1086 default:
1087 ASSERT(0);
1088 break;
1089 }
1090
1091 /* Have fun! */
1092 return adjustor;
1093 }
1094
1095
1096 void
1097 freeHaskellFunctionPtr(void* ptr)
1098 {
1099 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1100 if ( *(unsigned char*)ptr != 0x68 &&
1101 *(unsigned char*)ptr != 0x58 ) {
1102 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1103 return;
1104 }
1105
1106 /* Free the stable pointer first..*/
1107 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1108 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1109 } else {
1110 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1111 }
1112 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1113 if ( *(unsigned char*)ptr != 0xe8 ) {
1114 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1115 return;
1116 }
1117 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1118 #elif defined(x86_64_HOST_ARCH)
1119 if ( *(StgWord16 *)ptr == 0x894d ) {
1120 freeStablePtr(*(StgStablePtr*)(ptr+0x20));
1121 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1122 freeStablePtr(*(StgStablePtr*)(ptr+0x30));
1123 } else {
1124 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1125 return;
1126 }
1127 #elif defined(sparc_HOST_ARCH)
1128 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1129 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1130 return;
1131 }
1132
1133 /* Free the stable pointer first..*/
1134 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1135 #elif defined(alpha_HOST_ARCH)
1136 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1137 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1138 return;
1139 }
1140
1141 /* Free the stable pointer first..*/
1142 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1143 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1144 if ( *(StgWord*)ptr != 0x48000008 ) {
1145 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1146 return;
1147 }
1148 freeStablePtr(((StgStablePtr*)ptr)[1]);
1149 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1150 extern void* adjustorCode;
1151 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1152 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1153 return;
1154 }
1155 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1156 #elif defined(ia64_HOST_ARCH)
1157 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1158 StgWord64 *code = (StgWord64 *)(fdesc+1);
1159
1160 if (fdesc->ip != (StgWord64)code) {
1161 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1162 return;
1163 }
1164 freeStablePtr((StgStablePtr)code[16]);
1165 freeStablePtr((StgStablePtr)code[17]);
1166 return;
1167 #else
1168 ASSERT(0);
1169 #endif
1170 *((unsigned char*)ptr) = '\0';
1171
1172 freeExec(ptr);
1173 }
1174
1175 #endif // !USE_LIBFFI_FOR_ADJUSTORS