Enable pthread_getspecific() tls for LLVM compiler
[ghc.git] / rts / Adjustor.c
1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
3 *
4 * Copyright (c) 1998.
5 *
6 * ---------------------------------------------------------------------------*/
7
8 /* A little bit of background...
9
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
12
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
21
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
28
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
32
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
36 Haskell side.
37 */
38
39 #include "PosixSource.h"
40 #include "Rts.h"
41
42 #include "RtsUtils.h"
43 #include "Stable.h"
44
45 #if defined(USE_LIBFFI_FOR_ADJUSTORS)
46 #include "ffi.h"
47 #include <string.h>
48 #endif
49
50 #if defined(i386_HOST_ARCH)
51 extern void adjustorCode(void);
52 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
53 // from AdjustorAsm.s
54 // not declared as a function so that AIX-style
55 // fundescs can never get in the way.
56 extern void *adjustorCode;
57 #endif
58
59 #if defined(USE_LIBFFI_FOR_ADJUSTORS)
60 void
61 freeHaskellFunctionPtr(void* ptr)
62 {
63 ffi_closure *cl;
64
65 cl = (ffi_closure*)ptr;
66 freeStablePtr(cl->user_data);
67 stgFree(cl->cif->arg_types);
68 stgFree(cl->cif);
69 freeExec(cl);
70 }
71
72 static ffi_type * char_to_ffi_type(char c)
73 {
74 switch (c) {
75 case 'v': return &ffi_type_void;
76 case 'f': return &ffi_type_float;
77 case 'd': return &ffi_type_double;
78 case 'L': return &ffi_type_sint64;
79 case 'l': return &ffi_type_uint64;
80 case 'W': return &ffi_type_sint32;
81 case 'w': return &ffi_type_uint32;
82 case 'S': return &ffi_type_sint16;
83 case 's': return &ffi_type_uint16;
84 case 'B': return &ffi_type_sint8;
85 case 'b': return &ffi_type_uint8;
86 case 'p': return &ffi_type_pointer;
87 default: barf("char_to_ffi_type: unknown type '%c'", c);
88 }
89 }
90
91 void*
92 createAdjustor (int cconv,
93 StgStablePtr hptr,
94 StgFunPtr wptr,
95 char *typeString)
96 {
97 ffi_cif *cif;
98 ffi_type **arg_types;
99 nat n_args, i;
100 ffi_type *result_type;
101 ffi_closure *cl;
102 int r, abi;
103 void *code;
104
105 n_args = strlen(typeString) - 1;
106 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
107 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
108
109 result_type = char_to_ffi_type(typeString[0]);
110 for (i=0; i < n_args; i++) {
111 arg_types[i] = char_to_ffi_type(typeString[i+1]);
112 }
113 switch (cconv) {
114 #ifdef mingw32_HOST_OS
115 case 0: /* stdcall */
116 abi = FFI_STDCALL;
117 break;
118 #endif
119 case 1: /* ccall */
120 abi = FFI_DEFAULT_ABI;
121 break;
122 default:
123 barf("createAdjustor: convention %d not supported on this platform", cconv);
124 }
125
126 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
127 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
128
129 cl = allocateExec(sizeof(ffi_closure), &code);
130 if (cl == NULL) {
131 barf("createAdjustor: failed to allocate memory");
132 }
133
134 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
135 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
136
137 return (void*)code;
138 }
139
140 #else // To end of file...
141
142 #if defined(_WIN32)
143 #include <windows.h>
144 #endif
145
146 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
147 #include <string.h>
148 #endif
149
150 #ifdef LEADING_UNDERSCORE
151 #define UNDERSCORE "_"
152 #else
153 #define UNDERSCORE ""
154 #endif
155
156 #if defined(x86_64_HOST_ARCH)
157 /*
158 Now here's something obscure for you:
159
160 When generating an adjustor thunk that uses the C calling
161 convention, we have to make sure that the thunk kicks off
162 the process of jumping into Haskell with a tail jump. Why?
163 Because as a result of jumping in into Haskell we may end
164 up freeing the very adjustor thunk we came from using
165 freeHaskellFunctionPtr(). Hence, we better not return to
166 the adjustor code on our way out, since it could by then
167 point to junk.
168
169 The fix is readily at hand, just include the opcodes
170 for the C stack fixup code that we need to perform when
171 returning in some static piece of memory and arrange
172 to return to it before tail jumping from the adjustor thunk.
173 */
174 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
175 {
176 __asm__ (
177 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
178 UNDERSCORE "obscure_ccall_ret_code:\n\t"
179 "addq $0x8, %rsp\n\t"
180 "ret"
181 );
182 }
183 extern void obscure_ccall_ret_code(void);
184 #endif
185
186 #if defined(alpha_HOST_ARCH)
187 /* To get the definition of PAL_imb: */
188 # if defined(linux_HOST_OS)
189 # include <asm/pal.h>
190 # else
191 # include <machine/pal.h>
192 # endif
193 #endif
194
195 #if defined(ia64_HOST_ARCH)
196
197 /* Layout of a function descriptor */
198 typedef struct _IA64FunDesc {
199 StgWord64 ip;
200 StgWord64 gp;
201 } IA64FunDesc;
202
203 static void *
204 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
205 {
206 StgArrWords* arr;
207 nat data_size_in_words, total_size_in_words;
208
209 /* round up to a whole number of words */
210 data_size_in_words = ROUNDUP_BYTES_TO_WDS(size_in_bytes);
211 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
212
213 /* allocate and fill it in */
214 arr = (StgArrWords *)allocate(total_size_in_words);
215 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, size_in_bytes);
216
217 /* obtain a stable ptr */
218 *stable = getStablePtr((StgPtr)arr);
219
220 /* and return a ptr to the goods inside the array */
221 return(&(arr->payload));
222 }
223 #endif
224
225 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
226 __asm__("obscure_ccall_ret_code:\n\t"
227 "lwz 1,0(1)\n\t"
228 "lwz 0,4(1)\n\t"
229 "mtlr 0\n\t"
230 "blr");
231 extern void obscure_ccall_ret_code(void);
232 #endif
233
234 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
235 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
236
237 /* !!! !!! WARNING: !!! !!!
238 * This structure is accessed from AdjustorAsm.s
239 * Any changes here have to be mirrored in the offsets there.
240 */
241
242 typedef struct AdjustorStub {
243 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
244 unsigned lis;
245 unsigned ori;
246 unsigned lwz;
247 unsigned mtctr;
248 unsigned bctr;
249 StgFunPtr code;
250 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
251 /* powerpc64-darwin: just guessing that it won't use fundescs. */
252 unsigned lis;
253 unsigned ori;
254 unsigned rldimi;
255 unsigned oris;
256 unsigned ori2;
257 unsigned lwz;
258 unsigned mtctr;
259 unsigned bctr;
260 StgFunPtr code;
261 #else
262 /* fundesc-based ABIs */
263 #define FUNDESCS
264 StgFunPtr code;
265 struct AdjustorStub
266 *toc;
267 void *env;
268 #endif
269 StgStablePtr hptr;
270 StgFunPtr wptr;
271 StgInt negative_framesize;
272 StgInt extrawords_plus_one;
273 } AdjustorStub;
274
275 #endif
276 #endif
277
278 #if defined(i386_HOST_ARCH)
279
280 /* !!! !!! WARNING: !!! !!!
281 * This structure is accessed from AdjustorAsm.s
282 * Any changes here have to be mirrored in the offsets there.
283 */
284
285 typedef struct AdjustorStub {
286 unsigned char call[8];
287 StgStablePtr hptr;
288 StgFunPtr wptr;
289 StgInt frame_size;
290 StgInt argument_size;
291 } AdjustorStub;
292 #endif
293
294 #if defined(i386_HOST_ARCH) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
295 static int totalArgumentSize(char *typeString)
296 {
297 int sz = 0;
298 while(*typeString)
299 {
300 char t = *typeString++;
301
302 switch(t)
303 {
304 // on 32-bit platforms, Double and Int64 occupy two words.
305 case 'd':
306 case 'l':
307 case 'L':
308 if(sizeof(void*) == 4)
309 {
310 sz += 2;
311 break;
312 }
313 // everything else is one word.
314 default:
315 sz += 1;
316 }
317 }
318 return sz;
319 }
320 #endif
321
322 void*
323 createAdjustor(int cconv, StgStablePtr hptr,
324 StgFunPtr wptr,
325 char *typeString
326 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
327 STG_UNUSED
328 #endif
329 )
330 {
331 void *adjustor = NULL;
332 void *code;
333
334 switch (cconv)
335 {
336 case 0: /* _stdcall */
337 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
338 /* Magic constant computed by inspecting the code length of
339 the following assembly language snippet
340 (offset and machine code prefixed):
341
342 <0>: 58 popl %eax # temp. remove ret addr..
343 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
344 # hold a StgStablePtr
345 <6>: 50 pushl %eax # put back ret. addr
346 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
347 <c>: ff e0 jmp %eax # and jump to it.
348 # the callee cleans up the stack
349 */
350 adjustor = allocateExec(14,&code);
351 {
352 unsigned char *const adj_code = (unsigned char *)adjustor;
353 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
354
355 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
356 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
357
358 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
359
360 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
361 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
362
363 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
364 adj_code[0x0d] = (unsigned char)0xe0;
365 }
366 #endif
367 break;
368
369 case 1: /* _ccall */
370 #if defined(i386_HOST_ARCH)
371 {
372 /*
373 Most of the trickiness here is due to the need to keep the
374 stack pointer 16-byte aligned (see #5250). That means we
375 can't just push another argument on the stack and call the
376 wrapper, we may have to shuffle the whole argument block.
377
378 We offload most of the work to AdjustorAsm.S.
379 */
380 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
381 adjustor = adjustorStub;
382
383 int sz = totalArgumentSize(typeString);
384
385 adjustorStub->call[0] = 0xe8;
386 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
387 adjustorStub->hptr = hptr;
388 adjustorStub->wptr = wptr;
389
390 // The adjustor puts the following things on the stack:
391 // 1.) %ebp link
392 // 2.) padding and (a copy of) the arguments
393 // 3.) a dummy argument
394 // 4.) hptr
395 // 5.) return address (for returning to the adjustor)
396 // All these have to add up to a multiple of 16.
397
398 // first, include everything in frame_size
399 adjustorStub->frame_size = sz * 4 + 16;
400 // align to 16 bytes
401 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
402 // only count 2.) and 3.) as part of frame_size
403 adjustorStub->frame_size -= 12;
404 adjustorStub->argument_size = sz;
405 }
406
407 #elif defined(x86_64_HOST_ARCH)
408 /*
409 stack at call:
410 argn
411 ...
412 arg7
413 return address
414 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
415
416 if there are <6 integer args, then we can just push the
417 StablePtr into %edi and shuffle the other args up.
418
419 If there are >=6 integer args, then we have to flush one arg
420 to the stack, and arrange to adjust the stack ptr on return.
421 The stack will be rearranged to this:
422
423 argn
424 ...
425 arg7
426 return address *** <-- dummy arg in stub fn.
427 arg6
428 obscure_ccall_ret_code
429
430 This unfortunately means that the type of the stub function
431 must have a dummy argument for the original return address
432 pointer inserted just after the 6th integer argument.
433
434 Code for the simple case:
435
436 0: 4d 89 c1 mov %r8,%r9
437 3: 49 89 c8 mov %rcx,%r8
438 6: 48 89 d1 mov %rdx,%rcx
439 9: 48 89 f2 mov %rsi,%rdx
440 c: 48 89 fe mov %rdi,%rsi
441 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
442 16: ff 25 0c 00 00 00 jmpq *12(%rip)
443 ...
444 20: .quad 0 # aligned on 8-byte boundary
445 28: .quad 0 # aligned on 8-byte boundary
446
447
448 And the version for >=6 integer arguments:
449
450 0: 41 51 push %r9
451 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
452 8: 4d 89 c1 mov %r8,%r9
453 b: 49 89 c8 mov %rcx,%r8
454 e: 48 89 d1 mov %rdx,%rcx
455 11: 48 89 f2 mov %rsi,%rdx
456 14: 48 89 fe mov %rdi,%rsi
457 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
458 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
459 ...
460 28: .quad 0 # aligned on 8-byte boundary
461 30: .quad 0 # aligned on 8-byte boundary
462 38: .quad 0 # aligned on 8-byte boundary
463 */
464
465 {
466 int i = 0;
467 char *c;
468 StgWord8 *adj_code;
469
470 // determine whether we have 6 or more integer arguments,
471 // and therefore need to flush one to the stack.
472 for (c = typeString; *c != '\0'; c++) {
473 if (*c != 'f' && *c != 'd') i++;
474 if (i == 6) break;
475 }
476
477 if (i < 6) {
478 adjustor = allocateExec(0x30,&code);
479 adj_code = (StgWord8*)adjustor;
480
481 *(StgInt32 *)adj_code = 0x49c1894d;
482 *(StgInt32 *)(adj_code+0x4) = 0x8948c889;
483 *(StgInt32 *)(adj_code+0x8) = 0xf28948d1;
484 *(StgInt32 *)(adj_code+0xc) = 0x48fe8948;
485 *(StgInt32 *)(adj_code+0x10) = 0x000a3d8b;
486 *(StgInt32 *)(adj_code+0x14) = 0x25ff0000;
487 *(StgInt32 *)(adj_code+0x18) = 0x0000000c;
488 *(StgInt64 *)(adj_code+0x20) = (StgInt64)hptr;
489 *(StgInt64 *)(adj_code+0x28) = (StgInt64)wptr;
490 }
491 else
492 {
493 adjustor = allocateExec(0x40,&code);
494 adj_code = (StgWord8*)adjustor;
495
496 *(StgInt32 *)adj_code = 0x35ff5141;
497 *(StgInt32 *)(adj_code+0x4) = 0x00000020;
498 *(StgInt32 *)(adj_code+0x8) = 0x49c1894d;
499 *(StgInt32 *)(adj_code+0xc) = 0x8948c889;
500 *(StgInt32 *)(adj_code+0x10) = 0xf28948d1;
501 *(StgInt32 *)(adj_code+0x14) = 0x48fe8948;
502 *(StgInt32 *)(adj_code+0x18) = 0x00123d8b;
503 *(StgInt32 *)(adj_code+0x1c) = 0x25ff0000;
504 *(StgInt32 *)(adj_code+0x20) = 0x00000014;
505
506 *(StgInt64 *)(adj_code+0x28) = (StgInt64)obscure_ccall_ret_code;
507 *(StgInt64 *)(adj_code+0x30) = (StgInt64)hptr;
508 *(StgInt64 *)(adj_code+0x38) = (StgInt64)wptr;
509 }
510 }
511 #elif defined(sparc_HOST_ARCH)
512 /* Magic constant computed by inspecting the code length of the following
513 assembly language snippet (offset and machine code prefixed):
514
515 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
516 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
517 <08>: D823A05C st %o4, [%sp + 92]
518 <0C>: 9A10000B mov %o3, %o5
519 <10>: 9810000A mov %o2, %o4
520 <14>: 96100009 mov %o1, %o3
521 <18>: 94100008 mov %o0, %o2
522 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
523 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
524 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
525 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
526 <2C> 00000000 ! place for getting hptr back easily
527
528 ccall'ing on SPARC is easy, because we are quite lucky to push a
529 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
530 existing arguments (note that %sp must stay double-word aligned at
531 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
532 To do this, we extend the *caller's* stack frame by 2 words and shift
533 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
534 procedure because of the tail-jump) by 2 positions. This makes room in
535 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
536 for destination addr of jump on SPARC, return address on x86, ...). This
537 shouldn't cause any problems for a C-like caller: alloca is implemented
538 similarly, and local variables should be accessed via %fp, not %sp. In a
539 nutshell: This should work! (Famous last words! :-)
540 */
541 adjustor = allocateExec(4*(11+1),&code);
542 {
543 unsigned long *const adj_code = (unsigned long *)adjustor;
544
545 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
546 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
547 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
548 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
549 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
550 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
551 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
552 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
553 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
554 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
555 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
556 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
557 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
558 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
559 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
560
561 adj_code[11] = (unsigned long)hptr;
562
563 /* flush cache */
564 asm("flush %0" : : "r" (adj_code ));
565 asm("flush %0" : : "r" (adj_code + 2));
566 asm("flush %0" : : "r" (adj_code + 4));
567 asm("flush %0" : : "r" (adj_code + 6));
568 asm("flush %0" : : "r" (adj_code + 10));
569
570 /* max. 5 instructions latency, and we need at >= 1 for returning */
571 asm("nop");
572 asm("nop");
573 asm("nop");
574 asm("nop");
575 }
576 #elif defined(alpha_HOST_ARCH)
577 /* Magic constant computed by inspecting the code length of
578 the following assembly language snippet
579 (offset and machine code prefixed; note that the machine code
580 shown is longwords stored in little-endian order):
581
582 <00>: 46520414 mov a2, a4
583 <04>: 46100412 mov a0, a2
584 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
585 <0c>: 46730415 mov a3, a5
586 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
587 <14>: 46310413 mov a1, a3
588 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
589 <1c>: 00000000 # padding for alignment
590 <20>: [8 bytes for hptr quadword]
591 <28>: [8 bytes for wptr quadword]
592
593 The "computed" jump at <08> above is really a jump to a fixed
594 location. Accordingly, we place an always-correct hint in the
595 jump instruction, namely the address offset from <0c> to wptr,
596 divided by 4, taking the lowest 14 bits.
597
598 We only support passing 4 or fewer argument words, for the same
599 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
600 On the Alpha the first 6 integer arguments are in a0 through a5,
601 and the rest on the stack. Hence we want to shuffle the original
602 caller's arguments by two.
603
604 On the Alpha the calling convention is so complex and dependent
605 on the callee's signature -- for example, the stack pointer has
606 to be a multiple of 16 -- that it seems impossible to me [ccshan]
607 to handle the general case correctly without changing how the
608 adjustor is called from C. For now, our solution of shuffling
609 registers only and ignoring the stack only works if the original
610 caller passed 4 or fewer argument words.
611
612 TODO: Depending on how much allocation overhead stgMallocBytes uses for
613 header information (more precisely, if the overhead is no more than
614 4 bytes), we should move the first three instructions above down by
615 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
616 */
617 ASSERT(((StgWord64)wptr & 3) == 0);
618 adjustor = allocateExec(48,&code);
619 {
620 StgWord64 *const code = (StgWord64 *)adjustor;
621
622 code[0] = 0x4610041246520414L;
623 code[1] = 0x46730415a61b0020L;
624 code[2] = 0x46310413a77b0028L;
625 code[3] = 0x000000006bfb0000L
626 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
627
628 code[4] = (StgWord64)hptr;
629 code[5] = (StgWord64)wptr;
630
631 /* Ensure that instruction cache is consistent with our new code */
632 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
633 }
634 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
635
636 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
637 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
638 {
639 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
640 We need to calculate all the details of the stack frame layout,
641 taking into account the types of all the arguments, and then
642 generate code on the fly. */
643
644 int src_gpr = 3, dst_gpr = 5;
645 int fpr = 3;
646 int src_offset = 0, dst_offset = 0;
647 int n = strlen(typeString),i;
648 int src_locs[n], dst_locs[n];
649 int frameSize;
650 unsigned *code;
651
652 /* Step 1:
653 Calculate where the arguments should go.
654 src_locs[] will contain the locations of the arguments in the
655 original stack frame passed to the adjustor.
656 dst_locs[] will contain the locations of the arguments after the
657 adjustor runs, on entry to the wrapper proc pointed to by wptr.
658
659 This algorithm is based on the one described on page 3-19 of the
660 System V ABI PowerPC Processor Supplement.
661 */
662 for(i=0;typeString[i];i++)
663 {
664 char t = typeString[i];
665 if((t == 'f' || t == 'd') && fpr <= 8)
666 src_locs[i] = dst_locs[i] = -32-(fpr++);
667 else
668 {
669 if((t == 'l' || t == 'L') && src_gpr <= 9)
670 {
671 if((src_gpr & 1) == 0)
672 src_gpr++;
673 src_locs[i] = -src_gpr;
674 src_gpr += 2;
675 }
676 else if((t == 'w' || t == 'W') && src_gpr <= 10)
677 {
678 src_locs[i] = -(src_gpr++);
679 }
680 else
681 {
682 if(t == 'l' || t == 'L' || t == 'd')
683 {
684 if(src_offset % 8)
685 src_offset += 4;
686 }
687 src_locs[i] = src_offset;
688 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
689 }
690
691 if((t == 'l' || t == 'L') && dst_gpr <= 9)
692 {
693 if((dst_gpr & 1) == 0)
694 dst_gpr++;
695 dst_locs[i] = -dst_gpr;
696 dst_gpr += 2;
697 }
698 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
699 {
700 dst_locs[i] = -(dst_gpr++);
701 }
702 else
703 {
704 if(t == 'l' || t == 'L' || t == 'd')
705 {
706 if(dst_offset % 8)
707 dst_offset += 4;
708 }
709 dst_locs[i] = dst_offset;
710 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
711 }
712 }
713 }
714
715 frameSize = dst_offset + 8;
716 frameSize = (frameSize+15) & ~0xF;
717
718 /* Step 2:
719 Build the adjustor.
720 */
721 // allocate space for at most 4 insns per parameter
722 // plus 14 more instructions.
723 adjustor = allocateExec(4 * (4*n + 14),&code);
724 code = (unsigned*)adjustor;
725
726 *code++ = 0x48000008; // b *+8
727 // * Put the hptr in a place where freeHaskellFunctionPtr
728 // can get at it.
729 *code++ = (unsigned) hptr;
730
731 // * save the link register
732 *code++ = 0x7c0802a6; // mflr r0;
733 *code++ = 0x90010004; // stw r0, 4(r1);
734 // * and build a new stack frame
735 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
736
737 // * now generate instructions to copy arguments
738 // from the old stack frame into the new stack frame.
739 for(i=n-1;i>=0;i--)
740 {
741 if(src_locs[i] < -32)
742 ASSERT(dst_locs[i] == src_locs[i]);
743 else if(src_locs[i] < 0)
744 {
745 // source in GPR.
746 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
747 if(dst_locs[i] < 0)
748 {
749 ASSERT(dst_locs[i] > -32);
750 // dst is in GPR, too.
751
752 if(typeString[i] == 'l' || typeString[i] == 'L')
753 {
754 // mr dst+1, src+1
755 *code++ = 0x7c000378
756 | ((-dst_locs[i]+1) << 16)
757 | ((-src_locs[i]+1) << 11)
758 | ((-src_locs[i]+1) << 21);
759 }
760 // mr dst, src
761 *code++ = 0x7c000378
762 | ((-dst_locs[i]) << 16)
763 | ((-src_locs[i]) << 11)
764 | ((-src_locs[i]) << 21);
765 }
766 else
767 {
768 if(typeString[i] == 'l' || typeString[i] == 'L')
769 {
770 // stw src+1, dst_offset+4(r1)
771 *code++ = 0x90010000
772 | ((-src_locs[i]+1) << 21)
773 | (dst_locs[i] + 4);
774 }
775
776 // stw src, dst_offset(r1)
777 *code++ = 0x90010000
778 | ((-src_locs[i]) << 21)
779 | (dst_locs[i] + 8);
780 }
781 }
782 else
783 {
784 ASSERT(dst_locs[i] >= 0);
785 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
786
787 if(typeString[i] == 'l' || typeString[i] == 'L')
788 {
789 // lwz r0, src_offset(r1)
790 *code++ = 0x80010000
791 | (src_locs[i] + frameSize + 8 + 4);
792 // stw r0, dst_offset(r1)
793 *code++ = 0x90010000
794 | (dst_locs[i] + 8 + 4);
795 }
796 // lwz r0, src_offset(r1)
797 *code++ = 0x80010000
798 | (src_locs[i] + frameSize + 8);
799 // stw r0, dst_offset(r1)
800 *code++ = 0x90010000
801 | (dst_locs[i] + 8);
802 }
803 }
804
805 // * hptr will be the new first argument.
806 // lis r3, hi(hptr)
807 *code++ = OP_HI(0x3c60, hptr);
808 // ori r3,r3,lo(hptr)
809 *code++ = OP_LO(0x6063, hptr);
810
811 // * we need to return to a piece of code
812 // which will tear down the stack frame.
813 // lis r11,hi(obscure_ccall_ret_code)
814 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
815 // ori r11,r11,lo(obscure_ccall_ret_code)
816 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
817 // mtlr r11
818 *code++ = 0x7d6803a6;
819
820 // * jump to wptr
821 // lis r11,hi(wptr)
822 *code++ = OP_HI(0x3d60, wptr);
823 // ori r11,r11,lo(wptr)
824 *code++ = OP_LO(0x616b, wptr);
825 // mtctr r11
826 *code++ = 0x7d6903a6;
827 // bctr
828 *code++ = 0x4e800420;
829
830 // Flush the Instruction cache:
831 {
832 unsigned *p = adjustor;
833 while(p < code)
834 {
835 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
836 : : "r" (p));
837 p++;
838 }
839 __asm__ volatile ("sync\n\tisync");
840 }
841 }
842
843 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
844
845 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
846 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
847 {
848 /* The following code applies to all PowerPC and PowerPC64 platforms
849 whose stack layout is based on the AIX ABI.
850
851 Besides (obviously) AIX, this includes
852 Mac OS 9 and BeOS/PPC (may they rest in peace),
853 which use the 32-bit AIX ABI
854 powerpc64-linux,
855 which uses the 64-bit AIX ABI
856 and Darwin (Mac OS X),
857 which uses the same stack layout as AIX,
858 but no function descriptors.
859
860 The actual stack-frame shuffling is implemented out-of-line
861 in the function adjustorCode, in AdjustorAsm.S.
862 Here, we set up an AdjustorStub structure, which
863 is a function descriptor (on platforms that have function
864 descriptors) or a short piece of stub code (on Darwin) to call
865 adjustorCode with a pointer to the AdjustorStub struct loaded
866 into register r2.
867
868 One nice thing about this is that there is _no_ code generated at
869 runtime on the platforms that have function descriptors.
870 */
871 AdjustorStub *adjustorStub;
872 int sz = 0, extra_sz, total_sz;
873
874 #ifdef FUNDESCS
875 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
876 #else
877 adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
878 #endif
879 adjustor = adjustorStub;
880
881 adjustorStub->code = (void*) &adjustorCode;
882
883 #ifdef FUNDESCS
884 // function descriptors are a cool idea.
885 // We don't need to generate any code at runtime.
886 adjustorStub->toc = adjustorStub;
887 #else
888
889 // no function descriptors :-(
890 // We need to do things "by hand".
891 #if defined(powerpc_HOST_ARCH)
892 // lis r2, hi(adjustorStub)
893 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
894 // ori r2, r2, lo(adjustorStub)
895 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
896 // lwz r0, code(r2)
897 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
898 - (char*)adjustorStub);
899 // mtctr r0
900 adjustorStub->mtctr = 0x7c0903a6;
901 // bctr
902 adjustorStub->bctr = 0x4e800420;
903 #else
904 barf("adjustor creation not supported on this platform");
905 #endif
906
907 // Flush the Instruction cache:
908 {
909 int n = sizeof(AdjustorStub)/sizeof(unsigned);
910 unsigned *p = (unsigned*)adjustor;
911 while(n--)
912 {
913 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
914 : : "r" (p));
915 p++;
916 }
917 __asm__ volatile ("sync\n\tisync");
918 }
919 #endif
920
921 // Calculate the size of the stack frame, in words.
922 sz = totalArgumentSize(typeString);
923
924 // The first eight words of the parameter area
925 // are just "backing store" for the parameters passed in
926 // the GPRs. extra_sz is the number of words beyond those first
927 // 8 words.
928 extra_sz = sz - 8;
929 if(extra_sz < 0)
930 extra_sz = 0;
931
932 // Calculate the total size of the stack frame.
933 total_sz = (6 /* linkage area */
934 + 8 /* minimum parameter area */
935 + 2 /* two extra arguments */
936 + extra_sz)*sizeof(StgWord);
937
938 // align to 16 bytes.
939 // AIX only requires 8 bytes, but who cares?
940 total_sz = (total_sz+15) & ~0xF;
941
942 // Fill in the information that adjustorCode in AdjustorAsm.S
943 // will use to create a new stack frame with the additional args.
944 adjustorStub->hptr = hptr;
945 adjustorStub->wptr = wptr;
946 adjustorStub->negative_framesize = -total_sz;
947 adjustorStub->extrawords_plus_one = extra_sz + 1;
948 }
949
950 #elif defined(ia64_HOST_ARCH)
951 /*
952 Up to 8 inputs are passed in registers. We flush the last two inputs to
953 the stack, initially into the 16-byte scratch region left by the caller.
954 We then shuffle the others along by 4 (taking 2 registers for ourselves
955 to save return address and previous function state - we need to come back
956 here on the way out to restore the stack, so this is a real function
957 rather than just a trampoline).
958
959 The function descriptor we create contains the gp of the target function
960 so gp is already loaded correctly.
961
962 [MLX] alloc r16=ar.pfs,10,2,0
963 movl r17=wptr
964 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
965 mov r41=r37 // out7 = in5 (out3)
966 mov r40=r36;; // out6 = in4 (out2)
967 [MII] st8.spill [r12]=r39 // spill in7 (out5)
968 mov.sptk b6=r17,50
969 mov r38=r34;; // out4 = in2 (out0)
970 [MII] mov r39=r35 // out5 = in3 (out1)
971 mov r37=r33 // out3 = in1 (loc1)
972 mov r36=r32 // out2 = in0 (loc0)
973 [MLX] adds r12=-24,r12 // update sp
974 movl r34=hptr;; // out0 = hptr
975 [MIB] mov r33=r16 // loc1 = ar.pfs
976 mov r32=b0 // loc0 = retaddr
977 br.call.sptk.many b0=b6;;
978
979 [MII] adds r12=-16,r12
980 mov b0=r32
981 mov.i ar.pfs=r33
982 [MFB] nop.m 0x0
983 nop.f 0x0
984 br.ret.sptk.many b0;;
985 */
986
987 /* These macros distribute a long constant into the two words of an MLX bundle */
988 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
989 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
990 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
991 | (BITS(val,7,9) << 50) \
992 | (BITS(val,16,5) << 45) \
993 | (BITS(val,21,1) << 44) \
994 | (BITS(val,40,23)) \
995 | (BITS(val,63,1) << 59))
996
997 {
998 StgStablePtr stable;
999 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1000 StgWord64 wcode = wdesc->ip;
1001 IA64FunDesc *fdesc;
1002 StgWord64 *code;
1003
1004 /* we allocate on the Haskell heap since malloc'd memory isn't
1005 * executable - argh */
1006 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1007 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1008 * wiggle room so that we can put the code on a 16 byte boundary. */
1009 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1010
1011 fdesc = (IA64FunDesc *)adjustor;
1012 code = (StgWord64 *)(fdesc + 1);
1013 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1014 if ((StgWord64)code & 15) code++;
1015 fdesc->ip = (StgWord64)code;
1016 fdesc->gp = wdesc->gp;
1017
1018 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1019 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1020 code[2] = 0x029015d818984001;
1021 code[3] = 0x8401200500420094;
1022 code[4] = 0x886011d8189c0001;
1023 code[5] = 0x84011004c00380c0;
1024 code[6] = 0x0250210046013800;
1025 code[7] = 0x8401000480420084;
1026 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1027 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1028 code[10] = 0x0200210020010811;
1029 code[11] = 0x1080006800006200;
1030 code[12] = 0x0000210018406000;
1031 code[13] = 0x00aa021000038005;
1032 code[14] = 0x000000010000001d;
1033 code[15] = 0x0084000880000200;
1034
1035 /* save stable pointers in convenient form */
1036 code[16] = (StgWord64)hptr;
1037 code[17] = (StgWord64)stable;
1038 }
1039 #else
1040 barf("adjustor creation not supported on this platform");
1041 #endif
1042 break;
1043
1044 default:
1045 ASSERT(0);
1046 break;
1047 }
1048
1049 /* Have fun! */
1050 return code;
1051 }
1052
1053
1054 void
1055 freeHaskellFunctionPtr(void* ptr)
1056 {
1057 #if defined(i386_HOST_ARCH)
1058 if ( *(unsigned char*)ptr != 0xe8 &&
1059 *(unsigned char*)ptr != 0x58 ) {
1060 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1061 return;
1062 }
1063 if (*(unsigned char*)ptr == 0xe8) { /* Aha, a ccall adjustor! */
1064 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1065 } else {
1066 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1067 }
1068 #elif defined(x86_64_HOST_ARCH)
1069 if ( *(StgWord16 *)ptr == 0x894d ) {
1070 freeStablePtr(*(StgStablePtr*)((StgWord8*)ptr+0x20));
1071 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1072 freeStablePtr(*(StgStablePtr*)((StgWord8*)ptr+0x30));
1073 } else {
1074 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1075 return;
1076 }
1077 #elif defined(sparc_HOST_ARCH)
1078 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1079 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1080 return;
1081 }
1082
1083 /* Free the stable pointer first..*/
1084 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1085 #elif defined(alpha_HOST_ARCH)
1086 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1087 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1088 return;
1089 }
1090
1091 /* Free the stable pointer first..*/
1092 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1093 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1094 if ( *(StgWord*)ptr != 0x48000008 ) {
1095 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1096 return;
1097 }
1098 freeStablePtr(((StgStablePtr*)ptr)[1]);
1099 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1100 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1101 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1102 return;
1103 }
1104 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1105 #elif defined(ia64_HOST_ARCH)
1106 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1107 StgWord64 *code = (StgWord64 *)(fdesc+1);
1108
1109 if (fdesc->ip != (StgWord64)code) {
1110 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1111 return;
1112 }
1113 freeStablePtr((StgStablePtr)code[16]);
1114 freeStablePtr((StgStablePtr)code[17]);
1115 return;
1116 #else
1117 ASSERT(0);
1118 #endif
1119 // Can't write to this memory, it is only executable:
1120 // *((unsigned char*)ptr) = '\0';
1121
1122 freeExec(ptr);
1123 }
1124
1125 #endif // !USE_LIBFFI_FOR_ADJUSTORS