Fixed x86-64 long double passing.

long double arguments require 16-byte alignment on the stack, which
requires adjustment when the the stack offset is not an evven number of
8-byte words.
This commit is contained in:
James Lyon 2013-04-26 16:42:12 +01:00
parent 41d76e1fcb
commit 6ee366e765
4 changed files with 206 additions and 123 deletions

View File

@ -7,13 +7,13 @@
typedef void *va_list;
va_list __va_start(void *fp);
void *__va_arg(va_list ap, int arg_type, int size);
void *__va_arg(va_list ap, int arg_type, int size, int align);
va_list __va_copy(va_list src);
void __va_end(va_list ap);
#define va_start(ap, last) ((ap) = __va_start(__builtin_frame_address(0)))
#define va_arg(ap, type) \
(*(type *)(__va_arg(ap, __builtin_va_arg_types(type), sizeof(type))))
(*(type *)(__va_arg(ap, __builtin_va_arg_types(type), sizeof(type), __alignof__(type))))
#define va_copy(dest, src) ((dest) = __va_copy(src))
#define va_end(ap) __va_end(ap)

View File

@ -645,9 +645,10 @@ void *__va_start(void *fp)
void *__va_arg(struct __va_list_struct *ap,
enum __va_arg_type arg_type,
int size)
int size, int align)
{
size = (size + 7) & ~7;
align = (align + 7) & ~7;
switch (arg_type) {
case __va_gen_reg:
if (ap->gp_offset < 48) {
@ -668,6 +669,7 @@ void *__va_arg(struct __va_list_struct *ap,
case __va_stack:
use_overflow_area:
ap->overflow_arg_area += size;
ap->overflow_arg_area = (char*)((long long)(ap->overflow_arg_area + align - 1) & -(long long)align);
return ap->overflow_arg_area - size;
default:

View File

@ -389,6 +389,24 @@ static int stdarg_struct_test(void) {
return run_callback(src, stdarg_struct_test_callback);
}
/* Test that x86-64 arranges the stack correctly for arguments with alignment >8 bytes */
typedef LONG_DOUBLE (*arg_align_test_callback_type) (LONG_DOUBLE,int,LONG_DOUBLE,int,LONG_DOUBLE);
static int arg_align_test_callback(void *ptr) {
arg_align_test_callback_type f = (arg_align_test_callback_type)ptr;
long double x = f(12, 0, 25, 0, 37);
return (x == 74) ? 0 : -1;
}
static int arg_align_test(void) {
const char *src =
"long double f(long double a, int b, long double c, int d, long double e) {\n"
" return a + c + e;\n"
"}\n";
return run_callback(src, arg_align_test_callback);
}
#define RUN_TEST(t) \
if (!testname || (strcmp(#t, testname) == 0)) { \
fputs(#t "... ", stdout); \
@ -432,5 +450,6 @@ int main(int argc, char **argv) {
RUN_TEST(many_struct_test_2);
RUN_TEST(stdarg_test);
RUN_TEST(stdarg_struct_test);
RUN_TEST(arg_align_test);
return retval;
}

View File

@ -96,9 +96,9 @@ enum {
/* long double size and alignment, in bytes */
#define LDOUBLE_SIZE 16
#define LDOUBLE_ALIGN 8
#define LDOUBLE_ALIGN 16
/* maximum alignment (for aligned attribute support) */
#define MAX_ALIGN 8
#define MAX_ALIGN 16
/******************************************************/
/* ELF defines */
@ -983,7 +983,7 @@ static X86_64_Mode classify_x86_64_inner(CType *ty) {
assert(0);
}
static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *reg_count) {
static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count) {
X86_64_Mode mode;
int size, align, ret_t;
@ -995,6 +995,7 @@ static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *r
} else {
size = type_size(ty, &align);
*psize = (size + 7) & ~7;
*palign = (align + 7) & ~7;
if (size > 16) {
mode = x86_64_mode_memory;
@ -1042,8 +1043,8 @@ ST_FUNC int classify_x86_64_va_arg(CType *ty) {
enum __va_arg_type {
__va_gen_reg, __va_float_reg, __va_stack
};
int size, reg_count;
X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &reg_count);
int size, align, reg_count;
X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
switch (mode) {
default: return __va_stack;
case x86_64_mode_integer: return __va_gen_reg;
@ -1053,9 +1054,9 @@ ST_FUNC int classify_x86_64_va_arg(CType *ty) {
/* Return 1 if this function returns via an sret pointer, 0 otherwise */
int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
int size, reg_count;
int size, align, reg_count;
*ret_align = 1; // Never have to re-align return values for x86-64
return (classify_x86_64_arg(vt, ret, &size, &reg_count) == x86_64_mode_memory);
return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) == x86_64_mode_memory);
}
#define REGN 6
@ -1078,51 +1079,75 @@ void gfunc_call(int nb_args)
{
X86_64_Mode mode;
CType type;
int size, align, r, args_size, i, j, reg_count;
int size, align, r, args_size, stack_adjust, run_start, run_end, i, j, reg_count;
int nb_reg_args = 0;
int nb_sse_args = 0;
int sse_reg, gen_reg;
/* calculate the number of integer/float arguments */
args_size = 0;
/* calculate the number of integer/float register arguments */
for(i = 0; i < nb_args; i++) {
mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &reg_count);
switch (mode) {
case x86_64_mode_memory:
case x86_64_mode_x87:
args_size += size;
break;
case x86_64_mode_sse:
mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
if (mode == x86_64_mode_sse)
nb_sse_args += reg_count;
if (nb_sse_args > 8) args_size += size;
break;
case x86_64_mode_integer:
else if (mode == x86_64_mode_integer)
nb_reg_args += reg_count;
if (nb_reg_args > REGN) args_size += size;
break;
}
}
/* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
and ended by a 16-byte aligned argument. This is because, from the point of view of
the callee, argument alignment is computed from the bottom up. */
/* for struct arguments, we need to call memcpy and the function
call breaks register passing arguments we are preparing.
So, we process arguments which will be passed by stack first. */
gen_reg = nb_reg_args;
sse_reg = nb_sse_args;
run_start = 0;
args_size = 0;
while (run_start != nb_args) {
int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
run_end = nb_args;
stack_adjust = 0;
for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
switch (mode) {
case x86_64_mode_memory:
case x86_64_mode_x87:
stack_arg:
if (align == 16)
run_end = i;
else
stack_adjust += size;
break;
case x86_64_mode_sse:
sse_reg -= reg_count;
if (sse_reg + reg_count > 8) goto stack_arg;
break;
case x86_64_mode_integer:
gen_reg -= reg_count;
if (gen_reg + reg_count > REGN) goto stack_arg;
break;
}
}
gen_reg = run_gen_reg;
sse_reg = run_sse_reg;
/* adjust stack to align SSE boundary */
if (args_size &= 15) {
if (stack_adjust &= 15) {
/* fetch cpu flag before the following sub will change the value */
if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
gv(RC_INT);
args_size = 16 - args_size;
stack_adjust = 16 - stack_adjust;
o(0x48);
oad(0xec81, args_size); /* sub $xxx, %rsp */
oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
args_size += stack_adjust;
}
for(i = 0; i < nb_args;) {
for(i = run_start; i < run_end;) {
/* Swap argument to top, it will possibly be changed here,
and might use more temps. At the end of the loop we keep
in on the stack and swap it back to its original position
@ -1131,7 +1156,7 @@ void gfunc_call(int nb_args)
vtop[0] = vtop[-i];
vtop[-i] = tmp;
mode = classify_x86_64_arg(&vtop->type, NULL, &size, &reg_count);
mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
int arg_stored = 1;
switch (vtop->type.t & VT_BTYPE) {
@ -1164,13 +1189,7 @@ void gfunc_call(int nb_args)
break;
case VT_LDOUBLE:
gv(RC_ST0);
size = LDOUBLE_SIZE;
oad(0xec8148, size); /* sub $xxx, %rsp */
o(0x7cdb); /* fstpt 0(%rsp) */
g(0x24);
g(0x00);
args_size += size;
assert(0);
break;
case VT_FLOAT:
@ -1212,14 +1231,53 @@ void gfunc_call(int nb_args)
if (arg_stored) {
vrotb(i+1);
assert(vtop->type.t == tmp.type.t);
assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
vpop();
--nb_args;
--run_end;
} else {
++i;
}
}
/* handle 16 byte aligned arguments at end of run */
run_start = i = run_end;
while (i < nb_args) {
/* Rotate argument to top since it will always be popped */
mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
if (align != 16)
break;
vrotb(i+1);
if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
gv(RC_ST0);
oad(0xec8148, size); /* sub $xxx, %rsp */
o(0x7cdb); /* fstpt 0(%rsp) */
g(0x24);
g(0x00);
args_size += size;
} else {
assert(mode == x86_64_mode_memory);
/* allocate the necessary size on stack */
o(0x48);
oad(0xec81, size); /* sub $xxx, %rsp */
/* generate structure store */
r = get_reg(RC_INT);
orex(1, r, 0, 0x89); /* mov %rsp, r */
o(0xe0 + REG_VALUE(r));
vset(&vtop->type, r | VT_LVAL, 0);
vswap();
vstore();
args_size += size;
}
vpop();
--nb_args;
}
}
/* XXX This should be superfluous. */
save_regs(0); /* save used temporary registers */
@ -1230,7 +1288,7 @@ void gfunc_call(int nb_args)
assert(gen_reg <= REGN);
assert(sse_reg <= 8);
for(i = 0; i < nb_args; i++) {
mode = classify_x86_64_arg(&vtop->type, &type, &size, &reg_count);
mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
/* Alter stack entry type so that gv() knows how to treat it */
vtop->type = type;
if (mode == x86_64_mode_sse) {
@ -1324,10 +1382,11 @@ void gfunc_prolog(CType *func_type)
sym = func_type->ref;
while ((sym = sym->next) != NULL) {
type = &sym->type;
mode = classify_x86_64_arg(type, NULL, &size, &reg_count);
mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
switch (mode) {
default:
seen_stack_size += size;
stack_arg:
seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
break;
case x86_64_mode_integer:
@ -1335,7 +1394,7 @@ void gfunc_prolog(CType *func_type)
seen_reg_num += reg_count;
} else {
seen_reg_num = 8;
seen_stack_size += size;
goto stack_arg;
}
break;
@ -1344,7 +1403,7 @@ void gfunc_prolog(CType *func_type)
seen_sse_num += reg_count;
} else {
seen_sse_num = 8;
seen_stack_size += size;
goto stack_arg;
}
break;
}
@ -1383,7 +1442,7 @@ void gfunc_prolog(CType *func_type)
/* if the function returns a structure, then add an
implicit pointer parameter */
func_vt = sym->type;
mode = classify_x86_64_arg(&func_vt, NULL, &size, &reg_count);
mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
if (mode == x86_64_mode_memory) {
push_arg_reg(reg_param_index);
func_vc = loc;
@ -1392,7 +1451,7 @@ void gfunc_prolog(CType *func_type)
/* define parameters */
while ((sym = sym->next) != NULL) {
type = &sym->type;
mode = classify_x86_64_arg(type, NULL, &size, &reg_count);
mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
switch (mode) {
case x86_64_mode_sse:
if (sse_param_index + reg_count <= 8) {
@ -1405,6 +1464,7 @@ void gfunc_prolog(CType *func_type)
++sse_param_index;
}
} else {
addr = (addr + align - 1) & -align;
param_addr = addr;
addr += size;
sse_param_index += reg_count;
@ -1413,6 +1473,7 @@ void gfunc_prolog(CType *func_type)
case x86_64_mode_memory:
case x86_64_mode_x87:
addr = (addr + align - 1) & -align;
param_addr = addr;
addr += size;
break;
@ -1427,6 +1488,7 @@ void gfunc_prolog(CType *func_type)
++reg_param_index;
}
} else {
addr = (addr + align - 1) & -align;
param_addr = addr;
addr += size;
reg_param_index += reg_count;