diff --git a/.gitignore b/.gitignore
index 351d3f6a..da0d094f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,3 +57,5 @@ tcc-doc.info
 conftest*
 tiny_libmaker
 *.dSYM
+*~
+
diff --git a/arm-gen.c b/arm-gen.c
index 03262aa9..0f70062a 100644
--- a/arm-gen.c
+++ b/arm-gen.c
@@ -801,16 +801,21 @@ int assign_fpreg(struct avail_regs *avregs, int align, int size)
 #endif
 
 /* Return 1 if this function returns via an sret pointer, 0 otherwise */
-ST_FUNC int gfunc_sret(CType *vt, CType *ret, int *align) {
+ST_FUNC int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
+#if TCC_ARM_EABI
+    int size, align;
     size = type_size(vt, &align);
     if (size > 4) {
         return 1;
     } else {
-        *align = 4;
+        *ret_align = 4;
         ret->ref = NULL;
         ret->t = VT_INT;
+        return 0;
     }
-    return 0;
+#else
+    return 1;
+#endif
 }
 
 /* Generate function call. The function address is pushed first, then
diff --git a/c67-gen.c b/c67-gen.c
index abf25dfb..7d559c89 100644
--- a/c67-gen.c
+++ b/c67-gen.c
@@ -1879,6 +1879,12 @@ static void gcall_or_jmp(int is_jmp)
     }
 }
 
+/* Return 1 if this function returns via an sret pointer, 0 otherwise */
+ST_FUNC int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
+    *ret_align = 1; // Never have to re-align return values for x86-64
+    return 1;
+}
+
 /* generate function call with address in (vtop->t, vtop->c) and free function
    context. Stack entry is popped */
 void gfunc_call(int nb_args)
diff --git a/tcc.c b/tcc.c
index 25330a12..b223d39f 100644
--- a/tcc.c
+++ b/tcc.c
@@ -69,7 +69,7 @@ static void help(void)
            "  -Bdir       use 'dir' as tcc internal library and include path\n"
            "  -MD         generate target dependencies for make\n"
            "  -MF depfile put generated dependencies here\n"
-           "  -norunsrc   Do not compile the file which is the first argument after -run."
+           "  -norunsrc   Do not compile the file which is the first argument after -run.\n"
            );
 }
 
diff --git a/tcc.h b/tcc.h
index f243ed03..b336e1f7 100644
--- a/tcc.h
+++ b/tcc.h
@@ -718,6 +718,8 @@ struct TCCState {
 #define VT_LLONG           12  /* 64 bit integer */
 #define VT_LONG            13  /* long integer (NEVER USED as type, only
                                   during parsing) */
+#define VT_QLONG           14  /* 128-bit integer. Only used for x86-64 ABI */
+#define VT_QFLOAT          15  /* 128-bit float. Only used for x86-64 ABI */
 #define VT_UNSIGNED    0x0010  /* unsigned type */
 #define VT_ARRAY       0x0020  /* array type (also has VT_PTR) */
 #define VT_BITFIELD    0x0040  /* bitfield modifier */
diff --git a/tccgen.c b/tccgen.c
index febab117..83ee171a 100644
--- a/tccgen.c
+++ b/tccgen.c
@@ -2495,21 +2495,29 @@ ST_FUNC void vstore(void)
                 vtop[-1].r = t | VT_LVAL;
             }
             store(r, vtop - 1);
-#ifndef TCC_TARGET_X86_64
-            /* two word case handling : store second register at word + 4 */
+            /* two word case handling : store second register at word + 4 (or +8 for x86-64)  */
+#ifdef TCC_TARGET_X86_64
+            if ((ft & VT_BTYPE) == VT_QLONG) {
+#else
             if ((ft & VT_BTYPE) == VT_LLONG) {
+#endif
                 vswap();
                 /* convert to int to increment easily */
+#ifdef TCC_TARGET_X86_64
+                vtop->type.t = VT_LLONG;
+                gaddrof();
+                vpushi(8);
+#else
                 vtop->type.t = VT_INT;
                 gaddrof();
                 vpushi(4);
+#endif
                 gen_op('+');
                 vtop->r |= VT_LVAL;
                 vswap();
                 /* XXX: it works because r2 is spilled last ! */
                 store(vtop->r2, vtop - 1);
             }
-#endif
         }
         vswap();
         vtop--; /* NOT vpop() because on x86 it would flush the fp stack */
diff --git a/tests/Makefile b/tests/Makefile
index ae5d47d4..48668e67 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -188,8 +188,8 @@ abitest-tcc$(EXESUF): abitest.c $(top_builddir)/$(LIBTCC)
 
 abitest: abitest-cc$(EXESUF) abitest-tcc$(EXESUF)
 	@echo ------------ $@ ------------
-	abitest-cc$(EXESUF) lib_path=..
-	abitest-tcc$(EXESUF) lib_path=..
+	./abitest-cc$(EXESUF) lib_path=..
+	./abitest-tcc$(EXESUF) lib_path=..
 
 # targets for development
 %.bin: %.c tcc
diff --git a/tests/abitest.c b/tests/abitest.c
index 4f966a4e..ff948d06 100644
--- a/tests/abitest.c
+++ b/tests/abitest.c
@@ -54,6 +54,7 @@ RET_PRIMITIVE_TEST(int, int)
 RET_PRIMITIVE_TEST(longlong, long long)
 RET_PRIMITIVE_TEST(float, float)
 RET_PRIMITIVE_TEST(double, double)
+RET_PRIMITIVE_TEST(longdouble, long double)
 
 typedef struct ret_2float_test_type_s {float x, y;} ret_2float_test_type;
 typedef ret_2float_test_type (*ret_2float_test_function_type) (ret_2float_test_type);
@@ -128,8 +129,52 @@ static int sret_test(void) {
   return run_callback(src, sret_test_callback);
 }
 
+typedef union one_member_union_test_type_u {int x;} one_member_union_test_type;
+typedef one_member_union_test_type (*one_member_union_test_function_type) (one_member_union_test_type);
+
+static int one_member_union_test_callback(void *ptr) {
+  one_member_union_test_function_type f = (one_member_union_test_function_type)ptr;
+  one_member_union_test_type a, b;
+  a.x = 34;
+  b = f(a);
+  return (b.x == a.x*2) ? 0 : -1;
+}
+
+static int one_member_union_test(void) {
+  const char *src =
+  "typedef union one_member_union_test_type_u {int x;} one_member_union_test_type;\n"
+  "one_member_union_test_type f(one_member_union_test_type a) {\n"
+  "  one_member_union_test_type b;\n"
+  "  b.x = a.x * 2;\n"
+  "  return b;\n"
+  "}\n";
+  return run_callback(src, one_member_union_test_callback);
+}
+
+typedef union two_member_union_test_type_u {int x; long y;} two_member_union_test_type;
+typedef two_member_union_test_type (*two_member_union_test_function_type) (two_member_union_test_type);
+
+static int two_member_union_test_callback(void *ptr) {
+  two_member_union_test_function_type f = (two_member_union_test_function_type)ptr;
+  two_member_union_test_type a, b;
+  a.x = 34;
+  b = f(a);
+  return (b.x == a.x*2) ? 0 : -1;
+}
+
+static int two_member_union_test(void) {
+  const char *src =
+  "typedef union two_member_union_test_type_u {int x; long y;} two_member_union_test_type;\n"
+  "two_member_union_test_type f(two_member_union_test_type a) {\n"
+  "  two_member_union_test_type b;\n"
+  "  b.x = a.x * 2;\n"
+  "  return b;\n"
+  "}\n";
+  return run_callback(src, two_member_union_test_callback);
+}
+
 #define RUN_TEST(t) \
-  do { \
+  if (!testname || (strcmp(#t, testname) == 0)) { \
     fputs(#t "... ", stdout); \
     fflush(stdout); \
     if (t() == 0) { \
@@ -138,20 +183,30 @@ static int sret_test(void) {
       fputs("failure\n", stdout); \
       retval = EXIT_FAILURE; \
     } \
-  } while (0);
+  }
 
 int main(int argc, char **argv) {
+  int i;
+  const char *testname = NULL;
   int retval = EXIT_SUCCESS;
+  
   /* if tcclib.h and libtcc1.a are not installed, where can we find them */
-  if (argc == 2 && !memcmp(argv[1], "lib_path=",9))
-    tccdir = argv[1] + 9;
+  for (i = 1; i < argc; ++i) {
+    if (!memcmp(argv[i], "lib_path=",9))
+      tccdir = argv[i] + 9;
+    else if (!memcmp(argv[i], "run_test=", 9))
+      testname = argv[i] + 9;
+  }   
 
   RUN_TEST(ret_int_test);
   RUN_TEST(ret_longlong_test);
   RUN_TEST(ret_float_test);
   RUN_TEST(ret_double_test);
+  RUN_TEST(ret_longdouble_test);
   RUN_TEST(ret_2float_test);
   RUN_TEST(reg_pack_test);
   RUN_TEST(sret_test);
+  RUN_TEST(one_member_union_test);
+  RUN_TEST(two_member_union_test);
   return retval;
 }
diff --git a/x86_64-gen.c b/x86_64-gen.c
index f85cd01b..d9873424 100644
--- a/x86_64-gen.c
+++ b/x86_64-gen.c
@@ -602,6 +602,12 @@ void gen_offs_sp(int b, int r, int d)
     }
 }
 
+/* Return 1 if this function returns via an sret pointer, 0 otherwise */
+ST_FUNC int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
+    *ret_align = 1; // Never have to re-align return values for x86-64
+    return 1;
+}
+
 void gfunc_call(int nb_args)
 {
     int size, align, r, args_size, i, d, j, bt, struct_size;
@@ -817,6 +823,139 @@ static void gadd_sp(int val)
     }
 }
 
+typedef enum X86_64_Mode {
+  x86_64_mode_none,
+  x86_64_mode_memory,
+  x86_64_mode_integer,
+  x86_64_mode_sse,
+  x86_64_mode_x87
+} X86_64_Mode;
+
+static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b) {
+    if (a == b)
+        return a;
+    else if (a == x86_64_mode_none)
+        return b;
+    else if (b == x86_64_mode_none)
+        return a;
+    else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
+        return x86_64_mode_memory;
+    else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
+        return x86_64_mode_integer;
+    else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
+        return x86_64_mode_memory;
+    else
+        return x86_64_mode_sse;
+}
+
+static X86_64_Mode classify_x86_64_inner(CType *ty) {
+    X86_64_Mode mode;
+    Sym *f;
+    
+    if (ty->t & VT_BITFIELD)
+        return x86_64_mode_memory;
+    
+    switch (ty->t & VT_BTYPE) {
+    case VT_VOID: return x86_64_mode_none;
+    
+    case VT_INT:
+    case VT_BYTE:
+    case VT_SHORT:
+    case VT_LLONG:
+    case VT_BOOL:
+    case VT_PTR:
+    case VT_ENUM: return x86_64_mode_integer;
+    
+    case VT_FLOAT:
+    case VT_DOUBLE: return x86_64_mode_sse;
+    
+    case VT_LDOUBLE: return x86_64_mode_x87;
+      
+    case VT_STRUCT:
+        f = ty->ref;
+
+        // Detect union
+        if (f->next && (f->c == f->next->c))
+          return x86_64_mode_memory;
+        
+        mode = x86_64_mode_none;
+        for (; f; f = f->next)
+            mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
+        
+        return mode;
+    }
+}
+
+static X86_64_Mode classify_x86_64_arg(CType *ty, int *psize, int *reg_count) {
+    X86_64_Mode mode;
+    int size, align;
+    
+    if (ty->t & VT_ARRAY) {
+        *psize = 8;
+        *reg_count = 1;
+        return x86_64_mode_integer;
+    }
+    
+    size = type_size(ty, &align);
+    size = (size + 7) & ~7;
+    *psize = size;
+    if (size > 16)
+        return x86_64_mode_memory;
+
+    mode = classify_x86_64_inner(ty);
+    if (reg_count) {
+        if (mode == x86_64_mode_integer)
+            *reg_count = size / 8;
+        else if (mode == x86_64_mode_none)
+            *reg_count = 0;
+        else
+            *reg_count = 1;
+    }
+    return mode;
+}
+
+static X86_64_Mode classify_x86_64_arg_type(CType *vt, CType *ret, int *psize, int *reg_count) {
+    X86_64_Mode mode;
+    int size;
+    
+    ret->ref = NULL;
+
+    mode = classify_x86_64_arg(vt, &size, reg_count);
+    *psize = size;
+    switch (mode) {
+    case x86_64_mode_integer:
+        if (size > 8)
+            ret->t = VT_QLONG;
+        else if (size > 4)
+            ret->t = VT_LLONG;
+        else
+            ret->t = VT_INT;
+        break;
+        
+    case x86_64_mode_x87:
+        ret->t = VT_LDOUBLE;
+        break;
+
+    case x86_64_mode_sse:
+        if (size > 8)
+            ret->t = VT_QFLOAT;
+        else if (size > 4)
+            ret->t = VT_DOUBLE;
+        else
+            ret->t = VT_FLOAT;
+        break;
+    }
+    
+    return mode;
+}
+
+/* Return 1 if this function returns via an sret pointer, 0 otherwise */
+int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
+    int size, reg_count;
+    *ret_align = 1; // Never have to re-align return values for x86-64
+    return (classify_x86_64_arg_type(vt, ret, &size, &reg_count) == x86_64_mode_memory);
+}
+
 #define REGN 6
 static const uint8_t arg_regs[REGN] = {
     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
@@ -827,7 +966,9 @@ static const uint8_t arg_regs[REGN] = {
    parameters and the function address. */
 void gfunc_call(int nb_args)
 {
-    int size, align, r, args_size, i;
+    X86_64_Mode mode;
+    CType type;
+    int size, align, r, args_size, i, j, reg_count;
     int nb_reg_args = 0;
     int nb_sse_args = 0;
     int sse_reg, gen_reg;
@@ -835,17 +976,22 @@ void gfunc_call(int nb_args)
     /* calculate the number of integer/float arguments */
     args_size = 0;
     for(i = 0; i < nb_args; i++) {
-        if ((vtop[-i].type.t & VT_BTYPE) == VT_STRUCT) {
-            args_size += type_size(&vtop[-i].type, &align);
-            args_size = (args_size + 7) & ~7;
-        } else if ((vtop[-i].type.t & VT_BTYPE) == VT_LDOUBLE) {
-            args_size += 16;
-        } else if (is_sse_float(vtop[-i].type.t)) {
-            nb_sse_args++;
-            if (nb_sse_args > 8) args_size += 8;
-        } else {
-            nb_reg_args++;
-            if (nb_reg_args > REGN) args_size += 8;
+        mode = classify_x86_64_arg(&vtop[-i].type, &size, &reg_count);
+        switch (mode) {
+        case x86_64_mode_memory:
+        case x86_64_mode_x87:
+            args_size += size;
+            break;
+            
+        case x86_64_mode_sse:
+            nb_sse_args += reg_count;
+            if (nb_sse_args > 8) args_size += size;
+            break;
+        
+        case x86_64_mode_integer:
+            nb_reg_args += reg_count;
+            if (nb_reg_args > REGN) args_size += size;
+            break;
         }
     }
 
@@ -875,10 +1021,9 @@ void gfunc_call(int nb_args)
         SValue tmp = vtop[0];
 	vtop[0] = vtop[-i];
 	vtop[-i] = tmp;
-        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
-            size = type_size(&vtop->type, &align);
-            /* align to stack align size */
-            size = (size + 7) & ~7;
+        mode = classify_x86_64_arg(&vtop->type, &size, &reg_count);
+        switch (mode) {
+        case x86_64_mode_memory:
             /* allocate the necessary size on stack */
             o(0x48);
             oad(0xec81, size); /* sub $xxx, %rsp */
@@ -890,7 +1035,9 @@ void gfunc_call(int nb_args)
 	    vswap();
 	    vstore();
             args_size += size;
-        } else if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+            break;
+            
+        case x86_64_mode_x87:
             gv(RC_ST0);
             size = LDOUBLE_SIZE;
             oad(0xec8148, size); /* sub $xxx, %rsp */
@@ -898,25 +1045,30 @@ void gfunc_call(int nb_args)
             g(0x24);
             g(0x00);
             args_size += size;
-        } else if (is_sse_float(vtop->type.t)) {
-            int j = --sse_reg;
-            if (j >= 8) {
+            break;
+            
+        case x86_64_mode_sse:
+            if (sse_reg > 8) {
                 gv(RC_FLOAT);
                 o(0x50); /* push $rax */
                 /* movq %xmm0, (%rsp) */
                 o(0x04d60f66);
                 o(0x24);
-                args_size += 8;
+                args_size += size;
             }
-        } else {
-            int j = --gen_reg;
+            sse_reg -= reg_count;
+            break;
+            
+        case x86_64_mode_integer:
             /* simple type */
             /* XXX: implicit cast ? */
-            if (j >= REGN) {
+            if (gen_reg > REGN) {
                 r = gv(RC_INT);
                 orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
-                args_size += 8;
+                args_size += size;
             }
+            gen_reg -= reg_count;
+            break;
         }
 
 	/* And swap the argument back to it's original position.  */
@@ -935,29 +1087,45 @@ void gfunc_call(int nb_args)
     gen_reg = nb_reg_args;
     sse_reg = nb_sse_args;
     for(i = 0; i < nb_args; i++) {
-        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT ||
-            (vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
-        } else if (is_sse_float(vtop->type.t)) {
-            int j = --sse_reg;
-            if (j < 8) {
-                gv(RC_FLOAT); /* only one float register */
-                /* movaps %xmm0, %xmmN */
-                o(0x280f);
-                o(0xc0 + (sse_reg << 3));
+        mode = classify_x86_64_arg_type(&vtop->type, &type, &size, &reg_count);
+        /* Alter stack entry type so that gv() knows how to treat it */
+        vtop->type = type;
+        switch (mode) {
+        default:
+            break;
+            
+        case x86_64_mode_sse:
+            if (sse_reg > 8) {
+                sse_reg -= reg_count;
+            } else {
+                for (j = 0; j < reg_count; ++j) {
+                    --sse_reg;
+                    gv(RC_FLOAT); /* only one float register */
+                    /* movaps %xmm0, %xmmN */
+                    o(0x280f);
+                    o(0xc0 + (sse_reg << 3));
+                }
             }
-        } else {
-            int j = --gen_reg;
+            break;
+        
+        case x86_64_mode_integer:
             /* simple type */
             /* XXX: implicit cast ? */
-            if (j < REGN) {
-                int d = arg_regs[j];
-                r = gv(RC_INT);
-                if (j == 2 || j == 3)
-                    /* j=2: r10, j=3: r11 */
-                    d = j + 8;
-                orex(1,d,r,0x89); /* mov */
-                o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
+            if (gen_reg > 8) {
+                gen_reg -= reg_count;
+            } else {
+                for (j = 0; j < reg_count; ++j) {
+                    --gen_reg;
+                    int d = arg_regs[gen_reg];
+                    r = gv(RC_INT);
+                    if (gen_reg == 2 || gen_reg == 3)
+                        /* gen_reg=2: r10, gen_reg=3: r11 */
+                        d = gen_reg + 8;
+                    orex(1,d,r,0x89); /* mov */
+                    o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
+                }
             }
+            break;
         }
         vtop--;
     }
@@ -994,7 +1162,8 @@ static void push_arg_reg(int i) {
 /* generate function prolog of type 't' */
 void gfunc_prolog(CType *func_type)
 {
-    int i, addr, align, size;
+    X86_64_Mode mode;
+    int i, addr, align, size, reg_count;
     int param_index, param_addr, reg_param_index, sse_param_index;
     Sym *sym;
     CType *type;
@@ -1070,7 +1239,8 @@ void gfunc_prolog(CType *func_type)
     /* if the function returns a structure, then add an
        implicit pointer parameter */
     func_vt = sym->type;
-    if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
+    mode = classify_x86_64_arg(&func_vt, &size, &reg_count);
+    if (mode == x86_64_mode_memory) {
         push_arg_reg(reg_param_index);
         param_addr = loc;
 
@@ -1081,35 +1251,46 @@ void gfunc_prolog(CType *func_type)
     /* define parameters */
     while ((sym = sym->next) != NULL) {
         type = &sym->type;
-        size = type_size(type, &align);
-        size = (size + 7) & ~7;
-        if (is_sse_float(type->t)) {
-            if (sse_param_index < 8) {
+        mode = classify_x86_64_arg(type, &size, &reg_count);
+        switch (mode) {
+        case x86_64_mode_sse:
+            if (sse_param_index + reg_count <= 8) {
                 /* save arguments passed by register */
-                loc -= 8;
-                o(0xd60f66); /* movq */
-                gen_modrm(sse_param_index, VT_LOCAL, NULL, loc);
+                for (i = 0; i < reg_count; ++i) {
+                    loc -= 8;
+                    o(0xd60f66); /* movq */
+                    gen_modrm(sse_param_index, VT_LOCAL, NULL, loc);
+                    ++sse_param_index;
+                }
                 param_addr = loc;
             } else {
                 param_addr = addr;
                 addr += size;
+                sse_param_index += reg_count;
             }
-            sse_param_index++;
-
-        } else if ((type->t & VT_BTYPE) == VT_STRUCT ||
-                   (type->t & VT_BTYPE) == VT_LDOUBLE) {
+            break;
+            
+        case x86_64_mode_memory:
+        case x86_64_mode_x87:
             param_addr = addr;
             addr += size;
-        } else {
-            if (reg_param_index < REGN) {
+            break;
+            
+        case x86_64_mode_integer: {
+            if (reg_param_index + reg_count <= REGN) {
                 /* save arguments passed by register */
-                push_arg_reg(reg_param_index);
+                for (i = 0; i < reg_count; ++i) {
+                    push_arg_reg(reg_param_index);
+                    ++reg_param_index;
+                }
                 param_addr = loc;
             } else {
                 param_addr = addr;
-                addr += 8;
+                addr += size;
+                reg_param_index += reg_count;
             }
-            reg_param_index++;
+            break;
+        }
         }
         sym_push(sym->v & ~SYM_FIELD, type,
                  VT_LOCAL | VT_LVAL, param_addr);