diff --git a/i386-asm.c b/i386-asm.c
index 10926fef..23e1fbbb 100644
--- a/i386-asm.c
+++ b/i386-asm.c
@@ -1207,7 +1207,7 @@ ST_FUNC int asm_parse_regvar (int t)
     s = table_ident[t - TOK_IDENT]->str;
     if (s[0] != '%')
         return -1;
-    t = tok_alloc(s+1, strlen(s)-1)->tok;
+    t = tok_alloc_const(s + 1);
     unget_tok(t);
     unget_tok('%');
     parse_operand(tcc_state, &op);
@@ -1488,7 +1488,7 @@ ST_FUNC void subst_asm_operand(CString *add_str,
 		   in the C symbol table when later looking up
 		   this name.  So enter them now into the asm label
 		   list when we still know the symbol.  */
-		get_asm_sym(tok_alloc(name, strlen(name))->tok, sv->sym);
+		get_asm_sym(tok_alloc_const(name), sv->sym);
 	    }
             if (tcc_state->leading_underscore)
               cstr_ccat(add_str, '_');
@@ -1698,7 +1698,6 @@ ST_FUNC void asm_gen_code(ASMOperand *operands, int nb_operands,
 ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str)
 {
     int reg;
-    TokenSym *ts;
 #ifdef TCC_TARGET_X86_64
     unsigned int type;
 #endif
@@ -1707,8 +1706,7 @@ ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str)
         !strcmp(str, "cc") ||
 	!strcmp(str, "flags"))
         return;
-    ts = tok_alloc(str, strlen(str));
-    reg = ts->tok;
+    reg = tok_alloc_const(str);
     if (reg >= TOK_ASM_eax && reg <= TOK_ASM_edi) {
         reg -= TOK_ASM_eax;
     } else if (reg >= TOK_ASM_ax && reg <= TOK_ASM_di) {
diff --git a/i386-gen.c b/i386-gen.c
index d83418ef..f34072dd 100644
--- a/i386-gen.c
+++ b/i386-gen.c
@@ -423,26 +423,20 @@ ST_FUNC void gfunc_call(int nb_args)
             size = (size + 3) & ~3;
             /* allocate the necessary size on stack */
 #ifdef TCC_TARGET_PE
-            if (size >= 0x4096) {
-                /* cannot call alloca with bound checking. Do stack probing. */
-                o(0x50);               // push %eax
-                oad(0xb8, size - 4);   // mov size-4,%eax
-                oad(0x3d, 4096);       // p1: cmp $4096,%eax
-                o(0x1476);             // jbe <p2>
-                oad(0x248485,-4096);   // test %eax,-4096(%esp)
-                oad(0xec81, 4096);     // sub $4096,%esp
-                oad(0x2d, 4096);       // sub $4096,%eax
-                o(0xe5eb);             // jmp <p1>
-                o(0xc429);             // p2: sub %eax,%esp
-                oad(0xc481, size - 4); // add size-4,%esp
-                o(0x58);               // pop %eax
-            }
+            if (size >= 4096) {
+                r = get_reg(RC_EAX);
+                oad(0x68, size); // push size
+                /* cannot call normal 'alloca' with bound checking */
+                gen_static_call(tok_alloc_const("__alloca"));
+                gadd_sp(4);
+            } else
 #endif
-            oad(0xec81, size); /* sub $xxx, %esp */
-            /* generate structure store */
-            r = get_reg(RC_INT);
-            o(0x89); /* mov %esp, r */
-            o(0xe0 + r);
+            {
+                oad(0xec81, size); /* sub $xxx, %esp */
+                /* generate structure store */
+                r = get_reg(RC_INT);
+                o(0xe089 + (r << 8)); /* mov %esp, r */
+            }
             vset(&vtop->type, r | VT_LVAL, 0);
             vswap();
             vstore();
@@ -844,6 +838,12 @@ ST_FUNC void gen_opf(int op)
 {
     int a, ft, fc, swapped, r;
 
+    if (op == TOK_NEG) { /* unary minus */
+        gv(RC_FLOAT);
+        o(0xe0d9); /* fchs */
+        return;
+    }
+
     /* convert constants to memory references */
     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
         vswap();
diff --git a/lib/alloca86.S b/lib/alloca86.S
index bdc73911..426c67da 100644
--- a/lib/alloca86.S
+++ b/lib/alloca86.S
@@ -7,30 +7,30 @@
 # define _(s) s
 #endif
 
-.globl _(alloca)
+.globl _(alloca), _(__alloca)
 _(alloca):
-    pop     %edx
-    pop     %eax
+_(__alloca):
+    push    %ebp
+    mov     %esp,%ebp
+    mov     8(%ebp),%eax
     add     $3,%eax
     and     $-4,%eax
-    jz      p3
-
 #ifdef _WIN32
+    jmp     .+16 #p2
 p1:
-    cmp     $4096,%eax
-    jbe     p2
-    test    %eax,-4096(%esp)
     sub     $4096,%esp
     sub     $4096,%eax
-    jmp p1
+    test    %eax,(%esp)
 p2:
+    cmp     $4096,%eax
+    jae     p1
 #endif
-
     sub     %eax,%esp
-    mov     %esp,%eax
-p3:
-    push    %edx
-    push    %edx
+    mov     4(%ebp),%eax
+    mov     0(%ebp),%ebp
+    add     $8,%esp
+    push    %eax
+    lea     8(%esp),%eax
     ret
 
 /* ---------------------------------------------- */
diff --git a/lib/libtcc1.c b/lib/libtcc1.c
index 5d1942d8..d6d8dd28 100644
--- a/lib/libtcc1.c
+++ b/lib/libtcc1.c
@@ -625,3 +625,9 @@ long long __fixxfdi (long double a1)
     return s ? ret : -ret;
 }
 #endif /* !ARM */
+
+#if defined __x86_64__
+/* float constants used for unary minus operation */
+const float __mzerosf = -0.0;
+const double __mzerodf = -0.0;
+#endif
diff --git a/tcc.h b/tcc.h
index 3bf68a58..72c8c496 100644
--- a/tcc.h
+++ b/tcc.h
@@ -1090,6 +1090,7 @@ struct filespec {
 #define TOK_SHL     '<' /* shift left */
 #define TOK_SAR     '>' /* signed shift right */
 #define TOK_SHR     0x8b /* unsigned shift right */
+#define TOK_NEG     TOK_MID /* unary minus operation (for floats) */
 
 #define TOK_ARROW   0xa0 /* -> */
 #define TOK_DOTS    0xa1 /* three dots */
@@ -1378,6 +1379,7 @@ ST_DATA TokenSym **table_ident;
 #define IS_NUM 4
 
 ST_FUNC TokenSym *tok_alloc(const char *str, int len);
+ST_FUNC int tok_alloc_const(const char *str);
 ST_FUNC const char *get_tok_str(int v, CValue *cv);
 ST_FUNC void begin_macro(TokenString *str, int alloc);
 ST_FUNC void end_macro(void);
diff --git a/tccasm.c b/tccasm.c
index 911052de..097f41ca 100644
--- a/tccasm.c
+++ b/tccasm.c
@@ -27,11 +27,8 @@ static Section *last_text_section; /* to handle .previous asm directive */
 ST_FUNC int asm_get_local_label_name(TCCState *s1, unsigned int n)
 {
     char buf[64];
-    TokenSym *ts;
-
     snprintf(buf, sizeof(buf), "L..%u", n);
-    ts = tok_alloc(buf, strlen(buf));
-    return ts->tok;
+    return tok_alloc_const(buf);
 }
 
 static int tcc_assemble_internal(TCCState *s1, int do_preprocess, int global);
@@ -54,12 +51,11 @@ static int asm2cname(int v, int *addeddot)
     if (!name)
       return v;
     if (name[0] == '_') {
-        v = tok_alloc(name + 1, strlen(name) - 1)->tok;
+        v = tok_alloc_const(name + 1);
     } else if (!strchr(name, '.')) {
-        int n = strlen(name) + 2;
         char newname[256];
         snprintf(newname, sizeof newname, ".%s", name);
-        v = tok_alloc(newname, n - 1)->tok;
+        v = tok_alloc_const(newname);
         *addeddot = 1;
     }
     return v;
@@ -111,11 +107,10 @@ ST_FUNC Sym* get_asm_sym(int name, Sym *csym)
 
 static Sym* asm_section_sym(TCCState *s1, Section *sec)
 {
-    char buf[100];
-    int label = tok_alloc(buf,
-        snprintf(buf, sizeof buf, "L.%s", sec->name)
-        )->tok;
-    Sym *sym = asm_label_find(label);
+    char buf[100]; int label; Sym *sym;
+    snprintf(buf, sizeof buf, "L.%s", sec->name);
+    label = tok_alloc_const(buf);
+    sym = asm_label_find(label);
     return sym ? sym : asm_new_label1(s1, label, 1, sec->sh_num, 0);
 }
 
diff --git a/tccelf.c b/tccelf.c
index 4cc37202..2453b2f6 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -2511,9 +2511,12 @@ static int elf_output_file(TCCState *s1, const char *filename)
 {
     int i, ret, phnum, phfill, shnum, file_type, file_offset, *sec_order;
     struct dyn_inf dyninf = {0};
-    struct ro_inf roinf, *roinf_use = &roinf;
+    struct ro_inf roinf;
     ElfW(Phdr) *phdr;
     Section *strsec, *interp, *dynamic, *dynstr, *note = NULL;
+#ifndef ELF_OBJ_ONLY
+    struct ro_inf *roinf_use = NULL;
+#endif
 
     file_type = s1->output_type;
 
@@ -2648,10 +2651,8 @@ static int elf_output_file(TCCState *s1, const char *filename)
 #if !TARGETOS_FreeBSD && !TARGETOS_NetBSD && !defined(__APPLE__) && !defined(_WIN32)
     /* GNU_RELRO */
     if (file_type != TCC_OUTPUT_OBJ)
-	phnum++;
-    else
+	phnum++, roinf_use = &roinf;
 #endif
-        roinf_use = NULL;
 
     /* allocate program segment headers */
     phdr = tcc_mallocz(phnum * sizeof(ElfW(Phdr)));
diff --git a/tccgen.c b/tccgen.c
index dabc897f..914294bd 100644
--- a/tccgen.c
+++ b/tccgen.c
@@ -2790,6 +2790,33 @@ static void gen_opic(int op)
     }
 }
 
+#if defined TCC_TARGET_X86_64 || defined TCC_TARGET_I386
+# define gen_negf gen_opf
+#else
+/* XXX: implement in gen_opf() for other backends too */
+void gen_negf(int op)
+{
+    /* In IEEE negate(x) isn't subtract(0,x).  Without NaNs it's
+       subtract(-0, x), but with them it's really a sign flip
+       operation.  We implement this with bit manipulation and have
+       to do some type reinterpretation for this, which TCC can do
+       only via memory.  */
+
+    int align, size, bt;
+
+    size = type_size(&vtop->type, &align);
+    bt = vtop->type.t & VT_BTYPE;
+    save_reg(gv(RC_TYPE(bt)));
+    vdup();
+    incr_bf_adr(size - 1);
+    vdup();
+    vpushi(0x80); /* flip sign */
+    gen_op('^');
+    vstore();
+    vpop();
+}
+#endif
+
 /* generate a floating point operation with constant propagation */
 static void gen_opif(int op)
 {
@@ -2803,6 +2830,9 @@ static void gen_opif(int op)
 
     v1 = vtop - 1;
     v2 = vtop;
+    if (op == TOK_NEG)
+        v1 = v2;
+
     /* currently, we cannot do computations with forward symbols */
     c1 = (v1->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
     c2 = (v2->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
@@ -2817,29 +2847,43 @@ static void gen_opif(int op)
             f1 = v1->c.ld;
             f2 = v2->c.ld;
         }
-
         /* NOTE: we only do constant propagation if finite number (not
            NaN or infinity) (ANSI spec) */
-        if (!ieee_finite(f1) || !ieee_finite(f2))
+        if (!(ieee_finite(f1) || !ieee_finite(f2)) && !const_wanted)
             goto general_case;
-
         switch(op) {
         case '+': f1 += f2; break;
         case '-': f1 -= f2; break;
         case '*': f1 *= f2; break;
         case '/': 
             if (f2 == 0.0) {
+                union { float f; unsigned u; } x1, x2, y;
 		/* If not in initializer we need to potentially generate
 		   FP exceptions at runtime, otherwise we want to fold.  */
                 if (!const_wanted)
                     goto general_case;
+                /* the run-time result of 0.0/0.0 on x87, also of other compilers
+                   when used to compile the f1 /= f2 below, would be -nan */
+                x1.f = f1, x2.f = f2;
+                if (f1 == 0.0)
+                    y.u = 0x7fc00000; /* nan */
+                else
+                    y.u = 0x7f800000; /* infinity */
+                y.u |= (x1.u ^ x2.u) & 0x80000000; /* set sign */
+                f1 = y.f;
+                break;
             }
-            f1 /= f2; 
+            f1 /= f2;
             break;
+        case TOK_NEG:
+            f1 = -f1;
+            goto unary_result;
             /* XXX: also handles tests ? */
         default:
             goto general_case;
         }
+        vtop--;
+    unary_result:
         /* XXX: overflow test ? */
         if (v1->type.t == VT_FLOAT) {
             v1->c.f = f1;
@@ -2848,10 +2892,13 @@ static void gen_opif(int op)
         } else {
             v1->c.ld = f1;
         }
-        vtop--;
     } else {
     general_case:
-        gen_opf(op);
+        if (op == TOK_NEG) {
+            gen_negf(op);
+        } else {
+            gen_opf(op);
+        }
     }
 }
 
@@ -5878,44 +5925,8 @@ ST_FUNC void unary(void)
     case '-':
         next();
         unary();
-        t = vtop->type.t & VT_BTYPE;
-	if (is_float(t)) {
-            if ((vtop->r & VT_VALMASK) == VT_CONST) {
-                /* This is what gen_opif would do if we had a NEG operation.  */
-                if (t == VT_FLOAT)
-                  vtop->c.f = -vtop->c.f;
-                else if (t == VT_DOUBLE)
-                  vtop->c.d = -vtop->c.d;
-                else
-                  vtop->c.ld = -vtop->c.ld;
-            } else {
-                /* In IEEE negate(x) isn't subtract(0,x).  Without NaNs it's
-                   subtract(-0, x), but with them it's really a sign flip
-                   operation.  We implement this with bit manipulation and have
-                   to do some type reinterpretation for this, which TCC can do
-                   only via memory.  */
-                int align, size = type_size(&vtop->type, &align);
-                save_reg(gv(RC_TYPE(t)));
-                vdup();
-                gaddrof();
-                vtop->type = char_pointer_type;
-                /* Byte of sign bit.  For big endian, this would have to
-                   add zero always.  */
-#if defined(TCC_TARGET_X86_64) || defined(TCC_TARGET_I386)
-                /* sizeof long double is 12 or 16 here, but it's
-                   really the 80bit extended float format.  */
-                if (t == VT_LDOUBLE)
-                  size = 10;
-#endif
-                vpushi(size - 1);
-                gen_op('+');
-                indir();
-                vdup();
-                vpushi(0x80); /* flip sign */
-                gen_op('^');
-                vstore();
-                vpop();
-            }
+	if (is_float(vtop->type.t)) {
+            gen_opif(TOK_NEG);
 	} else {
             vpushi(0);
             vswap();
diff --git a/tccpp.c b/tccpp.c
index e0821a60..cd9bd684 100644
--- a/tccpp.c
+++ b/tccpp.c
@@ -487,6 +487,12 @@ ST_FUNC TokenSym *tok_alloc(const char *str, int len)
     return tok_alloc_new(pts, str, len);
 }
 
+ST_FUNC int tok_alloc_const(const char *str)
+{
+    return tok_alloc(str, strlen(str))->tok;
+}
+
+
 /* XXX: buffer overflow */
 /* XXX: float tokens */
 ST_FUNC const char *get_tok_str(int v, CValue *cv)
diff --git a/tcctok.h b/tcctok.h
index a7552393..6fc04af7 100644
--- a/tcctok.h
+++ b/tcctok.h
@@ -100,6 +100,8 @@
      DEF(TOK___NAN__, "__nan__")
      DEF(TOK___SNAN__, "__snan__")
      DEF(TOK___INF__, "__inf__")
+     DEF(TOK___mzerosf, "__mzerosf")
+     DEF(TOK___mzerodf, "__mzerodf")
 
 /* attribute identifiers */
 /* XXX: handle all tokens generically since speed is not critical */
diff --git a/x86_64-gen.c b/x86_64-gen.c
index 222249d4..a8eef52a 100644
--- a/x86_64-gen.c
+++ b/x86_64-gen.c
@@ -1813,14 +1813,38 @@ void gen_opl(int op)
     gen_opi(op);
 }
 
+void vpush_const(int t, int v)
+{
+    CType ctype = { t | VT_CONSTANT, 0 };
+    vpushsym(&ctype, external_global_sym(v, &ctype));
+    vtop->r |= VT_LVAL;
+}
+
 /* generate a floating point operation 'v = t1 op t2' instruction. The
    two operands are guaranteed to have the same floating point type */
 /* XXX: need to use ST1 too */
 void gen_opf(int op)
 {
     int a, ft, fc, swapped, r;
-    int float_type =
-        (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
+    int bt = vtop->type.t & VT_BTYPE;
+    int float_type = bt == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
+
+    if (op == TOK_NEG) { /* unary minus */
+        gv(float_type);
+        if (float_type == RC_ST0) {
+            o(0xe0d9); /* fchs */
+        } else {
+            /* -0.0, in libtcc1.c */
+            vpush_const(bt, bt == VT_FLOAT ? TOK___mzerosf : TOK___mzerodf);
+            gv(RC_FLOAT);
+            if (bt == VT_DOUBLE)
+                o(0x66);
+            /* xorp[sd] %xmm1, %xmm0 */
+            o(0xc0570f | (REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8) << 16);
+            vtop--;
+        }
+        return;
+    }
 
     /* convert constants to memory references */
     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {