diff --git a/elf.h b/elf.h
index 82fd7ed9..8ecadd17 100644
--- a/elf.h
+++ b/elf.h
@@ -949,6 +949,10 @@ typedef struct
 /* Keep this the last entry.  */
 #define R_386_NUM       11
 
+/* TCC-specific 16-bit relocs. */
+#define R_386_16        12              /* Direct 16 bit  */
+#define R_386_PC16      13              /* PC relative 16 bit */
+
 /* SUN SPARC specific definitions.  */
 
 /* Values for Elf64_Ehdr.e_flags.  */
diff --git a/i386-asm.c b/i386-asm.c
index 21b28d7a..4c312338 100644
--- a/i386-asm.c
+++ b/i386-asm.c
@@ -116,8 +116,6 @@ static const uint8_t reg_to_size[5] = {
 */
     0, 0, 1, 0, 2
 };
-    
-#define WORD_PREFIX_OPCODE 0x66
 
 #define NB_TEST_OPCODES 30
 
@@ -190,6 +188,10 @@ static inline int get_reg_shift(TCCState *s1)
 {
     int shift, v;
 
+    if (s1->seg_size == 16) {
+        error("invalid effective address");
+    }
+
     v = asm_int_expr(s1);
     switch(v) {
     case 1:
@@ -222,9 +224,13 @@ static int asm_parse_reg(void)
         reg = tok - TOK_ASM_eax;
         next();
         return reg;
+    } else if (tok >= TOK_ASM_ax && tok <= TOK_ASM_di) {
+        reg = tok - TOK_ASM_ax;
+        next();
+        return reg;
     } else {
     error_32:
-        expect("32 bit register");
+        expect("register");
         return 0;
     }
 }
@@ -336,6 +342,12 @@ static void parse_operand(TCCState *s1, Operand *op)
     op->type |= indir;
 }
 
+static void gen_le16(int v)
+{
+    g(v);
+    g(v >> 8);
+}
+
 /* XXX: unify with C code output ? */
 static void gen_expr32(ExprValue *pe)
 {
@@ -344,6 +356,13 @@ static void gen_expr32(ExprValue *pe)
     gen_le32(pe->v);
 }
 
+static void gen_expr16(ExprValue *pe)
+{
+    if (pe->sym)
+        greloc(cur_text_section, pe->sym, ind, R_386_16);
+    gen_le16(pe->v);
+}
+
 /* XXX: unify with C code output ? */
 static void gen_disp32(ExprValue *pe)
 {
@@ -368,11 +387,27 @@ static void gen_disp32(ExprValue *pe)
     }
 }
 
-
-static void gen_le16(int v)
+static void gen_disp16(ExprValue *pe)
 {
-    g(v);
-    g(v >> 8);
+    Sym *sym;
+    sym = pe->sym;
+    if (sym) {
+        if (sym->r == cur_text_section->sh_num) {
+            /* same section: we can output an absolute value. Note
+               that the TCC compiler behaves differently here because
+               it always outputs a relocation to ease (future) code
+               elimination in the linker */
+            gen_le16(pe->v + (long)sym->next - ind - 2);
+        } else {
+            greloc(cur_text_section, sym, ind, R_386_PC16);
+            gen_le16(pe->v - 2);
+        }
+    } else {
+        /* put an empty PC32 relocation */
+        put_elf_reloc(symtab_section, cur_text_section,
+                      ind, R_386_PC16, 0);
+        gen_le16(pe->v - 2);
+    }
 }
 
 /* generate the modrm operand */
@@ -384,8 +419,13 @@ static inline void asm_modrm(int reg, Operand *op)
         g(0xc0 + (reg << 3) + op->reg);
     } else if (op->reg == -1 && op->reg2 == -1) {
         /* displacement only */
-        g(0x05 + (reg << 3));
-        gen_expr32(&op->e);
+        if (tcc_state->seg_size == 16) {
+            g(0x06 + (reg << 3));
+            gen_expr16(&op->e);
+        } else if (tcc_state->seg_size == 32) {
+            g(0x05 + (reg << 3));
+            gen_expr32(&op->e);
+        }
     } else {
         sib_reg1 = op->reg;
         /* fist compute displacement encoding */
@@ -403,20 +443,59 @@ static inline void asm_modrm(int reg, Operand *op)
         reg1 = op->reg;
         if (op->reg2 != -1)
             reg1 = 4;
-        g(mod + (reg << 3) + reg1);
-        if (reg1 == 4) {
-            /* add sib byte */
-            reg2 = op->reg2;
-            if (reg2 == -1)
-                reg2 = 4; /* indicate no index */
-            g((op->shift << 6) + (reg2 << 3) + sib_reg1);
+        if (tcc_state->seg_size == 32) {
+            g(mod + (reg << 3) + reg1);
+            if (reg1 == 4) {
+                /* add sib byte */
+                reg2 = op->reg2;
+                if (reg2 == -1)
+                    reg2 = 4; /* indicate no index */
+                g((op->shift << 6) + (reg2 << 3) + sib_reg1);
+            }
+        } else if (tcc_state->seg_size == 16) {
+            /* edi = 7, esi = 6 --> di = 5, si = 4 */
+            if ((reg1 == 6) || (reg1 == 7)) {
+                reg1 -= 2;
+            /* ebx = 3 --> bx = 7 */
+            } else if (reg1 == 3) {
+                reg1 = 7;
+            /* o32 = 5 --> o16 = 6 */
+            } else if (reg1 == 5) {
+                reg1 = 6;
+            /* sib not valid in 16-bit mode */
+            } else if (reg1 == 4) {
+                reg2 = op->reg2;
+	    /* bp + si + offset */
+		if ((sib_reg1 == 5) && (reg2 == 6)) {
+		    reg1 = 2;
+		/* bp + di + offset */
+		} else if ((sib_reg1 == 5) && (reg2 == 7)) {
+		    reg1 = 3;
+		/* bx + si + offset */
+		} else if ((sib_reg1 == 3) && (reg2 == 6)) {
+		    reg1 = 0;
+		/* bx + di + offset */
+		} else if ((sib_reg1 == 3) && (reg2 == 7)) {
+		    reg1 = 1;
+		} else {
+		    error("invalid effective address");
+		}
+		if (op->e.v == 0)
+		    mod = 0;
+            } else {
+                error("invalid register");
+            }
+            g(mod + (reg << 3) + reg1);
         }
 
         /* add offset */
         if (mod == 0x40) {
             g(op->e.v);
         } else if (mod == 0x80 || op->reg == -1) {
-            gen_expr32(&op->e);
+            if (tcc_state->seg_size == 16)
+                gen_expr16(&op->e);
+            else if (tcc_state->seg_size == 32)
+                gen_expr32(&op->e);
         }
     }
 }
@@ -428,6 +507,7 @@ static void asm_opcode(TCCState *s1, int opcode)
     int nb_ops, s, ss;
     Operand ops[MAX_OPERANDS], *pop;
     int op_type[3]; /* decoded op type */
+    static int a32 = 0, o32 = 0, addr32 = 0, data32 = 0;
 
     /* get operands */
     pop = ops;
@@ -442,14 +522,17 @@ static void asm_opcode(TCCState *s1, int opcode)
         parse_operand(s1, pop);
         if (tok == ':') {
            if (pop->type != OP_SEG || seg_prefix) {
+           bad_prefix:
                error("incorrect prefix");
            }
            seg_prefix = segment_prefixes[pop->reg];
            next();
            parse_operand(s1, pop);
+#if 0
            if (!(pop->type & OP_EA)) {
                error("segment prefix must be followed by memory reference");
            }
+#endif
         }
         pop++;
         nb_ops++;
@@ -531,6 +614,17 @@ static void asm_opcode(TCCState *s1, int opcode)
         if (opcode >= TOK_ASM_pusha && opcode <= TOK_ASM_emms) {
             int b;
             b = op0_codes[opcode - TOK_ASM_pusha];
+            if (opcode == TOK_ASM_o32) {
+                if (s1->seg_size == 32)
+                    goto bad_prefix;
+                else
+                    o32 = data32 = 1;
+            } else if (opcode == TOK_ASM_a32) {
+                if (s1->seg_size == 32)
+                    goto bad_prefix;
+                else
+                    a32 = addr32 = 1;
+            }
             if (b & 0xff00) 
                 g(b >> 8);
             g(b);
@@ -555,12 +649,37 @@ static void asm_opcode(TCCState *s1, int opcode)
         }
     }
 
-    /* generate data16 prefix if needed */
+    for(i = 0; i < nb_ops; i++) {
+        if (ops[i].type & OP_REG32) {
+            if (s1->seg_size == 16)
+                o32 = 1;
+        } else if (!ops[i].type & OP_REG32) {
+            if (s1->seg_size == 32)
+                o32 = 1;
+        }
+    }
+
     ss = s;
-    if (s == 1 || (pa->instr_type & OPC_D16))
-        g(WORD_PREFIX_OPCODE);
-    else if (s == 2)
+    if (s == 1 || (pa->instr_type & OPC_D16)) {
+        if (s1->seg_size == 32)
+            o32 = 1;
+    } else if (s == 2) {
+        if (s1->seg_size == 16) {
+            if (!(pa->instr_type & OPC_D16))
+                o32 = 1;
+        }
         s = 1;
+    }
+
+    /* generate a16/a32 prefix if needed */
+    if ((a32 == 1) && (addr32 == 0))
+        g(0x67);
+    /* generate o16/o32 prefix if needed */
+    if ((o32 == 1) && (data32 == 0))
+        g(0x66);
+
+    addr32 = data32 = 0;
+
     /* now generates the operation */
     if (pa->instr_type & OPC_FWAIT)
         g(0x9b);
@@ -685,9 +804,17 @@ static void asm_opcode(TCCState *s1, int opcode)
     /* emit constants */
     if (pa->opcode == 0x9a || pa->opcode == 0xea) {
         /* ljmp or lcall kludge */
-        gen_expr32(&ops[1].e);
-        if (ops[0].e.sym)
+        if (s1->seg_size == 16) {
+            if (o32 == 0)
+                gen_expr16(&ops[1].e);
+            else if (o32 == 1)
+                gen_expr32(&ops[1].e);
+        } else
+            gen_expr32(&ops[1].e);
+        if (ops[0].e.sym) {
+        error_relocate:
             error("cannot relocate");
+        }
         gen_le16(ops[0].e.v);
     } else {
         for(i = 0;i < nb_ops; i++) {
@@ -709,24 +836,48 @@ static void asm_opcode(TCCState *s1, int opcode)
                         goto error_relocate;
                     g(ops[i].e.v);
                 } else if (v & OP_IM16) {
-                    if (ops[i].e.sym) {
-                    error_relocate:
-                        error("cannot relocate");
+                    if (s1->seg_size == 16)
+                        gen_expr16(&ops[i].e);
+                    else {
+                        if (ops[i].e.sym)
+                            goto error_relocate;
                     }
                     gen_le16(ops[i].e.v);
+                    }
                 } else {
                     if (pa->instr_type & (OPC_JMP | OPC_SHORTJMP)) {
                         if (is_short_jmp)
                             g(ops[i].e.v);
-                        else
-                            gen_disp32(&ops[i].e);
+                        else {
+                            if (s1->seg_size == 16)
+                                gen_disp16(&ops[i].e);
+                            else
+                                gen_disp32(&ops[i].e);
+                        }
                     } else {
-                        gen_expr32(&ops[i].e);
+                        if (s1->seg_size == 16) {
+                            if ((o32 == 1) && (v & OP_IM32))
+                                gen_expr32(&ops[i].e);
+                            else
+                                gen_expr16(&ops[i].e);
+                        } else if (s1->seg_size == 32) {
+                            if (o32 == 1)
+                                gen_expr16(&ops[i].e);
+                            else
+                                gen_expr32(&ops[i].e);
+                        }
                     }
                 }
+            } else if (v & (OP_REG16 | OP_REG32)) {
+                if (pa->instr_type & (OPC_JMP | OPC_SHORTJMP)) {
+                    /* jmp $r */
+                    g(0xE0 + ops[i].reg);
+                }
             }
         }
     }
+
+    a32 = o32 = 0;
 }
 
 #define NB_SAVED_REGS 3
@@ -1127,8 +1278,11 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
         /* generate reg save code */
         for(i = 0; i < NB_SAVED_REGS; i++) {
             reg = reg_saved[i];
-            if (regs_allocated[reg]) 
+            if (regs_allocated[reg]) {
+                if (tcc_state->seg_size == 16)
+                    g(0x66);
                 g(0x50 + reg);
+            }
         }
 
         /* generate load code */
@@ -1184,8 +1338,11 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
         /* generate reg restore code */
         for(i = NB_SAVED_REGS - 1; i >= 0; i--) {
             reg = reg_saved[i];
-            if (regs_allocated[reg]) 
+            if (regs_allocated[reg]) {
+                if (tcc_state->seg_size == 16)
+                    g(0x66);
                 g(0x58 + reg);
+            }
         }
     }
 }
diff --git a/i386-asm.h b/i386-asm.h
index a3b28d4d..7dfdca6d 100644
--- a/i386-asm.h
+++ b/i386-asm.h
@@ -74,10 +74,8 @@ ALT(DEF_ASM_OP2(btcw, 0x0fbb, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_REGW | OPT_EA
 ALT(DEF_ASM_OP2(btcw, 0x0fba, 7, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA))
 
      /* prefixes */
-     DEF_ASM_OP0(aword, 0x67)
-     DEF_ASM_OP0(addr16, 0x67)
-     DEF_ASM_OP0(word, 0x66)
-     DEF_ASM_OP0(data16, 0x66)
+     DEF_ASM_OP0(addr32, 0x67)
+     DEF_ASM_OP0(data32, 0x66)
      DEF_ASM_OP0(lock, 0xf0)
      DEF_ASM_OP0(rep, 0xf3)
      DEF_ASM_OP0(repe, 0xf3)
@@ -120,9 +118,9 @@ ALT(DEF_ASM_OP2(movzwl, 0x0fb7, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG32))
 
 ALT(DEF_ASM_OP1(pushw, 0x50, 0, OPC_REG | OPC_WL, OPT_REGW))
 ALT(DEF_ASM_OP1(pushw, 0xff, 6, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP1(pushw, 0x6a, 0, OPC_WL, OPT_IM8S))
 ALT(DEF_ASM_OP1(pushw, 0x68, 0, OPC_WL, OPT_IM32))
 ALT(DEF_ASM_OP1(pushw, 0x06, 0, OPC_WL, OPT_SEG))
+    DEF_ASM_OP1(pushb, 0x6a, 0, OPC_B, OPT_IM8S)
 
 ALT(DEF_ASM_OP1(popw, 0x58, 0, OPC_REG | OPC_WL, OPT_REGW))
 ALT(DEF_ASM_OP1(popw, 0x8f, 0, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA))
@@ -201,6 +199,7 @@ ALT(DEF_ASM_OP1(call, 0xff, 2, OPC_MODRM, OPT_INDIR))
 ALT(DEF_ASM_OP1(call, 0xe8, 0, OPC_JMP, OPT_ADDR))
 ALT(DEF_ASM_OP1(jmp, 0xff, 4, OPC_MODRM, OPT_INDIR))
 ALT(DEF_ASM_OP1(jmp, 0xeb, 0, OPC_SHORTJMP | OPC_JMP, OPT_ADDR))
+ALT(DEF_ASM_OP1(jmp, 0xff, 0, OPC_JMP | OPC_WL, OPT_REGW))
 
 ALT(DEF_ASM_OP2(lcall, 0x9a, 0, 0, OPT_IM16, OPT_IM32))
 ALT(DEF_ASM_OP1(lcall, 0xff, 3, 0, OPT_EA))
@@ -350,6 +349,9 @@ ALT(DEF_ASM_OP2(lslw, 0x0f03, 0, OPC_MODRM | OPC_WL, OPT_EA | OPT_REG, OPT_REG))
     DEF_ASM_OP1(verr, 0x0f00, 4, OPC_MODRM, OPT_REG | OPT_EA)
     DEF_ASM_OP1(verw, 0x0f00, 5, OPC_MODRM, OPT_REG | OPT_EA)
 
+    /* 386 */
+    DEF_ASM_OP0(loadall386, 0x0f07)
+
     /* 486 */
     DEF_ASM_OP1(bswap, 0x0fc8, 0, OPC_REG, OPT_REG32 )
 ALT(DEF_ASM_OP2(xaddb, 0x0fc0, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_REG | OPT_EA ))
@@ -363,7 +365,14 @@ ALT(DEF_ASM_OP2(cmpxchgb, 0x0fb0, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_REG | OPT
     DEF_ASM_OP1(cmpxchg8b, 0x0fc7, 1, OPC_MODRM, OPT_EA )
     
     /* pentium pro */
-    ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovno, 0x0f41, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovc, 0x0f42, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovnc, 0x0f43, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovz, 0x0f44, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovnz, 0x0f45, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmovna, 0x0f46, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
+ALT(DEF_ASM_OP2(cmova, 0x0f47, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
 
     DEF_ASM_OP2(fcmovb, 0xdac0, 0, OPC_REG, OPT_ST, OPT_ST0 )
     DEF_ASM_OP2(fcmove, 0xdac8, 0, OPC_REG, OPT_ST, OPT_ST0 )
diff --git a/libtcc.c b/libtcc.c
index 5208ea59..513de90d 100644
--- a/libtcc.c
+++ b/libtcc.c
@@ -1937,6 +1937,9 @@ TCCState *tcc_new(void)
 #if defined(TCC_TARGET_PE) && 0
     /* XXX: currently the PE linker is not ready to support that */
     s->leading_underscore = 1;
+#endif
+#ifdef TCC_TARGET_I386
+    s->seg_size = 32;
 #endif
     return s;
 }
diff --git a/tcc.h b/tcc.h
index 61164d45..9407f63e 100644
--- a/tcc.h
+++ b/tcc.h
@@ -512,6 +512,10 @@ struct TCCState {
     struct InlineFunc **inline_fns;
     int nb_inline_fns;
 
+#ifdef TCC_TARGET_I386
+    int seg_size;
+#endif
+
 #ifndef TCC_TARGET_PE
 #ifdef TCC_TARGET_X86_64
     /* write PLT and GOT here */
diff --git a/tccasm.c b/tccasm.c
index c1194055..3d8a46ed 100644
--- a/tccasm.c
+++ b/tccasm.c
@@ -571,6 +571,20 @@ static void asm_parse_directive(TCCState *s1)
             last_text_section = sec;
         }
         break;
+#ifdef TCC_TARGET_I386
+    case TOK_ASM_code16:
+        {
+            next();
+            s1->seg_size = 16;
+        }
+        break;
+    case TOK_ASM_code32:
+        {
+            next();
+            s1->seg_size = 32;
+        }
+        break;
+#endif
     default:
         error("unknown assembler directive '.%s'", get_tok_str(tok, NULL));
         break;
diff --git a/tccelf.c b/tccelf.c
index e87f2be5..34a4df94 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -591,6 +591,18 @@ static void relocate_section(TCCState *s1, Section *s)
             /* we load the got offset */
             *(int *)ptr += s1->got_offsets[sym_index];
             break;
+        case R_386_16:
+            if (s1->output_format != TCC_OUTPUT_FORMAT_BINARY) {
+            output_file:
+		error("can only produce 16-bit binary files");
+            }
+            *(short *)ptr += val;
+            break;
+        case R_386_PC16:
+            if (s1->output_format != TCC_OUTPUT_FORMAT_BINARY)
+		goto output_file;
+            *(short *)ptr += val - addr;
+            break;
 #elif defined(TCC_TARGET_ARM)
         case R_ARM_PC24:
         case R_ARM_CALL:
diff --git a/tcctok.h b/tcctok.h
index ceb11847..8e1c6d62 100644
--- a/tcctok.h
+++ b/tcctok.h
@@ -235,6 +235,7 @@
 /* Tiny Assembler */
 
  DEF_ASM(byte)
+ DEF_ASM(word)
  DEF_ASM(align)
  DEF_ASM(skip)
  DEF_ASM(space)
@@ -250,6 +251,10 @@
  DEF_ASM(fill)
  DEF_ASM(org)
  DEF_ASM(quad)
+#if defined(TCC_TARGET_I386)
+ DEF_ASM(code16)
+ DEF_ASM(code32)
+#endif
 
 #ifdef TCC_TARGET_I386