From b14ef0e24bcb70d70360690130b2bf2a0c8f8d33 Mon Sep 17 00:00:00 2001
From: Edmund Grimley Evans <Edmund.Grimley.Evans@gmail.com>
Date: Fri, 13 Feb 2015 18:58:31 +0000
Subject: [PATCH] Add arm64 (AArch64) as a target architecture.

---
 Makefile         |   20 +-
 arm64-gen.c      | 1621 ++++++++++++++++++++++++++++++++++++++++++++++
 configure        |    6 +
 conftest.c       |    2 +
 elf.h            |  112 ++++
 include/stdarg.h |   13 +
 lib/Makefile     |   12 +
 lib/lib-arm64.c  |  652 +++++++++++++++++++
 lib/testfp.c     |  510 +++++++++++++++
 libtcc.c         |    7 +-
 tcc.c            |    2 +
 tcc.h            |   30 +-
 tccelf.c         |  137 +++-
 tccgen.c         |  127 +++-
 tccrun.c         |   21 +
 tcctok.h         |   33 +
 16 files changed, 3265 insertions(+), 40 deletions(-)
 create mode 100644 arm64-gen.c
 create mode 100644 lib/lib-arm64.c
 create mode 100644 lib/testfp.c

diff --git a/Makefile b/Makefile
index feb5e127..7ddc81c8 100644
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,7 @@ NATIVE_DEFINES_$(CONFIG_arm) += -DTCC_TARGET_ARM
 NATIVE_DEFINES_$(CONFIG_arm_eabihf) += -DTCC_ARM_EABI -DTCC_ARM_HARDFLOAT
 NATIVE_DEFINES_$(CONFIG_arm_eabi) += -DTCC_ARM_EABI
 NATIVE_DEFINES_$(CONFIG_arm_vfp) += -DTCC_ARM_VFP
+NATIVE_DEFINES_$(CONFIG_arm64) += -DTCC_TARGET_ARM64
 NATIVE_DEFINES += $(NATIVE_DEFINES_yes)
 
 ifeq ($(TOP),.)
@@ -86,6 +87,7 @@ ARM_VFP_CROSS = arm-linux-gnu-tcc$(EXESUF)
 ARM_EABI_CROSS = arm-linux-gnueabi-tcc$(EXESUF)
 ARM_EABIHF_CROSS = arm-linux-gnueabihf-tcc$(EXESUF)
 ARM_CROSS = $(ARM_FPA_CROSS) $(ARM_FPA_LD_CROSS) $(ARM_VFP_CROSS) $(ARM_EABI_CROSS)
+ARM64_CROSS = arm64-tcc$(EXESUF)
 C67_CROSS = c67-tcc$(EXESUF)
 
 # Legacy symlinks for cross compilers
@@ -107,33 +109,39 @@ WIN64_FILES = $(CORE_FILES) x86_64-gen.c i386-asm.c x86_64-asm.h tccpe.c
 WINCE_FILES = $(CORE_FILES) arm-gen.c tccpe.c
 X86_64_FILES = $(CORE_FILES) x86_64-gen.c i386-asm.c x86_64-asm.h
 ARM_FILES = $(CORE_FILES) arm-gen.c
+ARM64_FILES = $(CORE_FILES) arm64-gen.c
 C67_FILES = $(CORE_FILES) c67-gen.c tcccoff.c
 
 ifdef CONFIG_WIN64
 PROGS+=tiny_impdef$(EXESUF) tiny_libmaker$(EXESUF)
 NATIVE_FILES=$(WIN64_FILES)
-PROGS_CROSS=$(WIN32_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(WIN32_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifdef CONFIG_WIN32
 PROGS+=tiny_impdef$(EXESUF) tiny_libmaker$(EXESUF)
 NATIVE_FILES=$(WIN32_FILES)
-PROGS_CROSS=$(WIN64_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(WIN64_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/x86_64-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),i386)
 NATIVE_FILES=$(I386_FILES)
-PROGS_CROSS=$(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),x86-64)
 NATIVE_FILES=$(X86_64_FILES)
-PROGS_CROSS=$(I386_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(I386_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),arm)
 NATIVE_FILES=$(ARM_FILES)
-PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+LIBTCC1=libtcc1.a
+LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
+else ifeq ($(ARCH),arm64)
+NATIVE_FILES=$(ARM64_FILES)
+PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1=libtcc1.a
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
 endif
@@ -181,6 +189,7 @@ $(ARM_FPA_CROSS): DEFINES = -DTCC_TARGET_ARM
 $(ARM_FPA_LD_CROSS)$(EXESUF): DEFINES = -DTCC_TARGET_ARM -DLDOUBLE_SIZE=12
 $(ARM_VFP_CROSS): DEFINES = -DTCC_TARGET_ARM -DTCC_ARM_VFP -DCONFIG_MULTIARCHDIR="\"arm-linux-gnu\""
 $(ARM_EABI_CROSS): DEFINES = -DTCC_TARGET_ARM -DTCC_ARM_EABI -DTCC_ARM_VFP -DCONFIG_MULTIARCHDIR="\"arm-linux-gnueabi\""
+$(ARM64_CROSS): DEFINES = -DTCC_TARGET_ARM64
 
 $(I386_CROSS): $(I386_FILES)
 $(X64_CROSS): $(X86_64_FILES)
@@ -189,6 +198,7 @@ $(WIN64_CROSS): $(WIN64_FILES)
 $(WINCE_CROSS): $(WINCE_FILES)
 $(C67_CROSS): $(C67_FILES)
 $(ARM_FPA_CROSS) $(ARM_FPA_LD_CROSS) $(ARM_VFP_CROSS) $(ARM_EABI_CROSS): $(ARM_FILES)
+$(ARM64_CROSS): $(ARM64_FILES)
 
 # libtcc generation and test
 ifndef ONE_SOURCE
diff --git a/arm64-gen.c b/arm64-gen.c
new file mode 100644
index 00000000..09e74167
--- /dev/null
+++ b/arm64-gen.c
@@ -0,0 +1,1621 @@
+/*
+ *  A64 code generator for TCC
+ *
+ *  Copyright (c) 2014-2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#ifdef TARGET_DEFS_ONLY
+
+// Number of registers available to allocator:
+#define NB_REGS 28 // x0-x18, x30, v0-v7
+
+#define TREG_R(x) (x) // x = 0..18
+#define TREG_R30  19
+#define TREG_F(x) (x + 20) // x = 0..7
+
+// Register classes sorted from more general to more precise:
+#define RC_INT (1 << 0)
+#define RC_FLOAT (1 << 1)
+#define RC_R(x) (1 << (2 + (x))) // x = 0..18
+#define RC_R30  (1 << 21)
+#define RC_F(x) (1 << (22 + (x))) // x = 0..7
+
+#define RC_IRET (RC_R(0)) // int return register class
+#define RC_FRET (RC_F(0)) // float return register class
+
+#define REG_IRET (TREG_R(0)) // int return register number
+#define REG_FRET (TREG_F(0)) // float return register number
+
+#define PTR_SIZE 8
+
+#define LDOUBLE_SIZE 16
+#define LDOUBLE_ALIGN 16
+
+#define MAX_ALIGN 16
+
+#define CHAR_IS_UNSIGNED
+
+/******************************************************/
+/* ELF defines */
+
+#define EM_TCC_TARGET EM_AARCH64
+
+#define R_DATA_32  R_AARCH64_ABS32
+#define R_DATA_PTR R_AARCH64_ABS64
+#define R_JMP_SLOT R_AARCH64_JUMP_SLOT
+#define R_COPY     R_AARCH64_COPY
+
+#define ELF_START_ADDR 0x00400000
+#define ELF_PAGE_SIZE 0x1000
+
+/******************************************************/
+#else /* ! TARGET_DEFS_ONLY */
+/******************************************************/
+#include "tcc.h"
+#include <assert.h>
+
+ST_DATA const int reg_classes[NB_REGS] = {
+  RC_INT | RC_R(0),
+  RC_INT | RC_R(1),
+  RC_INT | RC_R(2),
+  RC_INT | RC_R(3),
+  RC_INT | RC_R(4),
+  RC_INT | RC_R(5),
+  RC_INT | RC_R(6),
+  RC_INT | RC_R(7),
+  RC_INT | RC_R(8),
+  RC_INT | RC_R(9),
+  RC_INT | RC_R(10),
+  RC_INT | RC_R(11),
+  RC_INT | RC_R(12),
+  RC_INT | RC_R(13),
+  RC_INT | RC_R(14),
+  RC_INT | RC_R(15),
+  RC_INT | RC_R(16),
+  RC_INT | RC_R(17),
+  RC_INT | RC_R(18),
+  RC_R30, // not in RC_INT as we make special use of x30
+  RC_FLOAT | RC_F(0),
+  RC_FLOAT | RC_F(1),
+  RC_FLOAT | RC_F(2),
+  RC_FLOAT | RC_F(3),
+  RC_FLOAT | RC_F(4),
+  RC_FLOAT | RC_F(5),
+  RC_FLOAT | RC_F(6),
+  RC_FLOAT | RC_F(7)
+};
+
+#define IS_FREG(x) ((x) >= TREG_F(0))
+
+static uint32_t intr(int r)
+{
+    assert(TREG_R(0) <= r && r <= TREG_R30);
+    return r < TREG_R30 ? r : 30;
+}
+
+static uint32_t fltr(int r)
+{
+    assert(TREG_F(0) <= r && r <= TREG_F(7));
+    return r - TREG_F(0);
+}
+
+// Add an instruction to text section:
+ST_FUNC void o(unsigned int c)
+{
+    int ind1 = ind + 4;
+    if (ind1 > cur_text_section->data_allocated)
+        section_realloc(cur_text_section, ind1);
+    *(uint32_t *)(cur_text_section->data + ind) = c;
+    ind = ind1;
+}
+
+static int arm64_encode_bimm64(uint64_t x)
+{
+    int neg = x & 1;
+    int rep, pos, len;
+
+    if (neg)
+        x = ~x;
+    if (!x)
+        return -1;
+
+    if (x >> 2 == (x & (((uint64_t)1 << (64 - 2)) - 1)))
+        rep = 2, x &= ((uint64_t)1 << 2) - 1;
+    else if (x >> 4 == (x & (((uint64_t)1 << (64 - 4)) - 1)))
+        rep = 4, x &= ((uint64_t)1 <<  4) - 1;
+    else if (x >> 8 == (x & (((uint64_t)1 << (64 - 8)) - 1)))
+        rep = 8, x &= ((uint64_t)1 <<  8) - 1;
+    else if (x >> 16 == (x & (((uint64_t)1 << (64 - 16)) - 1)))
+        rep = 16, x &= ((uint64_t)1 << 16) - 1;
+    else if (x >> 32 == (x & (((uint64_t)1 << (64 - 32)) - 1)))
+        rep = 32, x &= ((uint64_t)1 << 32) - 1;
+    else
+        rep = 64;
+
+    pos = 0;
+    if (!(x & (((uint64_t)1 << 32) - 1))) x >>= 32, pos += 32;
+    if (!(x & (((uint64_t)1 << 16) - 1))) x >>= 16, pos += 16;
+    if (!(x & (((uint64_t)1 <<  8) - 1))) x >>= 8, pos += 8;
+    if (!(x & (((uint64_t)1 <<  4) - 1))) x >>= 4, pos += 4;
+    if (!(x & (((uint64_t)1 <<  2) - 1))) x >>= 2, pos += 2;
+    if (!(x & (((uint64_t)1 <<  1) - 1))) x >>= 1, pos += 1;
+
+    len = 0;
+    if (!(~x & (((uint64_t)1 << 32) - 1))) x >>= 32, len += 32;
+    if (!(~x & (((uint64_t)1 << 16) - 1))) x >>= 16, len += 16;
+    if (!(~x & (((uint64_t)1 << 8) - 1))) x >>= 8, len += 8;
+    if (!(~x & (((uint64_t)1 << 4) - 1))) x >>= 4, len += 4;
+    if (!(~x & (((uint64_t)1 << 2) - 1))) x >>= 2, len += 2;
+    if (!(~x & (((uint64_t)1 << 1) - 1))) x >>= 1, len += 1;
+
+    if (x)
+        return -1;
+    if (neg) {
+        pos = (pos + len) & (rep - 1);
+        len = rep - len;
+    }
+    return ((0x1000 & rep << 6) | (((rep - 1) ^ 31) << 1 & 63) |
+            ((rep - pos) & (rep - 1)) << 6 | (len - 1));
+}
+
+static uint32_t arm64_movi(int r, uint64_t x)
+{
+    uint64_t m = 0xffff;
+    int e;
+    if (!(x & ~m))
+        return 0x52800000 | r | x << 5; // movz w(r),#(x)
+    if (!(x & ~(m << 16)))
+        return 0x52a00000 | r | x >> 11; // movz w(r),#(x >> 16),lsl #16
+    if (!(x & ~(m << 32)))
+        return 0xd2c00000 | r | x >> 27; // movz x(r),#(x >> 32),lsl #32
+    if (!(x & ~(m << 48)))
+        return 0xd2e00000 | r | x >> 43; // movz x(r),#(x >> 48),lsl #48
+    if ((x & ~m) == m << 16)
+        return (0x12800000 | r |
+                (~x << 5 & 0x1fffe0)); // movn w(r),#(~x)
+    if ((x & ~(m << 16)) == m)
+        return (0x12a00000 | r |
+                (~x >> 11 & 0x1fffe0)); // movn w(r),#(~x >> 16),lsl #16
+    if (!~(x | m))
+        return (0x92800000 | r |
+                (~x << 5 & 0x1fffe0)); // movn x(r),#(~x)
+    if (!~(x | m << 16))
+        return (0x92a00000 | r |
+                (~x >> 11 & 0x1fffe0)); // movn x(r),#(~x >> 16),lsl #16
+    if (!~(x | m << 32))
+        return (0x92c00000 | r |
+                (~x >> 27 & 0x1fffe0)); // movn x(r),#(~x >> 32),lsl #32
+    if (!~(x | m << 48))
+        return (0x92e00000 | r |
+                (~x >> 43 & 0x1fffe0)); // movn x(r),#(~x >> 32),lsl #32
+    if (!(x >> 32) && (e = arm64_encode_bimm64(x | x << 32)) >= 0)
+        return 0x320003e0 | r | (uint32_t)e << 10; // movi w(r),#(x)
+    if ((e = arm64_encode_bimm64(x)) >= 0)
+        return 0xb20003e0 | r | (uint32_t)e << 10; // movi x(r),#(x)
+    return 0;
+}
+
+static void arm64_movimm(int r, uint64_t x)
+{
+    uint32_t i;
+    if ((i = arm64_movi(r, x)))
+        o(i);
+    else {
+        // This could be improved:
+        o(0x52800000 | r | (x & 0xffff) << 5); // movz w(r),#(x & 0xffff)
+        for (i = 1; i < 4; i++)
+            if (x >> 16 * i & 0xffff) {
+                o(0xf2800000 | r | (x >> 16 * i & 0xffff) << 5 | i << 21);
+                // movk w(r),#(*),lsl #(*)
+            }
+
+    }
+}
+
+// Patch all branches in list pointed to by t to branch to a:
+ST_FUNC void gsym_addr(int t_, int a_)
+{
+    uint32_t t = t_;
+    uint32_t a = a_;
+    while (t) {
+        uint32_t *ptr = (uint32_t *)(cur_text_section->data + t);
+        uint32_t next = *ptr;
+        if (a - t + 0x8000000 >= 0x10000000)
+            tcc_error("branch out of range");
+        *ptr = (a - t == 4 ? 0xd503201f : // nop
+                0x14000000 | ((a - t) >> 2 & 0x3ffffff)); // b
+        t = next;
+    }
+}
+
+// Patch all branches in list pointed to by t to branch to current location:
+ST_FUNC void gsym(int t)
+{
+    gsym_addr(t, ind);
+}
+
+static int arm64_type_size(int t)
+{
+    switch (t & VT_BTYPE) {
+    case VT_INT: return 2;
+    case VT_BYTE: return 0;
+    case VT_SHORT: return 1;
+    case VT_PTR: return 3;
+    case VT_ENUM: return 2;
+    case VT_FUNC: return 3;
+    case VT_FLOAT: return 2;
+    case VT_DOUBLE: return 3;
+    case VT_LDOUBLE: return 4;
+    case VT_BOOL: return 0;
+    case VT_LLONG: return 3;
+    }
+    assert(0);
+    return 0;
+}
+
+static void gen_stack_addr(int reg, uint64_t off)
+{
+    arm64_movimm(30, off); // use x30 for offset
+    o(0x8b3e63e0 | reg);
+}
+
+static void gen_load(int sg, int sz, int dst, int bas, uint64_t off)
+{
+    if (sz >= 2)
+        sg = 0;
+    if (!(off & ~(0xfff << sz)))
+        o(0x39400000 | dst | bas << 5 | off << (10 - sz) |
+          !!sg << 23 | sz << 30);
+    else if (off < 256 || -off <= 256)
+        o(0x38400000 | dst | bas << 5 | (off & 511) << 12 |
+          !!sg << 23 | sz << 30);
+    else {
+        arm64_movimm(30, off); // use x30 for offset
+        o(0x38206800 | dst | bas << 5 | 30 << 16 |
+          (!!sg + 1) << 22 | sz << 30);
+    }
+}
+
+static void gen_fload(int sz, int dst, int bas, uint64_t off)
+{
+    if (!(off & ~(0xfff << sz)))
+        o(0x3d400000 | dst | bas << 5 | off << (10 - sz) |
+          (sz & 4) << 21 | (sz & 3) << 30);
+    else if (off < 256 || -off <= 256)
+        o(0x3c400000 | dst | bas << 5 | (off & 511) << 12 |
+          (sz & 4) << 21 | (sz & 3) << 30);
+    else {
+        arm64_movimm(30, off); // use x30 for offset
+        o(0x3c606800 | dst | bas << 5 | 30 << 16 | sz << 30 | (sz & 4) << 21);
+    }
+}
+
+static void gen_sload(int reg, int size)
+{
+    // Use x30 for intermediate value in some cases.
+    switch (size) {
+    default: assert(0); break;
+    case 1:
+        gen_load(0, 0, reg, reg, 0);
+        break;
+    case 2:
+        gen_load(0, 1, reg, reg, 0);
+        break;
+    case 3:
+        gen_load(0, 1, 30, reg, 0);
+        gen_load(0, 0, reg, reg, 2);
+        o(0x2a0043c0 | reg | reg << 16); // orr x(reg),x30,x(reg),lsl #16
+        break;
+    case 4:
+        gen_load(0, 2, reg, reg, 0);
+        break;
+    case 5:
+        gen_load(0, 2, 30, reg, 0);
+        gen_load(0, 0, reg, reg, 4);
+        o(0xaa0083c0 | reg | reg << 16); // orr x(reg),x30,x(reg),lsl #32
+        break;
+    case 6:
+        gen_load(0, 2, 30, reg, 0);
+        gen_load(0, 1, reg, reg, 4);
+        o(0xaa0083c0 | reg | reg << 16); // orr x(reg),x30,x(reg),lsl #32
+        break;
+    case 7:
+        gen_load(0, 2, 30, reg, 0);
+        gen_load(0, 2, reg, reg, 3);
+        o(0x53087c00 | reg | reg << 5); // lsr w(reg), w(reg), #8
+        o(0xaa0083c0 | reg | reg << 16); // orr x(reg),x30,x(reg),lsl #32
+        break;
+    case 8:
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 9:
+        gen_load(0, 0, reg + 1, reg, 8);
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 10:
+        gen_load(0, 1, reg + 1, reg, 8);
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 11:
+        gen_load(0, 2, reg + 1, reg, 7);
+        o(0x53087c00 | (reg+1) | (reg+1) << 5); // lsr w(reg+1), w(reg+1), #8
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 12:
+        gen_load(0, 2, reg + 1, reg, 8);
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 13:
+        gen_load(0, 3, reg + 1, reg, 5);
+        o(0xd358fc00 | (reg+1) | (reg+1) << 5); // lsr x(reg+1), x(reg+1), #24
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 14:
+        gen_load(0, 3, reg + 1, reg, 6);
+        o(0xd350fc00 | (reg+1) | (reg+1) << 5); // lsr x(reg+1), x(reg+1), #16
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 15:
+        gen_load(0, 3, reg + 1, reg, 7);
+        o(0xd348fc00 | (reg+1) | (reg+1) << 5); // lsr x(reg+1), x(reg+1), #8
+        gen_load(0, 3, reg, reg, 0);
+        break;
+    case 16:
+        o(0xa9400000 | reg | (reg+1) << 10 | reg << 5);
+        // ldp x(reg),x(reg+1),[x(reg)]
+        break;
+    }
+}
+
+static void gen_store(int sz, int dst, int bas, uint64_t off)
+{
+    if (!(off & ~(0xfff << sz)))
+        o(0x39000000 | dst | bas << 5 | off << (10 - sz) | sz << 30);
+    else if (off < 256 || -off <= 256)
+        o(0x38000000 | dst | bas << 5 | (off & 511) << 12 | sz << 30);
+    else {
+        arm64_movimm(30, off); // use x30 for offset
+        o(0x38206800 | dst | bas << 5 | 30 << 16 | sz << 30);
+    }
+}
+
+static void gen_fstore(int sz, int dst, int bas, uint64_t off)
+{
+    if (!(off & ~(0xfff << sz)))
+        o(0x3d000000 | dst | bas << 5 | off << (10 - sz) |
+          (sz & 4) << 21 | (sz & 3) << 30);
+    else if (off < 256 || -off <= 256)
+        o(0x3c000000 | dst | bas << 5 | (off & 511) << 12 |
+          (sz & 4) << 21 | (sz & 3) << 30);
+    else {
+        arm64_movimm(30, off); // use x30 for offset
+        o(0x3c206800 | dst | bas << 5 | 30 << 16 | sz << 30 | (sz & 4) << 21);
+    }
+}
+
+static void gen_addr(int r, Sym *sym, unsigned long addend)
+{
+#if 0
+    // This is normally the right way to do it, I think,
+    // but it does not work with "-run" when stdin or stderr is
+    // used by the program: "R_AARCH64_ADR_PREL_PG_HI21 relocation failed".
+    greloca(cur_text_section, sym, ind, R_AARCH64_ADR_PREL_PG_HI21, addend);
+    o(0x90000000 | r);
+    greloca(cur_text_section, sym, ind, R_AARCH64_ADD_ABS_LO12_NC, addend);
+    o(0x91000000 | r | r << 5);
+#else
+    // This seems to work in all cases, unless you try to use an old buggy
+    // GCC for linking, which says: "unresolvable R_AARCH64_MOVW_UABS_G0_NC
+    // relocation against symbol `stderr@@GLIBC_2.17'".
+    greloca(cur_text_section, sym, ind, R_AARCH64_MOVW_UABS_G0_NC, addend);
+    o(0xf2800000 | r); // movk x(rt),#...,lsl #0
+    greloca(cur_text_section, sym, ind, R_AARCH64_MOVW_UABS_G1_NC, addend);
+    o(0xf2a00000 | r); // movk x(rt),#...,lsl #16
+    greloca(cur_text_section, sym, ind, R_AARCH64_MOVW_UABS_G2_NC, addend);
+    o(0xf2c00000 | r); // movk x(rt),#...,lsl #32
+    greloca(cur_text_section, sym, ind, R_AARCH64_MOVW_UABS_G3, addend);
+    o(0xf2e00000 | r); // movk x(rt),#...,lsl #48
+#endif
+}
+
+ST_FUNC void load(int r, SValue *sv)
+{
+    int svtt = sv->type.t;
+    int svr = sv->r & ~VT_LVAL_TYPE;
+    int svrv = svr & VT_VALMASK;
+    uint64_t svcul = (int32_t)sv->c.ul;
+
+    if (svr == (VT_LOCAL | VT_LVAL)) {
+        if (IS_FREG(r))
+            gen_fload(arm64_type_size(svtt), fltr(r), 29, svcul);
+        else
+            gen_load(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+                     intr(r), 29, svcul);
+        return;
+    }
+
+    if ((svr & ~VT_VALMASK) == VT_LVAL && svrv < VT_CONST) {
+        if (IS_FREG(r))
+            gen_fload(arm64_type_size(svtt),
+                      fltr(r), intr(svrv), 0);
+        else
+            gen_load(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+                     intr(r), intr(svrv), 0);
+        return;
+    }
+
+    if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+        gen_addr(30, sv->sym, svcul); // use x30 for address
+        if (IS_FREG(r))
+            gen_fload(arm64_type_size(svtt), fltr(r), 30, 0);
+        else
+            gen_load(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+                     intr(r), 30, 0);
+        return;
+    }
+
+    if (svr == (VT_CONST | VT_SYM)) {
+        gen_addr(intr(r), sv->sym, svcul);
+        return;
+    }
+
+    if (svr == VT_CONST) {
+        if ((svtt & VT_BTYPE) != VT_VOID)
+            arm64_movimm(intr(r),
+                         arm64_type_size(svtt) == 3 ? sv->c.ull : svcul);
+        return;
+    }
+
+    if (svr < VT_CONST) {
+        if (IS_FREG(r) && IS_FREG(svr))
+            if (svtt == VT_LDOUBLE)
+                o(0x4ea01c00 | fltr(r) | fltr(svr) << 5);
+                    // mov v(r).16b,v(svr).16b
+            else
+                o(0x1e604000 | fltr(r) | fltr(svr) << 5); // fmov d(r),d(svr)
+        else if (!IS_FREG(r) && !IS_FREG(svr))
+            o(0xaa0003e0 | intr(r) | intr(svr) << 16); // mov x(r),x(svr)
+        else
+            assert(0);
+      return;
+    }
+
+    if (svr == VT_LOCAL) {
+        if (-svcul < 0x1000)
+            o(0xd10003a0 | intr(r) | -svcul << 10); // sub x(r),x29,#...
+        else {
+            arm64_movimm(30, -svcul); // use x30 for offset
+            o(0xcb0003a0 | intr(r) | 30 << 16); // sub x(r),x29,x30
+        }
+        return;
+    }
+
+    if (svr == VT_JMP || svr == VT_JMPI) {
+        int t = (svr == VT_JMPI);
+        arm64_movimm(intr(r), t);
+        o(0x14000002); // b .+8
+        gsym(svcul);
+        arm64_movimm(intr(r), t ^ 1);
+        return;
+    }
+
+    if (svr == (VT_LLOCAL | VT_LVAL)) {
+        gen_load(0, 3, 30, 29, svcul); // use x30 for offset
+        if (IS_FREG(r))
+            gen_fload(arm64_type_size(svtt), fltr(r), 30, 0);
+        else
+            gen_load(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+                     intr(r), 30, 0);
+        return;
+    }
+
+    printf("load(%x, (%x, %x, %llx))\n", r, svtt, sv->r, (long long)svcul);
+    assert(0);
+}
+
+ST_FUNC void store(int r, SValue *sv)
+{
+    int svtt = sv->type.t;
+    int svr = sv->r & ~VT_LVAL_TYPE;
+    int svrv = svr & VT_VALMASK;
+    uint64_t svcul = (int32_t)sv->c.ul;
+
+    if (svr == (VT_LOCAL | VT_LVAL)) {
+        if (IS_FREG(r))
+            gen_fstore(arm64_type_size(svtt), fltr(r), 29, svcul);
+        else
+            gen_store(arm64_type_size(svtt), intr(r), 29, svcul);
+        return;
+    }
+
+    if ((svr & ~VT_VALMASK) == VT_LVAL && svrv < VT_CONST) {
+        if (IS_FREG(r))
+            gen_fstore(arm64_type_size(svtt), fltr(r), intr(svrv), 0);
+        else
+            gen_store(arm64_type_size(svtt), intr(r), intr(svrv), 0);
+        return;
+    }
+
+    if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+        gen_addr(30, sv->sym, svcul); // use x30 for address
+        if (IS_FREG(r))
+            gen_fstore(arm64_type_size(svtt), fltr(r), 30, 0);
+        else
+            gen_store(arm64_type_size(svtt), intr(r), 30, 0);
+        return;
+    }
+
+    printf("store(%x, (%x, %x, %llx))\n", r, svtt, sv->r, (long long)svcul);
+    assert(0);
+}
+
+static void arm64_gen_bl_or_b(int b)
+{
+    if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
+        assert(!b);
+        if (vtop->r & VT_SYM)
+            greloc(cur_text_section, vtop->sym, ind, R_AARCH64_CALL26);
+        else
+            assert(0);
+        o(0x94000000); // bl .
+    }
+    else
+        o(0xd61f0000 | !b << 21 | intr(gv(RC_R30)) << 5); // br/blr
+}
+
+static int arm64_hfa_aux(CType *type, int *fsize, int num)
+{
+    if (is_float(type->t)) {
+        int a, n = type_size(type, &a);
+        if (num >= 4 || (*fsize && *fsize != n))
+            return -1;
+        *fsize = n;
+        return num + 1;
+    }
+    else if ((type->t & VT_BTYPE) == VT_STRUCT) {
+        int is_struct = 0; // rather than union
+        Sym *field;
+        for (field = type->ref->next; field; field = field->next)
+            if (field->c) {
+                is_struct = 1;
+                break;
+            }
+        if (is_struct) {
+            int num0 = num;
+            for (field = type->ref->next; field; field = field->next) {
+                if (field->c != (num - num0) * *fsize)
+                    return -1;
+                num = arm64_hfa_aux(&field->type, fsize, num);
+                if (num == -1)
+                    return -1;
+            }
+            if (type->ref->c != (num - num0) * *fsize)
+                return -1;
+            return num;
+        }
+        else { // union
+            int num0 = num;
+            for (field = type->ref->next; field; field = field->next) {
+                int num1 = arm64_hfa_aux(&field->type, fsize, num0);
+                if (num1 == -1)
+                    return -1;
+                num = num1 < num ? num : num1;
+            }
+            if (type->ref->c != (num - num0) * *fsize)
+                return -1;
+            return num;
+        }
+    }
+    else if (type->t & VT_ARRAY) {
+        int num1;
+        if (!type->ref->c)
+            return num;
+        num1 = arm64_hfa_aux(&type->ref->type, fsize, num);
+        if (num1 == -1 || (num1 != num && type->ref->c > 4))
+            return -1;
+        num1 = num + type->ref->c * (num1 - num);
+        if (num1 > 4)
+            return -1;
+        return num1;
+    }
+    return -1;
+}
+
+static int arm64_hfa(CType *type, int *fsize)
+{
+    if ((type->t & VT_BTYPE) == VT_STRUCT || (type->t & VT_ARRAY)) {
+        int sz = 0;
+        int n = arm64_hfa_aux(type, &sz, 0);
+        if (0 < n && n <= 4) {
+            if (fsize)
+                *fsize = sz;
+            return n;
+        }
+    }
+    return 0;
+}
+
+static unsigned long arm64_pcs_aux(int n, CType **type, unsigned long *a)
+{
+    int nx = 0; // next integer register
+    int nv = 0; // next vector register
+    unsigned long ns = 32; // next stack offset
+    int i;
+
+    for (i = 0; i < n; i++) {
+        int hfa = arm64_hfa(type[i], 0);
+        int size, align;
+
+        if ((type[i]->t & VT_ARRAY) ||
+            (type[i]->t & VT_BTYPE) == VT_FUNC)
+            size = align = 8;
+        else
+            size = type_size(type[i], &align);
+
+        if (hfa)
+            // B.2
+            ;
+        else if (size > 16) {
+            // B.3: replace with pointer
+            if (nx < 8)
+                a[i] = nx++ << 1 | 1;
+            else {
+                ns = (ns + 7) & ~7;
+                a[i] = ns | 1;
+                ns += 8;
+            }
+            continue;
+        }
+        else if ((type[i]->t & VT_BTYPE) == VT_STRUCT)
+            // B.4
+            size = (size + 7) & ~7;
+
+        // C.1
+        if (is_float(type[i]->t) && nv < 8) {
+            a[i] = 16 + (nv++ << 1);
+            continue;
+        }
+
+        // C.2
+        if (hfa && nv + hfa <= 8) {
+            a[i] = 16 + (nv << 1);
+            nv += hfa;
+            continue;
+        }
+
+        // C.3
+        if (hfa) {
+            nv = 8;
+            size = (size + 7) & ~7;
+        }
+
+        // C.4
+        if (hfa || (type[i]->t & VT_BTYPE) == VT_LDOUBLE) {
+            ns = (ns + 7) & ~7;
+            ns = (ns + align - 1) & -align;
+        }
+
+        // C.5
+        if ((type[i]->t & VT_BTYPE) == VT_FLOAT)
+            size = 8;
+
+        // C.6
+        if (hfa || is_float(type[i]->t)) {
+            a[i] = ns;
+            ns += size;
+            continue;
+        }
+
+        // C.7
+        if ((type[i]->t & VT_BTYPE) != VT_STRUCT && size <= 8 && nx < 8) {
+            a[i] = nx++ << 1;
+            continue;
+        }
+
+        // C.8
+        if (align == 16)
+            nx = (nx + 1) & ~1;
+
+        // C.9
+        if ((type[i]->t & VT_BTYPE) != VT_STRUCT && size == 16 && nx < 7) {
+            a[i] = nx << 1;
+            nx += 2;
+            continue;
+        }
+
+        // C.10
+        if ((type[i]->t & VT_BTYPE) == VT_STRUCT && size <= (8 - nx) * 8) {
+            a[i] = nx << 1;
+            nx += (size + 7) >> 3;
+            continue;
+        }
+
+        // C.11
+        nx = 8;
+
+        // C.12
+        ns = (ns + 7) & ~7;
+        ns = (ns + align - 1) & -align;
+
+        // C.13
+        if ((type[i]->t & VT_BTYPE) == VT_STRUCT) {
+            a[i] = ns;
+            ns += size;
+            continue;
+        }
+
+        // C.14
+        if (size < 8)
+            size = 8;
+
+        // C.15
+        a[i] = ns;
+        ns += size;
+    }
+
+    return ns - 32;
+}
+
+static unsigned long arm64_pcs(int n, CType **type, unsigned long *a)
+{
+    unsigned long stack;
+
+    // Return type:
+    if ((type[0]->t & VT_BTYPE) == VT_VOID)
+        a[0] = -1;
+    else {
+        arm64_pcs_aux(1, type, a);
+        assert(a[0] == 0 || a[0] == 1 || a[0] == 16);
+    }
+
+    // Argument types:
+    stack = arm64_pcs_aux(n, type + 1, a + 1);
+
+    if (0) {
+        int i;
+        for (i = 0; i <= n; i++) {
+            if (!i)
+                printf("arm64_pcs return: ");
+            else
+                printf("arm64_pcs arg %d: ", i);
+            if (a[i] == (unsigned long)-1)
+                printf("void\n");
+            else if (a[i] == 1 && !i)
+                printf("X8 pointer\n");
+            else if (a[i] < 16)
+                printf("X%lu%s\n", a[i] / 2, a[i] & 1 ? " pointer" : "");
+            else if (a[i] < 32)
+                printf("V%lu\n", a[i] / 2 - 8);
+            else
+                printf("stack %lu%s\n",
+                       (a[i] - 32) & ~1, a[i] & 1 ? " pointer" : "");
+        }
+    }
+
+    return stack;
+}
+
+ST_FUNC void gfunc_call(int nb_args)
+{
+    CType *return_type;
+    CType **t;
+    unsigned long *a, *a1;
+    unsigned long stack;
+    int i;
+
+    return_type = &vtop[-nb_args].type.ref->type;
+    if ((return_type->t & VT_BTYPE) == VT_STRUCT)
+        --nb_args;
+
+    t = tcc_malloc((nb_args + 1) * sizeof(*t));
+    a = tcc_malloc((nb_args + 1) * sizeof(*a));
+    a1 = tcc_malloc((nb_args + 1) * sizeof(*a1));
+
+    t[0] = return_type;
+    for (i = 0; i < nb_args; i++)
+        t[nb_args - i] = &vtop[-i].type;
+
+    stack = arm64_pcs(nb_args, t, a);
+
+    // Allocate space for structs replaced by pointer:
+    for (i = nb_args; i; i--)
+        if (a[i] & 1) {
+            SValue *arg = &vtop[i - nb_args];
+            int align, size = type_size(&arg->type, &align);
+            assert((arg->type.t & VT_BTYPE) == VT_STRUCT);
+            stack = (stack + align - 1) & -align;
+            a1[i] = stack;
+            stack += size;
+        }
+
+    stack = (stack + 15) >> 4 << 4;
+
+    assert(stack < 0x1000);
+    if (stack)
+        o(0xd10003ff | stack << 10); // sub sp,sp,#(n)
+
+    // First pass: set all values on stack
+    for (i = nb_args; i; i--) {
+        vpushv(vtop - nb_args + i);
+
+        if (a[i] & 1) {
+            // struct replaced by pointer
+            int r = get_reg(RC_INT);
+            gen_stack_addr(intr(r), a1[i]);
+            vset(&vtop->type, r | VT_LVAL, 0);
+            vswap();
+            vstore();
+            if (a[i] >= 32) {
+                // pointer on stack
+                r = get_reg(RC_INT);
+                gen_stack_addr(intr(r), a1[i]);
+                gen_store(3, intr(r), 31, (a[i] - 32) >> 1 << 1);
+            }
+        }
+        else if (a[i] >= 32) {
+            // value on stack
+            if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
+                int r = get_reg(RC_INT);
+                gen_stack_addr(intr(r), a[i] - 32);
+                vset(&vtop->type, r | VT_LVAL, 0);
+                vswap();
+                vstore();
+            }
+            else if (is_float(vtop->type.t)) {
+                gv(RC_FLOAT);
+                gen_fstore(arm64_type_size(vtop[0].type.t),
+                           fltr(vtop[0].r), 31, a[i] - 32);
+            }
+            else {
+                gv(RC_INT);
+                gen_store(arm64_type_size(vtop[0].type.t),
+                          intr(vtop[0].r), 31, a[i] - 32);
+            }
+        }
+
+        --vtop;
+    }
+
+    // Second pass: assign values to registers
+    for (i = nb_args; i; i--, vtop--) {
+        if (a[i] < 16 && !(a[i] & 1)) {
+            // value in general-purpose registers
+            if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
+                int align, size = type_size(&vtop->type, &align);
+                vtop->type.t = VT_PTR;
+                gaddrof();
+                gv(RC_R(a[i] / 2));
+                gen_sload(a[i] / 2, size);
+            }
+            else
+                gv(RC_R(a[i] / 2));
+        }
+        else if (a[i] < 16)
+            // struct replaced by pointer in register
+            gen_stack_addr(a[i] / 2, a1[i]);
+        else if (a[i] < 32) {
+            // value in floating-point registers
+            if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
+                int j, sz, n = arm64_hfa(&vtop->type, &sz);
+                vtop->type.t = VT_PTR;
+                gaddrof();
+                gv(RC_R30);
+                for (j = 0; j < n; j++)
+                    o(0x3d4003c0 |
+                      (sz & 16) << 19 | -(sz & 8) << 27 | (sz & 4) << 29 |
+                      (a[i] / 2 - 8 + j) |
+                      j << 10); // ldr ([sdq])(*),[x30,#(j * sz)]
+            }
+            else
+                gv(RC_F(a[i] / 2 - 8));
+        }
+    }
+
+    if ((return_type->t & VT_BTYPE) == VT_STRUCT) {
+        if (a[0] == 1) {
+            // indirect return: set x8 and discard the stack value
+            gv(RC_R(8));
+            --vtop;
+        }
+        else
+            // return in registers: keep the address for after the call
+            vswap();
+    }
+
+    save_regs(0);
+    arm64_gen_bl_or_b(0);
+    --vtop;
+    if (stack)
+        o(0x910003ff | stack << 10); // add sp,sp,#(n)
+
+    {
+        int rt = return_type->t;
+        int bt = rt & VT_BTYPE;
+        if (bt == VT_BYTE || bt == VT_SHORT)
+            // Promote small integers:
+            o(0x13001c00 | (bt == VT_SHORT) << 13 |
+              !!(rt & VT_UNSIGNED) << 30); // [su]xt[bh] w0,w0
+        else if (bt == VT_STRUCT && !(a[0] & 1)) {
+            // A struct was returned in registers, so write it out:
+            gv(RC_R(8));
+            --vtop;
+            if (a[0] == 0) {
+                int align, size = type_size(return_type, &align);
+                assert(size <= 16);
+                if (size > 8)
+                    o(0xa9000500); // stp x0,x1,[x8]
+                else if (size)
+                    gen_store(size > 4 ? 3 : size > 2 ? 2 : size > 1,
+                              0, 8, 0);
+
+            }
+            else if (a[0] == 16) {
+                int j, sz, n = arm64_hfa(return_type, &sz);
+                for (j = 0; j < n; j++)
+                    o(0x3d000100 |
+                      (sz & 16) << 19 | -(sz & 8) << 27 | (sz & 4) << 29 |
+                      (a[i] / 2 - 8 + j) |
+                      j << 10); // str ([sdq])(*),[x8,#(j * sz)]
+            }
+        }
+    }
+
+    tcc_free(a1);
+    tcc_free(a);
+    tcc_free(t);
+}
+
+static unsigned long arm64_func_va_list_stack;
+static int arm64_func_va_list_gr_offs;
+static int arm64_func_va_list_vr_offs;
+static int arm64_func_sub_sp_offset;
+
+ST_FUNC void gfunc_prolog(CType *func_type)
+{
+    int n = 0;
+    int i = 0;
+    Sym *sym;
+    CType **t;
+    unsigned long *a;
+
+    // Why doesn't the caller (gen_function) set func_vt?
+    func_vt = func_type->ref->type;
+    func_vc = 144; // offset of where x8 is stored
+
+    for (sym = func_type->ref; sym; sym = sym->next)
+        ++n;
+    t = tcc_malloc(n * sizeof(*t));
+    a = tcc_malloc(n * sizeof(*a));
+
+    for (sym = func_type->ref; sym; sym = sym->next)
+        t[i++] = &sym->type;
+
+    arm64_func_va_list_stack = arm64_pcs(n - 1, t, a);
+
+    o(0xa9b27bfd); // stp x29,x30,[sp,#-224]!
+    o(0xad0087e0); // stp q0,q1,[sp,#16]
+    o(0xad018fe2); // stp q2,q3,[sp,#48]
+    o(0xad0297e4); // stp q4,q5,[sp,#80]
+    o(0xad039fe6); // stp q6,q7,[sp,#112]
+    o(0xa90923e8); // stp x8,x8,[sp,#144]
+    o(0xa90a07e0); // stp x0,x1,[sp,#160]
+    o(0xa90b0fe2); // stp x2,x3,[sp,#176]
+    o(0xa90c17e4); // stp x4,x5,[sp,#192]
+    o(0xa90d1fe6); // stp x6,x7,[sp,#208]
+
+    arm64_func_va_list_gr_offs = -64;
+    arm64_func_va_list_vr_offs = -128;
+
+    for (i = 1, sym = func_type->ref->next; sym; i++, sym = sym->next) {
+        int off = (a[i] < 16 ? 160 + a[i] / 2 * 8 :
+                   a[i] < 32 ? 16 + (a[i] - 16) / 2 * 16 :
+                   224 + ((a[i] - 32) >> 1 << 1));
+        sym_push(sym->v & ~SYM_FIELD, &sym->type,
+                 (a[i] & 1 ? VT_LLOCAL : VT_LOCAL) | lvalue_type(sym->type.t),
+                 off);
+
+        if (a[i] < 16) {
+            int align, size = type_size(&sym->type, &align);
+            arm64_func_va_list_gr_offs = (a[i] / 2 - 7 +
+                                          (!(a[i] & 1) && size > 8)) * 8;
+        }
+        else if (a[i] < 32) {
+            int hfa = arm64_hfa(&sym->type, 0);
+            arm64_func_va_list_vr_offs = (a[i] / 2 - 16 +
+                                          (hfa ? hfa : 1)) * 16;
+        }
+
+        // HFAs of float and double need to be written differently:
+        if (16 <= a[i] && a[i] < 32 && (sym->type.t & VT_BTYPE) == VT_STRUCT) {
+            int j, sz, k = arm64_hfa(&sym->type, &sz);
+            if (sz < 16)
+                for (j = 0; j < k; j++) {
+                    o(0x3d0003e0 | -(sz & 8) << 27 | (sz & 4) << 29 |
+                      ((a[i] - 16) / 2 + j) | (off / sz + j) << 10);
+                    // str ([sdq])(*),[sp,#(j * sz)]
+                }
+        }
+    }
+
+    tcc_free(a);
+    tcc_free(t);
+
+    o(0x910003fd); // mov x29,sp
+    arm64_func_sub_sp_offset = ind;
+    // In gfunc_epilog these will be replaced with code to decrement SP:
+    o(0xd503201f); // nop
+    o(0xd503201f); // nop
+    loc = 0;
+}
+
+ST_FUNC void gen_va_start(void)
+{
+    int r;
+    --vtop; // we don't need the "arg"
+    gaddrof();
+    r = intr(gv(RC_INT));
+
+    if (arm64_func_va_list_stack) {
+        //xx could use add (immediate) here
+        arm64_movimm(30, arm64_func_va_list_stack + 224);
+        o(0x8b1e03be); // add x30,x29,x30
+    }
+    else
+        o(0x910383be); // add x30,x29,#224
+    o(0xf900001e | r << 5); // str x30,[x(r)]
+
+    if (arm64_func_va_list_gr_offs) {
+        if (arm64_func_va_list_stack)
+            o(0x910383be); // add x30,x29,#224
+        o(0xf900041e | r << 5); // str x30,[x(r),#8]
+    }
+
+    if (arm64_func_va_list_vr_offs) {
+        o(0x910243be); // add x30,x29,#144
+        o(0xf900081e | r << 5); // str x30,[x(r),#16]
+    }
+
+    arm64_movimm(30, arm64_func_va_list_gr_offs);
+    o(0xb900181e | r << 5); // str w30,[x(r),#24]
+
+    arm64_movimm(30, arm64_func_va_list_vr_offs);
+    o(0xb9001c1e | r << 5); // str w30,[x(r),#28]
+
+    --vtop;
+}
+
+ST_FUNC void gen_va_arg(CType *t)
+{
+    int align, size = type_size(t, &align);
+    int fsize, hfa = arm64_hfa(t, &fsize);
+    int r0, r1;
+
+    if (is_float(t->t)) {
+        hfa = 1;
+        fsize = size;
+    }
+
+    gaddrof();
+    r0 = intr(gv(RC_INT));
+    r1 = get_reg(RC_INT);
+    vtop[0].r = r1 | lvalue_type(t->t);
+    r1 = intr(r1);
+
+    if (!hfa) {
+        uint32_t n = size > 16 ? 8 : (size + 7) & -8;
+        if (size == 16 && align == 16)
+            tcc_error("va_arg(ap, __uint128_t) unimplemented");
+        o(0xb940181e | r0 << 5); // ldr w30,[x(r0),#24] // __gr_offs
+        o(0x310003c0 | r1 | n << 10); // adds w(r1),w30,#(n)
+        o(0x540000ad); // b.le .+20
+        o(0xf9400000 | r1 | r0 << 5); // ldr x(r1),[x(r0)] // __stack
+        o(0x9100001e | r1 << 5 | n << 10); // add x30,x(r1),#(n)
+        o(0xf900001e | r0 << 5); // str x30,[x(r0)] // __stack
+        o(0x14000004); // b .+16
+        o(0xb9001800 | r1 | r0 << 5); // str w(r1),[x(r0),#24] // __gr_offs
+        o(0xf9400400 | r1 | r0 << 5); // ldr x(r1),[x(r0),#8] // __gr_top
+        o(0x8b3ec000 | r1 | r1 << 5); // add x(r1),x(r1),w30,sxtw
+        if (size > 16)
+            o(0xf9400000 | r1 | r1 << 5); // ldr x(r1),[x(r1)]
+    }
+    else {
+        uint32_t rsz = hfa << 4;
+        uint32_t ssz = (size + 7) & -(uint32_t)8;
+        uint32_t b1;
+        if (hfa > 1 && fsize < 16)
+            // We may need to change the layout of this HFA
+            tcc_error("va_arg(ap, HFA) unimplemented");
+        o(0xb9401c1e | r0 << 5); // ldr w30,[x(r0),#28] // __vr_offs
+        o(0x310003c0 | r1 | rsz << 10); // adds w(r1),w30,#(rsz)
+        b1 = ind; o(0x5400000d); // b.le lab1
+        o(0xf9400000 | r1 | r0 << 5); // ldr x(r1),[x(r0)] // __stack
+        if (fsize == 16) {
+            o(0x91003c00 | r1 | r1 << 5); // add x(r1),x(r1),#15
+            o(0x927cec00 | r1 | r1 << 5); // and x(r1),x(r1),#-16
+        }
+        o(0x9100001e | r1 << 5 | ssz << 10); // add x30,x(r1),#(ssz)
+        o(0xf900001e | r0 << 5); // str x30,[x(r0)] // __stack
+        o(0x14000004); // b .+16
+        // lab1:
+        *(uint32_t *)(cur_text_section->data + b1) =
+            (0x5400000d | (ind - b1) << 3);
+        o(0xb9001c00 | r1 | r0 << 5); // str w(r1),[x(r0),#28] // __vr_offs
+        o(0xf9400800 | r1 | r0 << 5); // ldr x(r1),[x(r0),#16] // __vr_top
+        o(0x8b3ec000 | r1 | r1 << 5); // add x(r1),x(r1),w30,sxtw
+    }
+}
+
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *align)
+{
+    return 0;
+}
+
+ST_FUNC void greturn(void)
+{
+    CType *t = &func_vt;
+    unsigned long a;
+
+    arm64_pcs(0, &t, &a);
+    switch (a) {
+    case -1:
+        break;
+    case 0:
+        if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
+            int align, size = type_size(&func_vt, &align);
+            gaddrof();
+            gv(RC_R(0));
+            gen_sload(0, size);
+        }
+        else
+            gv(RC_IRET);
+        break;
+    case 1: {
+        CType type = func_vt;
+        mk_pointer(&type);
+        vset(&type, VT_LOCAL | VT_LVAL, func_vc);
+        indir();
+        vswap();
+        vstore();
+        break;
+    }
+    case 16:
+        if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
+          int j, sz, n = arm64_hfa(&vtop->type, &sz);
+          gaddrof();
+          gv(RC_R(0));
+          for (j = 0; j < n; j++)
+              o(0x3d400000 |
+                (sz & 16) << 19 | -(sz & 8) << 27 | (sz & 4) << 29 |
+                j | j << 10); // ldr ([sdq])(*),[x0,#(j * sz)]
+        }
+        else
+            gv(RC_FRET);
+        break;
+    default:
+      assert(0);
+    }
+}
+
+ST_FUNC void gfunc_epilog(void)
+{
+    if (loc) {
+        // Insert instructions to subtract size of stack frame from SP.
+        uint32_t *ptr =
+            (uint32_t *)(cur_text_section->data + arm64_func_sub_sp_offset);
+        uint64_t diff = (-loc + 15) & ~15;
+        if (!(diff >> 24)) {
+            if (diff & 0xfff) // sub sp,sp,#(diff & 0xfff)
+                ptr[0] = 0xd10003ff | (diff & 0xfff) << 10;
+            if (diff >> 12) // sub sp,sp,#(diff >> 12),lsl #12
+                ptr[1] = 0xd14003ff | (diff >> 12) << 10;
+        }
+        else {
+            // In this case we may subtract more than necessary,
+            // but always less than 17/16 of what we were aiming for.
+            int i = 0;
+            int j = 0;
+            while (diff >> 20) {
+                diff = (diff + 0xffff) >> 16;
+                ++i;
+            }
+            while (diff >> 16) {
+                diff = (diff + 1) >> 1;
+                ++j;
+            }
+            ptr[0] = 0xd2800010 | diff << 5 | i << 21;
+            // mov x16,#(diff),lsl #(16 * i)
+            ptr[1] = 0xcb3063ff | j << 10;
+            // sub sp,sp,x16,lsl #(j)
+        }
+    }
+    o(0x910003bf); // mov sp,x29
+    o(0xa8ce7bfd); // ldp x29,x30,[sp],#224
+
+    o(0xd65f03c0); // ret
+}
+
+// Generate forward branch to label:
+ST_FUNC int gjmp(int t)
+{
+    int r = ind;
+    o(t);
+    return r;
+}
+
+// Generate branch to known address:
+ST_FUNC void gjmp_addr(int a)
+{
+    assert(a - ind + 0x8000000 < 0x10000000);
+    o(0x14000000 | ((a - ind) >> 2 & 0x3ffffff));
+}
+
+ST_FUNC int gtst(int inv, int t)
+{
+    int bt = vtop->type.t & VT_BTYPE;
+    if (bt == VT_LDOUBLE) {
+        int a, b, f = fltr(gv(RC_FLOAT));
+        a = get_reg(RC_INT);
+        vpushi(0);
+        vtop[0].r = a;
+        b = get_reg(RC_INT);
+        a = intr(a);
+        b = intr(b);
+        o(0x4e083c00 | a | f << 5); // mov x(a),v(f).d[0]
+        o(0x4e183c00 | b | f << 5); // mov x(b),v(f).d[1]
+        o(0xaa000400 | a | a << 5 | b << 16); // orr x(a),x(a),x(b),lsl #1
+        o(0xb4000040 | a | !!inv << 24); // cbz/cbnz x(a),.+8
+        --vtop;
+    }
+    else if (bt == VT_FLOAT || bt == VT_DOUBLE) {
+        int a = fltr(gv(RC_FLOAT));
+        o(0x1e202008 | a << 5 | (bt != VT_FLOAT) << 22); // fcmp
+        o(0x54000040 | !!inv); // b.eq/b.ne .+8
+    }
+    else {
+        int ll = (bt == VT_PTR || bt == VT_LLONG);
+        int a = intr(gv(RC_INT));
+        o(0x34000040 | a | !!inv << 24 | ll << 31); // cbz/cbnz wA,.+8
+    }
+    --vtop;
+    return gjmp(t);
+}
+
+static void arm64_gen_opil(int op, int l)
+{
+    int x, a, b;
+    gv2(RC_INT, RC_INT);
+    assert(vtop[-1].r < VT_CONST && vtop[0].r < VT_CONST);
+    a = intr(vtop[-1].r);
+    b = intr(vtop[0].r);
+    vtop -= 2;
+    x = get_reg(RC_INT);
+    ++vtop;
+    vtop[0].r = x;
+    x = intr(x);
+
+    switch (op) {
+    case '%':
+        // Use x30 for quotient:
+        o(0x1ac00c00 | l << 31 | 30 | a << 5 | b << 16); // sdiv
+        o(0x1b008000 | l << 31 | x | 30 << 5 | b << 16 | a << 10); // msub
+        break;
+    case '&':
+        o(0x0a000000 | l << 31 | x | a << 5 | b << 16); // and
+        break;
+    case '*':
+        o(0x1b007c00 | l << 31 | x | a << 5 | b << 16); // mul
+        break;
+    case '+':
+        o(0x0b000000 | l << 31 | x | a << 5 | b << 16); // add
+        break;
+    case '-':
+        o(0x4b000000 | l << 31 | x | a << 5 | b << 16); // sub
+        break;
+    case '/':
+        o(0x1ac00c00 | l << 31 | x | a << 5 | b << 16); // sdiv
+        break;
+    case '^':
+        o(0x4a000000 | l << 31 | x | a << 5 | b << 16); // eor
+        break;
+    case '|':
+        o(0x2a000000 | l << 31 | x | a << 5 | b << 16); // orr
+        break;
+    case TOK_EQ:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f17e0 | x); // cset wA,eq
+        break;
+    case TOK_GE:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9fb7e0 | x); // cset wA,ge
+        break;
+    case TOK_GT:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9fd7e0 | x); // cset wA,gt
+        break;
+    case TOK_LE:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9fc7e0 | x); // cset wA,le
+        break;
+    case TOK_LT:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9fa7e0 | x); // cset wA,lt
+        break;
+    case TOK_NE:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f07e0 | x); // cset wA,ne
+        break;
+    case TOK_SAR:
+        o(0x1ac02800 | l << 31 | x | a << 5 | b << 16); // asr
+        break;
+    case TOK_SHL:
+        o(0x1ac02000 | l << 31 | x | a << 5 | b << 16); // lsl
+        break;
+    case TOK_SHR:
+        o(0x1ac02400 | l << 31 | x | a << 5 | b << 16); // lsr
+        break;
+    case TOK_UDIV:
+    case TOK_PDIV:
+        o(0x1ac00800 | l << 31 | x | a << 5 | b << 16); // udiv
+        break;
+    case TOK_UGE:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f37e0 | x); // cset wA,cs
+        break;
+    case TOK_UGT:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f97e0 | x); // cset wA,hi
+        break;
+    case TOK_ULT:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f27e0 | x); // cset wA,cc
+        break;
+    case TOK_ULE:
+        o(0x6b00001f | l << 31 | a << 5 | b << 16); // cmp
+        o(0x1a9f87e0 | x); // cset wA,ls
+        break;
+    case TOK_UMOD:
+        // Use x30 for quotient:
+        o(0x1ac00800 | l << 31 | 30 | a << 5 | b << 16); // udiv
+        o(0x1b008000 | l << 31 | x | 30 << 5 | b << 16 | a << 10); // msub
+        break;
+    default:
+        assert(0);
+    }
+}
+
+ST_FUNC void gen_opi(int op)
+{
+    arm64_gen_opil(op, 0);
+}
+
+ST_FUNC void gen_opl(int op)
+{
+    arm64_gen_opil(op, 1);
+}
+
+ST_FUNC void gen_opf(int op)
+{
+    int x, a, b, dbl;
+
+    if (vtop[0].type.t == VT_LDOUBLE) {
+        CType type = vtop[0].type;
+        int func = 0;
+        int cond = -1;
+        switch (op) {
+        case '*': func = TOK___multf3; break;
+        case '+': func = TOK___addtf3; break;
+        case '-': func = TOK___subtf3; break;
+        case '/': func = TOK___divtf3; break;
+        case TOK_EQ: func = TOK___eqtf2; cond = 1; break;
+        case TOK_NE: func = TOK___netf2; cond = 0; break;
+        case TOK_LT: func = TOK___lttf2; cond = 10; break;
+        case TOK_GE: func = TOK___getf2; cond = 11; break;
+        case TOK_LE: func = TOK___letf2; cond = 12; break;
+        case TOK_GT: func = TOK___gttf2; cond = 13; break;
+        default: assert(0); break;
+        }
+        vpush_global_sym(&func_old_type, func);
+        vrott(3);
+        gfunc_call(2);
+        vpushi(0);
+        vtop->r = cond < 0 ? REG_FRET : REG_IRET;
+        if (cond < 0)
+            vtop->type = type;
+        else {
+            o(0x7100001f); // cmp w0,#0
+            o(0x1a9f07e0 | cond << 12); // cset w0,(cond)
+        }
+        return;
+    }
+
+    dbl = vtop[0].type.t != VT_FLOAT;
+    gv2(RC_FLOAT, RC_FLOAT);
+    assert(vtop[-1].r < VT_CONST && vtop[0].r < VT_CONST);
+    a = fltr(vtop[-1].r);
+    b = fltr(vtop[0].r);
+    vtop -= 2;
+    switch (op) {
+    case TOK_EQ: case TOK_NE:
+    case TOK_LT: case TOK_GE: case TOK_LE: case TOK_GT:
+        x = get_reg(RC_INT);
+        ++vtop;
+        vtop[0].r = x;
+        x = intr(x);
+        break;
+    default:
+        x = get_reg(RC_FLOAT);
+        ++vtop;
+        vtop[0].r = x;
+        x = fltr(x);
+        break;
+    }
+
+    switch (op) {
+    case '*':
+        o(0x1e200800 | dbl << 22 | x | a << 5 | b << 16); // fmul
+        break;
+    case '+':
+        o(0x1e202800 | dbl << 22 | x | a << 5 | b << 16); // fadd
+        break;
+    case '-':
+        o(0x1e203800 | dbl << 22 | x | a << 5 | b << 16); // fsub
+        break;
+    case '/':
+        o(0x1e201800 | dbl << 22 | x | a << 5 | b << 16); // fdiv
+        break;
+    case TOK_EQ:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9f17e0 | x); // cset w(x),eq
+        break;
+    case TOK_GE:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9fb7e0 | x); // cset w(x),ge
+        break;
+    case TOK_GT:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9fd7e0 | x); // cset w(x),gt
+        break;
+    case TOK_LE:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9f87e0 | x); // cset w(x),ls
+        break;
+    case TOK_LT:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9f57e0 | x); // cset w(x),mi
+        break;
+    case TOK_NE:
+        o(0x1e202000 | dbl << 22 | a << 5 | b << 16); // fcmp
+        o(0x1a9f07e0 | x); // cset w(x),ne
+        break;
+    default:
+        assert(0);
+    }
+}
+
+// Generate sign extension from 32 to 64 bits:
+ST_FUNC void gen_cvt_sxtw(void)
+{
+    int r = intr(gv(RC_INT));
+    o(0x93407c00 | r | r << 5); // sxtw x(r),w(r)
+}
+
+ST_FUNC void gen_cvt_itof(int t)
+{
+    if (t == VT_LDOUBLE) {
+        int f = vtop->type.t;
+        int func = (f & VT_BTYPE) == VT_LLONG ?
+          (f & VT_UNSIGNED ? TOK___floatunditf : TOK___floatditf) :
+          (f & VT_UNSIGNED ? TOK___floatunsitf : TOK___floatsitf);
+        vpush_global_sym(&func_old_type, func);
+        vrott(2);
+        gfunc_call(1);
+        vpushi(0);
+        vtop->type.t = t;
+        vtop->r = REG_FRET;
+        return;
+    }
+    else {
+        int d, n = intr(gv(RC_INT));
+        int s = !(vtop->type.t & VT_UNSIGNED);
+        int l = ((vtop->type.t & VT_BTYPE) == VT_LLONG);
+        --vtop;
+        d = get_reg(RC_FLOAT);
+        ++vtop;
+        vtop[0].r = d;
+        o(0x1e220000 | !s << 16 | (t != VT_FLOAT) << 22 | fltr(d) |
+          l << 31 | n << 5); // [us]cvtf [sd](d),[wx](n)
+    }
+}
+
+ST_FUNC void gen_cvt_ftoi(int t)
+{
+    if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+        int func = (t & VT_BTYPE) == VT_LLONG ?
+          (t & VT_UNSIGNED ? TOK___fixunstfdi : TOK___fixtfdi) :
+          (t & VT_UNSIGNED ? TOK___fixunstfsi : TOK___fixtfsi);
+        vpush_global_sym(&func_old_type, func);
+        vrott(2);
+        gfunc_call(1);
+        vpushi(0);
+        vtop->type.t = t;
+        vtop->r = REG_IRET;
+        return;
+    }
+    else {
+        int d, n = fltr(gv(RC_FLOAT));
+        int l = ((vtop->type.t & VT_BTYPE) != VT_FLOAT);
+        --vtop;
+        d = get_reg(RC_INT);
+        ++vtop;
+        vtop[0].r = d;
+        o(0x1e380000 |
+          !!(t & VT_UNSIGNED) << 16 |
+          ((t & VT_BTYPE) == VT_LLONG) << 31 | intr(d) |
+          l << 22 | n << 5); // fcvtz[su] [wx](d),[sd](n)
+    }
+}
+
+ST_FUNC void gen_cvt_ftof(int t)
+{
+    int f = vtop[0].type.t;
+    assert(t == VT_FLOAT || t == VT_DOUBLE || t == VT_LDOUBLE);
+    assert(f == VT_FLOAT || f == VT_DOUBLE || f == VT_LDOUBLE);
+    if (t == f)
+        return;
+
+    if (t == VT_LDOUBLE || f == VT_LDOUBLE) {
+        int func = (t == VT_LDOUBLE) ?
+            (f == VT_FLOAT ? TOK___extendsftf2 : TOK___extenddftf2) :
+            (t == VT_FLOAT ? TOK___trunctfsf2 : TOK___trunctfdf2);
+        vpush_global_sym(&func_old_type, func);
+        vrott(2);
+        gfunc_call(1);
+        vpushi(0);
+        vtop->type.t = t;
+        vtop->r = REG_FRET;
+    }
+    else {
+        int x, a;
+        gv(RC_FLOAT);
+        assert(vtop[0].r < VT_CONST);
+        a = fltr(vtop[0].r);
+        --vtop;
+        x = get_reg(RC_FLOAT);
+        ++vtop;
+        vtop[0].r = x;
+        x = fltr(x);
+
+        if (f == VT_FLOAT)
+            o(0x1e22c000 | x | a << 5); // fcvt d(x),s(a)
+        else
+            o(0x1e624000 | x | a << 5); // fcvt s(x),d(a)
+    }
+}
+
+ST_FUNC void ggoto(void)
+{
+    arm64_gen_bl_or_b(1);
+    --vtop;
+}
+
+ST_FUNC void gen_vla_sp_save(int addr) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+ST_FUNC void gen_vla_sp_restore(int addr) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+ST_FUNC void gen_vla_alloc(CType *type, int align) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+/* end of A64 code generator */
+/*************************************************************/
+#endif
+/*************************************************************/
diff --git a/configure b/configure
index f6df9ada..c75d4601 100755
--- a/configure
+++ b/configure
@@ -99,6 +99,9 @@ classify_cpu ()
       esac
       cpu="armv4l"
     ;;
+    aarch64)
+      cpu="aarch64"
+    ;;
     alpha)
       cpu="alpha"
     ;;
@@ -435,6 +438,9 @@ elif test "$cpu" = "armv4l" ; then
   echo "ARCH=arm" >> config.mak
   echo "#define HOST_ARM 1" >> $TMPH
   echo "#define TCC_ARM_VERSION $cpuver" >> $TMPH
+elif test "$cpu" = "aarch64" ; then
+  echo "ARCH=arm64" >> config.mak
+  echo "#define HOST_ARM64 1" >> $TMPH
 elif test "$cpu" = "powerpc" ; then
   echo "ARCH=ppc" >> config.mak
   echo "#define HOST_PPC 1" >> $TMPH
diff --git a/conftest.c b/conftest.c
index ddb7a20b..fa07a1b5 100644
--- a/conftest.c
+++ b/conftest.c
@@ -7,6 +7,8 @@
 # define TRIPLET_ARCH "x86_64"
 #elif defined(__arm__)
 # define TRIPLET_ARCH "arm"
+#elif defined(__aarch64__)
+# define TRIPLET_ARCH "aarch64"
 #else
 # define TRIPLET_ARCH "unknown"
 #endif
diff --git a/elf.h b/elf.h
index a3597f99..a40c736c 100644
--- a/elf.h
+++ b/elf.h
@@ -2336,6 +2336,117 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_AARCH64_NONE            0	/* No relocation.  */
 #define R_AARCH64_ABS64         257	/* Direct 64 bit. */
 #define R_AARCH64_ABS32         258	/* Direct 32 bit.  */
+#define R_AARCH64_ABS16         259	/* Direct 16-bit.  */
+#define R_AARCH64_PREL64        260	/* PC-relative 64-bit.  */
+#define R_AARCH64_PREL32        261	/* PC-relative 32-bit.  */
+#define R_AARCH64_PREL16        262	/* PC-relative 16-bit.  */
+#define R_AARCH64_MOVW_UABS_G0  263	/* Dir. MOVZ imm. from bits 15:0.  */
+#define R_AARCH64_MOVW_UABS_G0_NC 264	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G1  265	/* Dir. MOVZ imm. from bits 31:16.  */
+#define R_AARCH64_MOVW_UABS_G1_NC 266	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G2  267	/* Dir. MOVZ imm. from bits 47:32.  */
+#define R_AARCH64_MOVW_UABS_G2_NC 268	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G3  269	/* Dir. MOV{K,Z} imm. from 63:48.  */
+#define R_AARCH64_MOVW_SABS_G0  270	/* Dir. MOV{N,Z} imm. from 15:0.  */
+#define R_AARCH64_MOVW_SABS_G1  271	/* Dir. MOV{N,Z} imm. from 31:16.  */
+#define R_AARCH64_MOVW_SABS_G2  272	/* Dir. MOV{N,Z} imm. from 47:32.  */
+#define R_AARCH64_LD_PREL_LO19  273	/* PC-rel. LD imm. from bits 20:2.  */
+#define R_AARCH64_ADR_PREL_LO21 274	/* PC-rel. ADR imm. from bits 20:0.  */
+#define R_AARCH64_ADR_PREL_PG_HI21 275	/* Page-rel. ADRP imm. from 32:12.  */
+#define R_AARCH64_ADR_PREL_PG_HI21_NC 276	/* Likewise; no overflow check.  */
+#define R_AARCH64_ADD_ABS_LO12_NC 277	/* Dir. ADD imm. from bits 11:0.  */
+#define R_AARCH64_LDST8_ABS_LO12_NC 278	/* Likewise for LD/ST; no check. */
+#define R_AARCH64_TSTBR14       279	/* PC-rel. TBZ/TBNZ imm. from 15:2.  */
+#define R_AARCH64_CONDBR19      280	/* PC-rel. cond. br. imm. from 20:2. */
+#define R_AARCH64_JUMP26        282	/* PC-rel. B imm. from bits 27:2.  */
+#define R_AARCH64_CALL26        283	/* Likewise for CALL.  */
+#define R_AARCH64_LDST16_ABS_LO12_NC 284	/* Dir. ADD imm. from bits 11:1.  */
+#define R_AARCH64_LDST32_ABS_LO12_NC 285	/* Likewise for bits 11:2.  */
+#define R_AARCH64_LDST64_ABS_LO12_NC 286	/* Likewise for bits 11:3.  */
+#define R_AARCH64_MOVW_PREL_G0  287	/* PC-rel. MOV{N,Z} imm. from 15:0.  */
+#define R_AARCH64_MOVW_PREL_G0_NC 288	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G1  289	/* PC-rel. MOV{N,Z} imm. from 31:16. */
+#define R_AARCH64_MOVW_PREL_G1_NC 290	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G2  291	/* PC-rel. MOV{N,Z} imm. from 47:32. */
+#define R_AARCH64_MOVW_PREL_G2_NC 292	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G3  293	/* PC-rel. MOV{N,Z} imm. from 63:48. */
+#define R_AARCH64_LDST128_ABS_LO12_NC 299	/* Dir. ADD imm. from bits 11:4.  */
+#define R_AARCH64_MOVW_GOTOFF_G0 300	/* GOT-rel. off. MOV{N,Z} imm. 15:0. */
+#define R_AARCH64_MOVW_GOTOFF_G0_NC 301	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G1 302	/* GOT-rel. o. MOV{N,Z} imm. 31:16.  */
+#define R_AARCH64_MOVW_GOTOFF_G1_NC 303	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G2 304	/* GOT-rel. o. MOV{N,Z} imm. 47:32.  */
+#define R_AARCH64_MOVW_GOTOFF_G2_NC 305	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G3 306	/* GOT-rel. o. MOV{N,Z} imm. 63:48.  */
+#define R_AARCH64_GOTREL64      307	/* GOT-relative 64-bit.  */
+#define R_AARCH64_GOTREL32      308	/* GOT-relative 32-bit.  */
+#define R_AARCH64_GOT_LD_PREL19 309	/* PC-rel. GOT off. load imm. 20:2.  */
+#define R_AARCH64_LD64_GOTOFF_LO15 310	/* GOT-rel. off. LD/ST imm. 14:3.  */
+#define R_AARCH64_ADR_GOT_PAGE  311	/* P-page-rel. GOT off. ADRP 32:12.  */
+#define R_AARCH64_LD64_GOT_LO12_NC 312	/* Dir. GOT off. LD/ST imm. 11:3.  */
+#define R_AARCH64_LD64_GOTPAGE_LO15 313	/* GOT-page-rel. GOT off. LD/ST 14:3 */
+#define R_AARCH64_TLSGD_ADR_PREL21 512	/* PC-relative ADR imm. 20:0.  */
+#define R_AARCH64_TLSGD_ADR_PAGE21 513	/* page-rel. ADRP imm. 32:12.  */
+#define R_AARCH64_TLSGD_ADD_LO12_NC 514	/* direct ADD imm. from 11:0.  */
+#define R_AARCH64_TLSGD_MOVW_G1 515	/* GOT-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSGD_MOVW_G0_NC 516	/* GOT-rel. MOVK imm. 15:0.  */
+#define R_AARCH64_TLSLD_ADR_PREL21 517	/* Like 512; local dynamic model.  */
+#define R_AARCH64_TLSLD_ADR_PAGE21 518	/* Like 513; local dynamic model.  */
+#define R_AARCH64_TLSLD_ADD_LO12_NC 519	/* Like 514; local dynamic model.  */
+#define R_AARCH64_TLSLD_MOVW_G1 520	/* Like 515; local dynamic model.  */
+#define R_AARCH64_TLSLD_MOVW_G0_NC 521	/* Like 516; local dynamic model.  */
+#define R_AARCH64_TLSLD_LD_PREL19 522	/* TLS PC-rel. load imm. 20:2.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G2 523	/* TLS DTP-rel. MOV{N,Z} 47:32.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G1 524	/* TLS DTP-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC 525	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G0 526	/* TLS DTP-rel. MOV{N,Z} 15:0.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC 527	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLD_ADD_DTPREL_HI12 528	/* DTP-rel. ADD imm. from 23:12. */
+#define R_AARCH64_TLSLD_ADD_DTPREL_LO12 529	/* DTP-rel. ADD imm. from 11:0.  */
+#define R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC 530	/* Likewise; no ovfl. check.  */
+#define R_AARCH64_TLSLD_LDST8_DTPREL_LO12 531	/* DTP-rel. LD/ST imm. 11:0.  */
+#define R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC 532	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST16_DTPREL_LO12 533	/* DTP-rel. LD/ST imm. 11:1.  */
+#define R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC 534	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST32_DTPREL_LO12 535	/* DTP-rel. LD/ST imm. 11:2.  */
+#define R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC 536	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST64_DTPREL_LO12 537	/* DTP-rel. LD/ST imm. 11:3.  */
+#define R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC 538	/* Likewise; no check.  */
+#define R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 539	/* GOT-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC 540	/* GOT-rel. MOVK 15:0.  */
+#define R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 541	/* Page-rel. ADRP 32:12.  */
+#define R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC 542	/* Direct LD off. 11:3.  */
+#define R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 543	/* PC-rel. load imm. 20:2.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G2 544	/* TLS TP-rel. MOV{N,Z} 47:32.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G1 545	/* TLS TP-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G1_NC 546	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G0 547	/* TLS TP-rel. MOV{N,Z} 15:0.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G0_NC 548	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_HI12 549	/* TP-rel. ADD imm. 23:12.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_LO12 550	/* TP-rel. ADD imm. 11:0.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_LO12_NC 551	/* Likewise; no ovfl. check.  */
+#define R_AARCH64_TLSLE_LDST8_TPREL_LO12 552	/* TP-rel. LD/ST off. 11:0.  */
+#define R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC 553	/* Likewise; no ovfl. check. */
+#define R_AARCH64_TLSLE_LDST16_TPREL_LO12 554	/* TP-rel. LD/ST off. 11:1.  */
+#define R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC 555	/* Likewise; no check.  */
+#define R_AARCH64_TLSLE_LDST32_TPREL_LO12 556	/* TP-rel. LD/ST off. 11:2.  */
+#define R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC 557	/* Likewise; no check.  */
+#define R_AARCH64_TLSLE_LDST64_TPREL_LO12 558	/* TP-rel. LD/ST off. 11:3.  */
+#define R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC 559	/* Likewise; no check.  */
+#define R_AARCH64_TLSDESC_LD_PREL19 560	/* PC-rel. load immediate 20:2.  */
+#define R_AARCH64_TLSDESC_ADR_PREL21 561	/* PC-rel. ADR immediate 20:0.  */
+#define R_AARCH64_TLSDESC_ADR_PAGE21 562	/* Page-rel. ADRP imm. 32:12.  */
+#define R_AARCH64_TLSDESC_LD64_LO12 563	/* Direct LD off. from 11:3.  */
+#define R_AARCH64_TLSDESC_ADD_LO12 564	/* Direct ADD imm. from 11:0.  */
+#define R_AARCH64_TLSDESC_OFF_G1 565	/* GOT-rel. MOV{N,Z} imm. 31:16.  */
+#define R_AARCH64_TLSDESC_OFF_G0_NC 566	/* GOT-rel. MOVK imm. 15:0; no ck.  */
+#define R_AARCH64_TLSDESC_LDR   567	/* Relax LDR.  */
+#define R_AARCH64_TLSDESC_ADD   568	/* Relax ADD.  */
+#define R_AARCH64_TLSDESC_CALL  569	/* Relax BLR.  */
+#define R_AARCH64_TLSLE_LDST128_TPREL_LO12 570	/* TP-rel. LD/ST off. 11:4.  */
+#define R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC 571	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST128_DTPREL_LO12 572	/* DTP-rel. LD/ST imm. 11:4. */
+#define R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC 573	/* Likewise; no check.  */
 #define R_AARCH64_COPY         1024	/* Copy symbol at runtime.  */
 #define R_AARCH64_GLOB_DAT     1025	/* Create GOT entry.  */
 #define R_AARCH64_JUMP_SLOT    1026	/* Create PLT entry.  */
@@ -2344,6 +2455,7 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_AARCH64_TLS_DTPREL64 1029	/* Module-relative offset, 64 bit.  */
 #define R_AARCH64_TLS_TPREL64  1030	/* TP-relative offset, 64 bit.  */
 #define R_AARCH64_TLSDESC      1031	/* TLS Descriptor.  */
+#define R_AARCH64_IRELATIVE    1032	/* STT_GNU_IFUNC relocation.  */
 
 /* ARM relocs.  */
 
diff --git a/include/stdarg.h b/include/stdarg.h
index 5aa9d57b..06d592b9 100644
--- a/include/stdarg.h
+++ b/include/stdarg.h
@@ -46,6 +46,19 @@ typedef char *va_list;
 #define va_copy(dest, src) (dest) = (src)
 #define va_end(ap)
 
+#elif defined(__aarch64__)
+typedef struct {
+    void *__stack;
+    void *__gr_top;
+    void *__vr_top;
+    int   __gr_offs;
+    int   __vr_offs;
+} va_list;
+#define va_start(ap, last) __va_start(ap, last)
+#define va_arg(ap, type) __va_arg(ap, type)
+#define va_end(ap)
+#define va_copy(dest, src) ((dest) = (src))
+
 #else /* __i386__ */
 typedef char *va_list;
 /* only correct for i386 */
diff --git a/lib/Makefile b/lib/Makefile
index 37d4711d..6e192b93 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -28,6 +28,11 @@ ifndef TARGET # native library
  ifeq ($(ARCH),arm)
   TARGET = arm
   XCC = $(CC)
+ else
+ ifeq ($(ARCH),arm64)
+  TARGET = arm64
+ else
+ endif
  endif
  endif
  endif
@@ -49,6 +54,7 @@ X86_64_O = libtcc1.o alloca86_64.o
 ARM_O = libtcc1.o armeabi.o alloca-arm.o
 WIN32_O = $(I386_O) crt1.o wincrt1.o dllcrt1.o dllmain.o chkstk.o
 WIN64_O = $(X86_64_O) crt1.o wincrt1.o dllcrt1.o dllmain.o chkstk.o
+ARM64_O = lib-arm64.o
 
 # build TCC runtime library to contain PIC code, so it can be linked
 # into shared libraries
@@ -86,6 +92,11 @@ ifeq "$(TARGET)" "arm"
  OBJ = $(addprefix $(DIR)/,$(ARM_O))
  TGT = -DTCC_TARGET_ARM
  XCC ?= $(TCC) -B$(TOP)
+else
+ifeq "$(TARGET)" "arm64"
+ OBJ = $(addprefix $(DIR)/,$(ARM64_O))
+ TGT = -DTCC_TARGET_ARM64
+ XCC ?= $(TCC) -B$(TOP)
 else
  $(error libtcc1.a not supported on target '$(TARGET)')
 endif
@@ -93,6 +104,7 @@ endif
 endif
 endif
 endif
+endif
 
 XFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAGS) $(TGT)
 
diff --git a/lib/lib-arm64.c b/lib/lib-arm64.c
new file mode 100644
index 00000000..fd73506e
--- /dev/null
+++ b/lib/lib-arm64.c
@@ -0,0 +1,652 @@
+/*
+ *  TCC runtime library for arm64.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+void __clear_cache(char *beg, char *end)
+{
+#warning __clear_cache not yet implemented
+}
+
+typedef struct {
+    uint64_t x0, x1;
+} u128_t;
+
+static long double f3_zero(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_infinity(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 | 0x7fff000000000000 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_NaN(void)
+{
+    long double f;
+#if 0
+    // ARM's default NaN usually has just the top fraction bit set:
+    u128_t x = {  0, 0x7fff800000000000 };
+#else
+    // GCC's library sets all fraction bits:
+    u128_t x = { -1, 0x7fffffffffffffff };
+#endif
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int fp3_convert_NaN(long double *f, int sgn, u128_t mnt)
+{
+    u128_t x = { mnt.x0,
+                 mnt.x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
+    memcpy(f, &x, 16);
+    return 1;
+}
+
+static int fp3_detect_NaNs(long double *f,
+                           int a_sgn, int a_exp, u128_t a,
+                           int b_sgn, int b_exp, u128_t b)
+{
+    // Detect signalling NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16) && !(a.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16) && !(b.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    // Detect quiet NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    return 0;
+}
+
+static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    *sgn = x.x1 >> 63;
+    *exp = x.x1 >> 48 & 32767;
+    x.x1 = x.x1 << 16 >> 16;
+    if (*exp)
+        x.x1 |= (uint64_t)1 << 48;
+    else
+        *exp = 1;
+    *mnt = x;
+}
+
+static u128_t f3_normalise(int32_t *exp, u128_t mnt)
+{
+    int sh;
+    if (!(mnt.x0 | mnt.x1))
+        return mnt;
+    if (!mnt.x1) {
+        mnt.x1 = mnt.x0;
+        mnt.x0 = 0;
+        *exp -= 64;
+    }
+    for (sh = 32; sh; sh >>= 1) {
+        if (!(mnt.x1 >> (64 - sh))) {
+            mnt.x1 = mnt.x1 << sh | mnt.x0 >> (64 - sh);
+            mnt.x0 = mnt.x0 << sh;
+            *exp -= sh;
+        }
+    }
+    return mnt;
+}
+
+static u128_t f3_sticky_shift(int32_t sh, u128_t x)
+{
+  if (sh >= 128) {
+      x.x0 = !!(x.x0 | x.x1);
+      x.x1 = 0;
+      return x;
+  }
+  if (sh >= 64) {
+      x.x0 = x.x1 | !!x.x0;
+      x.x1 = 0;
+      sh -= 64;
+  }
+  if (sh > 0) {
+      x.x0 = x.x0 >> sh | x.x1 << (64 - sh) | !!(x.x0 << (64 - sh));
+      x.x1 = x.x1 >> sh;
+  }
+  return x;
+}
+
+static long double f3_round(int sgn, int32_t exp, u128_t x)
+{
+    long double f;
+    int error;
+
+    if (exp > 0) {
+        x = f3_sticky_shift(13, x);
+    }
+    else {
+        x = f3_sticky_shift(14 - exp, x);
+        exp = 0;
+    }
+
+    error = x.x0 & 3;
+    x.x0 = x.x0 >> 2 | x.x1 << 62;
+    x.x1 = x.x1 >> 2;
+
+    if (error == 3 || ((error == 2) & (x.x0 & 1))) {
+        if (!++x.x0) {
+            ++x.x1;
+            if (x.x1 == (uint64_t)1 << 48)
+                exp = 1;
+            else if (x.x1 == (uint64_t)1 << 49) {
+                ++exp;
+                x.x0 = x.x0 >> 1 | x.x1 << 63;
+                x.x1 = x.x1 >> 1;
+            }
+        }
+    }
+
+    if (exp >= 32767)
+        return f3_infinity(sgn);
+
+    x.x1 = x.x1 << 16 >> 16 | (uint64_t)exp << 48 | (uint64_t)sgn << 63;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_add(long double fa, long double fb, int neg)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 32767 && b_exp == 32767 && a_sgn != b_sgn)
+        return f3_NaN();
+    if (a_exp == 32767)
+        return f3_infinity(a_sgn);
+    if (b_exp == 32767)
+        return f3_infinity(b_sgn);
+    if (!(a.x0 | a.x1 | b.x0 | b.x1))
+        return f3_zero(a_sgn & b_sgn);
+
+    a.x1 = a.x1 << 3 | a.x0 >> 61;
+    a.x0 = a.x0 << 3;
+    b.x1 = b.x1 << 3 | b.x0 >> 61;
+    b.x0 = b.x0 << 3;
+
+    if (a_exp <= b_exp) {
+        a = f3_sticky_shift(b_exp - a_exp, a);
+        a_exp = b_exp;
+    }
+    else {
+        b = f3_sticky_shift(a_exp - b_exp, b);
+        b_exp = a_exp;
+    }
+
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x.x0 = a.x0 + b.x0;
+        x.x1 = a.x1 + b.x1 + (x.x0 < a.x0);
+    }
+    else {
+        x.x0 = a.x0 - b.x0;
+        x.x1 = a.x1 - b.x1 - (x.x0 > a.x0);
+        if (x.x1 >> 63) {
+            x_sgn ^= 1;
+            x.x0 = -x.x0;
+            x.x1 = -x.x1 - !!x.x0;
+        }
+    }
+
+    if (!(x.x0 | x.x1))
+        return f3_zero(0);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp + 12, x);
+}
+
+long double __addtf3(long double a, long double b)
+{
+    return f3_add(a, b, 0);
+}
+
+long double __subtf3(long double a, long double b)
+{
+    return f3_add(a, b, 1);
+}
+
+long double __multf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && !(b.x0 | b.x1)) ||
+        (b_exp == 32767 && !(a.x0 | a.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || b_exp == 32767)
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || !(b.x0 | b.x1))
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 16352;
+
+    {
+        // Convert to base (1 << 30), discarding bottom 6 bits, which are zero,
+        // so there are (32, 30, 30, 30) bits in (a3, a2, a1, a0):
+        uint64_t a0 = a.x0 << 28 >> 34;
+        uint64_t b0 = b.x0 << 28 >> 34;
+        uint64_t a1 = a.x0 >> 36 | a.x1 << 62 >> 34;
+        uint64_t b1 = b.x0 >> 36 | b.x1 << 62 >> 34;
+        uint64_t a2 = a.x1 << 32 >> 34;
+        uint64_t b2 = b.x1 << 32 >> 34;
+        uint64_t a3 = a.x1 >> 32;
+        uint64_t b3 = b.x1 >> 32;
+        // Use 16 small multiplications and additions that do not overflow:
+        uint64_t x0 = a0 * b0;
+        uint64_t x1 = (x0 >> 30) + a0 * b1 + a1 * b0;
+        uint64_t x2 = (x1 >> 30) + a0 * b2 + a1 * b1 + a2 * b0;
+        uint64_t x3 = (x2 >> 30) + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+        uint64_t x4 = (x3 >> 30) + a1 * b3 + a2 * b2 + a3 * b1;
+        uint64_t x5 = (x4 >> 30) + a2 * b3 + a3 * b2;
+        uint64_t x6 = (x5 >> 30) + a3 * b3;
+        // We now have (64, 30, 30, ...) bits in (x6, x5, x4, ...).
+        // Take the top 128 bits, setting bottom bit if any lower bits were set:
+        uint64_t y0 = (x5 << 34 | x4 << 34 >> 30 | x3 << 34 >> 60 |
+                       !!(x3 << 38 | (x2 | x1 | x0) << 34));
+        uint64_t y1 = x6;
+        // Top bit may be zero. Renormalise:
+        if (!(y1 >> 63)) {
+            y1 = y1 << 1 | y0 >> 63;
+            y0 = y0 << 1;
+            --x_exp;
+        }
+        x.x0 = y0;
+        x.x1 = y1;
+    }
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __divtf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn, i;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && b_exp == 32767) ||
+        (!(a.x0 | a.x1) && !(b.x0 | b.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || !(b.x0 | b.x1))
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || b_exp == 32767)
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 16395;
+
+    a.x0 = a.x0 >> 1 | a.x1 << 63;
+    a.x1 = a.x1 >> 1;
+    b.x0 = b.x0 >> 1 | b.x1 << 63;
+    b.x1 = b.x1 >> 1;
+    x.x0 = 0;
+    x.x1 = 0;
+    for (i = 0; i < 116; i++) {
+        x.x1 = x.x1 << 1 | x.x0 >> 63;
+        x.x0 = x.x0 << 1;
+        if (a.x1 > b.x1 || (a.x1 == b.x1 && a.x0 >= b.x0)) {
+            a.x1 = a.x1 - b.x1 - (a.x0 < b.x0);
+            a.x0 = a.x0 - b.x0;
+            x.x0 |= 1;
+        }
+        a.x1 = a.x1 << 1 | a.x0 >> 63;
+        a.x0 = a.x0 << 1;
+    }
+    x.x0 |= !!(a.x0 | a.x1);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __extendsftf2(float f)
+{
+    long double fx;
+    u128_t x;
+    uint32_t a;
+    uint64_t aa;
+    memcpy(&a, &f, 4);
+    aa = a;
+    x.x0 = 0;
+    if (!(a << 1))
+        x.x1 = aa << 32;
+    else if (a << 1 >> 24 == 255)
+        x.x1 = (0x7fff000000000000 | aa >> 31 << 63 | aa << 41 >> 16 |
+                (uint64_t)!!(a << 9) << 47);
+    else
+        x.x1 = (aa >> 31 << 63 | ((aa >> 23 & 255) + 16256) << 48 |
+                aa << 41 >> 16);
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+long double __extenddftf2(double f)
+{
+    long double fx;
+    u128_t x;
+    uint64_t a;
+    memcpy(&a, &f, 8);
+    x.x0 = a << 60;
+    if (!(a << 1))
+        x.x1 = a;
+    else if (a << 1 >> 53 == 2047)
+        x.x1 = (0x7fff000000000000 | a >> 63 << 63 | a << 12 >> 16 |
+                (uint64_t)!!(a << 12) << 47);
+    else
+        x.x1 = a >> 63 << 63 | ((a >> 52 & 2047) + 15360) << 48 | a << 12 >> 16;
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+float __trunctfsf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint32_t x;
+    float fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = 0x7fc00000 | (uint32_t)sgn << 31 | (mnt.x1 >> 25 & 0x007fffff);
+    else if (exp > 16510)
+        x = 0x7f800000 | (uint32_t)sgn << 31;
+    else if (exp < 16233)
+        x = (uint32_t)sgn << 31;
+    else {
+        exp -= 16257;
+        x = mnt.x1 >> 23 | !!(mnt.x0 | mnt.x1 << 41);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (32 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + (exp << 23)) | (uint32_t)sgn << 31;
+    }
+    memcpy(&fx, &x, 4);
+    return fx;
+}
+
+double __trunctfdf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint64_t x;
+    double fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = (0x7ff8000000000000 | (uint64_t)sgn << 63 |
+             mnt.x1 << 16 >> 12 | mnt.x0 >> 60);
+    else if (exp > 17406)
+        x = 0x7ff0000000000000 | (uint64_t)sgn << 63;
+    else if (exp < 15308)
+        x = (uint64_t)sgn << 63;
+    else {
+        exp -= 15361;
+        x = mnt.x1 << 6 | mnt.x0 >> 58 | !!(mnt.x0 << 6);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (64 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + ((uint64_t)exp << 52)) | (uint64_t)sgn << 63;
+    }
+    memcpy(&fx, &x, 8);
+    return fx;
+}
+
+int32_t __fixtfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int32_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16369)
+        return 0;
+    if (a_exp > 16413)
+        return a_sgn ? -0x80000000 : 0x7fffffff;
+    x = a.x1 >> (16431 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+int64_t __fixtfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int64_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16383)
+        return 0;
+    if (a_exp > 16445)
+        return a_sgn ? -0x8000000000000000 : 0x7fffffffffffffff;
+    x = (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+uint32_t __fixunstfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16369)
+        return 0;
+    if (a_exp > 16414)
+        return -1;
+    return a.x1 >> (16431 - a_exp);
+}
+
+uint64_t __fixunstfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16383)
+        return 0;
+    if (a_exp > 16446)
+        return -1;
+    return (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+}
+
+long double __floatsitf(int32_t a)
+{
+    int sgn = 0;
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = ((uint64_t)sgn << 63 | (uint64_t)exp << 48 |
+                (uint64_t)(mnt << 1) << 16);
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatditf(int64_t a)
+{
+    int sgn = 0;
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)sgn << 63 | (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunsitf(uint32_t a)
+{
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = (uint64_t)exp << 48 | (uint64_t)(mnt << 1) << 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunditf(uint64_t a)
+{
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int f3_cmp(long double fa, long double fb)
+{
+    u128_t a, b;
+    memcpy(&a, &fa, 16);
+    memcpy(&b, &fb, 16);
+    return (!(a.x0 | a.x1 << 1 | b.x0 | b.x1 << 1) ? 0 :
+            ((a.x1 << 1 >> 49 == 0x7fff && (a.x0 | a.x1 << 16)) ||
+             (b.x1 << 1 >> 49 == 0x7fff && (b.x0 | b.x1 << 16))) ? 2 :
+            a.x1 >> 63 != b.x1 >> 63 ? (int)(b.x1 >> 63) - (int)(a.x1 >> 63) :
+            a.x1 < b.x1 ? (int)(a.x1 >> 63 << 1) - 1 :
+            a.x1 > b.x1 ? 1 - (int)(a.x1 >> 63 << 1) :
+            a.x0 < b.x0 ? (int)(a.x1 >> 63 << 1) - 1 :
+            b.x0 < a.x0 ? 1 - (int)(a.x1 >> 63 << 1) : 0);
+}
+
+int __eqtf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __netf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __lttf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __letf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __gttf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
+
+int __getf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
diff --git a/lib/testfp.c b/lib/testfp.c
new file mode 100644
index 00000000..63342b42
--- /dev/null
+++ b/lib/testfp.c
@@ -0,0 +1,510 @@
+/*
+ *  Test 128-bit floating-point arithmetic on arm64:
+ *  build with two different compilers and compare the output.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define check(x) ((x) ? (void)0 : check_fail(#x, __FILE__, __LINE__))
+
+void check_fail(const char *assertion, const char *file, unsigned int line)
+{
+    printf("%s:%d: Check (%s) failed.", file, line, assertion);
+    exit(1);
+}
+
+typedef struct {
+    unsigned long long x0, x1;
+} u128_t;
+
+float copy_fi(uint32_t x)
+{
+    float f;
+    memcpy(&f, &x, 4);
+    return f;
+}
+
+double copy_di(uint64_t x)
+{
+    double f;
+    memcpy(&f, &x, 8);
+    return f;
+}
+
+long double copy_ldi(u128_t x)
+{
+    long double f;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+uint32_t copy_if(float f)
+{
+    uint32_t x;
+    memcpy(&x, &f, 4);
+    return x;
+}
+
+uint64_t copy_id(double f)
+{
+    uint64_t x;
+    memcpy(&x, &f, 8);
+    return x;
+}
+
+u128_t copy_ild(long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    return x;
+}
+
+long double make(int sgn, int exp, uint64_t high, uint64_t low)
+{
+    u128_t x = { low,
+                 (0x0000ffffffffffff & high) |
+                 (0x7fff000000000000 & (uint64_t)exp << 48) |
+                 (0x8000000000000000 & (uint64_t)sgn << 63) };
+    return copy_ldi(x);
+}
+
+void cmp(long double a, long double b)
+{
+    u128_t ax = copy_ild(a);
+    u128_t bx = copy_ild(b);
+    int eq = (a == b);
+    int ne = (a != b);
+    int lt = (a < b);
+    int le = (a <= b);
+    int gt = (a > b);
+    int ge = (a >= b);
+
+    check(eq == 0 || eq == 1);
+    check(lt == 0 || lt == 1);
+    check(gt == 0 || gt == 1);
+    check(ne == !eq && le == (lt | eq) && ge == (gt | eq));
+    check(eq + lt + gt < 2);
+
+    printf("cmp %016llx%016llx %016llx%016llx %d %d %d\n",
+           ax.x1, ax.x0, bx.x1, bx.x0, lt, eq, gt);
+}
+
+void cmps(void)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+        for (j = 0; j < 2; j++)
+            cmp(make(i, 0, 0, 0), make(j, 0, 0, 0));
+
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 64; j++) {
+            long double f1 = make(i, 32767, (uint64_t)1 << j, 0);
+            long double f2 = make(i, 32767, 0, (uint64_t)1 << j);
+            cmp(f1, 0);
+            cmp(f2, 0);
+            cmp(0, f1);
+            cmp(0, f2);
+        }
+    }
+
+    for (i = 0; i < 6; i++)
+        for (j = 0; j < 6; j++)
+            cmp(make(i & 1, i >> 1, 0, 0),
+                make(j & 1, j >> 1, 0, 0));
+
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            int a, b;
+            for (a = 0; a < 2; a++) {
+                for (b = 0; b < 2; b++) {
+                    cmp(make(i, j, a, b), make(i, j, 0, 0));
+                    cmp(make(i, j, 0, 0), make(i, j, a, b));
+                }
+            }
+        }
+    }
+}
+
+void xop(const char *name, long double a, long double b, long double c)
+{
+    u128_t ax = copy_ild(a);
+    u128_t bx = copy_ild(b);
+    u128_t cx = copy_ild(c);
+    printf("%s %016llx%016llx %016llx%016llx %016llx%016llx\n",
+           name, ax.x1, ax.x0, bx.x1, bx.x0, cx.x1, cx.x0);
+}
+
+void fadd(long double a, long double b)
+{
+    xop("add", a, b, a + b);
+}
+
+void fsub(long double a, long double b)
+{
+    xop("sub", a, b, a - b);
+}
+
+void fmul(long double a, long double b)
+{
+    xop("mul", a, b, a * b);
+}
+
+void fdiv(long double a, long double b)
+{
+    xop("div", a, b, a / b);
+}
+
+void nanz(void)
+{
+    // Check NaNs:
+    {
+        long double x[7];
+        int i, j, n = 0;
+        x[n++] = make(0, 32000, 0x95132b76effc, 0xd79035214b4f8d53);
+        x[n++] = make(1, 32001, 0xbe71d7a51587, 0x30601c6815d6c3ac);
+        x[n++] = make(0, 32767, 0, 1);
+        x[n++] = make(0, 32767, (uint64_t)1 << 46, 0);
+        x[n++] = make(1, 32767, (uint64_t)1 << 47, 0);
+        x[n++] = make(1, 32767, 0x7596c7099ad5, 0xe25fed2c58f73fc9);
+        x[n++] = make(0, 32767, 0x835d143360f9, 0x5e315efb35630666);
+        check(n == sizeof(x) / sizeof(*x));
+        for (i = 0; i < n; i++) {
+            for (j = 0; j < n; j++) {
+                fadd(x[i], x[j]);
+                fsub(x[i], x[j]);
+                fmul(x[i], x[j]);
+                fdiv(x[i], x[j]);
+            }
+        }
+    }
+
+    // Check infinities and zeroes:
+    {
+        long double x[6];
+        int i, j, n = 0;
+        x[n++] = make(1, 32000, 0x62acda85f700, 0x47b6c9f35edc4044);
+        x[n++] = make(0, 32001, 0x94b7abf55af7, 0x9f425fe354428e19);
+        x[n++] = make(0, 32767, 0, 0);
+        x[n++] = make(1, 32767, 0, 0);
+        x[n++] = make(0, 0, 0, 0);
+        x[n++] = make(1, 0, 0, 0);
+        check(n == sizeof(x) / sizeof(*x));
+        for (i = 0; i < n; i++) {
+            for (j = 0; j < n; j++) {
+                fadd(x[i], x[j]);
+                fsub(x[i], x[j]);
+                fmul(x[i], x[j]);
+                fdiv(x[i], x[j]);
+            }
+        }
+    }
+}
+
+void adds(void)
+{
+    // Check shifting and add/sub:
+    {
+        int i;
+        for (i = -130; i <= 130; i++) {
+            int s1 = (uint32_t)i % 3 < 1;
+            int s2 = (uint32_t)i % 5 < 2;
+            fadd(make(s1, 16384    , 0x502c065e4f71a65d, 0xd2f9bdb031f4f031),
+                 make(s2, 16384 + i, 0xae267395a9bc1033, 0xb56b5800da1ba448));
+        }
+    }
+
+    // Check normalisation:
+    {
+        uint64_t a0 = 0xc6bab0a6afbef5ed;
+        uint64_t a1 = 0x4f84136c4a2e9b52;
+        int ee[] = { 0, 1, 10000 };
+        int e, i;
+        for (e = 0; e < sizeof(ee) / sizeof(*ee); e++) {
+            int exp = ee[e];
+            fsub(make(0, exp, a1, a0), make(0, 0, 0, 0));
+            for (i = 63; i >= 0; i--)
+                fsub(make(0, exp, a1 | (uint64_t)1 << i >> 1, a0),
+                     make(0, exp, a1 >> i << i, 0));
+            for (i = 63; i >=0; i--)
+                fsub(make(0, exp, a1, a0 | (uint64_t)1 << i >> 1),
+                     make(0, exp, a1, a0 >> i << i));
+        }
+    }
+
+    // Carry/overflow from rounding:
+    {
+        fadd(make(0, 114, -1, -1), make(0, 1, 0, 0));
+        fadd(make(0, 32766, -1, -1), make(0, 32653, 0, 0));
+        fsub(make(1, 32766, -1, -1), make(0, 32653, 0, 0));
+    }
+}
+
+void muls(void)
+{
+    int i, j;
+
+    {
+        long double max = make(0, 32766, -1, -1);
+        long double min = make(0, 0, 0, 1);
+        fmul(max, max);
+        fmul(max, min);
+        fmul(min, min);
+    }
+
+    for (i = 117; i > 0; i--)
+        fmul(make(0, 16268, 0x643dcea76edc, 0xe0877a598403627a),
+             make(i & 1, i, 0, 0));
+
+    fmul(make(0, 16383, -1, -3), make(0, 16383, 0, 1));
+    // Round to next exponent:
+    fmul(make(0, 16383, -1, -2), make(0, 16383, 0, 1));
+    // Round from subnormal to normal:
+    fmul(make(0, 1, -1, -1), make(0, 16382, 0, 0));
+
+    for (i = 0; i < 2; i++)
+        for (j = 0; j < 112; j++)
+            fmul(make(0, 16383, (uint64_t)1 << i, 0),
+                 make(0, 16383,
+                      j < 64 ? 0 : (uint64_t)1 << (j - 64),
+                      j < 64 ? (uint64_t)1 << j : 0));
+}
+
+void divs(void)
+{
+    int i;
+
+    {
+        long double max = make(0, 32766, -1, -1);
+        long double min = make(0, 0, 0, 1);
+        fdiv(max, max);
+        fdiv(max, min);
+        fdiv(min, max);
+        fdiv(min, min);
+    }
+
+    for (i = 0; i < 64; i++)
+        fdiv(make(0, 16383, -1, -1), make(0, 16383, -1, -(uint64_t)1 << i));
+    for (i = 0; i < 48; i++)
+        fdiv(make(0, 16383, -1, -1), make(0, 16383, -(uint64_t)1 << i, 0));
+}
+
+void cvtlsw(int32_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlsw %08lx %016llx%016llx\n", (long)(uint32_t)a, x.x1, x.x0);
+}
+
+void cvtlsx(int64_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlsx %016llx %016llx%016llx\n",
+           (long long)(uint64_t)a, x.x1, x.x0);
+}
+
+void cvtluw(uint32_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtluw %08lx %016llx%016llx\n", (long)a, x.x1, x.x0);
+}
+
+void cvtlux(uint64_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlux %016llx %016llx%016llx\n", (long long)a, x.x1, x.x0);
+}
+
+void cvtil(long double a)
+{
+    u128_t x = copy_ild(a);
+    int32_t b1 = a;
+    int64_t b2 = a;
+    uint32_t b3 = a;
+    uint64_t b4 = a;
+    printf("cvtswl %016llx%016llx %08lx\n",
+           x.x1, x.x0, (long)(uint32_t)b1);
+    printf("cvtsxl %016llx%016llx %016llx\n",
+           x.x1, x.x0, (long long)(uint64_t)b2);
+    printf("cvtuwl %016llx%016llx %08lx\n",
+           x.x1, x.x0, (long)b3);
+    printf("cvtuxl %016llx%016llx %016llx\n",
+           x.x1, x.x0, (long long)b4);
+}
+
+void cvtlf(float a)
+{
+    uint32_t ax = copy_if(a);
+    long double b = a;
+    u128_t bx = copy_ild(b);
+    printf("cvtlf %08lx %016llx%016llx\n", (long)ax, bx.x1, bx.x0);
+}
+
+void cvtld(double a)
+{
+    uint64_t ax = copy_id(a);
+    long double b = a;
+    u128_t bx = copy_ild(b);
+    printf("cvtld %016llx %016llx%016llx\n", (long long)ax, bx.x1, bx.x0);
+}
+
+void cvtfl(long double a)
+{
+    u128_t ax = copy_ild(a);
+    float b = a;
+    uint32_t bx = copy_if(b);
+    printf("cvtfl %016llx%016llx %08lx\n", ax.x1, ax.x0, (long)bx);
+}
+
+void cvtdl(long double a)
+{
+    u128_t ax = copy_ild(a);
+    double b = a;
+    uint64_t bx = copy_id(b);
+    printf("cvtdl %016llx%016llx %016llx\n", ax.x1, ax.x0, (long long)bx);
+}
+
+void cvts(void)
+{
+    int i, j;
+
+    {
+        uint32_t x = 0xad040c5b;
+        cvtlsw(0);
+        for (i = 0; i < 31; i++)
+            cvtlsw(x >> (31 - i));
+        for (i = 0; i < 31; i++)
+            cvtlsw(-(x >> (31 - i)));
+        cvtlsw(0x80000000);
+    }
+    {
+        uint64_t x = 0xb630a248cad9afd2;
+        cvtlsx(0);
+        for (i = 0; i < 63; i++)
+            cvtlsx(x >> (63 - i));
+        for (i = 0; i < 63; i++)
+            cvtlsx(-(x >> (63 - i)));
+        cvtlsx(0x8000000000000000);
+    }
+    {
+        uint32_t x = 0xad040c5b;
+        cvtluw(0);
+        for (i = 0; i < 32; i++)
+            cvtluw(x >> (31 - i));
+    }
+    {
+        uint64_t x = 0xb630a248cad9afd2;
+        cvtlux(0);
+        for (i = 0; i < 64; i++)
+            cvtlux(x >> (63 - i));
+    }
+
+    for (i = 0; i < 2; i++) {
+        cvtil(make(i, 32767, 0, 1));
+        cvtil(make(i, 32767, (uint64_t)1 << 47, 0));
+        cvtil(make(i, 32767, 123, 456));
+        cvtil(make(i, 32767, 0, 0));
+        cvtil(make(i, 16382, -1, -1));
+        cvtil(make(i, 16383, -1, -1));
+        cvtil(make(i, 16384, 0x7fffffffffff, -1));
+        cvtil(make(i, 16384, 0x800000000000, 0));
+        for (j = 0; j < 68; j++)
+            cvtil(make(i, 16381 + j, 0xd4822c0a10ec, 0x1fe2f8b2669f5c9d));
+    }
+
+    cvtlf(copy_fi(0x00000000));
+    cvtlf(copy_fi(0x456789ab));
+    cvtlf(copy_fi(0x7f800000));
+    cvtlf(copy_fi(0x7f923456));
+    cvtlf(copy_fi(0x7fdbcdef));
+    cvtlf(copy_fi(0x80000000));
+    cvtlf(copy_fi(0xabcdef12));
+    cvtlf(copy_fi(0xff800000));
+    cvtlf(copy_fi(0xff923456));
+    cvtlf(copy_fi(0xffdbcdef));
+
+    cvtld(copy_di(0x0000000000000000));
+    cvtld(copy_di(0x456789abcdef0123));
+    cvtld(copy_di(0x7ff0000000000000));
+    cvtld(copy_di(0x7ff123456789abcd));
+    cvtld(copy_di(0x7ffabcdef1234567));
+    cvtld(copy_di(0x8000000000000000));
+    cvtld(copy_di(0xcdef123456789abc));
+    cvtld(copy_di(0xfff0000000000000));
+    cvtld(copy_di(0xfff123456789abcd));
+    cvtld(copy_di(0xfffabcdef1234567));
+
+    for (i = 0; i < 2; i++) {                   \
+        cvtfl(make(i, 0, 0, 0));
+        cvtfl(make(i, 16232, -1, -1));
+        cvtfl(make(i, 16233, 0, 0));
+        cvtfl(make(i, 16233, 0, 1));
+        cvtfl(make(i, 16383, 0xab0ffd000000, 0));
+        cvtfl(make(i, 16383, 0xab0ffd000001, 0));
+        cvtfl(make(i, 16383, 0xab0ffeffffff, 0));
+        cvtfl(make(i, 16383, 0xab0fff000000, 0));
+        cvtfl(make(i, 16383, 0xab0fff000001, 0));
+        cvtfl(make(i, 16510, 0xfffffeffffff, -1));
+        cvtfl(make(i, 16510, 0xffffff000000, 0));
+        cvtfl(make(i, 16511, 0, 0));
+        cvtfl(make(i, 32767, 0, 0));
+        cvtfl(make(i, 32767, 0, 1));
+        cvtfl(make(i, 32767, 0x4cbe01ac5f40, 0x75cee3c6afbb00b5));
+        cvtfl(make(i, 32767, 0x800000000000, 1));
+        cvtfl(make(i, 32767, 0xa11caaaf6a52, 0x696033e871eab099));
+    }
+
+    for (i = 0; i < 2; i++) {
+        cvtdl(make(i, 0, 0, 0));
+        cvtdl(make(i, 15307, -1, -1));
+        cvtdl(make(i, 15308, 0, 0));
+        cvtdl(make(i, 15308, 0, 1));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xe800000000000000));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xe800000000000001));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf7ffffffffffffff));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf800000000000000));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf800000000000001));
+        cvtdl(make(i, 17406, 0xffffffffffff, 0xf7ffffffffffffff));
+        cvtdl(make(i, 17406, 0xffffffffffff, 0xf800000000000000));
+        cvtdl(make(i, 17407, 0, 0));
+        cvtdl(make(i, 32767, 0, 0));
+        cvtdl(make(i, 32767, 0, 1));
+        cvtdl(make(i, 32767, 0x4cbe01ac5f40, 0x75cee3c6afbb00b5));
+        cvtdl(make(i, 32767, 0x800000000000, 1));
+        cvtdl(make(i, 32767, 0xa11caaaf6a52, 0x696033e871eab099));
+    }
+}
+
+void tests(void)
+{
+    cmps();
+    nanz();
+    adds();
+    muls();
+    divs();
+    cvts();
+}
+
+int main()
+{
+#ifdef __aarch64__
+    tests();
+#else
+    printf("This test program is intended for a little-endian architecture\n"
+           "with an IEEE-standard 128-bit long double.\n");
+#endif
+    return 0;
+}
diff --git a/libtcc.c b/libtcc.c
index 01497b28..c68221e0 100644
--- a/libtcc.c
+++ b/libtcc.c
@@ -45,6 +45,9 @@ ST_DATA struct TCCState *tcc_state;
 #ifdef TCC_TARGET_ARM
 #include "arm-gen.c"
 #endif
+#ifdef TCC_TARGET_ARM64
+#include "arm64-gen.c"
+#endif
 #ifdef TCC_TARGET_C67
 #include "c67-gen.c"
 #endif
@@ -959,6 +962,8 @@ LIBTCCAPI TCCState *tcc_new(void)
 #else
     s->float_abi = ARM_SOFTFP_FLOAT;
 #endif
+#elif defined(TCC_TARGET_ARM64)
+    tcc_define_symbol(s, "__aarch64__", NULL);
 #endif
 
 #ifdef TCC_TARGET_PE
@@ -1560,7 +1565,7 @@ static int tcc_set_linker(TCCState *s, const char *option)
         } else if (link_option(option, "oformat=", &p)) {
 #if defined(TCC_TARGET_PE)
             if (strstart("pe-", &p)) {
-#elif defined(TCC_TARGET_X86_64)
+#elif defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             if (strstart("elf64-", &p)) {
 #else
             if (strstart("elf32-", &p)) {
diff --git a/tcc.c b/tcc.c
index c80bb4fd..52afca70 100644
--- a/tcc.c
+++ b/tcc.c
@@ -203,6 +203,8 @@ static void display_info(TCCState *s, int what)
 # endif
 #elif defined TCC_TARGET_ARM
         "ARM"
+#elif defined TCC_TARGET_ARM64
+        "AArch64"
 # ifdef TCC_ARM_HARDFLOAT
         " Hard Float"
 # endif
diff --git a/tcc.h b/tcc.h
index aaf5be0a..0a052329 100644
--- a/tcc.h
+++ b/tcc.h
@@ -113,7 +113,7 @@
 #endif
 
 #include "elf.h"
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
 # define ELFCLASSW ELFCLASS64
 # define ElfW(type) Elf##64##_##type
 # define ELFW(type) ELF##64##_##type
@@ -151,23 +151,26 @@
 /* target selection */
 /* #define TCC_TARGET_I386   *//* i386 code generator */
 /* #define TCC_TARGET_ARM    *//* ARMv4 code generator */
+/* #define TCC_TARGET_ARM64  *//* ARMv8 code generator */
 /* #define TCC_TARGET_C67    *//* TMS320C67xx code generator */
 /* #define TCC_TARGET_X86_64 *//* x86-64 code generator */
 
 /* default target is I386 */
 #if !defined(TCC_TARGET_I386) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64)
+    !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_C67) && \
+    !defined(TCC_TARGET_X86_64)
 #define TCC_TARGET_I386
 #endif
 
 #if !defined(TCC_UCLIBC) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64) && \
-    !defined(CONFIG_USE_LIBGCC)
+    !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_C67) && \
+    !defined(TCC_TARGET_X86_64) && !defined(CONFIG_USE_LIBGCC)
 #define CONFIG_TCC_BCHECK /* enable bound checking code */
 #endif
 
 /* define it to include assembler support */
-#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_C67)
+#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_ARM64) && \
+    !defined(TCC_TARGET_C67)
 #define CONFIG_TCC_ASM
 #endif
 
@@ -184,6 +187,8 @@
 #  define TCC_IS_NATIVE
 # elif defined __arm__ && defined TCC_TARGET_ARM
 #  define TCC_IS_NATIVE
+# elif defined __aarch64__ && defined TCC_TARGET_ARM64
+#  define TCC_IS_NATIVE
 # endif
 #endif
 
@@ -256,6 +261,8 @@
 #  define CONFIG_TCC_ELFINTERP "/usr/libexec/ld-elf.so.2"
 # elif defined __GNU__
 #  define CONFIG_TCC_ELFINTERP "/lib/ld.so"
+# elif defined TCC_TARGET_ARM64
+#  define CONFIG_TCC_ELFINTERP "/lib/ld-linux-aarch64.so.1"
 # elif defined(TCC_TARGET_X86_64)
 #  define CONFIG_TCC_ELFINTERP "/lib64/ld-linux-x86-64.so.2"
 # elif defined(TCC_UCLIBC)
@@ -290,6 +297,9 @@
 #ifdef TCC_TARGET_ARM
 # include "arm-gen.c"
 #endif
+#ifdef TCC_TARGET_ARM64
+# include "arm64-gen.c"
+#endif
 #ifdef TCC_TARGET_C67
 # include "coff.h"
 # include "c67-gen.c"
@@ -1214,6 +1224,7 @@ ST_FUNC void vpushv(SValue *v);
 ST_FUNC void save_reg(int r);
 ST_FUNC int get_reg(int rc);
 ST_FUNC void save_regs(int n);
+ST_FUNC void gaddrof(void);
 ST_FUNC int gv(int rc);
 ST_FUNC void gv2(int rc1, int rc2);
 ST_FUNC void vpop(void);
@@ -1357,6 +1368,15 @@ ST_FUNC uint32_t encbranch(int pos, int addr, int fail);
 ST_FUNC void gen_cvt_itof1(int t);
 #endif
 
+/* ------------ arm64-gen.c ------------ */
+#ifdef TCC_TARGET_ARM64
+ST_FUNC void gen_cvt_sxtw(void);
+ST_FUNC void gen_opl(int op);
+ST_FUNC void greturn(void);
+ST_FUNC void gen_va_start(void);
+ST_FUNC void gen_va_arg(CType *t);
+#endif
+
 /* ------------ c67-gen.c ------------ */
 #ifdef TCC_TARGET_C67
 #endif
diff --git a/tccelf.c b/tccelf.c
index dc0a1443..4f892245 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -291,7 +291,7 @@ ST_FUNC void put_elf_reloca(Section *symtab, Section *s, unsigned long offset,
     rel = section_ptr_add(sr, sizeof(ElfW_Rel));
     rel->r_offset = offset;
     rel->r_info = ELFW(R_INFO)(symbol, type);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
     rel->r_addend = addend;
 #else
     if (addend)
@@ -506,7 +506,7 @@ ST_FUNC void relocate_section(TCCState *s1, Section *s)
         sym_index = ELFW(R_SYM)(rel->r_info);
         sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
         val = sym->st_value;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
         val += rel->r_addend;
 #endif
         type = ELFW(R_TYPE)(rel->r_info);
@@ -760,6 +760,69 @@ ST_FUNC void relocate_section(TCCState *s1, Section *s)
             fprintf(stderr,"FIXME: handle reloc type %x at %x [%p] to %x\n",
                 type, (unsigned)addr, ptr, (unsigned)val);
             break;
+#elif defined(TCC_TARGET_ARM64)
+        case R_AARCH64_ABS64:
+            *(uint64_t *)ptr = val;
+            break;
+        case R_AARCH64_ABS32:
+            *(uint32_t *)ptr = val;
+            break;
+        case R_AARCH64_MOVW_UABS_G0_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G1_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 16 & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G2_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 32 & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G3:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 48 & 0xffff) << 5;
+            break;
+        case R_AARCH64_ADR_PREL_PG_HI21: {
+            uint64_t off = (val >> 12) - (addr >> 12);
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_PREL_PG_HI21 relocation failed");
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0x9f00001f) |
+                (off & 0x1ffffc) << 3 | (off & 3) << 29;
+            break;
+        }
+        case R_AARCH64_ADD_ABS_LO12_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffc003ff) |
+                (val & 0xfff) << 10;
+            break;
+        case R_AARCH64_JUMP26:
+        case R_AARCH64_CALL26:
+            if (((val - addr) + ((uint64_t)1 << 27)) & ~(uint64_t)0xffffffc)
+                tcc_error("R_AARCH64_(JUMP|CALL)26 relocation failed");
+            *(uint32_t *)ptr = 0x14000000 | (type == R_AARCH64_CALL26) << 31 |
+                ((val - addr) >> 2 & 0x3ffffff);
+            break;
+        case R_AARCH64_ADR_GOT_PAGE: {
+            uint64_t off =
+                (((s1->got->sh_addr +
+                   s1->sym_attrs[sym_index].got_offset) >> 12) - (addr >> 12));
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_GOT_PAGE relocation failed");
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0x9f00001f) |
+                (off & 0x1ffffc) << 3 | (off & 3) << 29;
+            break;
+        }
+        case R_AARCH64_LD64_GOT_LO12_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xfff803ff) |
+                ((s1->got->sh_addr + s1->sym_attrs[sym_index].got_offset)
+                 & 0xff8) << 7;
+            break;
+        case R_AARCH64_COPY:
+          break;
+        default:
+            fprintf(stderr, "FIXME: handle reloc type %x at %x [%p] to %x\n",
+                    type, (unsigned)addr, ptr, (unsigned)val);
+            break;
 #elif defined(TCC_TARGET_C67)
         case R_C60_32:
             *(int *)ptr += val;
@@ -955,7 +1018,7 @@ static void put32(unsigned char *p, uint32_t val)
 }
 
 #if defined(TCC_TARGET_I386) || defined(TCC_TARGET_ARM) || \
-    defined(TCC_TARGET_X86_64)
+    defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
 static uint32_t get32(unsigned char *p)
 {
     return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
@@ -1014,6 +1077,8 @@ static unsigned long put_got_entry(TCCState *s1,
         (reloc_type == R_386_JMP_SLOT);
 #elif defined(TCC_TARGET_ARM)
         (reloc_type == R_ARM_JUMP_SLOT);
+#elif defined(TCC_TARGET_ARM64)
+        (reloc_type == R_AARCH64_JUMP_SLOT);
 #else
         0;
 #endif
@@ -1135,6 +1200,24 @@ static unsigned long put_got_entry(TCCState *s1,
 	    if (sym->st_shndx == SHN_UNDEF)
                 offset = plt->data_offset - 16;
         }
+#elif defined(TCC_TARGET_ARM64)
+        if (need_plt_entry) {
+            Section *plt;
+            uint8_t *p;
+
+            if (s1->output_type == TCC_OUTPUT_DLL)
+                tcc_error("DLLs unimplemented!");
+
+            plt = s1->plt;
+            if (plt->data_offset == 0)
+                section_ptr_add(plt, 32);
+            p = section_ptr_add(plt, 16);
+            put32(p, s1->got->data_offset);
+            put32(p + 4, (uint64_t)s1->got->data_offset >> 32);
+
+            if (sym->st_shndx == SHN_UNDEF)
+                offset = plt->data_offset - 16;
+        }
 #elif defined(TCC_TARGET_C67)
     if (s1->dynsym) {
         tcc_error("C67 got not implemented");
@@ -1277,6 +1360,18 @@ ST_FUNC void build_got_entries(TCCState *s1)
                     put32(p+2, 0x46c0); /* nop   */
                     put32(p+4, 0xeafffffe); /* b $sym */
                 }
+#elif defined(TCC_TARGET_ARM64)
+                //xx Other cases may be required here:
+            case R_AARCH64_ADR_GOT_PAGE:
+            case R_AARCH64_LD64_GOT_LO12_NC:
+                if (!s1->got)
+                    build_got(s1);
+                sym_index = ELFW(R_SYM)(rel->r_info);
+                sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
+                reloc_type = R_AARCH64_GLOB_DAT;
+                put_got_entry(s1, reloc_type, sym->st_size, sym->st_info,
+                              sym_index);
+                break;
 #elif defined(TCC_TARGET_C67)
             case R_C60_GOT32:
             case R_C60_GOTOFF:
@@ -1796,6 +1891,40 @@ ST_FUNC void relocate_plt(TCCState *s1)
             put32(p + 12, x + get32(p + 12) + s1->plt->data - p);
             p += 16;
         }
+#elif defined(TCC_TARGET_ARM64)
+        uint64_t plt = s1->plt->sh_addr;
+        uint64_t got = s1->got->sh_addr;
+        uint64_t off = (got >> 12) - (plt >> 12);
+        if ((off + ((uint64_t)1 << 20)) >> 21)
+            tcc_error("Failed relocating PLT");
+        put32(p, 0xa9bf7bf0); // stp x16,x30,[sp,#-16]!
+        put32(p + 4, (0x90000010 | // adrp x16,...
+                      (off & 0x1ffffc) << 3 | (off & 3) << 29));
+        put32(p + 8, (0xf9400211 | // ldr x17,[x16,#...]
+                      (got & 0xff8) << 7));
+        put32(p + 12, (0x91000210 | // add x16,x16,#...
+                       (got & 0xfff) << 10));
+        put32(p + 16, 0xd61f0220); // br x17
+        put32(p + 20, 0xd503201f); // nop
+        put32(p + 24, 0xd503201f); // nop
+        put32(p + 28, 0xd503201f); // nop
+        p += 32;
+        while (p < p_end) {
+            uint64_t pc = plt + (p - s1->plt->data);
+            uint64_t addr = got +
+                (get32(p) | (uint64_t)get32(p + 4) << 32);
+            uint32_t off = (addr >> 12) - (pc >> 12);
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("Failed relocating PLT");
+            put32(p, (0x90000010 | // adrp x16,...
+                      (off & 0x1ffffc) << 3 | (off & 3) << 29));
+            put32(p + 4, (0xf9400211 | // ldr x17,[x16,#...]
+                          (addr & 0xff8) << 7));
+            put32(p + 8, (0x91000210 | // add x16,x16,#...
+                          (addr & 0xfff) << 10));
+            put32(p + 12, 0xd61f0220); // br x17
+            p += 16;
+        }
 #elif defined(TCC_TARGET_C67)
         /* XXX: TODO */
 #else
@@ -2093,7 +2222,7 @@ static void fill_dynamic(TCCState *s1, struct dyn_inf *dyninf)
     put_dt(dynamic, DT_SYMTAB, s1->dynsym->sh_addr);
     put_dt(dynamic, DT_STRSZ, dyninf->dynstr->data_offset);
     put_dt(dynamic, DT_SYMENT, sizeof(ElfW(Sym)));
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
     put_dt(dynamic, DT_RELA, dyninf->rel_addr);
     put_dt(dynamic, DT_RELASZ, dyninf->rel_size);
     put_dt(dynamic, DT_RELAENT, sizeof(ElfW_Rel));
diff --git a/tccgen.c b/tccgen.c
index ae075634..b96b2857 100644
--- a/tccgen.c
+++ b/tccgen.c
@@ -545,7 +545,7 @@ ST_FUNC void save_reg(int r)
                 type = &p->type;
                 if ((p->r & VT_LVAL) ||
                     (!is_float(type->t) && (type->t & VT_BTYPE) != VT_LLONG))
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                     type = &char_pointer_type;
 #else
                     type = &int_type;
@@ -562,7 +562,7 @@ ST_FUNC void save_reg(int r)
                     o(0xd8dd); /* fstp %st(0) */
                 }
 #endif
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
                 /* special long long case */
                 if ((type->t & VT_BTYPE) == VT_LLONG) {
                     sv.c.ul += 4;
@@ -681,7 +681,7 @@ static void move_reg(int r, int s, int t)
 }
 
 /* get address of vtop (vtop MUST BE an lvalue) */
-static void gaddrof(void)
+ST_FUNC void gaddrof(void)
 {
     if (vtop->r & VT_REF)
         gv(RC_INT);
@@ -803,11 +803,13 @@ ST_FUNC int gv(int rc)
 
         r = vtop->r & VT_VALMASK;
         rc2 = (rc & RC_FLOAT) ? RC_FLOAT : RC_INT;
+#ifndef TCC_TARGET_ARM64
         if (rc == RC_IRET)
             rc2 = RC_LRET;
 #ifdef TCC_TARGET_X86_64
         else if (rc == RC_FRET)
             rc2 = RC_QRET;
+#endif
 #endif
 
         /* need to reload if:
@@ -817,7 +819,7 @@ ST_FUNC int gv(int rc)
         if (r >= VT_CONST
          || (vtop->r & VT_LVAL)
          || !(reg_classes[r] & rc)
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
          || ((vtop->type.t & VT_BTYPE) == VT_QLONG && !(reg_classes[vtop->r2] & rc2))
          || ((vtop->type.t & VT_BTYPE) == VT_QFLOAT && !(reg_classes[vtop->r2] & rc2))
 #else
@@ -826,7 +828,7 @@ ST_FUNC int gv(int rc)
             )
         {
             r = get_reg(rc);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             if (((vtop->type.t & VT_BTYPE) == VT_QLONG) || ((vtop->type.t & VT_BTYPE) == VT_QFLOAT)) {
                 int addr_type = VT_LLONG, load_size = 8, load_type = ((vtop->type.t & VT_BTYPE) == VT_QLONG) ? VT_LLONG : VT_DOUBLE;
 #else
@@ -838,7 +840,7 @@ ST_FUNC int gv(int rc)
                 original_type = vtop->type.t;
                 /* two register type load : expand to two words
                    temporarily */
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
                 if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
                     /* load constant */
                     ll = vtop->c.ull;
@@ -890,7 +892,7 @@ ST_FUNC int gv(int rc)
                 t1 = t;
                 /* compute memory access type */
                 if (vtop->r & VT_REF)
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                     t = VT_PTR;
 #else
                     t = VT_INT;
@@ -952,6 +954,7 @@ ST_FUNC void gv2(int rc1, int rc2)
     }
 }
 
+#ifndef TCC_TARGET_ARM64
 /* wrapper around RC_FRET to return a register by type */
 static int rc_fret(int t)
 {
@@ -962,6 +965,7 @@ static int rc_fret(int t)
 #endif
     return RC_FRET;
 }
+#endif
 
 /* wrapper around REG_FRET to return a register by type */
 static int reg_fret(int t)
@@ -1147,7 +1151,7 @@ ST_FUNC int gvtst(int inv, int t)
     return gtst(inv, t);
 }
 
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
 /* generate CPU independent (unsigned) long long operations */
 static void gen_opl(int op)
 {
@@ -1358,7 +1362,7 @@ static void gen_opl(int op)
 #elif defined(TCC_TARGET_ARM)
                 b = ind;
                 o(0x1A000000 | encbranch(ind, 0, 1));
-#elif defined(TCC_TARGET_C67)
+#elif defined(TCC_TARGET_C67) || defined(TCC_TARGET_ARM64)
                 tcc_error("not implemented");
 #else
 #error not supported
@@ -1512,7 +1516,8 @@ static void gen_opic(int op)
         general_case:
             if (!nocode_wanted) {
                 /* call low level op generator */
-                if (t1 == VT_LLONG || t2 == VT_LLONG) 
+                if (t1 == VT_LLONG || t2 == VT_LLONG ||
+                    (PTR_SIZE == 8 && (t1 == VT_PTR || t2 == VT_PTR)))
                     gen_opl(op);
                 else
                     gen_opi(op);
@@ -1679,7 +1684,7 @@ ST_FUNC void gen_op(int op)
         if (op >= TOK_ULT && op <= TOK_LOR) {
             check_comparison_pointer_types(vtop - 1, vtop, op);
             /* pointers are handled are unsigned */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             t = VT_LLONG | VT_UNSIGNED;
 #else
             t = VT_INT | VT_UNSIGNED;
@@ -1700,7 +1705,7 @@ ST_FUNC void gen_op(int op)
             vrott(3);
             gen_opic(op);
             /* set to integer type */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             vtop->type.t = VT_LLONG;
 #else
             vtop->type.t = VT_INT; 
@@ -1724,7 +1729,7 @@ ST_FUNC void gen_op(int op)
                 u = pointed_size(&vtop[-1].type);
                 if (u < 0)
                     tcc_error("unknown array element size");
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                 vpushll(u);
 #else
                 /* XXX: cast to int ? (long long case) */
@@ -1833,6 +1838,9 @@ ST_FUNC void gen_op(int op)
 /* generic itof for unsigned long long case */
 static void gen_cvt_itof1(int t)
 {
+#ifdef TCC_TARGET_ARM64
+    gen_cvt_itof(t);
+#else
     if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) == 
         (VT_LLONG | VT_UNSIGNED)) {
 
@@ -1851,12 +1859,16 @@ static void gen_cvt_itof1(int t)
     } else {
         gen_cvt_itof(t);
     }
+#endif
 }
 #endif
 
 /* generic ftoi for unsigned long long case */
 static void gen_cvt_ftoi1(int t)
 {
+#ifdef TCC_TARGET_ARM64
+    gen_cvt_ftoi(t);
+#else
     int st;
 
     if (t == (VT_LLONG | VT_UNSIGNED)) {
@@ -1878,6 +1890,7 @@ static void gen_cvt_ftoi1(int t)
     } else {
         gen_cvt_ftoi(t);
     }
+#endif
 }
 
 /* force char or short cast */
@@ -1968,7 +1981,7 @@ static void gen_cast(CType *type)
                     vtop->c.ll = vtop->c.ull;
                 else if (sbt & VT_UNSIGNED)
                     vtop->c.ll = vtop->c.ui;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                 else if (sbt == VT_PTR)
                     ;
 #endif
@@ -1979,7 +1992,7 @@ static void gen_cast(CType *type)
                     vtop->c.ull = vtop->c.ll;
                 else if (dbt == VT_BOOL)
                     vtop->c.i = (vtop->c.ll != 0);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                 else if (dbt == VT_PTR)
                     ;
 #endif
@@ -2024,7 +2037,7 @@ static void gen_cast(CType *type)
                         gen_cast(type);
                     }
                 }
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
             } else if ((dbt & VT_BTYPE) == VT_LLONG) {
                 if ((sbt & VT_BTYPE) != VT_LLONG) {
                     /* scalar to long long */
@@ -2056,11 +2069,18 @@ static void gen_cast(CType *type)
                     (sbt & VT_BTYPE) != VT_PTR &&
                     (sbt & VT_BTYPE) != VT_FUNC) {
                     /* need to convert from 32bit to 64bit */
-                    int r = gv(RC_INT);
+                    gv(RC_INT);
                     if (sbt != (VT_INT | VT_UNSIGNED)) {
+#if defined(TCC_TARGET_ARM64)
+                        gen_cvt_sxtw();
+#elif defined(TCC_TARGET_X86_64)
+                        int r = gv(RC_INT);
                         /* x86_64 specific: movslq */
                         o(0x6348);
                         o(0xc0 + (REG_VALUE(r) << 3) + REG_VALUE(r));
+#else
+#error
+#endif
                     }
                 }
 #endif
@@ -2589,7 +2609,7 @@ ST_FUNC void vstore(void)
             if ((vtop[-1].r & VT_VALMASK) == VT_LLOCAL) {
                 SValue sv;
                 t = get_reg(RC_INT);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                 sv.type.t = VT_PTR;
 #else
                 sv.type.t = VT_INT;
@@ -2600,7 +2620,7 @@ ST_FUNC void vstore(void)
                 vtop[-1].r = t | VT_LVAL;
             }
             /* two word case handling : store second register at word + 4 (or +8 for x86-64)  */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             if (((ft & VT_BTYPE) == VT_QLONG) || ((ft & VT_BTYPE) == VT_QFLOAT)) {
                 int addr_type = VT_LLONG, load_size = 8, load_type = ((vtop->type.t & VT_BTYPE) == VT_QLONG) ? VT_LLONG : VT_DOUBLE;
 #else
@@ -3098,6 +3118,13 @@ static int parse_btype(CType *type, AttributeDef *ad)
                 goto basic_type1;
             }
             break;
+#ifdef TCC_TARGET_ARM64
+        case TOK_UINT128:
+            /* GCC's __uint128_t appears in some Linux header files. Make it a
+               synonym for long double to get the size and alignment right. */
+            u = VT_LDOUBLE;
+            goto basic_type;
+#endif
         case TOK_BOOL:
             u = VT_BOOL;
             goto basic_type;
@@ -3233,7 +3260,8 @@ the_end:
 
     /* long is never used as type */
     if ((t & VT_BTYPE) == VT_LONG)
-#if !defined TCC_TARGET_X86_64 || defined TCC_TARGET_PE
+#if (!defined TCC_TARGET_X86_64 && !defined TCC_TARGET_ARM64) || \
+    defined TCC_TARGET_PE
         t = (t & ~VT_BTYPE) | VT_INT;
 #else
         t = (t & ~VT_BTYPE) | VT_LLONG;
@@ -3881,6 +3909,36 @@ ST_FUNC void unary(void)
         break;
 #endif
 #endif
+
+#ifdef TCC_TARGET_ARM64
+    case TOK___va_start: {
+        next();
+        skip('(');
+        expr_eq();
+        skip(',');
+        expr_eq();
+        skip(')');
+        //xx check types
+        gen_va_start();
+        vpushi(0);
+        vtop->type.t = VT_VOID;
+        break;
+    }
+    case TOK___va_arg: {
+        CType type;
+        next();
+        skip('(');
+        expr_eq();
+        skip(',');
+        parse_type(&type);
+        skip(')');
+        //xx check types
+        gen_va_arg(&type);
+        vtop->type = type;
+        break;
+    }
+#endif
+
     case TOK_INC:
     case TOK_DEC:
         t = tok;
@@ -4071,6 +4129,15 @@ ST_FUNC void unary(void)
                 if (!ret_nregs) {
                     /* get some space for the returned structure */
                     size = type_size(&s->type, &align);
+#ifdef TCC_TARGET_ARM64
+                /* On arm64, a small struct is return in registers.
+                   It is much easier to write it to memory if we know
+                   that we are allowed to write some extra bytes, so
+                   round the allocated space up to a power of 2: */
+                if (size < 16)
+                    while (size & (size - 1))
+                        size = (size | (size - 1)) + 1;
+#endif
                     loc = (loc - size) & -align;
                     ret.type = s->type;
                     ret.r = VT_LOCAL | VT_LVAL;
@@ -4094,12 +4161,14 @@ ST_FUNC void unary(void)
                       ret.r2 = REG_QRET;
 #endif
                 } else {
+#ifndef TCC_TARGET_ARM64
 #ifdef TCC_TARGET_X86_64
                     if ((ret.type.t & VT_BTYPE) == VT_QLONG)
 #else
                     if ((ret.type.t & VT_BTYPE) == VT_LLONG)
 #endif
                         ret.r2 = REG_LRET;
+#endif
                     ret.r = REG_IRET;
                 }
                 ret.c.i = 0;
@@ -4717,6 +4786,10 @@ static void block(int *bsym, int *csym, int *case_sym, int *def_sym,
         if (tok != ';') {
             gexpr();
             gen_assign_cast(&func_vt);
+#ifdef TCC_TARGET_ARM64
+            // Perhaps it would be better to use this for all backends:
+            greturn();
+#else
             if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
                 CType type, ret_type;
                 int ret_align, ret_nregs;
@@ -4770,6 +4843,7 @@ static void block(int *bsym, int *csym, int *case_sym, int *def_sym,
             } else {
                 gv(RC_IRET);
             }
+#endif
             vtop--; /* NOT vpop() because on x86 it would flush the fp stack */
         }
         skip(';');
@@ -5160,9 +5234,9 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
         /* XXX: generate error if incorrect relocation */
         gen_assign_cast(&dtype);
         bt = type->t & VT_BTYPE;
-        /* we'll write at most 12 bytes */
-        if (c + 12 > sec->data_allocated) {
-            section_realloc(sec, c + 12);
+        /* we'll write at most 16 bytes */
+        if (c + 16 > sec->data_allocated) {
+            section_realloc(sec, c + 16);
         }
         ptr = sec->data + c;
         /* XXX: make code faster ? */
@@ -5184,6 +5258,9 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
              (bt == VT_INT && bit_size != 32)))
             tcc_error("initializer element is not computable at load time");
         switch(bt) {
+            /* XXX: when cross-compiling we assume that each type has the
+               same representation on host and target, which is likely to
+               be wrong in the case of long double */
         case VT_BOOL:
             vtop->c.i = (vtop->c.i != 0);
         case VT_BYTE:
@@ -5203,7 +5280,7 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
             break;
         case VT_PTR: {
             addr_t val = (vtop->c.ptr_offset & bit_mask) << bit_pos;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             if (vtop->r & VT_SYM)
                 greloca(sec, vtop->sym, c, R_DATA_PTR, val);
             else
@@ -5217,7 +5294,7 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
         }
         default: {
             int val = (vtop->c.i & bit_mask) << bit_pos;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
             if (vtop->r & VT_SYM)
                 greloca(sec, vtop->sym, c, R_DATA_PTR, val);
             else
diff --git a/tccrun.c b/tccrun.c
index 13c20120..264322bf 100644
--- a/tccrun.c
+++ b/tccrun.c
@@ -604,6 +604,27 @@ static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
     }
 }
 
+/* ------------------------------------------------------------- */
+#elif defined(__aarch64__)
+
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    if (level < 0)
+        return -1;
+    else if (level == 0) {
+        *paddr = uc->uc_mcontext.pc;
+        return 0;
+    }
+    else {
+        addr_t *fp = (addr_t *)uc->uc_mcontext.regs[29];
+        int i;
+        for (i = 1; i < level; i++)
+            fp = (addr_t *)fp[0];
+        *paddr = fp[1];
+        return 0;
+    }
+}
+
 /* ------------------------------------------------------------- */
 #else
 
diff --git a/tcctok.h b/tcctok.h
index 53d0659c..64d1288f 100644
--- a/tcctok.h
+++ b/tcctok.h
@@ -59,6 +59,10 @@
      DEF(TOK_ASM2, "__asm")
      DEF(TOK_ASM3, "__asm__")
 
+#ifdef TCC_TARGET_ARM64
+     DEF(TOK_UINT128, "__uint128_t")
+#endif
+
 /*********************************************************************/
 /* the following are not keywords. They are included to ease parsing */
 /* preprocessor only */
@@ -136,6 +140,11 @@
      DEF(TOK_REGPARM1, "regparm")
      DEF(TOK_REGPARM2, "__regparm__")
 
+#ifdef TCC_TARGET_ARM64
+     DEF(TOK___va_start, "__va_start")
+     DEF(TOK___va_arg, "__va_arg")
+#endif
+
 /* pragma */
      DEF(TOK_pack, "pack")
 #if !defined(TCC_TARGET_I386) && !defined(TCC_TARGET_X86_64)
@@ -229,6 +238,30 @@
 #if defined TCC_TARGET_PE
      DEF(TOK___chkstk, "__chkstk")
 #endif
+#ifdef TCC_TARGET_ARM64
+     DEF(TOK___addtf3, "__addtf3")
+     DEF(TOK___subtf3, "__subtf3")
+     DEF(TOK___multf3, "__multf3")
+     DEF(TOK___divtf3, "__divtf3")
+     DEF(TOK___extendsftf2, "__extendsftf2")
+     DEF(TOK___extenddftf2, "__extenddftf2")
+     DEF(TOK___trunctfsf2, "__trunctfsf2")
+     DEF(TOK___trunctfdf2, "__trunctfdf2")
+     DEF(TOK___fixtfsi, "__fixtfsi")
+     DEF(TOK___fixtfdi, "__fixtfdi")
+     DEF(TOK___fixunstfsi, "__fixunstfsi")
+     DEF(TOK___fixunstfdi, "__fixunstfdi")
+     DEF(TOK___floatsitf, "__floatsitf")
+     DEF(TOK___floatditf, "__floatditf")
+     DEF(TOK___floatunsitf, "__floatunsitf")
+     DEF(TOK___floatunditf, "__floatunditf")
+     DEF(TOK___eqtf2, "__eqtf2")
+     DEF(TOK___netf2, "__netf2")
+     DEF(TOK___lttf2, "__lttf2")
+     DEF(TOK___letf2, "__letf2")
+     DEF(TOK___gttf2, "__gttf2")
+     DEF(TOK___getf2, "__getf2")
+#endif
 
 /* bound checking symbols */
 #ifdef CONFIG_TCC_BCHECK