diff --git a/arm-asm.c b/arm-asm.c
index c31df7d0..9c390d24 100644
--- a/arm-asm.c
+++ b/arm-asm.c
@@ -1293,6 +1293,7 @@ static void asm_single_data_transfer_opcode(TCCState *s1, int token)
     }
 }
 
+// Note: Only call this using a VFP register if you know exactly what you are doing (i.e. cp_number is 10 or 11 and you are doing a vmov)
 static void asm_emit_coprocessor_data_transfer(uint32_t high_nibble, uint8_t cp_number, uint8_t CRd, const Operand* Rn, const Operand* offset, int offset_minus, int preincrement, int writeback, int long_transfer, int load) {
     uint32_t opcode = 0x0;
     opcode |= 1 << 26; // Load/Store
@@ -1306,12 +1307,14 @@ static void asm_emit_coprocessor_data_transfer(uint32_t high_nibble, uint8_t cp_
 
     opcode |= cp_number << 8;
 
+    //assert(CRd < 16);
     opcode |= ENCODE_RD(CRd);
 
     if (Rn->type != OP_REG32) {
         expect("register");
         return;
     }
+    //assert(Rn->reg < 16);
     opcode |= ENCODE_RN(Rn->reg);
     if (preincrement)
         opcode |= 1 << 24; // add offset before transfer
@@ -1344,6 +1347,9 @@ static void asm_emit_coprocessor_data_transfer(uint32_t high_nibble, uint8_t cp_
         opcode |= offset->reg;
         tcc_error("Using register offset to register address is not possible here");
         return;
+    } else if (offset->type == OP_VREG64) {
+        opcode |= 16;
+        opcode |= offset->reg;
     } else
         expect("immediate or register");
 
@@ -1419,6 +1425,9 @@ static void asm_coprocessor_data_transfer_opcode(TCCState *s1, int token)
                 tcc_error("Using 'pc' for register offset in '%s' is not implemented by ARM", get_tok_str(token, NULL));
                 return;
             }
+        } else if (ops[2].type == OP_VREG64) {
+            tcc_error("'%s' does not support VFP register operand", get_tok_str(token, NULL));
+            return;
         }
     } else {
         // end of input expression in brackets--assume 0 offset
@@ -1816,12 +1825,77 @@ static void asm_floating_point_immediate_data_processing_opcode_tail(TCCState *s
     asm_emit_coprocessor_opcode(condition_code_of_token(token), coprocessor, opcode1, operands[0], operands[1], operands[2], opcode2, 0);
 }
 
+static void asm_floating_point_reg_arm_reg_transfer_opcode_tail(TCCState *s1, int token, int coprocessor, int nb_arm_regs, int nb_ops, Operand ops[3]) {
+    uint8_t opcode1 = 0;
+    uint8_t opcode2 = 0;
+    switch (coprocessor) {
+    case CP_SINGLE_PRECISION_FLOAT:
+        // "vmov.f32 r2, s3" or "vmov.f32 s3, r2"
+        if (nb_ops != 2 || nb_arm_regs != 1) {
+            tcc_error("vmov.f32 only implemented for one VFP register operand and one ARM register operands");
+            return;
+        }
+        if (ops[0].type != OP_REG32) { // determine mode: load or store
+            // need to swap operands 0 and 1
+            memcpy(&ops[2], &ops[1], sizeof(ops[2]));
+            memcpy(&ops[1], &ops[0], sizeof(ops[1]));
+            memcpy(&ops[0], &ops[2], sizeof(ops[0]));
+        } else
+            opcode1 |= 1;
+
+        if (ops[1].type == OP_VREG32) {
+            if (ops[1].reg & 1)
+                opcode2 |= 4;
+            ops[1].reg >>= 1;
+        }
+
+        if (ops[0].type == OP_VREG32) {
+            if (ops[0].reg & 1)
+                opcode1 |= 4;
+            ops[0].reg >>= 1;
+        }
+
+        asm_emit_coprocessor_opcode(condition_code_of_token(token), coprocessor, opcode1, ops[0].reg, (ops[1].type == OP_IM8) ? ops[1].e.v : ops[1].reg, 0x10, opcode2, 0);
+        break;
+    case CP_DOUBLE_PRECISION_FLOAT:
+        if (nb_ops != 3 || nb_arm_regs != 2) {
+            tcc_error("vmov.f32 only implemented for one VFP register operand and two ARM register operands");
+            return;
+        }
+        // Determine whether it's a store into a VFP register (vmov "d1, r2, r3") rather than "vmov r2, r3, d1"
+        if (ops[0].type == OP_VREG64) {
+            if (ops[2].type == OP_REG32) {
+                Operand temp;
+                // need to rotate operand list to the left
+                memcpy(&temp, &ops[0], sizeof(temp));
+                memcpy(&ops[0], &ops[1], sizeof(ops[0]));
+                memcpy(&ops[1], &ops[2], sizeof(ops[1]));
+                memcpy(&ops[2], &temp, sizeof(ops[2]));
+            } else {
+                tcc_error("vmov.f64 only implemented for one VFP register operand and two ARM register operands");
+                return;
+            }
+        } else if (ops[0].type != OP_REG32 || ops[1].type != OP_REG32 || ops[2].type != OP_VREG64) {
+            tcc_error("vmov.f64 only implemented for one VFP register operand and two ARM register operands");
+            return;
+        } else {
+            opcode1 |= 1;
+        }
+        asm_emit_coprocessor_data_transfer(condition_code_of_token(token), coprocessor, ops[0].reg, &ops[1], &ops[2], 0, 0, 0, 1, opcode1);
+        break;
+    default:
+        tcc_internal_error("unknown coprocessor");
+    }
+}
+
 static void asm_floating_point_data_processing_opcode(TCCState *s1, int token) {
     uint8_t coprocessor = CP_SINGLE_PRECISION_FLOAT;
     uint8_t opcode1 = 0;
     uint8_t opcode2 = 0; // (0 || 2) | register selection
     Operand ops[3];
     uint8_t nb_ops = 0;
+    int vmov = 0;
+    int nb_arm_regs = 0;
 
 /* TODO:
    Instruction    opcode opcode2  Reason
@@ -1835,12 +1909,8 @@ static void asm_floating_point_data_processing_opcode(TCCState *s1, int token) {
    VCVT*
 
    VMOV Fd, Fm
-   VMOV Sn, Rd
-   VMOV Rd, Sn
    VMOV Sn, Sm, Rd, Rn
    VMOV Rd, Rn, Sn, Sm
-   VMOV Dm, Rd, Rn
-   VMOV Rd, Rn, Dm
    VMOV Dn[0], Rd
    VMOV Rd, Dn[0]
    VMOV Dn[1], Rd
@@ -1870,13 +1940,23 @@ static void asm_floating_point_data_processing_opcode(TCCState *s1, int token) {
         coprocessor = CP_DOUBLE_PRECISION_FLOAT;
     }
 
+    switch (ARM_INSTRUCTION_GROUP(token)) {
+    case TOK_ASM_vmoveq_f32:
+    case TOK_ASM_vmoveq_f64:
+        vmov = 1;
+        break;
+    }
+
     for (nb_ops = 0; nb_ops < 3; ) {
+        // Note: Necessary because parse_operand can't parse decimal numerals.
         if (nb_ops == 1 && (tok == '#' || tok == '$')) {
             asm_floating_point_immediate_data_processing_opcode_tail(s1, token, coprocessor, ops[0].reg);
             return;
         }
         parse_operand(s1, &ops[nb_ops]);
-        if (ops[nb_ops].type == OP_VREG32) {
+        if (vmov && ops[nb_ops].type == OP_REG32) {
+            ++nb_arm_regs;
+        } else if (ops[nb_ops].type == OP_VREG32) {
             if (coprocessor != CP_SINGLE_PRECISION_FLOAT) {
                 expect("'s<number>'");
                 return;
@@ -1897,14 +1977,16 @@ static void asm_floating_point_data_processing_opcode(TCCState *s1, int token) {
             break;
     }
 
-    if (nb_ops == 2) { // implicit
-        memcpy(&ops[2], &ops[1], sizeof(ops[1])); // move ops[2]
-        memcpy(&ops[1], &ops[0], sizeof(ops[0])); // ops[1] was implicit
-        nb_ops = 3;
-    }
-    if (nb_ops < 3) {
-        tcc_error("Not enough operands for '%s' (%u)", get_tok_str(token, NULL), nb_ops);
-        return;
+    if (nb_arm_regs == 0) {
+        if (nb_ops == 2) { // implicit
+            memcpy(&ops[2], &ops[1], sizeof(ops[1])); // move ops[2]
+            memcpy(&ops[1], &ops[0], sizeof(ops[0])); // ops[1] was implicit
+            nb_ops = 3;
+        }
+        if (nb_ops < 3) {
+            tcc_error("Not enough operands for '%s' (%u)", get_tok_str(token, NULL), nb_ops);
+            return;
+        }
     }
 
     switch (ARM_INSTRUCTION_GROUP(token)) {
@@ -1990,11 +2072,15 @@ static void asm_floating_point_data_processing_opcode(TCCState *s1, int token) {
         break;
     case TOK_ASM_vmoveq_f32:
     case TOK_ASM_vmoveq_f64:
-        // FIXME: Check for ARM registers--and allow only very little.
-        opcode1 = 11; // "Other" instruction
-        opcode2 = 2;
-        ops[1].type = OP_IM8;
-        ops[1].e.v = 0;
+        if (nb_arm_regs > 0) { // vmov.f32 r2, s3 or similar
+            asm_floating_point_reg_arm_reg_transfer_opcode_tail(s1, token, coprocessor, nb_arm_regs, nb_ops, ops);
+            return;
+        } else {
+            opcode1 = 11; // "Other" instruction
+            opcode2 = 2;
+            ops[1].type = OP_IM8;
+            ops[1].e.v = 0;
+        }
         break;
     // TODO: vcvt; vcvtr
     default:
diff --git a/tests/arm-asm-testsuite.sh b/tests/arm-asm-testsuite.sh
index 6a3845e5..19acf907 100755
--- a/tests/arm-asm-testsuite.sh
+++ b/tests/arm-asm-testsuite.sh
@@ -151,6 +151,10 @@ do
                     "d3, #0.0" \
                     "s4, #-0.1796875" \
                     "d4, #0.1796875" \
+                    "r2, r3, d1" \
+                    "d1, r2, r3" \
+                    "s1, r2" \
+                    "r2, s1" \
 	            ""
 	do
 		#echo ".syntax unified" > a.s
@@ -214,6 +218,9 @@ else
 			"bl r3"|"b r3"|"mov r2, #0xEFFF"|"mov r4, #0x0201")
 				known_failure=" (known failure)"
 				;;
+			"vmov.f32 r2, r3, d1"|"vmov.f32 d1, r2, r3") # GNU as bug
+				known_failure=" (known failure)"
+				;;
 			*)
 				known_failure=""
 				status=1