From 79a8229fb5da15135ac0c54c2c696a8da8988b24 Mon Sep 17 00:00:00 2001
From: herman ten brugge <hermantenbrugge@home.nl>
Date: Sun, 16 Oct 2022 18:51:56 +0200
Subject: [PATCH] Add atomic functions for arm/arm64/riscv

Make code more compatible with gcc. Change
__atomic_store
__atomic_load
__atomic_exchange
__atomic_compare_exchange

Also add (include/stdatomic.h, lib/stdatomic.c):
atomic_thread_fence
atomic_signal_fence
atomic_is_lock_free

And gcc extensions (tcctok.h, tccgen.c, lib/stdatomic.c):
__atomic_fetch_nand
__atomic_and_fetch
__atomic_sub_fetch
__atomic_or_fetch
__atomic_xor_fetch
__atomic_and_fetch
__atomic_nand_fetch

Add new file lib/atomic.S with assembly code for __atomic_compare_exchange_n
for arm/arm64/riscv. Also update lib/Makefile.

Update testcode in 124_atomic_counter and 125_atomic_misc to test new functions.
Also update tests/tests2/Makefile to run tests on arm/arm64/riscv.
---
 include/stdatomic.h                    |  88 +++-
 lib/Makefile                           |  10 +-
 lib/atomic.S                           | 606 +++++++++++++++++++++++++
 lib/stdatomic.c                        | 175 ++++---
 tccgen.c                               |  57 ++-
 tcctok.h                               |   7 +
 tests/tests2/124_atomic_counter.c      | 103 ++++-
 tests/tests2/124_atomic_counter.expect |   1 +
 tests/tests2/125_atomic_misc.c         | 119 +++++
 tests/tests2/125_atomic_misc.expect    |  40 +-
 tests/tests2/Makefile                  |   2 -
 11 files changed, 1078 insertions(+), 130 deletions(-)
 create mode 100644 lib/atomic.S

diff --git a/include/stdatomic.h b/include/stdatomic.h
index f00c62f0..f1e16134 100644
--- a/include/stdatomic.h
+++ b/include/stdatomic.h
@@ -14,6 +14,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdbool.h>
 
 #define __ATOMIC_RELAXED 0
 #define __ATOMIC_CONSUME 1
@@ -78,41 +79,68 @@ typedef struct {
 
 #define ATOMIC_FLAG_INIT {0}
 
-#define atomic_flag_test_and_set(object) \
-    __atomic_exchange(&(object)->value, 1, __ATOMIC_SEQ_CST)
-#define atomic_flag_test_and_set_explicit(object, order) \
-    __atomic_exchange(&(object)->value, 1, order)
+#define atomic_flag_test_and_set_explicit(object, order)                  \
+    ({ bool ret, value = 1;                                               \
+       __atomic_exchange(&(object)->value, &value, &ret, order);          \
+       ret;                                                               \
+    })
+#define atomic_flag_test_and_set(object)                                  \
+    atomic_flag_test_and_set_explicit(object, __ATOMIC_SEQ_CST)
 
+#define atomic_flag_clear_explicit(object, order)                         \
+    ({ bool value = 0;                                                    \
+       __atomic_store(&(object)->value, &value, order);                   \
+    })
 #define atomic_flag_clear(object) \
-    __atomic_store(&(object)->value, 0, __ATOMIC_SEQ_CST)
-#define atomic_flag_clear_explicit(object, order) \
-    __atomic_store(&(object)->value, 0, order)
+    atomic_flag_clear_explicit(object, __ATOMIC_SEQ_CST)
 
 /* Generic routines */
-#define atomic_init(object, desired) \
-    __atomic_store(object, desired, __ATOMIC_RELAXED)
+#define atomic_init(object, desired)                                      \
+    atomic_store_explicit(object, desired, __ATOMIC_RELAXED)
 
-#define atomic_store(object, desired) \
-    __atomic_store(object, desired, __ATOMIC_SEQ_CST)
-#define atomic_store_explicit __atomic_store
+#define atomic_store_explicit(object, desired, order)                     \
+    ({ __typeof__ (object) ptr = (object);                                \
+       __typeof__ (*ptr) tmp = (desired);                                 \
+       __atomic_store (ptr, &tmp, (order));                               \
+    })
+#define atomic_store(object, desired)                                     \
+     atomic_store_explicit (object, desired, __ATOMIC_SEQ_CST)
 
-#define atomic_load(object) \
-    __atomic_load(object, __ATOMIC_SEQ_CST)
-#define atomic_load_explicit __atomic_load
+#define atomic_load_explicit(object, order)                               \
+    ({ __typeof__ (object) ptr = (object);                                \
+       __typeof__ (*ptr) tmp;                                             \
+       __atomic_load (ptr, &tmp, (order));                                \
+       tmp;                                                               \
+    })
+#define atomic_load(object) atomic_load_explicit (object, __ATOMIC_SEQ_CST)
 
-#define atomic_exchange(object, desired) \
-    __atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
-#define atomic_exchange_explicit __atomic_exchange
+#define atomic_exchange_explicit(object, desired, order)                  \
+    ({ __typeof__ (object) ptr = (object);                                \
+       __typeof__ (*ptr) val = (desired);                                 \
+       __typeof__ (*ptr) tmp;                                             \
+       __atomic_exchange (ptr, &val, &tmp, (order));                      \
+       tmp;                                                               \
+    })
+#define atomic_exchange(object, desired)                                  \
+  atomic_exchange_explicit (object, desired, __ATOMIC_SEQ_CST)
 
-#define atomic_compare_exchange_strong(object, expected, desired) \
-    __atomic_compare_exchange(object, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 #define atomic_compare_exchange_strong_explicit(object, expected, desired, success, failure) \
-    __atomic_compare_exchange(object, expected, desired, 0, success, failure)
+    ({ __typeof__ (object) ptr = (object);                                \
+       __typeof__ (*ptr) tmp = desired;                                   \
+       __atomic_compare_exchange(ptr, expected, &tmp, 0, success, failure); \
+    })
+#define atomic_compare_exchange_strong(object, expected, desired)         \
+    atomic_compare_exchange_strong_explicit (object, expected, desired,   \
+                                             __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 
-#define atomic_compare_exchange_weak(object, expected, desired) \
-    __atomic_compare_exchange(object, expected, desired, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 #define atomic_compare_exchange_weak_explicit(object, expected, desired, success, failure) \
-    __atomic_compare_exchange(object, expected, desired, 1, success, failure)
+    ({ __typeof__ (object) ptr = (object);                                \
+       __typeof__ (*ptr) tmp = desired;                                   \
+       __atomic_compare_exchange(ptr, expected, &tmp, 1, success, failure); \
+    })
+#define atomic_compare_exchange_weak(object, expected, desired)           \
+    atomic_compare_exchange_weak_explicit (PTR, VAL, DES,                 \
+                                           __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 
 #define atomic_fetch_add(object, operand) \
     __atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
@@ -134,4 +162,16 @@ typedef struct {
     __atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
 #define atomic_fetch_and_explicit __atomic_fetch_and
 
+extern void atomic_thread_fence (memory_order);
+extern void __atomic_thread_fence (memory_order);
+#define atomic_thread_fence(order) __atomic_thread_fence (order)
+extern void atomic_signal_fence (memory_order);
+extern void __atomic_signal_fence (memory_order);
+#define atomic_signal_fence(order) __atomic_signal_fence  (order)
+extern bool __atomic_is_lock_free(size_t size, void *ptr);
+#define atomic_is_lock_free(OBJ) __atomic_is_lock_free (sizeof (*(OBJ)), (OBJ))
+
+extern bool __atomic_test_and_set (void *, memory_order);
+extern void __atomic_clear (bool *, memory_order);
+
 #endif /* _STDATOMIC_H */
diff --git a/lib/Makefile b/lib/Makefile
index 74722b61..046ea3c8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -42,11 +42,11 @@ $(X)BT_O += tcov.o
 
 DSO_O = dsohandle.o
 
-I386_O = libtcc1.o alloca.o alloca-bt.o $(BT_O) stdatomic.o
-X86_64_O = libtcc1.o alloca.o alloca-bt.o $(BT_O) stdatomic.o
-ARM_O = libtcc1.o armeabi.o alloca.o armflush.o fetch_and_add.o $(BT_O)
-ARM64_O = lib-arm64.o fetch_and_add.o $(BT_O)
-RISCV64_O = lib-arm64.o fetch_and_add.o $(BT_O)
+I386_O = libtcc1.o alloca.o alloca-bt.o stdatomic.o $(BT_O)
+X86_64_O = libtcc1.o alloca.o alloca-bt.o stdatomic.o $(BT_O)
+ARM_O = libtcc1.o armeabi.o alloca.o armflush.o fetch_and_add.o stdatomic.o atomic.o $(BT_O)
+ARM64_O = lib-arm64.o fetch_and_add.o stdatomic.o atomic.o $(BT_O)
+RISCV64_O = lib-arm64.o fetch_and_add.o stdatomic.o atomic.o $(BT_O)
 WIN_O = crt1.o crt1w.o wincrt1.o wincrt1w.o dllcrt1.o dllmain.o
 
 OBJ-i386 = $(I386_O) $(BCHECK_O) $(DSO_O)
diff --git a/lib/atomic.S b/lib/atomic.S
new file mode 100644
index 00000000..84bf0f5c
--- /dev/null
+++ b/lib/atomic.S
@@ -0,0 +1,606 @@
+/* ---------------------------------------------- */
+/* This file implements for arm/arm64/riscv:
+ * __atomic_compare_exchange_1
+ * __atomic_compare_exchange_2
+ * __atomic_compare_exchange_4
+ * __atomic_compare_exchange_8
+ */
+#if defined __arm__
+
+#ifndef __TINYC__
+	.arch armv6k
+	.syntax unified
+#endif
+        .text
+        .align  2
+
+        .global __atomic_compare_exchange_1
+        .type   __atomic_compare_exchange_1, %function
+__atomic_compare_exchange_1:
+#ifdef __TINYC__
+	.int	0xe52de004
+	.int	0xe5d13000
+	.int	0xf57ff05b
+	.int	0xe1d0cf9f
+	.int	0xe15c0003
+	.int	0x1a000002
+	.int	0xe1c0ef92
+	.int	0xe35e0000
+	.int	0x1afffff9
+	.int	0x03a00001
+	.int	0x13a00000
+	.int	0xf57ff05b
+	.int	0x15c1c000
+	.int	0xe49df004
+#else
+        str     lr, [sp, #-4]!
+        ldrb    r3, [r1]
+        mcr     p15, 0, r0, c7, c10, 5
+.L1:
+        ldrexb  ip, [r0]
+        cmp     ip, r3
+        bne     .L2
+        strexb  lr, r2, [r0]
+        cmp     lr, #0
+        bne     .L1
+.L2:
+        mcr     p15, 0, r0, c7, c10, 5
+        moveq   r0, #1
+        movne   r0, #0
+        strbne  ip, [r1]
+        ldr     pc, [sp], #4
+#endif
+	.size   __atomic_compare_exchange_1, .-__atomic_compare_exchange_1
+
+        .global __atomic_compare_exchange_2
+        .type   __atomic_compare_exchange_2, %function
+__atomic_compare_exchange_2:
+#ifdef __TINYC__
+	.int	0xe52de004
+	.int	0xe1d130b0
+	.int	0xf57ff05b
+	.int	0xe1f0cf9f
+	.int	0xe15c0003
+	.int	0x1a000002
+	.int	0xe1e0ef92
+	.int	0xe35e0000
+	.int	0x1afffff9
+	.int	0x03a00001
+	.int	0x13a00000
+	.int	0xf57ff05b
+	.int	0x11c1c0b0
+	.int	0xe49df004
+#else
+        str     lr, [sp, #-4]!
+        ldrh    r3, [r1]
+        mcr     p15, 0, r0, c7, c10, 5
+.L3:
+        ldrexh  ip, [r0]
+        cmp     ip, r3
+        bne     .L4
+        strexh  lr, r2, [r0]
+        cmp     lr, #0
+        bne     .L3
+.L4:
+        mcr     p15, 0, r0, c7, c10, 5
+        moveq   r0, #1
+        movne   r0, #0
+        strhne  ip, [r1]
+	ldr     pc, [sp], #4
+#endif
+	.size   __atomic_compare_exchange_2, .-__atomic_compare_exchange_2
+
+        .global __atomic_compare_exchange_4
+        .type   __atomic_compare_exchange_4, %function
+__atomic_compare_exchange_4:
+#ifdef __TINYC__
+	.int	0xe52de004
+	.int	0xe5913000
+	.int	0xf57ff05b
+	.int	0xe190cf9f
+	.int	0xe15c0003
+	.int	0x1a000002
+	.int	0xe180ef92
+	.int	0xe35e0000
+	.int	0x1afffff9
+	.int	0x03a00001
+	.int	0x13a00000
+	.int	0xf57ff05b
+	.int	0x1581c000
+	.int	0xe49df004
+#else
+        str     lr, [sp, #-4]!
+        ldr     r3, [r1]
+        mcr     p15, 0, r0, c7, c10, 5
+.L5:
+        ldrex   ip, [r0]
+        cmp     ip, r3
+        bne     .L6
+        strex   lr, r2, [r0]
+        cmp     lr, #0
+        bne     .L5
+.L6:
+        mcr     p15, 0, r0, c7, c10, 5
+        moveq   r0, #1
+        movne   r0, #0
+        strne   ip, [r1]
+        ldr     pc, [sp], #4
+#endif
+	.size   __atomic_compare_exchange_4, .-__atomic_compare_exchange_4
+
+/* ---------------------------------------------- */
+#elif defined __aarch64__
+
+        .text
+        .align  2
+
+        .global __atomic_compare_exchange_1
+        .type   __atomic_compare_exchange_1, %function
+__atomic_compare_exchange_1:
+#ifdef __TINYC__
+	.int	0xa9be7bfd
+	.int	0x910003fd
+	.int	0xa90153f3
+	.int	0xaa0103f3
+	.int	0x12001c41
+	.int	0xaa0003e2
+	.int	0x39400274
+	.int	0x2a1403e0
+	.int	0x53001c10
+	.int	0x085ffc40
+	.int	0x6b10001f
+	.int	0x54000061
+	.int	0x0811fc41
+	.int	0x35ffff91
+	.int	0x6b34001f
+	.int	0x1a9f17e1
+	.int	0x54000040
+	.int	0x39000260
+	.int	0x2a0103e0
+	.int	0xa94153f3
+	.int	0xa8c27bfd
+	.int	0xd65f03c0
+#else
+        stp     x29, x30, [sp, -32]!
+        mov     x29, sp
+        stp     x19, x20, [sp, 16]
+        mov     x19, x1
+        and     w1, w2, 255
+        mov     x2, x0
+        ldrb    w20, [x19]
+        mov     w0, w20
+	uxtb    w16, w0
+.L1:
+	ldaxrb  w0, [x2]
+	cmp     w0, w16
+	b.ne    .L2
+	stlxrb  w17, w1, [x2]
+	cbnz    w17, .L1
+.L2:
+        cmp     w0, w20, uxtb
+        cset    w1, eq
+        beq     .L3
+        strb    w0, [x19]
+.L3:
+        mov     w0, w1
+        ldp     x19, x20, [sp, 16]
+        ldp     x29, x30, [sp], 32
+        ret
+#endif
+        .size   __atomic_compare_exchange_1, .-__atomic_compare_exchange_1
+
+        .global __atomic_compare_exchange_2
+        .type   __atomic_compare_exchange_2, %function
+__atomic_compare_exchange_2:
+#ifdef __TINYC__
+	.int	0xa9be7bfd
+	.int	0x910003fd
+	.int	0xa90153f3
+	.int	0xaa0103f3
+	.int	0x12003c41
+	.int	0xaa0003e2
+	.int	0x79400274
+	.int	0x2a1403e0
+	.int	0x53003c10
+	.int	0x485ffc40
+	.int	0x6b10001f
+	.int	0x54000061
+	.int	0x4811fc41
+	.int	0x35ffff91
+	.int	0x6b34201f
+	.int	0x1a9f17e1
+	.int	0x54000040
+	.int	0x79000260
+	.int	0x2a0103e0
+	.int	0xa94153f3
+	.int	0xa8c27bfd
+	.int	0xd65f03c0
+#else
+        stp     x29, x30, [sp, -32]!
+        mov     x29, sp
+        stp     x19, x20, [sp, 16]
+        mov     x19, x1
+        and     w1, w2, 65535
+        mov     x2, x0
+        ldrh    w20, [x19]
+        mov     w0, w20
+	uxth    w16, w0
+.L4:
+	ldaxrh  w0, [x2]
+	cmp     w0, w16
+	b.ne    .L5
+	stlxrh  w17, w1, [x2]
+	cbnz    w17, .L4
+.L5:
+        cmp     w0, w20, uxth
+        cset    w1, eq
+        beq     .L6
+        strh    w0, [x19]
+.L6:
+        mov     w0, w1
+        ldp     x19, x20, [sp, 16]
+        ldp     x29, x30, [sp], 32
+        ret
+#endif
+        .size   __atomic_compare_exchange_2, .-__atomic_compare_exchange_2
+
+        .global __atomic_compare_exchange_4
+        .type   __atomic_compare_exchange_4, %function
+__atomic_compare_exchange_4:
+#ifdef __TINYC__
+	.int	0xa9be7bfd
+	.int	0x910003fd
+	.int	0xa90153f3
+	.int	0xaa0103f3
+	.int	0x2a0203e1
+	.int	0xaa0003e2
+	.int	0xb9400274
+	.int	0x2a1403e0
+	.int	0x2a0003f0
+	.int	0x885ffc40
+	.int	0x6b10001f
+	.int	0x54000061
+	.int	0x8811fc41
+	.int	0x35ffff91
+	.int	0x6b14001f
+	.int	0x1a9f17e1
+	.int	0x54000040
+	.int	0xb9000260
+	.int	0x2a0103e0
+	.int	0xa94153f3
+	.int	0xa8c27bfd
+	.int	0xd65f03c0
+#else
+        stp     x29, x30, [sp, -32]!
+        mov     x29, sp
+        stp     x19, x20, [sp, 16]
+        mov     x19, x1
+        mov     w1, w2
+        mov     x2, x0
+        ldr     w20, [x19]
+        mov     w0, w20
+	mov     w16, w0
+.L7:
+	ldaxr   w0, [x2]
+	cmp     w0, w16
+	b.ne    .L8
+	stlxr   w17, w1, [x2]
+	cbnz    w17, .L7
+.L8:
+        cmp     w0, w20
+        cset    w1, eq
+        beq     .L9
+        str     w0, [x19]
+.L9:
+        mov     w0, w1
+        ldp     x19, x20, [sp, 16]
+        ldp     x29, x30, [sp], 32
+        ret
+#endif
+        .size   __atomic_compare_exchange_4, .-__atomic_compare_exchange_4
+
+        .global __atomic_compare_exchange_8
+        .type   __atomic_compare_exchange_8, %function
+__atomic_compare_exchange_8:
+#ifdef __TINYC__
+	.int	0xa9be7bfd
+	.int	0x910003fd
+	.int	0xa90153f3
+	.int	0xaa0103f3
+	.int	0xaa0203e1
+	.int	0xaa0003e2
+	.int	0xf9400274
+	.int	0xaa1403e0
+	.int	0xaa0003f0
+	.int	0xc85ffc40
+	.int	0xeb10001f
+	.int	0x54000061
+	.int	0xc811fc41
+	.int	0x35ffff91
+	.int	0xeb14001f
+	.int	0x1a9f17e1
+	.int	0x54000040
+	.int	0xf9000260
+	.int	0x2a0103e0
+	.int	0xa94153f3
+	.int	0xa8c27bfd
+	.int	0xd65f03c0
+#else
+        stp     x29, x30, [sp, -32]!
+        mov     x29, sp
+        stp     x19, x20, [sp, 16]
+        mov     x19, x1
+        mov     x1, x2
+        mov     x2, x0
+        ldr     x20, [x19]
+        mov     x0, x20
+	mov     x16, x0
+.L10:
+	ldaxr   x0, [x2]
+	cmp     x0, x16
+	b.ne    .L11
+	stlxr   w17, x1, [x2]
+	cbnz    w17, .L10
+.L11:
+        cmp     x0, x20
+        cset    w1, eq
+        beq     .L12
+        str     x0, [x19]
+.L12:
+        mov     w0, w1
+        ldp     x19, x20, [sp, 16]
+        ldp     x29, x30, [sp], 32
+        ret
+#endif
+        .size   __atomic_compare_exchange_8, .-__atomic_compare_exchange_8
+
+/* ---------------------------------------------- */
+#elif defined __riscv
+
+        .text
+        .align  2
+
+        .global __atomic_compare_exchange_1
+        .type   __atomic_compare_exchange_1, %function
+__atomic_compare_exchange_1:
+#ifdef __TINYC__
+	.short	0x1141
+	.short	0x86ba
+	.short	0x873e
+	.short	0xe406
+	.int	0x0ff0000f
+	.int	0x0005c803
+	.int	0xff857893
+	.int	0x0008b783
+	.short	0x891d
+	.short	0x050e
+	.int	0x0ff00693
+	.int	0x00a696b3
+	.int	0x00a81833
+	.int	0x00a61633
+	.int	0xfff6c713
+	.short	0x8f7d
+	.int	0x00f6f333
+	.short	0x8f51
+	.int	0x03031263
+	.int	0x1008b32f
+	.int	0x00f31663
+	.int	0x18e8be2f
+	.int	0xfe0e1ae3
+	.int	0x40f30733
+	.short	0x879a
+	.short	0xff69
+	.int	0x0ff0000f
+	.short	0x4505
+	.short	0xa801
+	.int	0x00a7d7b3
+	.int	0x00f58023
+	.int	0x0ff0000f
+	.short	0x4501
+	.short	0x60a2
+	.short	0x0141
+	.short	0x8082
+#else
+        addi    sp,sp,-16
+        mv      a3,a4
+        mv      a4,a5
+        sd      ra,8(sp)
+	fence
+	lbu     a6,0(a1)
+	andi    a7,a0,-8
+	ld      a5,0(a7)
+	andi    a0,a0,7
+	slli    a0,a0,0x3
+	li      a3,255
+	sll     a3,a3,a0
+	sll     a6,a6,a0
+	sll     a2,a2,a0
+.L1:
+	not     a4,a3
+	and     a4,a4,a5
+	and     t1,a3,a5
+	or      a4,a4,a2
+	bne     t1,a6,.L4
+.L2:
+	lr.d    t1,(a7)
+	bne     t1,a5,.L3
+	sc.d    t3,a4,(a7)
+	bnez    t3,.L2
+.L3:
+	sub     a4,t1,a5
+	mv      a5,t1
+	bnez    a4,.L1
+	fence
+	li      a0,1
+	j	.L5
+.L4:
+	srl     a5,a5,a0
+	sb      a5,0(a1)
+	fence
+	li      a0,0
+.L5:
+        ld      ra,8(sp)
+        addi    sp,sp,16
+        jr      ra
+#endif
+        .size   __atomic_compare_exchange_1, .-__atomic_compare_exchange_1
+
+        .global __atomic_compare_exchange_2
+        .type   __atomic_compare_exchange_2, %function
+__atomic_compare_exchange_2:
+#ifdef __TINYC__
+	.short	0x1141
+	.short	0x86ba
+	.short	0x873e
+	.short	0xe406
+	.int	0x0ff0000f
+	.int	0x0005d803
+	.int	0xff857893
+	.short	0x67c1
+	.short	0x891d
+	.int	0x0008b703
+	.short	0x050e
+	.short	0x17fd
+	.int	0x00a797b3
+	.int	0x00a81833
+	.int	0x00a61633
+	.int	0xfff7c693
+	.short	0x8ef9
+	.int	0x00e7f333
+	.short	0x8ed1
+	.int	0x03031263
+	.int	0x1008b32f
+	.int	0x00e31663
+	.int	0x18d8be2f
+	.int	0xfe0e1ae3
+	.int	0x40e306b3
+	.short	0x871a
+	.short	0xfee9
+	.int	0x0ff0000f
+	.short	0x4505
+	.short	0xa801
+	.int	0x00a75733
+	.int	0x00e59023
+	.int	0x0ff0000f
+	.short	0x4501
+	.short	0x60a2
+	.short	0x0141
+	.short	0x8082
+#else
+        addi    sp,sp,-16
+        mv      a3,a4
+        mv      a4,a5
+        sd      ra,8(sp)
+	fence
+	lhu     a6,0(a1)
+	andi    a7,a0,-8
+	lui     a5,0x10
+	andi    a0,a0,7
+	ld      a4,0(a7)
+	slli    a0,a0,0x3
+	addi    a5,a5,-1
+	sll     a5,a5,a0
+	sll     a6,a6,a0
+	sll     a2,a2,a0
+.L6:
+	not     a3,a5
+	and     a3,a3,a4
+	and     t1,a5,a4
+	or      a3,a3,a2
+	bne     t1,a6,.L9
+.L7:
+	lr.d    t1,(a7)
+	bne     t1,a4,.L8
+	sc.d    t3,a3,(a7)
+	bnez    t3,.L7
+.L8:
+	sub     a3,t1,a4
+	mv      a4,t1
+	bnez    a3,.L6
+	fence
+	li      a0,1
+	j	.L10
+.L9:
+	srl     a4,a4,a0
+	sh      a4,0(a1)
+	fence
+	li      a0,0
+.L10:
+        ld      ra,8(sp)
+        addi    sp,sp,16
+        jr      ra
+#endif
+        .size   __atomic_compare_exchange_2, .-__atomic_compare_exchange_2
+
+        .global __atomic_compare_exchange_4
+        .type   __atomic_compare_exchange_4, %function
+__atomic_compare_exchange_4:
+#ifdef __TINYC__
+        .short 0x419c
+        .int   0x0f50000f
+        .int   0x1405272f
+        .int   0x00f71663
+        .int   0x1cc5282f
+        .int   0xfe081ae3
+        .int   0x40f707bb
+        .int   0x0017b513
+        .short 0xc391
+        .short 0xc198
+        .short 0x8905
+        .short 0x8082
+#else
+	lw      a5,0(a1)
+	fence	iorw,ow;
+.L11:
+	lr.w.aq	a4,0(a0)
+	bne	a4,a5,.L12
+	sc.w.aq	a6,a2,0(a0)
+	bnez	a6,.L11
+.L12:
+	subw    a5,a4,a5
+	seqz    a0,a5
+	beq     a5,zero,.L13
+	sw      a4,0(a1)
+.L13:
+	andi    a0,a0,1
+        ret
+#endif
+        .size   __atomic_compare_exchange_4, .-__atomic_compare_exchange_4
+
+        .global __atomic_compare_exchange_8
+        .type   __atomic_compare_exchange_8, %function
+__atomic_compare_exchange_8:
+#ifdef __TINYC__
+        .short 0x619c
+        .int   0x0f50000f
+        .int   0x1405372f
+        .int   0x00f71563
+        .int   0x1cc536af
+        .short 0xfaf5
+        .int   0x40f707b3
+        .int   0x0017b513
+        .short 0xc391
+        .short 0xe198
+        .short 0x8905
+        .short 0x8082
+#else
+	ld      a5,0(a1)
+	fence	iorw,ow;
+.L14:
+	lr.d.aq	a4,0(a0)
+	bne	a4,a5,.L15
+	sc.d.aq	a3,a2,0(a0)
+	bnez	a3,.L14
+.L15:
+	sub	a5,a4,a5
+	seqz    a0,a5
+	beq     a5,zero,.L16
+	sd      a4,0(a1)
+.L16:
+	andi    a0,a0,1
+        ret
+#endif
+        .size   __atomic_compare_exchange_8, .-__atomic_compare_exchange_8
+
+/* ---------------------------------------------- */
+#endif
diff --git a/lib/stdatomic.c b/lib/stdatomic.c
index eea70855..6f0754ef 100644
--- a/lib/stdatomic.c
+++ b/lib/stdatomic.c
@@ -5,6 +5,7 @@
 #define uint32_t unsigned int
 #define uint64_t unsigned long long
 #define bool _Bool
+#define false 0
 #define true 1
 #define __ATOMIC_RELAXED 0
 #define __ATOMIC_CONSUME 1
@@ -12,8 +13,10 @@
 #define __ATOMIC_RELEASE 3
 #define __ATOMIC_ACQ_REL 4
 #define __ATOMIC_SEQ_CST 5
+typedef __SIZE_TYPE__ size_t;
 
-#define ATOMIC_X86_COMPARE_EXCHANGE(TYPE, MODE, SUFFIX) \
+#if defined __i386__ || defined __x86_64__
+#define ATOMIC_COMPARE_EXCHANGE(TYPE, MODE, SUFFIX) \
     bool __atomic_compare_exchange_##MODE \
         (volatile void *atom, void *ref, TYPE xchg, \
          bool weak, int success_memorder, int failure_memorder) \
@@ -29,96 +32,132 @@
         *(TYPE *)ref = rv; \
         return (rv == cmp); \
     }
+#else
+#define ATOMIC_COMPARE_EXCHANGE(TYPE, MODE, SUFFIX) \
+    extern bool __atomic_compare_exchange_##MODE \
+        (volatile void *atom, void *ref, TYPE xchg, \
+         bool weak, int success_memorder, int failure_memorder);
+#endif
 
-#define ATOMIC_X86_LOAD(TYPE, MODE) \
+#define ATOMIC_LOAD(TYPE, MODE) \
     TYPE __atomic_load_##MODE(const volatile void *atom, int memorder) \
     { \
         return *(volatile TYPE *)atom; \
     }
 
-#define ATOMIC_X86_STORE(TYPE, MODE) \
+#define ATOMIC_STORE(TYPE, MODE) \
     void __atomic_store_##MODE(volatile void *atom, TYPE value, int memorder) \
     { \
         *(volatile TYPE *)atom = value; \
     }
 
-/* Some tcc targets set __GNUC__ */
-#if defined(__GNUC__) && !defined(__TINYC__)
-#define	ATOMIC_LOAD(t,a,b,c)              t b; __atomic_load((t *)a, (t *)&b, c)
-#define COMPARE_EXCHANGE(t,a,b,c,d,e,f)   __atomic_compare_exchange((t *)a,b,&c,d,e,f)
-#else
-#define ATOMIC_LOAD(t,a,b,c)              t b = __atomic_load((t *)a, c)
-#define COMPARE_EXCHANGE(t,a,b,c,d,e,f)   __atomic_compare_exchange((t *)a,b,c,d,e,f)
-#endif
-
-#define ATOMIC_GEN_OP(TYPE, MODE, NAME, OP) \
+#define ATOMIC_GEN_OP(TYPE, MODE, NAME, OP, RET) \
     TYPE __atomic_##NAME##_##MODE(volatile void *atom, TYPE value, int memorder) \
     { \
-        TYPE xchg; \
-        ATOMIC_LOAD(TYPE, atom, cmp, __ATOMIC_RELAXED); \
+        TYPE xchg, cmp; \
+        __atomic_load((TYPE *)atom, (TYPE *)&cmp, __ATOMIC_RELAXED); \
         do { \
             xchg = (OP); \
-        } while (!COMPARE_EXCHANGE(TYPE, atom, &cmp, xchg, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)); \
-        return cmp; \
+        } while (!__atomic_compare_exchange((TYPE *)atom, &cmp, &xchg, true, \
+                                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)); \
+        return RET; \
     }
 
 #define ATOMIC_EXCHANGE(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, exchange, value)
+    ATOMIC_GEN_OP(TYPE, MODE, exchange, value, cmp)
+#define ATOMIC_ADD_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, add_fetch, (cmp + value), xchg)
+#define ATOMIC_SUB_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, sub_fetch, (cmp - value), xchg)
+#define ATOMIC_AND_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, and_fetch, (cmp & value), xchg)
+#define ATOMIC_OR_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, or_fetch, (cmp | value), xchg)
+#define ATOMIC_XOR_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, xor_fetch, (cmp ^ value), xchg)
+#define ATOMIC_NAND_FETCH(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, nand_fetch, ~(cmp & value), xchg)
 #define ATOMIC_FETCH_ADD(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, fetch_add, (cmp + value))
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_add, (cmp + value), cmp)
 #define ATOMIC_FETCH_SUB(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, fetch_sub, (cmp - value))
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_sub, (cmp - value), cmp)
 #define ATOMIC_FETCH_AND(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, fetch_and, (cmp & value))
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_and, (cmp & value), cmp)
 #define ATOMIC_FETCH_OR(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, fetch_or, (cmp | value))
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_or, (cmp | value), cmp)
 #define ATOMIC_FETCH_XOR(TYPE, MODE) \
-    ATOMIC_GEN_OP(TYPE, MODE, fetch_xor, (cmp ^ value))
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_xor, (cmp ^ value), cmp)
+#define ATOMIC_FETCH_NAND(TYPE, MODE) \
+    ATOMIC_GEN_OP(TYPE, MODE, fetch_nand, ~(cmp & value), cmp)
 
-ATOMIC_X86_STORE(uint8_t, 1)
-ATOMIC_X86_STORE(uint16_t, 2)
-ATOMIC_X86_STORE(uint32_t, 4)
+#define ATOMIC_GEN(TYPE, SIZE, SUFFIX) \
+    ATOMIC_STORE(TYPE, SIZE) \
+    ATOMIC_LOAD(TYPE, SIZE) \
+    ATOMIC_COMPARE_EXCHANGE(TYPE, SIZE, SUFFIX) \
+    ATOMIC_EXCHANGE(TYPE, SIZE) \
+    ATOMIC_ADD_FETCH(TYPE, SIZE) \
+    ATOMIC_SUB_FETCH(TYPE, SIZE) \
+    ATOMIC_AND_FETCH(TYPE, SIZE) \
+    ATOMIC_OR_FETCH(TYPE, SIZE) \
+    ATOMIC_XOR_FETCH(TYPE, SIZE) \
+    ATOMIC_NAND_FETCH(TYPE, SIZE) \
+    ATOMIC_FETCH_ADD(TYPE, SIZE) \
+    ATOMIC_FETCH_SUB(TYPE, SIZE) \
+    ATOMIC_FETCH_AND(TYPE, SIZE) \
+    ATOMIC_FETCH_OR(TYPE, SIZE) \
+    ATOMIC_FETCH_XOR(TYPE, SIZE) \
+    ATOMIC_FETCH_NAND(TYPE, SIZE)
 
-ATOMIC_X86_LOAD(uint8_t, 1)
-ATOMIC_X86_LOAD(uint16_t, 2)
-ATOMIC_X86_LOAD(uint32_t, 4)
-
-ATOMIC_X86_COMPARE_EXCHANGE(uint8_t, 1, "b")
-ATOMIC_X86_COMPARE_EXCHANGE(uint16_t, 2, "w")
-ATOMIC_X86_COMPARE_EXCHANGE(uint32_t, 4, "l")
-
-ATOMIC_EXCHANGE(uint8_t, 1)
-ATOMIC_EXCHANGE(uint16_t, 2)
-ATOMIC_EXCHANGE(uint32_t, 4)
-
-ATOMIC_FETCH_ADD(uint8_t, 1)
-ATOMIC_FETCH_ADD(uint16_t, 2)
-ATOMIC_FETCH_ADD(uint32_t, 4)
-
-ATOMIC_FETCH_SUB(uint8_t, 1)
-ATOMIC_FETCH_SUB(uint16_t, 2)
-ATOMIC_FETCH_SUB(uint32_t, 4)
-
-ATOMIC_FETCH_AND(uint8_t, 1)
-ATOMIC_FETCH_AND(uint16_t, 2)
-ATOMIC_FETCH_AND(uint32_t, 4)
-
-ATOMIC_FETCH_OR(uint8_t, 1)
-ATOMIC_FETCH_OR(uint16_t, 2)
-ATOMIC_FETCH_OR(uint32_t, 4)
-
-ATOMIC_FETCH_XOR(uint8_t, 1)
-ATOMIC_FETCH_XOR(uint16_t, 2)
-ATOMIC_FETCH_XOR(uint32_t, 4)
-
-#if defined __x86_64__
-ATOMIC_X86_STORE(uint64_t, 8)
-ATOMIC_X86_LOAD(uint64_t, 8)
-ATOMIC_X86_COMPARE_EXCHANGE(uint64_t, 8, "q")
-ATOMIC_EXCHANGE(uint64_t, 8)
-ATOMIC_FETCH_ADD(uint64_t, 8)
-ATOMIC_FETCH_SUB(uint64_t, 8)
-ATOMIC_FETCH_AND(uint64_t, 8)
-ATOMIC_FETCH_OR(uint64_t, 8)
-ATOMIC_FETCH_XOR(uint64_t, 8)
+ATOMIC_GEN(uint8_t, 1, "b")
+ATOMIC_GEN(uint16_t, 2, "w")
+ATOMIC_GEN(uint32_t, 4, "l")
+#if defined __x86_64__ || defined __aarch64__ || defined __riscv
+ATOMIC_GEN(uint64_t, 8, "q")
 #endif
+
+bool __atomic_test_and_set (volatile void *ptr, int memorder)
+{
+    return __atomic_exchange_1(ptr, 1, memorder);
+}
+
+void __atomic_clear (volatile void *ptr, int memorder)
+{
+    __atomic_store_1(ptr, 0, memorder);
+}
+
+void __atomic_signal_fence (int memorder)
+{
+}
+
+void __atomic_thread_fence (int memorder)
+{
+#if defined __i386__
+        asm volatile("lock orl $0, (%esp)");
+#elif defined __x86_64__
+        asm volatile("lock orq $0, (%rsp)");
+#elif defined __arm__
+        asm volatile(".int 0xee070fba"); // mcr p15, 0, r0, c7, c10, 5
+#elif defined __aarch64__
+        asm volatile(".int 0xd5033bbf"); // dmb ish
+#elif defined __riscv
+        asm volatile(".int 0x0ff0000f"); // fence iorw,iorw
+#endif
+}
+
+bool __atomic_is_lock_free(size_t size, void *ptr)
+{
+    bool ret;
+
+    switch (size) {
+    case 1: ret = true; break;
+    case 2: ret = true; break;
+    case 4: ret = true; break;
+#if defined __x86_64__ || defined __aarch64__ || defined __riscv
+    case 8: ret = true; break;
+#else
+    case 8: ret = false; break;
+#endif
+    default: ret = false; break;
+    }
+    return ret;
+}
diff --git a/tccgen.c b/tccgen.c
index f8ecc4ee..2f5214ec 100644
--- a/tccgen.c
+++ b/tccgen.c
@@ -5171,8 +5171,9 @@ static void parse_builtin_params(int nc, const char *args)
 
 static void parse_atomic(int atok)
 {
-    int size, align, arg;
+    int size, align, arg, t, save = 0;
     CType *atom, *atom_ptr, ct = {0};
+    SValue store;
     char buf[40];
     static const char *const templates[] = {
         /*
@@ -5185,19 +5186,28 @@ static void parse_atomic(int atok)
          * A read-only atomic
          * p pointer to memory
          * v value
+         * l load pointer
+         * s save pointer
          * m memory model
          */
 
         /* keep in order of appearance in tcctok.h: */
-        /* __atomic_store */            "avm.?",
-        /* __atomic_load */             "Am.v",
-        /* __atomic_exchange */         "avm.v",
-        /* __atomic_compare_exchange */ "apvbmm.b",
+        /* __atomic_store */            "alm.?",
+        /* __atomic_load */             "Asm.v",
+        /* __atomic_exchange */         "alsm.v",
+        /* __atomic_compare_exchange */ "aplbmm.b",
         /* __atomic_fetch_add */        "avm.v",
         /* __atomic_fetch_sub */        "avm.v",
         /* __atomic_fetch_or */         "avm.v",
         /* __atomic_fetch_xor */        "avm.v",
-        /* __atomic_fetch_and */        "avm.v"
+        /* __atomic_fetch_and */        "avm.v",
+        /* __atomic_fetch_nand */       "avm.v",
+        /* __atomic_and_fetch */        "avm.v",
+        /* __atomic_sub_fetch */        "avm.v",
+        /* __atomic_or_fetch */         "avm.v",
+        /* __atomic_xor_fetch */        "avm.v",
+        /* __atomic_and_fetch */        "avm.v",
+        /* __atomic_nand_fetch */       "avm.v"
     };
     const char *template = templates[(atok - TOK___atomic_store)];
 
@@ -5235,6 +5245,16 @@ static void parse_atomic(int atok)
         case 'v':
             gen_assign_cast(atom);
             break;
+        case 'l':
+            indir();
+            gen_assign_cast(atom);
+            break;
+        case 's':
+            save = 1;
+            indir();
+            store = *vtop;
+            vpop();
+            break;
         case 'm':
             gen_assign_cast(&int_type);
             break;
@@ -5261,18 +5281,26 @@ static void parse_atomic(int atok)
 
     sprintf(buf, "%s_%d", get_tok_str(atok, 0), size);
     vpush_helper_func(tok_alloc_const(buf));
-    vrott(arg + 1);
-    gfunc_call(arg);
+    vrott(arg - save + 1);
+    gfunc_call(arg - save);
 
     vpush(&ct);
     PUT_R_RET(vtop, ct.t);
-    if (ct.t == VT_BOOL) {
+    t = ct.t & VT_BTYPE;
+    if (t == VT_BYTE || t == VT_SHORT || t == VT_BOOL) {
 #ifdef PROMOTE_RET
-	vtop->r |= BFVAL(VT_MUSTCAST, 1);
+        vtop->r |= BFVAL(VT_MUSTCAST, 1);
 #else
-	vtop->type.t = VT_INT;
+        vtop->type.t = VT_INT;
 #endif
     }
+    gen_cast(&ct);
+    if (save) {
+        vpush(&ct);
+        *vtop = store;
+        vswap();
+        vstore();
+    }
 }
 
 ST_FUNC void unary(void)
@@ -5660,6 +5688,13 @@ ST_FUNC void unary(void)
     case TOK___atomic_fetch_or:
     case TOK___atomic_fetch_xor:
     case TOK___atomic_fetch_and:
+    case TOK___atomic_fetch_nand:
+    case TOK___atomic_add_fetch:
+    case TOK___atomic_sub_fetch:
+    case TOK___atomic_or_fetch:
+    case TOK___atomic_xor_fetch:
+    case TOK___atomic_and_fetch:
+    case TOK___atomic_nand_fetch:
         parse_atomic(tok);
         break;
 
diff --git a/tcctok.h b/tcctok.h
index e3ca7169..52a6e0cc 100644
--- a/tcctok.h
+++ b/tcctok.h
@@ -187,6 +187,13 @@
      DEF_ATOMIC(atomic_fetch_or)
      DEF_ATOMIC(atomic_fetch_xor)
      DEF_ATOMIC(atomic_fetch_and)
+     DEF_ATOMIC(atomic_fetch_nand)
+     DEF_ATOMIC(atomic_add_fetch)
+     DEF_ATOMIC(atomic_sub_fetch)
+     DEF_ATOMIC(atomic_or_fetch)
+     DEF_ATOMIC(atomic_xor_fetch)
+     DEF_ATOMIC(atomic_and_fetch)
+     DEF_ATOMIC(atomic_nand_fetch)
 
 /* pragma */
      DEF(TOK_pack, "pack")
diff --git a/tests/tests2/124_atomic_counter.c b/tests/tests2/124_atomic_counter.c
index a817fc26..67a500a7 100644
--- a/tests/tests2/124_atomic_counter.c
+++ b/tests/tests2/124_atomic_counter.c
@@ -6,7 +6,7 @@
 #include <stdatomic.h>
 
 #define NR_THREADS 16
-#define NR_STEPS   ((uint32_t)UINT16_MAX * 4)
+#define NR_STEPS   ((uint32_t)UINT16_MAX)
 
 #define BUG_ON(COND) \
     do { \
@@ -14,14 +14,34 @@
             abort(); \
     } while (0)
 
+#if defined __x86_64__ || defined __aarch64__ || defined __riscv
+#define HAS_64BITS
+#endif
+
+typedef struct {
+    atomic_flag flag;
+    atomic_uchar uc;
+    atomic_ushort us;
+    atomic_uint ui;
+#ifdef HAS_64BITS
+    atomic_size_t ul;
+#endif
+} counter_type;
+
 static
 void *adder_simple(void *arg)
 {
     size_t step;
-    atomic_size_t *counter = arg;
+    counter_type *counter = arg;
 
-    for (step = 0; step < NR_STEPS; ++step)
-        atomic_fetch_add_explicit(counter, 1, memory_order_relaxed);
+    for (step = 0; step < NR_STEPS; ++step) {
+        atomic_fetch_add_explicit(&counter->uc, 1, memory_order_relaxed);
+        atomic_fetch_add_explicit(&counter->us, 1, memory_order_relaxed);
+        atomic_fetch_add_explicit(&counter->ui, 1, memory_order_relaxed);
+#ifdef HAS_64BITS
+        atomic_fetch_add_explicit(&counter->ul, 1, memory_order_relaxed);
+#endif
+    }
 
     return NULL;
 }
@@ -30,16 +50,60 @@ static
 void *adder_cmpxchg(void *arg)
 {
     size_t step;
-    atomic_size_t *counter = arg;
+    counter_type *counter = arg;
 
     for (step = 0; step < NR_STEPS; ++step) {
-        size_t xchg;
-        size_t cmp = atomic_load_explicit(counter, memory_order_relaxed);
+        unsigned char xchgc;
+        unsigned short xchgs;
+        unsigned int xchgi;
+#ifdef HAS_64BITS
+        size_t xchgl;
+#endif
+        unsigned char cmpc = atomic_load_explicit(&counter->uc, memory_order_relaxed);
+        unsigned short cmps = atomic_load_explicit(&counter->us, memory_order_relaxed);
+        unsigned int cmpi = atomic_load_explicit(&counter->ui, memory_order_relaxed);
+#ifdef HAS_64BITS
+        size_t cmpl = atomic_load_explicit(&counter->ul, memory_order_relaxed);
+#endif
 
         do {
-            xchg = (cmp + 1);
-        } while (!atomic_compare_exchange_strong_explicit(counter,
-            &cmp, xchg, memory_order_relaxed, memory_order_relaxed));
+            xchgc = (cmpc + 1);
+        } while (!atomic_compare_exchange_strong_explicit(&counter->uc,
+            &cmpc, xchgc, memory_order_relaxed, memory_order_relaxed));
+        do {
+            xchgs = (cmps + 1);
+        } while (!atomic_compare_exchange_strong_explicit(&counter->us,
+            &cmps, xchgs, memory_order_relaxed, memory_order_relaxed));
+        do {
+            xchgi = (cmpi + 1);
+        } while (!atomic_compare_exchange_strong_explicit(&counter->ui,
+            &cmpi, xchgi, memory_order_relaxed, memory_order_relaxed));
+#ifdef HAS_64BITS
+        do {
+            xchgl = (cmpl + 1);
+        } while (!atomic_compare_exchange_strong_explicit(&counter->ul,
+            &cmpl, xchgl, memory_order_relaxed, memory_order_relaxed));
+#endif
+    }
+
+    return NULL;
+}
+
+static
+void *adder_test_and_set(void *arg)
+{
+    size_t step;
+    counter_type *counter = arg;
+
+    for (step = 0; step < NR_STEPS; ++step) {
+        while (atomic_flag_test_and_set(&counter->flag));
+        ++counter->uc;
+        ++counter->us;
+        ++counter->ui;
+#ifdef HAS_64BITS
+        ++counter->ul;
+#endif
+        atomic_flag_clear(&counter->flag);
     }
 
     return NULL;
@@ -49,10 +113,16 @@ static
 void atomic_counter_test(void *(*adder)(void *arg))
 {
     size_t index;
-    atomic_size_t counter;
+    counter_type counter;
     pthread_t thread[NR_THREADS];
 
-    atomic_init(&counter, 0);
+    atomic_flag_clear(&counter.flag);
+    atomic_init(&counter.uc, 0);
+    atomic_init(&counter.us, 0);
+    atomic_init(&counter.ui, 0);
+#ifdef HAS_64BITS
+    atomic_init(&counter.ul, 0);
+#endif
 
     for (index = 0; index < NR_THREADS; ++index)
         BUG_ON(pthread_create(&thread[index], NULL, adder, (void *)&counter));
@@ -60,7 +130,13 @@ void atomic_counter_test(void *(*adder)(void *arg))
     for (index = 0; index < NR_THREADS; ++index)
         BUG_ON(pthread_join(thread[index], NULL));
 
-    if (atomic_load(&counter) == (NR_THREADS * NR_STEPS))
+    if (atomic_load(&counter.uc) == ((NR_THREADS * NR_STEPS) & 0xffu)
+        && atomic_load(&counter.us) == ((NR_THREADS * NR_STEPS) & 0xffffu)
+        && atomic_load(&counter.ui) == (NR_THREADS * NR_STEPS)
+#ifdef HAS_64BITS
+        && atomic_load(&counter.ul) == (NR_THREADS * NR_STEPS)
+#endif
+        )
         printf("SUCCESS\n");
     else
         printf("FAILURE\n");
@@ -70,6 +146,7 @@ int main(void)
 {
     atomic_counter_test(adder_simple);
     atomic_counter_test(adder_cmpxchg);
+    atomic_counter_test(adder_test_and_set);
 
     return 0;
 }
diff --git a/tests/tests2/124_atomic_counter.expect b/tests/tests2/124_atomic_counter.expect
index 1db2652c..21667a71 100644
--- a/tests/tests2/124_atomic_counter.expect
+++ b/tests/tests2/124_atomic_counter.expect
@@ -1,2 +1,3 @@
 SUCCESS
 SUCCESS
+SUCCESS
diff --git a/tests/tests2/125_atomic_misc.c b/tests/tests2/125_atomic_misc.c
index 40e7cba8..2d106316 100644
--- a/tests/tests2/125_atomic_misc.c
+++ b/tests/tests2/125_atomic_misc.c
@@ -16,6 +16,14 @@ int main()
     r = atomic_compare_exchange_strong(&a, &b, 99);
     printf("%d %d %d\n", r, a, b);
 
+    atomic_store(&a, b + 5);
+    r = atomic_exchange(&a,  33);
+    printf("%d %d %d\n", r, a, b);
+
+    atomic_store(&a, b + 10);
+    r = atomic_exchange(&a, 66);
+    printf("%d %d %d\n", r, a, b);
+
     return 0;
 }
 
@@ -50,6 +58,117 @@ int main()
     printf("%d %d %d %d\n", p.c[0], p.c[1], p.c[2], p.c[3]);
 }
 
+#elif defined test_atomic_op
+
+#define OP1(func, v, e1, e2) atomic_##func(&c, v) == e1 && c == e2
+#define OP2(func, v, e1, e2) atomic_##func(&s, v) == e1 && s == e2
+#define OP4(func, v, e1, e2) atomic_##func(&i, v) == e1 && i == e2
+#if defined __x86_64__ || defined __aarch64__ || defined __riscv
+#define OP8(func, v, e1, e2) atomic_##func(&l, v) == e1 && l == e2
+#define HAS_64BITS
+#else
+#define OP8(func, v, e1, e2) 1
+#endif
+
+#define OP(func, v, e1, e2) printf ("%s: %s\n", #func,                        \
+                                    OP1(func,v,e1,e2) && OP2(func,v,e1,e2) && \
+                                    OP4(func,v,e1,e2) && OP8(func,v,e1,e2)    \
+                                    ? "SUCCESS" : "FAIL");
+ 
+int main()
+{
+    atomic_char c;
+    atomic_short s;
+    atomic_int i;
+#ifdef HAS_64BITS
+    atomic_size_t l;
+#endif
+
+    atomic_init(&c, 0);
+    atomic_init(&s, 0);
+    atomic_init(&i, 0);
+#ifdef HAS_64BITS
+    atomic_init(&l, 0);
+#endif
+
+    OP(fetch_add, 10, 0, 10);
+    OP(fetch_sub, 5, 10, 5);
+    OP(fetch_or, 0x10, 5, 21);
+    OP(fetch_xor, 0x20, 21, 53);
+    OP(fetch_and, 0x0f, 53, 5);
+}
+
+#elif defined test_atomic_op2
+
+typedef __SIZE_TYPE__ size64_t;
+
+#define OP1(func, v, e1, e2) \
+    __atomic_##func(&c, v, __ATOMIC_SEQ_CST) == e1 && c == e2
+#define OP2(func, v, e1, e2)\
+    __atomic_##func(&s, v, __ATOMIC_SEQ_CST) == e1 && s == e2
+#define OP4(func, v, e1, e2)\
+    __atomic_##func(&i, v, __ATOMIC_SEQ_CST) == e1 && i == e2
+#if defined __x86_64__ || defined __aarch64__ || defined __riscv
+#define OP8(func, v, e1, e2)\
+    __atomic_##func(&l, v, __ATOMIC_SEQ_CST) == e1 && l == e2
+#define HAS_64BITS
+#else
+#define OP8(func, v, e1, e2) 1
+#endif
+
+#define OP(func, v, e1, e2) printf ("%s: %s\n", #func,                        \
+                                    OP1(func,v,e1,e2) && OP2(func,v,e1,e2) && \
+                                    OP4(func,v,e1,e2) && OP8(func,v,e1,e2)    \
+                                    ? "SUCCESS" : "FAIL");
+
+int main()
+{
+    signed char c;
+    short s;
+    int i;
+#ifdef HAS_64BITS
+    size64_t l;
+#endif
+
+    atomic_init(&c, 0);
+    atomic_init(&s, 0);
+    atomic_init(&i, 0);
+#ifdef HAS_64BITS
+    atomic_init(&l, 0);
+#endif
+
+    OP(fetch_add, 10, 0, 10);
+    OP(fetch_sub, 5, 10, 5);
+    OP(fetch_or, 0x10, 5, 21);
+    OP(fetch_xor, 0x20, 21, 53);
+    OP(fetch_and, 0x0f, 53, 5);
+    OP(fetch_nand, 0x01, 5, -2);
+
+    atomic_init(&c, 0);
+    atomic_init(&s, 0);
+    atomic_init(&i, 0);
+#ifdef HAS_64BITS
+    atomic_init(&l, 0);
+#endif
+
+    OP(add_fetch, 10, 10, 10);
+    OP(sub_fetch, 5, 5, 5);
+    OP(or_fetch, 0x10, 21, 21);
+    OP(xor_fetch, 0x20, 53, 53);
+    OP(and_fetch, 0x0f, 5, 5);
+    OP(nand_fetch, 0x01, -2, -2);
+}
+
+#elif defined test_atomic_thread_signal
+int main()
+{
+    int c;
+
+    atomic_thread_fence(__ATOMIC_SEQ_CST);
+    atomic_signal_fence(__ATOMIC_SEQ_CST);
+    printf ("%d\n", atomic_is_lock_free(&c));
+}
+
 #elif defined test_atomic_error_1
 int main()
 {
diff --git a/tests/tests2/125_atomic_misc.expect b/tests/tests2/125_atomic_misc.expect
index 7aa03eba..61bd29f4 100644
--- a/tests/tests2/125_atomic_misc.expect
+++ b/tests/tests2/125_atomic_misc.expect
@@ -1,6 +1,8 @@
 [test_atomic_compare_exchange]
 1 99 77
 0 80 80
+85 33 80
+90 66 80
 
 [test_atomic_store]
 r = 12, i = 24
@@ -11,23 +13,47 @@ r = 12, i = 24
 [test_atomic_store_struct]
 1 2 3 4
 
+[test_atomic_op]
+fetch_add: SUCCESS
+fetch_sub: SUCCESS
+fetch_or: SUCCESS
+fetch_xor: SUCCESS
+fetch_and: SUCCESS
+
+[test_atomic_op2]
+fetch_add: SUCCESS
+fetch_sub: SUCCESS
+fetch_or: SUCCESS
+fetch_xor: SUCCESS
+fetch_and: SUCCESS
+fetch_nand: SUCCESS
+add_fetch: SUCCESS
+sub_fetch: SUCCESS
+or_fetch: SUCCESS
+xor_fetch: SUCCESS
+and_fetch: SUCCESS
+nand_fetch: SUCCESS
+
+[test_atomic_thread_signal]
+1
+
 [test_atomic_error_1]
-125_atomic_misc.c:57: error: pointer expected
+125_atomic_misc.c:176: error: pointer expected
 
 [test_atomic_error_2]
-125_atomic_misc.c:64: error: integral or integer-sized pointer target type expected
+125_atomic_misc.c:183: error: integral or integer-sized pointer target type expected
 
 [test_atomic_error_3]
-125_atomic_misc.c:71: error: integral or integer-sized pointer target type expected
+125_atomic_misc.c:190: error: integral or integer-sized pointer target type expected
 
 [test_atomic_error_4]
-125_atomic_misc.c:79: error: pointer target type mismatch in argument 2
+125_atomic_misc.c:198: error: pointer target type mismatch in argument 2
 
 [test_atomic_warn_1]
-125_atomic_misc.c:87: warning: assignment makes integer from pointer without a cast
+125_atomic_misc.c:206: warning: assignment makes integer from pointer without a cast
 
 [test_atomic_warn_2]
-125_atomic_misc.c:97: warning: assignment from incompatible pointer type
+125_atomic_misc.c:216: warning: assignment from incompatible pointer type
 
 [test_atomic_warn_3]
-125_atomic_misc.c:105: warning: assignment of read-only location
+125_atomic_misc.c:224: warning: assignment of read-only location
diff --git a/tests/tests2/Makefile b/tests/tests2/Makefile
index c0e10c07..4347093a 100644
--- a/tests/tests2/Makefile
+++ b/tests/tests2/Makefile
@@ -23,8 +23,6 @@ ifeq (,$(filter i386,$(ARCH)))
 endif
 ifeq (,$(filter i386 x86_64,$(ARCH)))
  SKIP += 85_asm-outside-function.test # x86 asm
- SKIP += 124_atomic_counter.test
- SKIP += 125_atomic_misc.test # currently only x86 supported
  SKIP += 127_asm_goto.test    # hardcodes x86 asm
 endif
 ifeq ($(CONFIG_backtrace),no)