* Re: stdatomic library implementation, code
2015-07-27 7:51 stdatomic library implementation Jens Gustedt
@ 2015-07-27 7:53 ` Jens Gustedt
2015-07-27 10:36 ` Joakim Sindholt
2015-07-27 8:08 ` stdatomic library implementation, documentation Jens Gustedt
2015-08-03 11:47 ` stdatomic library, v2 Jens Gustedt
2 siblings, 1 reply; 6+ messages in thread
From: Jens Gustedt @ 2015-07-27 7:53 UTC (permalink / raw)
To: musl
[-- Attachment #1.1: Type: text/plain, Size: 532 bytes --]
Am Montag, den 27.07.2015, 09:51 +0200 schrieb Jens Gustedt:
> Hello,
> in two follow up mails I will send you the first version of all of
> this as a patch and a documentation of it as .org and .pdf files.
the patch
--
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536 ::
:: :::::::::::::::::::::: gsm France : +33 651400183 ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::
[-- Attachment #1.2: stdatomic-patch-v1.txt --]
[-- Type: text/plain, Size: 73188 bytes --]
diff --git a/src/internal/atomic_clang_c11.h b/src/internal/atomic_clang_c11.h
new file mode 100644
index 0000000..bb3629d
--- /dev/null
+++ b/src/internal/atomic_clang_c11.h
@@ -0,0 +1,82 @@
+#ifndef _STDATOMIC_CLANG_C11_H_
+#define _STDATOMIC_CLANG_C11_H_ 1
+
+#include <atomic_stub.h>
+
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#define atomic_init __c11_atomic_init
+
+/* Map operations to the special builtins that clang provides. */
+
+/* Map all non-explicit macros to the builtin with forced memory order. */
+#define atomic_fetch_add(X, Y) __c11_atomic_fetch_add((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) __c11_atomic_fetch_sub((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) __c11_atomic_fetch_and((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) __c11_atomic_fetch_or((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) __c11_atomic_fetch_xor((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) __c11_atomic_load((X), memory_order_seq_cst)
+#define atomic_store(X, V) __c11_atomic_store((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) __c11_atomic_exchange((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) __c11_atomic_compare_exchange_weak((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) __c11_atomic_compare_exchange_strong((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* Map allexplicit macros to the corresponding builtin. */
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+#define atomic_load_explicit __c11_atomic_load
+#define atomic_store_explicit __c11_atomic_store
+#define atomic_exchange_explicit __c11_atomic_exchange
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_add(_X, _V, _mo); \
+} \
+T __impl_fetch_sub_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_sub(_X, _V, _mo); \
+} \
+T __impl_fetch_and_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_and(_X, _V, _mo); \
+} \
+T __impl_fetch_xor_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_xor(_X, _V, _mo); \
+} \
+T __impl_fetch_or_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_or(_X, _V, _mo); \
+} \
+T __impl_add_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_add(_X, _V, _mo) + _V; \
+} \
+T __impl_sub_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_sub(_X, _V, _mo) - _V; \
+} \
+T __impl_and_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_and(_X, _V, _mo) & _V; \
+} \
+T __impl_xor_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_xor(_X, _V, _mo) ^ _V; \
+} \
+T __impl_or_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_or(_X, _V, _mo) | _V; \
+} \
+T __impl_load_ ## N(_Atomic(T)* _X, int _mo) { \
+ return __c11_atomic_load(_X, _mo); \
+} \
+void __impl_store_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ __c11_atomic_store(_X, _V, _mo); \
+} \
+T __impl_exchange_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_exchange(_X, _V, _mo); \
+} \
+_Bool __impl_compare_exchange_ ## N(_Atomic(T)* _X, T* _E, T const _V, int _mos, int _mof) { \
+ return __c11_atomic_compare_exchange_strong(_X, _E, _V, _mos, _mof); \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+#define INSTANTIATE_STUB(N, T) INSTANTIATE_STUB_ ## N(T)
+
+#endif
diff --git a/src/internal/atomic_constants.h b/src/internal/atomic_constants.h
new file mode 100644
index 0000000..327241b
--- /dev/null
+++ b/src/internal/atomic_constants.h
@@ -0,0 +1,162 @@
+#ifndef _STDATOMIC_ATOMIC_CONSTANTS_H_
+#define _STDATOMIC_ATOMIC_CONSTANTS_H_ 1
+
+#include <stdint.h>
+
+#if !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1) || __GNUC__ < 4
+# error "this implementation of stdatomic need support that is compatible with the gcc ABI"
+#endif
+
+/* gcc 4.7 and 4.8 implement atomic operations but not atomic
+ types. This test is meant to stay simple, we don't know of any
+ other compiler that fakes to be gcc 4.[78] or 4.[78].x */
+#if !defined(__ATOMIC_RELAXED) && __GNUC__ == 4 && __GNUC_MINOR__ < 7
+#undef __ATOMIC_FORCE_SYNC
+#define __ATOMIC_FORCE_SYNC 1
+#endif
+
+#ifdef __SIZEOF_INT128__
+# define __UINT128__ 1
+typedef __uint128_t __impl_uint128_t;
+#else
+# define __UINT128__ 0
+typedef struct { uint64_t a[2]; } __impl_uint128_t;
+#endif
+
+#define __atomic_align(T) \
+(sizeof(T) == 1 ? __alignof__(uint8_t) \
+ : (sizeof(T) == 2 ? __alignof__(uint16_t) \
+ : (sizeof(T) == 4 ? __alignof__(uint32_t) \
+ : ((sizeof(T) == 8) ? __alignof__(uint64_t) \
+ : ((sizeof(T) == 16) ? __alignof__(__impl_uint128_t) \
+ : __alignof__(T))))))
+
+#if __ATOMIC_FORCE_SYNC
+/* There is no compiler support for _Atomic type qualification, so we
+ use the type specifier variant. The idea is to use a one element
+ array to ensure that such an _Atomic(something) can never be used
+ in operators.
+
+ Underneath we will use uintXX_t for special cases. To be sure that
+ no bad things can happen, then, we ensure that the alignment for
+ these special cases is as wide as possible, namely sizeof the
+ type. */
+#define _Atomic(T) __typeof__(T volatile[1])
+#define _Atomic_aligned(T) __attribute__ ((__aligned__(__atomic_align(T)))) __typeof__(T[1])
+#endif
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#endif
+#ifndef __ATOMIC_CONSUME
+#define __ATOMIC_CONSUME 1
+#endif
+#ifndef __ATOMIC_ACQUIRE
+#define __ATOMIC_ACQUIRE 2
+#endif
+#ifndef __ATOMIC_RELEASE
+#define __ATOMIC_RELEASE 3
+#endif
+#ifndef __ATOMIC_ACQ_REL
+#define __ATOMIC_ACQ_REL 4
+#endif
+#ifndef __ATOMIC_SEQ_CST
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+enum memory_order {
+ memory_order_relaxed = __ATOMIC_RELAXED,
+ memory_order_consume = __ATOMIC_CONSUME,
+ memory_order_acquire = __ATOMIC_ACQUIRE,
+ memory_order_release = __ATOMIC_RELEASE,
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ memory_order_seq_cst = __ATOMIC_SEQ_CST,
+};
+typedef enum memory_order memory_order;
+
+#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
+#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+#define __GCC_ATOMIC_SHORT_T_LOCK_FREE 2
+# if defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE 2
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE 2
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE 2
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE 2
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+# elsif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE ((UINT_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE ((ULONG_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE ((ULLONG_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE ((UINTPTR_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE ((UINT_LEAST16_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE ((UINT_LEAST32_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE ((WCHAR_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# elsif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE ((UINT_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE ((ULONG_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE ((ULLONG_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE ((UINTPTR_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE ((UINT_LEAST16_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE ((UINT_LEAST32_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE ((WCHAR_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# endif
+#endif
+
+
+#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE
+#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE
+#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE
+#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE
+
+#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE
+
+#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+# define ATOMIC_UINT8_LOCK_FREE 2
+#else
+# define ATOMIC_UINT8_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+# define ATOMIC_UINT16_LOCK_FREE 2
+#else
+# define ATOMIC_UINT16_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+# define ATOMIC_UINT32_LOCK_FREE 2
+#else
+# define ATOMIC_UINT32_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+# define ATOMIC_UINT64_LOCK_FREE 2
+#else
+# define ATOMIC_UINT64_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+# define ATOMIC_UINT128_LOCK_FREE 2
+#else
+# define ATOMIC_UINT128_LOCK_FREE 0
+#endif
+
+
+#define atomic_is_lock_free(O) \
+(sizeof*(O) == 1 ? ATOMIC_UINT8_LOCK_FREE \
+ : (sizeof*(O) == 2 ? ATOMIC_UINT16_LOCK_FREE \
+ : (sizeof*(O) == 4 ? ATOMIC_UINT32_LOCK_FREE \
+ : ((sizeof*(O) == 8) ? ATOMIC_UINT64_LOCK_FREE \
+ : ((sizeof*(O) == 16) ? ATOMIC_UINT128_LOCK_FREE \
+ : 0)))))
+
+
+#endif
diff --git a/src/internal/atomic_fence.h b/src/internal/atomic_fence.h
new file mode 100644
index 0000000..162507a
--- /dev/null
+++ b/src/internal/atomic_fence.h
@@ -0,0 +1,29 @@
+#ifndef _STDATOMIC_ATOMIC_FENCE_H_
+#define _STDATOMIC_ATOMIC_FENCE_H_ 1
+
+#include <atomic_constants.h>
+
+
+void atomic_thread_fence(memory_order mo);
+
+void atomic_signal_fence(memory_order mo);
+
+#define kill_dependency(X) \
+({ \
+ register __typeof__(X) kill_dependency = (X); \
+ kill_dependency; \
+ })
+
+#ifndef __ATOMIC_FORCE_SYNC
+# define atomic_thread_fence(MO) __atomic_thread_fence(MO)
+# define atomic_signal_fence(MO) __atomic_signal_fence(MO)
+#else
+# define atomic_thread_fence(MO) \
+({ \
+ if (MO != memory_order_relaxed) __sync_synchronize(); \
+ else __asm__ volatile("# relaxed fence"); \
+ })
+#define atomic_signal_fence(MO) __asm__ volatile("# signal fence")
+#endif
+
+#endif
diff --git a/src/internal/atomic_flag.h b/src/internal/atomic_flag.h
new file mode 100644
index 0000000..0d3344b
--- /dev/null
+++ b/src/internal/atomic_flag.h
@@ -0,0 +1,47 @@
+#ifndef _STDATOMIC_ATOMIC_FLAG_H_
+#define _STDATOMIC_ATOMIC_FLAG_H_ 1
+
+#include <atomic_constants.h>
+
+#ifndef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL
+# define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+#endif
+
+typedef struct atomic_flag atomic_flag;
+struct atomic_flag {
+ _Bool f;
+};
+
+_Bool atomic_flag_test_and_set(volatile atomic_flag*);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag*, memory_order);
+void atomic_flag_clear(volatile atomic_flag*);
+void atomic_flag_clear_explicit(volatile atomic_flag*, memory_order);
+
+#define ATOMIC_FLAG_INIT { .f = 0, }
+
+#define atomic_flag_test_and_set(A) atomic_flag_test_and_set_explicit((A), memory_order_seq_cst)
+#define atomic_flag_clear(A) atomic_flag_clear_explicit((A), memory_order_seq_cst)
+
+#ifndef __ATOMIC_FORCE_SYNC
+# define atomic_flag_test_and_set_explicit(A, MO) (__atomic_test_and_set(&((A)->f), MO) == __GCC_ATOMIC_TEST_AND_SET_TRUEVAL)
+# define atomic_flag_clear_explicit(A, MO) __atomic_clear(&(A)->f, MO)
+#else
+# define atomic_flag_test_and_set_explicit(A, O) \
+({ \
+ register _Bool atomic_flag_test_and_set_explicit \
+ = (__sync_lock_test_and_set(&(A)->f, __GCC_ATOMIC_TEST_AND_SET_TRUEVAL) == __GCC_ATOMIC_TEST_AND_SET_TRUEVAL); \
+ /* gcc guarantees that this was an acquire operation. */ \
+ /* synchronize even stronger if we need to */ \
+ if ((O) == memory_order_seq_cst) __sync_synchronize(); \
+ atomic_flag_test_and_set_explicit; \
+ })
+# define atomic_flag_clear_explicit(A, O) \
+({ \
+ /* gcc guarantees that this will be a release operation. */ \
+ /* synchronize even stronger if we need to */ \
+ if ((O) == memory_order_seq_cst) __sync_synchronize(); \
+ __sync_lock_release(&(A)->f); \
+ })
+#endif
+
+#endif
diff --git a/src/internal/atomic_gcc_atomic.h b/src/internal/atomic_gcc_atomic.h
new file mode 100644
index 0000000..e030b85
--- /dev/null
+++ b/src/internal/atomic_gcc_atomic.h
@@ -0,0 +1,107 @@
+#ifndef _STDATOMIC_GCC_ATOMIC_H_
+#define _STDATOMIC_GCC_ATOMIC_H_ 1
+
+#include <atomic_stub.h>
+
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#define atomic_init(X, V) ((void)((*(X))=(V)))
+
+/* Map all non-explicit macros to the explicit version. */
+#define atomic_fetch_add(X, Y) atomic_fetch_add_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) atomic_fetch_sub_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) atomic_fetch_and_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) atomic_fetch_or_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) atomic_fetch_xor_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) atomic_load_explicit((X), memory_order_seq_cst)
+#define atomic_store(X, V) atomic_store_explicit((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) atomic_exchange_explicit((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) atomic_compare_exchange_weak_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* Map allexplicit macros to the corresponding builtin. */
+/* The arithmetic operations don't have to use a memory operand. */
+#define atomic_fetch_add_explicit(X, Y, MO) __atomic_fetch_add((X), (Y), (MO))
+#define atomic_fetch_sub_explicit(X, Y, MO) __atomic_fetch_sub((X), (Y), (MO))
+#define atomic_fetch_and_explicit(X, Y, MO) __atomic_fetch_and((X), (Y), (MO))
+#define atomic_fetch_or_explicit(X, Y, MO) __atomic_fetch_or((X), (Y), (MO))
+#define atomic_fetch_xor_explicit(X, Y, MO) __atomic_fetch_xor((X), (Y), (MO))
+
+/* The interfaces for the universal functions need to operate on
+ memory operands, only. */
+
+#define atomic_load_explicit(X, MO) \
+({ \
+ __atyp(*X) _r; \
+ __atomic_load((X), _r, (MO)); \
+ __aret(_r[0]); \
+ })
+
+#define atomic_store_explicit(X, V, MO) \
+ __atomic_store((X), &__atmp(*X, V), (MO))
+
+#define atomic_exchange_explicit(X, V, MO) \
+({ \
+ __atyp(*X) _r; \
+ __atomic_exchange((X), &__atmp(*X, V), _r, (MO)); \
+ __aret(_r[0]); \
+ })
+
+#define atomic_compare_exchange_weak_explicit(X, E, V, MOS, MOF) \
+ __atomic_compare_exchange((X), (E), &__atmp(*(X), (V)), 1, (MOS), (MOF))
+
+#define atomic_compare_exchange_strong_explicit(X, E, V, MOS, MOF) \
+ __atomic_compare_exchange((X), (E), &__atmp(*(X), (V)), 0, (MOS), (MOF))
+
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_add(X, V, M); \
+} \
+T __impl_fetch_sub_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_sub(X, V, M); \
+} \
+T __impl_fetch_and_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_and(X, V, M); \
+} \
+T __impl_fetch_xor_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_xor(X, V, M); \
+} \
+T __impl_fetch_nand_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_nand(X, V, M); \
+} \
+T __impl_fetch_or_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_or(X, V, M); \
+} \
+T __impl_add_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_add_fetch(X, V, M); \
+} \
+T __impl_sub_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_sub_fetch(X, V, M); \
+} \
+T __impl_and_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_and_fetch(X, V, M); \
+} \
+T __impl_xor_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_xor_fetch(X, V, M); \
+} \
+T __impl_nand_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_nand_fetch(X, V, M); \
+} \
+T __impl_or_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_or_fetch(X, V, M); \
+} \
+T __impl_load_ ## N(_Atomic(T)* X, int M) { \
+ return __atomic_load_n(X, M); \
+} \
+void __impl_store_ ## N(_Atomic(T)* X, T const V, int M) { \
+ __atomic_store_n(X, V, M); \
+} \
+T __impl_exchange_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_exchange_n(X, V, M); \
+} \
+_Bool __impl_compare_exchange_ ## N(_Atomic(T)* X, T* E, T const V, int MS, int MF) { \
+ return __atomic_compare_exchange_n(X, E, V, 0, MS, MF); \
+}
+
+
+#endif
diff --git a/src/internal/atomic_gcc_sync.h b/src/internal/atomic_gcc_sync.h
new file mode 100644
index 0000000..2d42a57
--- /dev/null
+++ b/src/internal/atomic_gcc_sync.h
@@ -0,0 +1,250 @@
+#ifndef _STDATOMIC_GCC_SYNC_H_
+#define _STDATOMIC_GCC_SYNC_H_ 1
+
+#define ATOMIC_VAR_INIT(...) { [0] = __VA_ARGS__, }
+#define atomic_init(X, V) ((void)((*(X))[0]=(V)))
+
+/* Map all non-explicit macros to the explicit version. */
+#define atomic_fetch_add(X, Y) atomic_fetch_add_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) atomic_fetch_sub_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) atomic_fetch_and_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) atomic_fetch_or_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) atomic_fetch_xor_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) atomic_load_explicit((X), memory_order_seq_cst)
+#define atomic_store(X, V) atomic_store_explicit((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) atomic_exchange_explicit((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* The argument X is supposed to be pointer to a one element array of
+ the base type. In evaluation context ``*(X)'' decays to a pointer
+ to the base type. In __typeof__ context we have to use
+ ``&(*(X))[0]'' for that. */
+#define atomic_fetch_add_explicit(X, Y, MO) __sync_fetch_and_add(*(X), (Y))
+#define atomic_fetch_sub_explicit(X, Y, MO) __sync_fetch_and_sub(*(X), (Y))
+#define atomic_fetch_and_explicit(X, Y, MO) __sync_fetch_and_or(*(X), (Y))
+#define atomic_fetch_or_explicit(X, Y, MO) __sync_fetch_and_and(*(X), (Y))
+#define atomic_fetch_xor_explicit(X, Y, MO) __sync_fetch_and_xor(*(X), (Y))
+
+#define atomic_compare_exchange_weak(X, E, D, MOS, MOF) atomic_compare_exchange_strong((X), (E), (V), (MOS), (MOF))
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_add(&((*X)[0]), _V); \
+} \
+T __impl_fetch_sub_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_sub(&((*X)[0]), _V); \
+} \
+T __impl_fetch_and_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_and(&((*X)[0]), _V); \
+} \
+T __impl_fetch_xor_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_xor(&((*X)[0]), _V, _mo); \
+} \
+T __impl_fetch_or_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_or(&((*X)[0]), _V, _mo); \
+} \
+T __impl_add_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_add_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_sub_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_sub_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_and_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_and_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_xor_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_xor_and_fetch(&((*X)[0]), _V, _mo); \
+} \
+T __impl_or_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_or_and_fetch(&((*X)[0]), _V, _mo); \
+} \
+T __impl_load_ ## N(__typeof__(T volatile[1])* X, int _mo) { \
+ return __sync_val_compare_and_swap(&((*X)[0]), 0, 0); \
+} \
+T __impl_exchange_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ T _r = _V, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap(&((*X)[0]), _e, _V); \
+ } while (_r != _e); \
+ return _r; \
+} \
+void __impl_store_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ (void)__impl_exchange_ ## N(X, _V, _mo); \
+} \
+_Bool __impl_compare_exchange_ ## N(__typeof__(T volatile[1])* X, T* _E, T const _D, int _mos, int _mof) { \
+ T _v = *_E; \
+ T _n = __sync_val_compare_and_swap(&((*X)[0]), _v, _D); \
+ if (_v != _n) { \
+ *_E = _n; \
+ return 0; \
+ } \
+ return 1; \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+
+#define atomic_compare_exchange_strong_explicit(X, E, D, MOS, MOF) \
+({ \
+ _Bool ret; \
+ __typeof__((*X)[0])* _e = (E); \
+ __typeof__((*X)[0]) const _d = (D); \
+ switch (sizeof _d) { \
+ case 8: ret = __sync_val_compare_and_swap((uint64_t*)(X), *_e, (D)); break; \
+ case 4: ret = __sync_val_compare_and_swap((uint32_t*)(X), *_e, (D)); break; \
+ case 2: ret = __sync_val_compare_and_swap((uint16_t*)(X), *_e, (D)); break; \
+ case 1: ret = __sync_val_compare_and_swap((uint8_t*)(X), *_e, (D)); break; \
+ default: ret = __impl_compare_exchange(sizeof (*X), (void*)(X), _e, &_d, MOS, MOS); \
+ } \
+ __aret(ret); \
+ })
+
+#define __impl_union(T, X) union { __typeof__(*(X)) x; T t; }
+#define __impl_union2T(T, X) (((__impl_union(T, X)){ .x = (*(X)), }).t)
+#define __impl_union2X(T, X, V) (((__impl_union(T, X)){ .t = (V), }).x)
+
+#define __impl_load_union(T, X) \
+__impl_union2X(T, X, __sync_val_compare_and_swap((T*)X, 0, 0))
+
+#define __impl_exchange_union(T, X, V) \
+({ \
+ __impl_union(T, X) _V = { .t = (V), }; \
+ T _r = _V.t, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap((T*)X, _e, _V.t); \
+ } while (_r != _e); \
+ __impl_union2X(T, X, _r); \
+ })
+
+#define __impl_store_union(T, X, V) \
+({ \
+ __impl_union(T, X) _V = { .t = (V), }; \
+ T _r = _V.t, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap((T*)X, _e, _V.t); \
+ } while (_r != _e); \
+ })
+
+#define __impl_compare_exchange_union(T, X, E, V) \
+({ \
+ __typeof__(*E)* _e = (E); \
+ __impl_union(T, X) _V = { .x = (V), }; \
+ __impl_union(T, X) _E = { .x = *_e, }; \
+ __impl_union(T, X) _R = { .t = __sync_val_compare_and_swap((T*)X, _E.t, _V.t), }; \
+ _Bool _r = (_E.t == _R.t); \
+ if (!_r) _E.x = _R.x; \
+ _r; \
+ })
+
+#define atomic_load_explicit(X, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_load_union(__impl_uint128_t, &((*X)[0])), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_load_union(uint64_t, &((*X)[0])), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_load_union(uint32_t, &((*X)[0])), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_load_union(uint16_t, &((*X)[0])), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_load_union(uint8_t, &((*X)[0])), \
+ ({ \
+ __typeof__((*X)[0]) _r; \
+ __impl_load(sizeof _r, (void*)(&((*X)[0])), &_r, MO); \
+ _r; \
+ }))))))
+
+#define atomic_store_explicit(X, V, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_store_union(__impl_uint128_t, &((*X)[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_store_union(uint64_t, &((*X)[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_store_union(uint32_t, &((*X)[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_store_union(uint16_t, &((*X)[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_store_union(uint8_t, &((*X)[0]), (V)), \
+ ({ \
+ __typeof__((*X)[0]) const _v = (V); \
+ __impl_store(sizeof _v, &((*X)[0]), &_v, MO); \
+ }))))))
+
+#define atomic_exchange_explicit(X, V, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_exchange_union(__impl_uint128_t, &((*(X))[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_exchange_union(uint64_t, &((*(X))[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_exchange_union(uint32_t, &((*(X))[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_exchange_union(uint16_t, &((*(X))[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_exchange_union(uint8_t, &((*(X))[0]), (V)), \
+ ({ \
+ __typeof__((*X)[0]) const _v = (V); \
+ __typeof__((*X)[0]) _r = (V); \
+ __impl_exchange(sizeof _r, (&((*X)[0])), &_r, &_v, MO); \
+ _r; \
+ }))))))
+
+#define atomic_compare_exchange_explicit(X, E, V, MOS, MOF) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_compare_exchange_union(__impl_uint128_t, &((*(X))[0]), (E), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_compare_exchange_union(uint64_t, &((*(X))[0]), (E), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_compare_exchange_union(uint32_t, &((*(X))[0]), (E), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_compare_exchange_union(uint16_t, &((*(X))[0]), (E), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_compare_exchange_union(uint8_t, &((*(X))[0]), (E), (V)), \
+ ({ \
+ __typeof__((*X)[0])* _e = (E); \
+ __typeof__((*X)[0]) const _v = (V); \
+ __impl_compare_exchange(sizeof _r, (&((*X)[0])), _e, &_v, MOS, MOF); \
+ }))))))
+
+#endif
diff --git a/src/internal/atomic_generic.h b/src/internal/atomic_generic.h
new file mode 100644
index 0000000..6ec6bb8
--- /dev/null
+++ b/src/internal/atomic_generic.h
@@ -0,0 +1,10 @@
+#ifndef _STDATOMIC_ATOMIC_GENERIC_H_
+#define _STDATOMIC_ATOMIC_GENERIC_H_ 1
+
+void __impl_load (size_t size, void volatile* ptr, void volatile* ret, int mo);
+void __impl_store (size_t size, void volatile* ptr, void const volatile* val, int mo);
+void __impl_exchange (size_t size, void volatile*__restrict__ ptr, void const volatile* val, void volatile* ret, int mo);
+_Bool __impl_compare_exchange (size_t size, void volatile* ptr, void volatile* expected, void const volatile* desired, int mos, int mof);
+void __impl_print_stat(void);
+
+#endif
diff --git a/src/internal/atomic_stub.h b/src/internal/atomic_stub.h
new file mode 100644
index 0000000..c5e3329
--- /dev/null
+++ b/src/internal/atomic_stub.h
@@ -0,0 +1,208 @@
+#ifndef _STDATOMIC_STUB_H_
+#define _STDATOMIC_STUB_H_ 1
+
+#include <atomic_generic.h>
+
+#define INSTANTIATE_STUB_LCA(N, T) \
+T __impl_fetch_add_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E + V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_sub_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = -V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E - V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_and_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = 0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E & V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_xor_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E ^ V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_or_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = MO == memory_order_relaxed ? memory_order_relaxed : memory_order_consume; \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E | V; \
+ } \
+ return E; \
+} \
+T __impl_add_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E + V; \
+ } \
+ return R; \
+} \
+T __impl_sub_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = -V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E - V; \
+ } \
+ return R; \
+} \
+T __impl_and_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = 0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E & V; \
+ } \
+ return R; \
+} \
+T __impl_xor_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E ^ V; \
+ } \
+ return R; \
+} \
+T __impl_or_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = MO == memory_order_relaxed ? memory_order_relaxed : memory_order_consume; \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E | V; \
+ } \
+ return R; \
+}
+
+#define INSTANTIATE_STUB_NAND(N, T) \
+T __impl_fetch_nand_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = ~0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!atomic_compare_exchange_strong_explicit((_Atomic(T)*)X, &E, R, MO, mof)){ \
+ R = ~(E & V); \
+ } \
+ return E; \
+} \
+T __impl_nand_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = ~E; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!atomic_compare_exchange_strong_explicit((_Atomic(T)*)X, &E, R, MO, mof)){ \
+ R = ~(E & V); \
+ } \
+ return R; \
+}
+
+
+#define INSTANTIATE_STUB_LCM(N, T) \
+T __impl_load_ ## N(void volatile* X, int MO) { \
+ T ret; \
+ __impl_load(N, X, &ret, MO); \
+ return ret; \
+} \
+void __impl_store_ ## N(void volatile* X, T const V, int MO) { \
+ __impl_store(N, X, &V, MO); \
+} \
+T __impl_exchange_ ## N(void volatile* X, T const V, int MO) { \
+ T ret; \
+ __impl_exchange(N, X, &V, &ret, MO); \
+ return ret; \
+} \
+_Bool __impl_compare_exchange_ ## N(void volatile* X, T* E, T const V, int MOS, int MOf) { \
+ return __impl_compare_exchange(N, X, E, &V, MOS, MOf); \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+#define INSTANTIATE_STUB_LC(N, T) INSTANTIATE_STUB_LCA(N, T) INSTANTIATE_STUB_LCM(N, T)
+
+
+#define INSTANTIATE_SYNCA(N, T) \
+T __impl_fetch_and_add_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_add_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_sub_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_sub_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_and_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_and_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_or_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_or_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_xor_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_xor_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_add_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_add_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_sub_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_sub_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_and_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_and_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_or_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_or_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_xor_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_xor_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+}
+
+#define INSTANTIATE_SYNCM(N, T) \
+_Bool __impl_bool_compare_and_swap_ ## N(void volatile* X, T E, T const V) { \
+ T R = E; \
+ return __impl_compare_exchange_ ## N((_Atomic(T)*)X, &R, V, \
+ memory_order_seq_cst, memory_order_seq_cst); \
+} \
+T __impl_val_compare_and_swap_ ## N(void volatile* X, T E, T const V) { \
+ T R = E; \
+ __impl_compare_exchange_ ## N((_Atomic(T)*)X, &R, V, \
+ memory_order_seq_cst, memory_order_seq_cst); \
+ return R; \
+}
+
+#define INSTANTIATE_SYNC(N, T) INSTANTIATE_SYNCA(N, T) INSTANTIATE_SYNCM(N, T)
+
+#endif
diff --git a/src/internal/atomic_types.h b/src/internal/atomic_types.h
new file mode 100644
index 0000000..e873f35
--- /dev/null
+++ b/src/internal/atomic_types.h
@@ -0,0 +1,46 @@
+#ifndef _STDATOMIC_TYPES_H_
+#define _STDATOMIC_TYPES_ 1
+
+#include <wchar.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef _Atomic(_Bool) atomic_bool;
+typedef _Atomic(char) atomic_char;
+typedef _Atomic(int) atomic_int;
+typedef _Atomic(int_fast16_t) atomic_int_fast16_t;
+typedef _Atomic(int_fast32_t) atomic_int_fast32_t;
+typedef _Atomic(int_fast64_t) atomic_int_fast64_t;
+typedef _Atomic(int_fast8_t) atomic_int_fast8_t;
+typedef _Atomic(int_least16_t) atomic_int_least16_t;
+typedef _Atomic(int_least32_t) atomic_int_least32_t;
+typedef _Atomic(int_least64_t) atomic_int_least64_t;
+typedef _Atomic(int_least8_t) atomic_int_least8_t;
+typedef _Atomic(intmax_t) atomic_intmax_t;
+typedef _Atomic(intptr_t) atomic_intptr_t;
+typedef _Atomic(long long) atomic_llong;
+typedef _Atomic(long) atomic_long;
+typedef _Atomic(ptrdiff_t) atomic_ptrdiff_t;
+typedef _Atomic(short) atomic_short;
+typedef _Atomic(signed char) atomic_schar;
+typedef _Atomic(size_t) atomic_size_t;
+typedef _Atomic(uint_fast16_t) atomic_uint_fast16_t;
+typedef _Atomic(uint_fast32_t) atomic_uint_fast32_t;
+typedef _Atomic(uint_fast64_t) atomic_uint_fast64_t;
+typedef _Atomic(uint_fast8_t) atomic_uint_fast8_t;
+typedef _Atomic(uint_least16_t) atomic_char16_t;
+typedef _Atomic(uint_least16_t) atomic_uint_least16_t;
+typedef _Atomic(uint_least32_t) atomic_char32_t;
+typedef _Atomic(uint_least32_t) atomic_uint_least32_t;
+typedef _Atomic(uint_least64_t) atomic_uint_least64_t;
+typedef _Atomic(uint_least8_t) atomic_uint_least8_t;
+typedef _Atomic(uintmax_t) atomic_uintmax_t;
+typedef _Atomic(uintptr_t) atomic_uintptr_t;
+typedef _Atomic(unsigned char) atomic_uchar;
+typedef _Atomic(unsigned int) atomic_uint;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(unsigned long) atomic_ulong;
+typedef _Atomic(unsigned short) atomic_ushort;
+typedef _Atomic(wchar_t) atomic_wchar_t;
+
+#endif
diff --git a/src/stdatomic/atomic_fence.c b/src/stdatomic/atomic_fence.c
new file mode 100644
index 0000000..996286f
--- /dev/null
+++ b/src/stdatomic/atomic_fence.c
@@ -0,0 +1,9 @@
+#include "atomic_fence.h"
+
+void (atomic_thread_fence)(memory_order mo) {
+ atomic_thread_fence(mo);
+}
+
+void (atomic_signal_fence)(memory_order mo) {
+ atomic_signal_fence(mo);
+}
diff --git a/src/stdatomic/atomic_flag.c b/src/stdatomic/atomic_flag.c
new file mode 100644
index 0000000..a27fbe3
--- /dev/null
+++ b/src/stdatomic/atomic_flag.c
@@ -0,0 +1,17 @@
+#include "atomic_flag.h"
+
+_Bool (atomic_flag_test_and_set)(volatile atomic_flag* f) {
+ return atomic_flag_test_and_set(f);
+}
+
+_Bool (atomic_flag_test_and_set_explicit)(volatile atomic_flag* f, memory_order mo) {
+ return atomic_flag_test_and_set_explicit(f, mo);
+}
+
+void (atomic_flag_clear)(volatile atomic_flag* f) {
+ atomic_flag_clear(f);
+}
+
+void (atomic_flag_clear_explicit)(volatile atomic_flag* f, memory_order mo) {
+ atomic_flag_clear_explicit(f, mo);
+}
diff --git a/src/stdatomic/atomic_generic.c b/src/stdatomic/atomic_generic.c
new file mode 100644
index 0000000..d08e71f
--- /dev/null
+++ b/src/stdatomic/atomic_generic.c
@@ -0,0 +1,224 @@
+#include <limits.h>
+#include <string.h>
+#include "stdatomic-impl.h"
+#include "atomic_generic.h"
+#include "libc.h"
+
+#ifdef HASH_STAT
+# include <math.h>
+# include <stdio.h>
+#endif
+
+/* This is compatible with musl's internal locks. */
+/* The lock itself must be lock-free, so in general the can only be an
+ atomic_flag if we know nothing else about the platform. */
+
+typedef int volatile __impl_lock[2];
+
+
+
+/* the size of this table has a trade off between the probability of
+ collisions (the bigger the table, the better) and the waste of
+ space (the smaller, the better). */
+
+#ifndef HBIT
+# define HBIT 8
+#endif
+/* len is a power of two such that we just can mask out higher bits */
+enum { LEN = 1<<HBIT, };
+enum { ptrbit = sizeof(uintptr_t)*CHAR_BIT, };
+
+static __impl_lock table[LEN];
+
+#ifdef HASH_STAT
+static _Atomic(size_t) draw[LEN];
+#endif
+
+/* Chose a medium sized prime number as a factor. The multiplication
+ by it is a bijection modulo any LEN. */
+#define MAGIC 14530039U
+
+__attribute__((__unused__))
+static
+unsigned __impl_hash(void volatile const* X) {
+ uintptr_t const len = LEN;
+ uintptr_t x = (uintptr_t)X;
+ x *= MAGIC;
+ /* Be sure to use all bits in the result. */
+ if (ptrbit > 8*HBIT) x ^= (x / (len*len*len*len*len*len*len*len));
+ if (ptrbit > 4*HBIT) x ^= (x / (len*len*len*len));
+ if (ptrbit > 2*HBIT) x ^= (x / (len*len));
+ if (ptrbit > 1*HBIT) x ^= (x / len);
+ x %= len;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[x], 1, memory_order_relaxed);
+#endif
+ return x;
+}
+
+__attribute__((__unused__))
+static
+unsigned __impl_jenkins_one_at_a_time_hash(void volatile const* k) {
+ union {
+ unsigned char b[sizeof k];
+ uintptr_t v;
+ void volatile const* k;
+ } key = { .k = k, };
+ uintptr_t i, x = 0;
+ for(i = 0; i < sizeof(uintptr_t); ++i) {
+ x += key.b[i];
+ x += (x << 10);
+ x ^= (x >> 6);
+ }
+ x += (x << 3);
+ x ^= (x >> 11);
+ x += (x << 15);
+ x %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[x], 1, memory_order_relaxed);
+#endif
+ return x;
+}
+
+__attribute__((__unused__))
+static
+uintptr_t __impl_mix(void volatile const* x) {
+ uintptr_t h = (uintptr_t)x;
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+ h %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[h], 1, memory_order_relaxed);
+#endif
+ return h;
+}
+
+__attribute__((__unused__))
+static
+uintptr_t __impl_8(void volatile const* x) {
+ uintptr_t h = (uintptr_t)x;
+ h ^= (h >> 8);
+ h %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[h], 1, memory_order_relaxed);
+#endif
+ return h;
+}
+
+#ifndef __ATOMIC_HASH
+# define __ATOMIC_HASH __impl_hash
+#endif
+#define hash __ATOMIC_HASH
+
+
+void __impl_load (size_t size, void volatile* ptr, void volatile* ret, int mo) {
+ unsigned pos = hash(ptr);
+ LOCK(table[pos]);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)ret, (void*)ptr, size);
+ UNLOCK(table[pos]);
+}
+
+void __impl_store (size_t size, void volatile* ptr, void const volatile* val, int mo) {
+ unsigned pos = hash(ptr);
+ LOCK(table[pos]);
+ memcpy((void*)ptr, (void*)val, size);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ UNLOCK(table[pos]);
+}
+
+static
+void atomic_exchange_restrict (size_t size, void volatile*__restrict__ ptr, void const volatile*__restrict__ val, void volatile*__restrict__ ret, int mo) {
+ unsigned pos = hash(ptr);
+ LOCK(table[pos]);
+ memcpy((void*)ret, (void*)ptr, size);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)ptr, (void*)val, size);
+ UNLOCK(table[pos]);
+}
+
+void __impl_exchange (size_t size, void volatile*__restrict__ ptr, void const volatile* val, void volatile* ret, int mo) {
+ if (val == ret) {
+ unsigned char buffer[size];
+ atomic_exchange_restrict(size, ptr, val, buffer, mo);
+ memcpy((void*)ret, buffer, size);
+ } else {
+ atomic_exchange_restrict(size, ptr, val, ret, mo);
+ }
+}
+
+_Bool __impl_compare_exchange (size_t size, void volatile* ptr, void volatile* expected, void const volatile* desired, int mos, int mof) {
+ unsigned pos = hash(ptr);
+ LOCK(table[pos]);
+ _Bool ret = !memcmp((void*)ptr, (void*)expected, size);
+ if (ret) {
+ memcpy((void*)ptr, (void*)desired, size);
+ if (mos == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ } else {
+ if (mof == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)expected, (void*)ptr, size);
+ }
+ UNLOCK(table[pos]);
+ /* fprintf(stderr, "cas for %p (%zu) at pos %u, %s, exp %p, des %p\n", */
+ /* ptr, size, pos, ret ? "suceeded" : "failed", */
+ /* expected, desired); */
+ return ret;
+}
+
+/* To collect hash statistics about atomics, compile with
+ ``HASH_STAT'' */
+void __impl_print_stat(void) {
+#ifdef HASH_STAT
+ size_t x1 = 0;
+ size_t x2 = 0;
+ size_t min = -1;
+ size_t max = 0;
+ size_t i;
+ for (i = 0; i < LEN; i++) {
+ size_t val = atomic_load(&draw[i]);
+ fprintf(stderr, "\t%zu", val);
+ if ((i % 8) == 7) fputc('\n', stderr);
+ x1 += val;
+ x2 += val*val;
+ if (val < min) min = val;
+ if (val > max) max = val;
+ }
+ fputc('\n', stderr);
+ double avg1 = (x1+0.0)/LEN;
+ double avg2 = (x2+0.0)/LEN;
+ double var = avg2 - avg1*avg1;
+ fprintf(stderr, "hash utilisation, %d positions, %zu draws: %zu < %e (+%e) < %zu\n",
+ LEN, x1, min, avg1, sqrt(var), max);
+#endif
+}
+
+/* For the four functions defined here, we need two entries in the
+ symbol table. One will be the final link name of the replacement
+ stub, something like __atomic_load, e.g. The other one is the
+ __impl prefixed name. It may eventually be used by the fixed-sized
+ stub functions, since these can't use the name that corresponds to
+ the builtin.
+
+ The replacement to the final name is not done within compiling,
+ since the name clashes can only create conflicts for a C
+ compiler. Instead, we use an external tool (objcopy) that does the
+ renaming.
+
+ We want these to be strong aliases, so they can't accidentally be
+ replaced. Therefore we can't use musl's weak_alias macro but create
+ one of our own. */
+
+#define alias(X, Y) __attribute__((__alias__(#X))) __typeof__(X) Y
+
+alias(__impl_load, __impl_load_replace);
+alias(__impl_store, __impl_store_replace);
+alias(__impl_exchange, __impl_exchange_replace);
+alias(__impl_compare_exchange, __impl_compare_exchange_replace);
diff --git a/src/stdatomic/atomic_generic_1.c b/src/stdatomic/atomic_generic_1.c
new file mode 100644
index 0000000..722657b
--- /dev/null
+++ b/src/stdatomic/atomic_generic_1.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+INSTANTIATE_STUB_LF(1, uint8_t)
+#else
+INSTANTIATE_STUB_LC(1, uint8_t)
+#endif
+
+INSTANTIATE_SYNC(1, uint8_t)
diff --git a/src/stdatomic/atomic_generic_16.c b/src/stdatomic/atomic_generic_16.c
new file mode 100644
index 0000000..ac5105f
--- /dev/null
+++ b/src/stdatomic/atomic_generic_16.c
@@ -0,0 +1,15 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+INSTANTIATE_STUB_LF(16, __impl_uint128_t)
+INSTANTIATE_SYNC(16, __impl_uint128_t)
+#else
+INSTANTIATE_STUB_LCM(16, __impl_uint128_t)
+INSTANTIATE_SYNCM(16, __impl_uint128_t)
+# if __UINT128__
+INSTANTIATE_STUB_LCA(16, __impl_uint128_t)
+INSTANTIATE_SYNCA(16, __impl_uint128_t)
+# endif
+#endif
+
diff --git a/src/stdatomic/atomic_generic_2.c b/src/stdatomic/atomic_generic_2.c
new file mode 100644
index 0000000..a8244f2
--- /dev/null
+++ b/src/stdatomic/atomic_generic_2.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+INSTANTIATE_STUB_LF(2, uint16_t)
+#else
+INSTANTIATE_STUB_LC(2, uint16_t)
+#endif
+
+INSTANTIATE_SYNC(2, uint16_t)
diff --git a/src/stdatomic/atomic_generic_4.c b/src/stdatomic/atomic_generic_4.c
new file mode 100644
index 0000000..7b1693f
--- /dev/null
+++ b/src/stdatomic/atomic_generic_4.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+INSTANTIATE_STUB_LF(4, uint32_t)
+#else
+INSTANTIATE_STUB_LC(4, uint32_t)
+#endif
+
+INSTANTIATE_SYNC(4, uint32_t)
diff --git a/src/stdatomic/atomic_generic_8.c b/src/stdatomic/atomic_generic_8.c
new file mode 100644
index 0000000..d652497
--- /dev/null
+++ b/src/stdatomic/atomic_generic_8.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+INSTANTIATE_STUB_LF(8, uint64_t)
+#else
+INSTANTIATE_STUB_LC(8, uint64_t)
+#endif
+
+INSTANTIATE_SYNC(8, uint64_t)
diff --git a/src/stdatomic/redefine_syms.sh b/src/stdatomic/redefine_syms.sh
new file mode 100755
index 0000000..31740c5
--- /dev/null
+++ b/src/stdatomic/redefine_syms.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+objects=$(ls *.o)
+
+for obj in ${objects} ; do
+ objcopy --redefine-syms=redefine_syms.txt ${obj} tmp.o
+ mv tmp.o ${obj}
+ objcopy --redefine-syms=redefine_syms.txt ${obj%.o}.lo tmp.o
+ mv tmp.o ${obj%.o}.lo
+done
diff --git a/src/stdatomic/redefine_syms.txt b/src/stdatomic/redefine_syms.txt
new file mode 100644
index 0000000..5197a49
--- /dev/null
+++ b/src/stdatomic/redefine_syms.txt
@@ -0,0 +1,144 @@
+__impl_load_1 __atomic_load_1
+__impl_store_1 __atomic_store_1
+__impl_exchange_1 __atomic_exchange_1
+__impl_compare_exchange_1 __atomic_compare_exchange_1
+__impl_fetch_add_1 __atomic_fetch_add_1
+__impl_fetch_sub_1 __atomic_fetch_sub_1
+__impl_fetch_and_1 __atomic_fetch_and_1
+__impl_fetch_xor_1 __atomic_fetch_xor_1
+__impl_fetch_nand_1 __atomic_fetch_nand_1
+__impl_fetch_or_1 __atomic_fetch_or_1
+__impl_add_fetch_1 __atomic_add_fetch_1
+__impl_sub_fetch_1 __atomic_sub_fetch_1
+__impl_and_fetch_1 __atomic_and_fetch_1
+__impl_xor_fetch_1 __atomic_xor_fetch_1
+__impl_nand_fetch_1 __atomic_nand_fetch_1
+__impl_or_fetch_1 __atomic_or_fetch_1
+__impl_load_2 __atomic_load_2
+__impl_store_2 __atomic_store_2
+__impl_exchange_2 __atomic_exchange_2
+__impl_compare_exchange_2 __atomic_compare_exchange_2
+__impl_fetch_add_2 __atomic_fetch_add_2
+__impl_fetch_sub_2 __atomic_fetch_sub_2
+__impl_fetch_and_2 __atomic_fetch_and_2
+__impl_fetch_xor_2 __atomic_fetch_xor_2
+__impl_fetch_nand_2 __atomic_fetch_nand_2
+__impl_fetch_or_2 __atomic_fetch_or_2
+__impl_add_fetch_2 __atomic_add_fetch_2
+__impl_sub_fetch_2 __atomic_sub_fetch_2
+__impl_and_fetch_2 __atomic_and_fetch_2
+__impl_xor_fetch_2 __atomic_xor_fetch_2
+__impl_nand_fetch_2 __atomic_nand_fetch_2
+__impl_or_fetch_2 __atomic_or_fetch_2
+__impl_load_4 __atomic_load_4
+__impl_store_4 __atomic_store_4
+__impl_exchange_4 __atomic_exchange_4
+__impl_compare_exchange_4 __atomic_compare_exchange_4
+__impl_fetch_add_4 __atomic_fetch_add_4
+__impl_fetch_sub_4 __atomic_fetch_sub_4
+__impl_fetch_and_4 __atomic_fetch_and_4
+__impl_fetch_xor_4 __atomic_fetch_xor_4
+__impl_fetch_nand_4 __atomic_fetch_nand_4
+__impl_fetch_or_4 __atomic_fetch_or_4
+__impl_add_fetch_4 __atomic_add_fetch_4
+__impl_sub_fetch_4 __atomic_sub_fetch_4
+__impl_and_fetch_4 __atomic_and_fetch_4
+__impl_xor_fetch_4 __atomic_xor_fetch_4
+__impl_nand_fetch_4 __atomic_nand_fetch_4
+__impl_or_fetch_4 __atomic_or_fetch_4
+__impl_load_8 __atomic_load_8
+__impl_store_8 __atomic_store_8
+__impl_exchange_8 __atomic_exchange_8
+__impl_compare_exchange_8 __atomic_compare_exchange_8
+__impl_fetch_add_8 __atomic_fetch_add_8
+__impl_fetch_sub_8 __atomic_fetch_sub_8
+__impl_fetch_and_8 __atomic_fetch_and_8
+__impl_fetch_xor_8 __atomic_fetch_xor_8
+__impl_fetch_nand_8 __atomic_fetch_nand_8
+__impl_fetch_or_8 __atomic_fetch_or_8
+__impl_add_fetch_8 __atomic_add_fetch_8
+__impl_sub_fetch_8 __atomic_sub_fetch_8
+__impl_and_fetch_8 __atomic_and_fetch_8
+__impl_xor_fetch_8 __atomic_xor_fetch_8
+__impl_nand_fetch_8 __atomic_nand_fetch_8
+__impl_or_fetch_8 __atomic_or_fetch_8
+__impl_load_16 __atomic_load_16
+__impl_store_16 __atomic_store_16
+__impl_exchange_16 __atomic_exchange_16
+__impl_compare_exchange_16 __atomic_compare_exchange_16
+__impl_fetch_add_16 __atomic_fetch_add_16
+__impl_fetch_sub_16 __atomic_fetch_sub_16
+__impl_fetch_and_16 __atomic_fetch_and_16
+__impl_fetch_xor_16 __atomic_fetch_xor_16
+__impl_fetch_nand_16 __atomic_fetch_nand_16
+__impl_fetch_or_16 __atomic_fetch_or_16
+__impl_add_fetch_16 __atomic_add_fetch_16
+__impl_sub_fetch_16 __atomic_sub_fetch_16
+__impl_and_fetch_16 __atomic_and_fetch_16
+__impl_xor_fetch_16 __atomic_xor_fetch_16
+__impl_nand_fetch_16 __atomic_nand_fetch_16
+__impl_or_fetch_16 __atomic_or_fetch_16
+__impl_bool_compare_and_swap_1 __sync_bool_compare_and_swap_1
+__impl_val_compare_and_swap_1 __sync_val_compare_and_swap_1
+__impl_fetch_and_add_1 __sync_fetch_and_add_1
+__impl_fetch_and_sub_1 __sync_fetch_and_sub_1
+__impl_fetch_and_and_1 __sync_fetch_and_and_1
+__impl_fetch_and_xor_1 __sync_fetch_and_xor_1
+__impl_fetch_and_or_1 __sync_fetch_and_or_1
+__impl_add_and_fetch_1 __sync_add_and_fetch_1
+__impl_sub_and_fetch_1 __sync_sub_and_fetch_1
+__impl_and_and_fetch_1 __sync_and_and_fetch_1
+__impl_xor_and_fetch_1 __sync_xor_and_fetch_1
+__impl_or_and_fetch_1 __sync_or_and_fetch_1
+__impl_bool_compare_and_swap_2 __sync_bool_compare_and_swap_2
+__impl_val_compare_and_swap_2 __sync_val_compare_and_swap_2
+__impl_fetch_and_add_2 __sync_fetch_and_add_2
+__impl_fetch_and_sub_2 __sync_fetch_and_sub_2
+__impl_fetch_and_and_2 __sync_fetch_and_and_2
+__impl_fetch_and_xor_2 __sync_fetch_and_xor_2
+__impl_fetch_and_or_2 __sync_fetch_and_or_2
+__impl_add_and_fetch_2 __sync_add_and_fetch_2
+__impl_sub_and_fetch_2 __sync_sub_and_fetch_2
+__impl_and_and_fetch_2 __sync_and_and_fetch_2
+__impl_xor_and_fetch_2 __sync_xor_and_fetch_2
+__impl_or_and_fetch_2 __sync_or_and_fetch_2
+__impl_bool_compare_and_swap_4 __sync_bool_compare_and_swap_4
+__impl_val_compare_and_swap_4 __sync_val_compare_and_swap_4
+__impl_fetch_and_add_4 __sync_fetch_and_add_4
+__impl_fetch_and_sub_4 __sync_fetch_and_sub_4
+__impl_fetch_and_and_4 __sync_fetch_and_and_4
+__impl_fetch_and_xor_4 __sync_fetch_and_xor_4
+__impl_fetch_and_or_4 __sync_fetch_and_or_4
+__impl_add_and_fetch_4 __sync_add_and_fetch_4
+__impl_sub_and_fetch_4 __sync_sub_and_fetch_4
+__impl_and_and_fetch_4 __sync_and_and_fetch_4
+__impl_xor_and_fetch_4 __sync_xor_and_fetch_4
+__impl_or_and_fetch_4 __sync_or_and_fetch_4
+__impl_bool_compare_and_swap_8 __sync_bool_compare_and_swap_8
+__impl_val_compare_and_swap_8 __sync_val_compare_and_swap_8
+__impl_fetch_and_add_8 __sync_fetch_and_add_8
+__impl_fetch_and_sub_8 __sync_fetch_and_sub_8
+__impl_fetch_and_and_8 __sync_fetch_and_and_8
+__impl_fetch_and_xor_8 __sync_fetch_and_xor_8
+__impl_fetch_and_or_8 __sync_fetch_and_or_8
+__impl_add_and_fetch_8 __sync_add_and_fetch_8
+__impl_sub_and_fetch_8 __sync_sub_and_fetch_8
+__impl_and_and_fetch_8 __sync_and_and_fetch_8
+__impl_xor_and_fetch_8 __sync_xor_and_fetch_8
+__impl_or_and_fetch_8 __sync_or_and_fetch_8
+__impl_bool_compare_and_swap_16 __sync_bool_compare_and_swap_16
+__impl_val_compare_and_swap_16 __sync_val_compare_and_swap_16
+__impl_fetch_and_add_16 __sync_fetch_and_add_16
+__impl_fetch_and_sub_16 __sync_fetch_and_sub_16
+__impl_fetch_and_and_16 __sync_fetch_and_and_16
+__impl_fetch_and_xor_16 __sync_fetch_and_xor_16
+__impl_fetch_and_or_16 __sync_fetch_and_or_16
+__impl_add_and_fetch_16 __sync_add_and_fetch_16
+__impl_sub_and_fetch_16 __sync_sub_and_fetch_16
+__impl_and_and_fetch_16 __sync_and_and_fetch_16
+__impl_xor_and_fetch_16 __sync_xor_and_fetch_16
+__impl_or_and_fetch_16 __sync_or_and_fetch_16
+__impl_load_replace __atomic_load
+__impl_store_replace __atomic_store
+__impl_exchange_replace __atomic_exchange
+__impl_compare_exchange_replace __atomic_compare_exchange
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* stdatomic library, v2
2015-07-27 7:51 stdatomic library implementation Jens Gustedt
2015-07-27 7:53 ` stdatomic library implementation, code Jens Gustedt
2015-07-27 8:08 ` stdatomic library implementation, documentation Jens Gustedt
@ 2015-08-03 11:47 ` Jens Gustedt
2 siblings, 0 replies; 6+ messages in thread
From: Jens Gustedt @ 2015-08-03 11:47 UTC (permalink / raw)
To: musl
[-- Attachment #1.1: Type: text/plain, Size: 9611 bytes --]
Hello,
here is the second version of my stdatomic implementation.
The main difference to the previous one, is a handcrafted lock. This
lock is there for a very particular situation, namely critical
sections that are extremely short. It is not yet clear if the same
strategy could or should apply to other parts of musl, maybe in
replacement of the __lock/__unlock pair.
The situation is so special here, since every cycle counts.
- On the fast-path, the shorter it gets, the less conflicts there are
with other lockers.
- On the mild-conflict path, we spin only once on average, so here
also, the less cycles the less interaction with other lockers.
This lock is already implemented with stdatomic semantics. First,
because it is nicer and we finally have it, here :)
More seriously, the assembler produced is much better than with the
inline assembler macros, simply because the compiler can integrate
everything more smoothly and take advantage of particular properties
of the instructions. E.g on x86 the cmpxchg instruction already places
the result of the operation in a flag, there is no need to retest the
condition. And since, every cycle counts, see above, using stdatomic
alone seems to have benefits over __lock/__unlock.
The gain of this new lock is that on my machine my bench (list element
allocations-insertions or removal-deallocation) goes from about 700000
up to about 1000000 per second.
I also tried to benchmark the individual parts of the new algorithm to
get a hand on where the problem spots are, but exact measurement
doesn't seem possible due to Heisenberg effects.
As before, please find the "implementation" part of my notes below, I
also attach the full .org file of it.
Jens
* The implementation
** Requirements
*** Compilers
You should be able to compile this implementation with any version
of modern gcc and clang. (Versions are hard to tell, gcc should work
for 4.1) The quality of the resulting binary will depend on the
implementation of atomic support by the compiler.
There are three different implementations, for modern clang and gcc,
and one for those compilers that only support the =__sync_=
built-ins. They are only tested with clang and gcc, but might work
with other compilers that implement one of the sets of built-ins and
is otherwise compatible to some gcc extensions:
- compound expressions with =({ })=
- =__attribute__= with =__alias__= and =__unused__=
- =__builtin_choose_expr= for the =__sync= version as a precursor of
C11's =_Generic=
There are some heuristics in place to decide at compile time which
case applies, namely =__clang__= to detect clang, =__ATOMIC_...=
macros to detect the C11 versions of the built-ins.
*** OS or C library support
The library may work with different lock constructs, currently we
implement one simple generic approach that only uses spinning, and
a mixed approach that uses Linux' =futex= as an inactive sleep
strategy as a last resort. The latter has been tested with the
=musl= C library.
This locking strategy can be a performance bottleneck for
applications with a strong congestion on one particular atomic
data, e.g code that would insert list elements through a
centralized list head. If this list head can not be realized with
a lock-free atomic, the critical section of modifying it is
protected by our lock. Such code has very particular properties.
- Since the critical section usually is really short compared to a
scheduling interval of the OS, the probability that the lock can
be taken immediately is high. So the fast path for taking the
lock must be *really fast*. Our implementation essentially has
an =atomic_compare_exchange_strong_explicit=, here. One memory
instruction on the fast path must be enough.
- If locking fails a the first try, still the probability is very
high that it will succeed soon after. This is because only
scheduled threads compete, here, so there are never more threads
in play than we have processors. Therefore as a second strategy
we spin for a while until we get the lock. In our experiments on
average one single round of spinning was enough.
- A third exceptional case may occur, when the thread that is
holding the lock is descheduled in the middle of the critical
section. The probability for that event is quite rare (0.1 % in
our experiments) but still this case occurs. If it does, the
world changes drastically, a herd of threads all have to wait
for a long time (until the locker is rescheduled) to have any
chance to obtain the lock. Active wait here is
counterproductive. In the contrary, by going into an inactive OS
sleep, the possibility for the locker to regain an execution
slot increases.
We implement this strategy a bit differently than classical locks
with wait-counters would do. We just have a single =unsigned= value
that at the same time holds the lock bit (HO bit) and a
counter. That counter is not viewed as a counter of the threads
that are in a kernel wait, but just counts the number of threads
inside the critical section. This has the following advantages:
- An update to the counter part is relatively rare. So we save
memory bandwidth, and we also avoid too much interaction between
the different threads that compete for the lock.
- The fast path occurs when the value is =0=, initially. It sets
the HO bit (the lock bit) and the LO bit (for a counter of value
=1=) in one go. The resulting value is =UINT_MAX/2u+2u=.
- If the fast path fails, the counter is atomically incremented by
one, and we enter a spin lock to set the HO bit as well.
- After having spun for sometime, we suppose that we are in the bad
situation and go into a =futex_wait=. Going into the =futex_wait=
may fail if the value changes. Since additional threads only
change the counter when they arrive, this can't happen too often
and the thread goes to sleep, eventually.
- Unlocking is a very simple operation. The locker has contributed
=UINT_MAX/2u+2u= to the value, and so just has to decrement the
value atomically by that amount. By doing so, the thread also
notices if other threads still are in the critical section and
wakens one of them.
** Caveats
*** Symbol renaming
There is one important difficulty when compiling this. The original
=__atomic= library interface was developed with C++ in mind and not
C. Therefore it freely uses function overloading for the built-ins
versus the library interface. Since we also use the library
functions as fallbacks in the implementation of some of the =_X=
variants this naming scheme is not supportable with a C compiler.
We get away with it by using internal names, prefixed with =__impl_=
for all functions. Then we rename symbols to the intended ones using
=objcopy=.
- The current integration into musl does this with a *script* that
you have to run *manually* after compilation.
- You then have to launch =make= a *second time* to do the final link.
This technique is certainly not ideal and subject to improvement.
*** Support of 16 byte atomic instructions
The main difference for modern processors that is relevant here is
if it supports 16 byte atomic instructions or not. There is no
difficulty to detect this at compile time, but if the library is
used with code that is compiled with a different compiler or just
different compiler options, incompatible binary code may be
produced.
My plan is to freeze that feature at compile time of the library
and reflect the capacity in the =<stdatomic.h>= that is
provided. This then may result in code that is a bit less
optimized than it could, but that is compatible.
- If the library is *not* compiled with direct 16 byte support the
application may not use it, and thus use a memory implementation
for such operations.
- If the library *is* compiled with direct 16 byte support but the
application compiler doesn't support it, the user code should
fallback to library calls, but which in turn use the atomic
instructions. So such a variant would have a call overhead and
would not be able to inline the atomics in the user binary.
All of this is not yet, done, though. Be careful when using this
preliminary version.
** Leftovers
There are some leftovers that will hopefully disappear.
- There are several hash functions and a instrumentation
infrastructure for the hashes. I didn't have enough test cases
yet to see what would be best, here.
- There is optional instrumentation for the lock
functions. Switching it on changes overall performance
substantially, and thus I'd expect a noticeable Heisenberg
effect. So these counter can give qualitative information about
what happens, you shouldn't take the figures verbally.
--
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536 ::
:: :::::::::::::::::::::: gsm France : +33 651400183 ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::
[-- Attachment #1.2: stdatomic-patch-v2.txt --]
[-- Type: text/plain, Size: 82181 bytes --]
diff --git a/src/internal/atomic_clang_c11.h b/src/internal/atomic_clang_c11.h
new file mode 100644
index 0000000..bb3629d
--- /dev/null
+++ b/src/internal/atomic_clang_c11.h
@@ -0,0 +1,82 @@
+#ifndef _STDATOMIC_CLANG_C11_H_
+#define _STDATOMIC_CLANG_C11_H_ 1
+
+#include <atomic_stub.h>
+
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#define atomic_init __c11_atomic_init
+
+/* Map operations to the special builtins that clang provides. */
+
+/* Map all non-explicit macros to the builtin with forced memory order. */
+#define atomic_fetch_add(X, Y) __c11_atomic_fetch_add((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) __c11_atomic_fetch_sub((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) __c11_atomic_fetch_and((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) __c11_atomic_fetch_or((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) __c11_atomic_fetch_xor((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) __c11_atomic_load((X), memory_order_seq_cst)
+#define atomic_store(X, V) __c11_atomic_store((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) __c11_atomic_exchange((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) __c11_atomic_compare_exchange_weak((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) __c11_atomic_compare_exchange_strong((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* Map allexplicit macros to the corresponding builtin. */
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+#define atomic_load_explicit __c11_atomic_load
+#define atomic_store_explicit __c11_atomic_store
+#define atomic_exchange_explicit __c11_atomic_exchange
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_add(_X, _V, _mo); \
+} \
+T __impl_fetch_sub_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_sub(_X, _V, _mo); \
+} \
+T __impl_fetch_and_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_and(_X, _V, _mo); \
+} \
+T __impl_fetch_xor_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_xor(_X, _V, _mo); \
+} \
+T __impl_fetch_or_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_or(_X, _V, _mo); \
+} \
+T __impl_add_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_add(_X, _V, _mo) + _V; \
+} \
+T __impl_sub_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_sub(_X, _V, _mo) - _V; \
+} \
+T __impl_and_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_and(_X, _V, _mo) & _V; \
+} \
+T __impl_xor_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_xor(_X, _V, _mo) ^ _V; \
+} \
+T __impl_or_fetch_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_fetch_or(_X, _V, _mo) | _V; \
+} \
+T __impl_load_ ## N(_Atomic(T)* _X, int _mo) { \
+ return __c11_atomic_load(_X, _mo); \
+} \
+void __impl_store_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ __c11_atomic_store(_X, _V, _mo); \
+} \
+T __impl_exchange_ ## N(_Atomic(T)* _X, T const _V, int _mo) { \
+ return __c11_atomic_exchange(_X, _V, _mo); \
+} \
+_Bool __impl_compare_exchange_ ## N(_Atomic(T)* _X, T* _E, T const _V, int _mos, int _mof) { \
+ return __c11_atomic_compare_exchange_strong(_X, _E, _V, _mos, _mof); \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+#define INSTANTIATE_STUB(N, T) INSTANTIATE_STUB_ ## N(T)
+
+#endif
diff --git a/src/internal/atomic_constants.h b/src/internal/atomic_constants.h
new file mode 100644
index 0000000..327241b
--- /dev/null
+++ b/src/internal/atomic_constants.h
@@ -0,0 +1,162 @@
+#ifndef _STDATOMIC_ATOMIC_CONSTANTS_H_
+#define _STDATOMIC_ATOMIC_CONSTANTS_H_ 1
+
+#include <stdint.h>
+
+#if !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1) || __GNUC__ < 4
+# error "this implementation of stdatomic need support that is compatible with the gcc ABI"
+#endif
+
+/* gcc 4.7 and 4.8 implement atomic operations but not atomic
+ types. This test is meant to stay simple, we don't know of any
+ other compiler that fakes to be gcc 4.[78] or 4.[78].x */
+#if !defined(__ATOMIC_RELAXED) && __GNUC__ == 4 && __GNUC_MINOR__ < 7
+#undef __ATOMIC_FORCE_SYNC
+#define __ATOMIC_FORCE_SYNC 1
+#endif
+
+#ifdef __SIZEOF_INT128__
+# define __UINT128__ 1
+typedef __uint128_t __impl_uint128_t;
+#else
+# define __UINT128__ 0
+typedef struct { uint64_t a[2]; } __impl_uint128_t;
+#endif
+
+#define __atomic_align(T) \
+(sizeof(T) == 1 ? __alignof__(uint8_t) \
+ : (sizeof(T) == 2 ? __alignof__(uint16_t) \
+ : (sizeof(T) == 4 ? __alignof__(uint32_t) \
+ : ((sizeof(T) == 8) ? __alignof__(uint64_t) \
+ : ((sizeof(T) == 16) ? __alignof__(__impl_uint128_t) \
+ : __alignof__(T))))))
+
+#if __ATOMIC_FORCE_SYNC
+/* There is no compiler support for _Atomic type qualification, so we
+ use the type specifier variant. The idea is to use a one element
+ array to ensure that such an _Atomic(something) can never be used
+ in operators.
+
+ Underneath we will use uintXX_t for special cases. To be sure that
+ no bad things can happen, then, we ensure that the alignment for
+ these special cases is as wide as possible, namely sizeof the
+ type. */
+#define _Atomic(T) __typeof__(T volatile[1])
+#define _Atomic_aligned(T) __attribute__ ((__aligned__(__atomic_align(T)))) __typeof__(T[1])
+#endif
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#endif
+#ifndef __ATOMIC_CONSUME
+#define __ATOMIC_CONSUME 1
+#endif
+#ifndef __ATOMIC_ACQUIRE
+#define __ATOMIC_ACQUIRE 2
+#endif
+#ifndef __ATOMIC_RELEASE
+#define __ATOMIC_RELEASE 3
+#endif
+#ifndef __ATOMIC_ACQ_REL
+#define __ATOMIC_ACQ_REL 4
+#endif
+#ifndef __ATOMIC_SEQ_CST
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+enum memory_order {
+ memory_order_relaxed = __ATOMIC_RELAXED,
+ memory_order_consume = __ATOMIC_CONSUME,
+ memory_order_acquire = __ATOMIC_ACQUIRE,
+ memory_order_release = __ATOMIC_RELEASE,
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ memory_order_seq_cst = __ATOMIC_SEQ_CST,
+};
+typedef enum memory_order memory_order;
+
+#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
+#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+#define __GCC_ATOMIC_SHORT_T_LOCK_FREE 2
+# if defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE 2
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE 2
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE 2
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE 2
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+# elsif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE ((UINT_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE ((ULONG_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE ((ULLONG_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE ((UINTPTR_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE ((UINT_LEAST16_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE ((UINT_LEAST32_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE ((WCHAR_MAX <= 0xFFFFFFFFFFFFFFFFU) ? 2 : 0)
+# elsif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+# define __GCC_ATOMIC_INT_T_LOCK_FREE ((UINT_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LONG_T_LOCK_FREE ((ULONG_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_LLONG_T_LOCK_FREE ((ULLONG_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_POINTER_T_LOCK_FREE ((UINTPTR_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR16_T_LOCK_FREE ((UINT_LEAST16_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_CHAR32_T_LOCK_FREE ((UINT_LEAST32_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# define __GCC_ATOMIC_WCHAR_T_LOCK_FREE ((WCHAR_MAX <= 0xFFFFFFFFU) ? 2 : 0)
+# endif
+#endif
+
+
+#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE
+#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE
+#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE
+#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE
+
+#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE
+
+#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+# define ATOMIC_UINT8_LOCK_FREE 2
+#else
+# define ATOMIC_UINT8_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+# define ATOMIC_UINT16_LOCK_FREE 2
+#else
+# define ATOMIC_UINT16_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+# define ATOMIC_UINT32_LOCK_FREE 2
+#else
+# define ATOMIC_UINT32_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+# define ATOMIC_UINT64_LOCK_FREE 2
+#else
+# define ATOMIC_UINT64_LOCK_FREE 0
+#endif
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+# define ATOMIC_UINT128_LOCK_FREE 2
+#else
+# define ATOMIC_UINT128_LOCK_FREE 0
+#endif
+
+
+#define atomic_is_lock_free(O) \
+(sizeof*(O) == 1 ? ATOMIC_UINT8_LOCK_FREE \
+ : (sizeof*(O) == 2 ? ATOMIC_UINT16_LOCK_FREE \
+ : (sizeof*(O) == 4 ? ATOMIC_UINT32_LOCK_FREE \
+ : ((sizeof*(O) == 8) ? ATOMIC_UINT64_LOCK_FREE \
+ : ((sizeof*(O) == 16) ? ATOMIC_UINT128_LOCK_FREE \
+ : 0)))))
+
+
+#endif
diff --git a/src/internal/atomic_fence.h b/src/internal/atomic_fence.h
new file mode 100644
index 0000000..162507a
--- /dev/null
+++ b/src/internal/atomic_fence.h
@@ -0,0 +1,29 @@
+#ifndef _STDATOMIC_ATOMIC_FENCE_H_
+#define _STDATOMIC_ATOMIC_FENCE_H_ 1
+
+#include <atomic_constants.h>
+
+
+void atomic_thread_fence(memory_order mo);
+
+void atomic_signal_fence(memory_order mo);
+
+#define kill_dependency(X) \
+({ \
+ register __typeof__(X) kill_dependency = (X); \
+ kill_dependency; \
+ })
+
+#ifndef __ATOMIC_FORCE_SYNC
+# define atomic_thread_fence(MO) __atomic_thread_fence(MO)
+# define atomic_signal_fence(MO) __atomic_signal_fence(MO)
+#else
+# define atomic_thread_fence(MO) \
+({ \
+ if (MO != memory_order_relaxed) __sync_synchronize(); \
+ else __asm__ volatile("# relaxed fence"); \
+ })
+#define atomic_signal_fence(MO) __asm__ volatile("# signal fence")
+#endif
+
+#endif
diff --git a/src/internal/atomic_flag.h b/src/internal/atomic_flag.h
new file mode 100644
index 0000000..0d3344b
--- /dev/null
+++ b/src/internal/atomic_flag.h
@@ -0,0 +1,47 @@
+#ifndef _STDATOMIC_ATOMIC_FLAG_H_
+#define _STDATOMIC_ATOMIC_FLAG_H_ 1
+
+#include <atomic_constants.h>
+
+#ifndef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL
+# define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+#endif
+
+typedef struct atomic_flag atomic_flag;
+struct atomic_flag {
+ _Bool f;
+};
+
+_Bool atomic_flag_test_and_set(volatile atomic_flag*);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag*, memory_order);
+void atomic_flag_clear(volatile atomic_flag*);
+void atomic_flag_clear_explicit(volatile atomic_flag*, memory_order);
+
+#define ATOMIC_FLAG_INIT { .f = 0, }
+
+#define atomic_flag_test_and_set(A) atomic_flag_test_and_set_explicit((A), memory_order_seq_cst)
+#define atomic_flag_clear(A) atomic_flag_clear_explicit((A), memory_order_seq_cst)
+
+#ifndef __ATOMIC_FORCE_SYNC
+# define atomic_flag_test_and_set_explicit(A, MO) (__atomic_test_and_set(&((A)->f), MO) == __GCC_ATOMIC_TEST_AND_SET_TRUEVAL)
+# define atomic_flag_clear_explicit(A, MO) __atomic_clear(&(A)->f, MO)
+#else
+# define atomic_flag_test_and_set_explicit(A, O) \
+({ \
+ register _Bool atomic_flag_test_and_set_explicit \
+ = (__sync_lock_test_and_set(&(A)->f, __GCC_ATOMIC_TEST_AND_SET_TRUEVAL) == __GCC_ATOMIC_TEST_AND_SET_TRUEVAL); \
+ /* gcc guarantees that this was an acquire operation. */ \
+ /* synchronize even stronger if we need to */ \
+ if ((O) == memory_order_seq_cst) __sync_synchronize(); \
+ atomic_flag_test_and_set_explicit; \
+ })
+# define atomic_flag_clear_explicit(A, O) \
+({ \
+ /* gcc guarantees that this will be a release operation. */ \
+ /* synchronize even stronger if we need to */ \
+ if ((O) == memory_order_seq_cst) __sync_synchronize(); \
+ __sync_lock_release(&(A)->f); \
+ })
+#endif
+
+#endif
diff --git a/src/internal/atomic_gcc_atomic.h b/src/internal/atomic_gcc_atomic.h
new file mode 100644
index 0000000..e030b85
--- /dev/null
+++ b/src/internal/atomic_gcc_atomic.h
@@ -0,0 +1,107 @@
+#ifndef _STDATOMIC_GCC_ATOMIC_H_
+#define _STDATOMIC_GCC_ATOMIC_H_ 1
+
+#include <atomic_stub.h>
+
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#define atomic_init(X, V) ((void)((*(X))=(V)))
+
+/* Map all non-explicit macros to the explicit version. */
+#define atomic_fetch_add(X, Y) atomic_fetch_add_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) atomic_fetch_sub_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) atomic_fetch_and_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) atomic_fetch_or_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) atomic_fetch_xor_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) atomic_load_explicit((X), memory_order_seq_cst)
+#define atomic_store(X, V) atomic_store_explicit((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) atomic_exchange_explicit((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) atomic_compare_exchange_weak_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* Map allexplicit macros to the corresponding builtin. */
+/* The arithmetic operations don't have to use a memory operand. */
+#define atomic_fetch_add_explicit(X, Y, MO) __atomic_fetch_add((X), (Y), (MO))
+#define atomic_fetch_sub_explicit(X, Y, MO) __atomic_fetch_sub((X), (Y), (MO))
+#define atomic_fetch_and_explicit(X, Y, MO) __atomic_fetch_and((X), (Y), (MO))
+#define atomic_fetch_or_explicit(X, Y, MO) __atomic_fetch_or((X), (Y), (MO))
+#define atomic_fetch_xor_explicit(X, Y, MO) __atomic_fetch_xor((X), (Y), (MO))
+
+/* The interfaces for the universal functions need to operate on
+ memory operands, only. */
+
+#define atomic_load_explicit(X, MO) \
+({ \
+ __atyp(*X) _r; \
+ __atomic_load((X), _r, (MO)); \
+ __aret(_r[0]); \
+ })
+
+#define atomic_store_explicit(X, V, MO) \
+ __atomic_store((X), &__atmp(*X, V), (MO))
+
+#define atomic_exchange_explicit(X, V, MO) \
+({ \
+ __atyp(*X) _r; \
+ __atomic_exchange((X), &__atmp(*X, V), _r, (MO)); \
+ __aret(_r[0]); \
+ })
+
+#define atomic_compare_exchange_weak_explicit(X, E, V, MOS, MOF) \
+ __atomic_compare_exchange((X), (E), &__atmp(*(X), (V)), 1, (MOS), (MOF))
+
+#define atomic_compare_exchange_strong_explicit(X, E, V, MOS, MOF) \
+ __atomic_compare_exchange((X), (E), &__atmp(*(X), (V)), 0, (MOS), (MOF))
+
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_add(X, V, M); \
+} \
+T __impl_fetch_sub_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_sub(X, V, M); \
+} \
+T __impl_fetch_and_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_and(X, V, M); \
+} \
+T __impl_fetch_xor_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_xor(X, V, M); \
+} \
+T __impl_fetch_nand_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_nand(X, V, M); \
+} \
+T __impl_fetch_or_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_fetch_or(X, V, M); \
+} \
+T __impl_add_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_add_fetch(X, V, M); \
+} \
+T __impl_sub_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_sub_fetch(X, V, M); \
+} \
+T __impl_and_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_and_fetch(X, V, M); \
+} \
+T __impl_xor_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_xor_fetch(X, V, M); \
+} \
+T __impl_nand_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_nand_fetch(X, V, M); \
+} \
+T __impl_or_fetch_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_or_fetch(X, V, M); \
+} \
+T __impl_load_ ## N(_Atomic(T)* X, int M) { \
+ return __atomic_load_n(X, M); \
+} \
+void __impl_store_ ## N(_Atomic(T)* X, T const V, int M) { \
+ __atomic_store_n(X, V, M); \
+} \
+T __impl_exchange_ ## N(_Atomic(T)* X, T const V, int M) { \
+ return __atomic_exchange_n(X, V, M); \
+} \
+_Bool __impl_compare_exchange_ ## N(_Atomic(T)* X, T* E, T const V, int MS, int MF) { \
+ return __atomic_compare_exchange_n(X, E, V, 0, MS, MF); \
+}
+
+
+#endif
diff --git a/src/internal/atomic_gcc_sync.h b/src/internal/atomic_gcc_sync.h
new file mode 100644
index 0000000..2d42a57
--- /dev/null
+++ b/src/internal/atomic_gcc_sync.h
@@ -0,0 +1,250 @@
+#ifndef _STDATOMIC_GCC_SYNC_H_
+#define _STDATOMIC_GCC_SYNC_H_ 1
+
+#define ATOMIC_VAR_INIT(...) { [0] = __VA_ARGS__, }
+#define atomic_init(X, V) ((void)((*(X))[0]=(V)))
+
+/* Map all non-explicit macros to the explicit version. */
+#define atomic_fetch_add(X, Y) atomic_fetch_add_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_sub(X, Y) atomic_fetch_sub_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_and(X, Y) atomic_fetch_and_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_or(X, Y) atomic_fetch_or_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_fetch_xor(X, Y) atomic_fetch_xor_explicit((X), (Y), memory_order_seq_cst)
+#define atomic_load(X) atomic_load_explicit((X), memory_order_seq_cst)
+#define atomic_store(X, V) atomic_store_explicit((X), (V), memory_order_seq_cst)
+#define atomic_exchange(X, V) atomic_exchange_explicit((X), (V), memory_order_seq_cst)
+#define atomic_compare_exchange_weak(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_strong(X, E, V) atomic_compare_exchange_strong_explicit((X), (E), (V), memory_order_seq_cst, memory_order_seq_cst)
+
+/* The argument X is supposed to be pointer to a one element array of
+ the base type. In evaluation context ``*(X)'' decays to a pointer
+ to the base type. In __typeof__ context we have to use
+ ``&(*(X))[0]'' for that. */
+#define atomic_fetch_add_explicit(X, Y, MO) __sync_fetch_and_add(*(X), (Y))
+#define atomic_fetch_sub_explicit(X, Y, MO) __sync_fetch_and_sub(*(X), (Y))
+#define atomic_fetch_and_explicit(X, Y, MO) __sync_fetch_and_or(*(X), (Y))
+#define atomic_fetch_or_explicit(X, Y, MO) __sync_fetch_and_and(*(X), (Y))
+#define atomic_fetch_xor_explicit(X, Y, MO) __sync_fetch_and_xor(*(X), (Y))
+
+#define atomic_compare_exchange_weak(X, E, D, MOS, MOF) atomic_compare_exchange_strong((X), (E), (V), (MOS), (MOF))
+
+#define INSTANTIATE_STUB_LF(N, T) \
+T __impl_fetch_add_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_add(&((*X)[0]), _V); \
+} \
+T __impl_fetch_sub_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_sub(&((*X)[0]), _V); \
+} \
+T __impl_fetch_and_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_and(&((*X)[0]), _V); \
+} \
+T __impl_fetch_xor_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_xor(&((*X)[0]), _V, _mo); \
+} \
+T __impl_fetch_or_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_fetch_and_or(&((*X)[0]), _V, _mo); \
+} \
+T __impl_add_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_add_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_sub_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_sub_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_and_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_and_and_fetch(&((*X)[0]), _V); \
+} \
+T __impl_xor_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_xor_and_fetch(&((*X)[0]), _V, _mo); \
+} \
+T __impl_or_fetch_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ return __sync_or_and_fetch(&((*X)[0]), _V, _mo); \
+} \
+T __impl_load_ ## N(__typeof__(T volatile[1])* X, int _mo) { \
+ return __sync_val_compare_and_swap(&((*X)[0]), 0, 0); \
+} \
+T __impl_exchange_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ T _r = _V, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap(&((*X)[0]), _e, _V); \
+ } while (_r != _e); \
+ return _r; \
+} \
+void __impl_store_ ## N(__typeof__(T volatile[1])* X, T const _V, int _mo) { \
+ (void)__impl_exchange_ ## N(X, _V, _mo); \
+} \
+_Bool __impl_compare_exchange_ ## N(__typeof__(T volatile[1])* X, T* _E, T const _D, int _mos, int _mof) { \
+ T _v = *_E; \
+ T _n = __sync_val_compare_and_swap(&((*X)[0]), _v, _D); \
+ if (_v != _n) { \
+ *_E = _n; \
+ return 0; \
+ } \
+ return 1; \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+
+#define atomic_compare_exchange_strong_explicit(X, E, D, MOS, MOF) \
+({ \
+ _Bool ret; \
+ __typeof__((*X)[0])* _e = (E); \
+ __typeof__((*X)[0]) const _d = (D); \
+ switch (sizeof _d) { \
+ case 8: ret = __sync_val_compare_and_swap((uint64_t*)(X), *_e, (D)); break; \
+ case 4: ret = __sync_val_compare_and_swap((uint32_t*)(X), *_e, (D)); break; \
+ case 2: ret = __sync_val_compare_and_swap((uint16_t*)(X), *_e, (D)); break; \
+ case 1: ret = __sync_val_compare_and_swap((uint8_t*)(X), *_e, (D)); break; \
+ default: ret = __impl_compare_exchange(sizeof (*X), (void*)(X), _e, &_d, MOS, MOS); \
+ } \
+ __aret(ret); \
+ })
+
+#define __impl_union(T, X) union { __typeof__(*(X)) x; T t; }
+#define __impl_union2T(T, X) (((__impl_union(T, X)){ .x = (*(X)), }).t)
+#define __impl_union2X(T, X, V) (((__impl_union(T, X)){ .t = (V), }).x)
+
+#define __impl_load_union(T, X) \
+__impl_union2X(T, X, __sync_val_compare_and_swap((T*)X, 0, 0))
+
+#define __impl_exchange_union(T, X, V) \
+({ \
+ __impl_union(T, X) _V = { .t = (V), }; \
+ T _r = _V.t, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap((T*)X, _e, _V.t); \
+ } while (_r != _e); \
+ __impl_union2X(T, X, _r); \
+ })
+
+#define __impl_store_union(T, X, V) \
+({ \
+ __impl_union(T, X) _V = { .t = (V), }; \
+ T _r = _V.t, _e; \
+ do { \
+ _e = _r; \
+ _r = __sync_val_compare_and_swap((T*)X, _e, _V.t); \
+ } while (_r != _e); \
+ })
+
+#define __impl_compare_exchange_union(T, X, E, V) \
+({ \
+ __typeof__(*E)* _e = (E); \
+ __impl_union(T, X) _V = { .x = (V), }; \
+ __impl_union(T, X) _E = { .x = *_e, }; \
+ __impl_union(T, X) _R = { .t = __sync_val_compare_and_swap((T*)X, _E.t, _V.t), }; \
+ _Bool _r = (_E.t == _R.t); \
+ if (!_r) _E.x = _R.x; \
+ _r; \
+ })
+
+#define atomic_load_explicit(X, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_load_union(__impl_uint128_t, &((*X)[0])), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_load_union(uint64_t, &((*X)[0])), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_load_union(uint32_t, &((*X)[0])), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_load_union(uint16_t, &((*X)[0])), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_load_union(uint8_t, &((*X)[0])), \
+ ({ \
+ __typeof__((*X)[0]) _r; \
+ __impl_load(sizeof _r, (void*)(&((*X)[0])), &_r, MO); \
+ _r; \
+ }))))))
+
+#define atomic_store_explicit(X, V, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_store_union(__impl_uint128_t, &((*X)[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_store_union(uint64_t, &((*X)[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_store_union(uint32_t, &((*X)[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_store_union(uint16_t, &((*X)[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_store_union(uint8_t, &((*X)[0]), (V)), \
+ ({ \
+ __typeof__((*X)[0]) const _v = (V); \
+ __impl_store(sizeof _v, &((*X)[0]), &_v, MO); \
+ }))))))
+
+#define atomic_exchange_explicit(X, V, MO) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_exchange_union(__impl_uint128_t, &((*(X))[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_exchange_union(uint64_t, &((*(X))[0]), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_exchange_union(uint32_t, &((*(X))[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_exchange_union(uint16_t, &((*(X))[0]), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_exchange_union(uint8_t, &((*(X))[0]), (V)), \
+ ({ \
+ __typeof__((*X)[0]) const _v = (V); \
+ __typeof__((*X)[0]) _r = (V); \
+ __impl_exchange(sizeof _r, (&((*X)[0])), &_r, &_v, MO); \
+ _r; \
+ }))))))
+
+#define atomic_compare_exchange_explicit(X, E, V, MOS, MOF) \
+__builtin_choose_expr \
+( \
+ __UINT128__ && sizeof(*X)==16, \
+ __impl_compare_exchange_union(__impl_uint128_t, &((*(X))[0]), (E), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==8, \
+ __impl_compare_exchange_union(uint64_t, &((*(X))[0]), (E), (V)), \
+__builtin_choose_expr \
+( \
+ sizeof(*X)==4, \
+ __impl_compare_exchange_union(uint32_t, &((*(X))[0]), (E), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==2, \
+ __impl_compare_exchange_union(uint16_t, &((*(X))[0]), (E), (V)), \
+ __builtin_choose_expr \
+( \
+ sizeof(*X)==1, \
+ __impl_compare_exchange_union(uint8_t, &((*(X))[0]), (E), (V)), \
+ ({ \
+ __typeof__((*X)[0])* _e = (E); \
+ __typeof__((*X)[0]) const _v = (V); \
+ __impl_compare_exchange(sizeof _r, (&((*X)[0])), _e, &_v, MOS, MOF); \
+ }))))))
+
+#endif
diff --git a/src/internal/atomic_generic.h b/src/internal/atomic_generic.h
new file mode 100644
index 0000000..6ec6bb8
--- /dev/null
+++ b/src/internal/atomic_generic.h
@@ -0,0 +1,10 @@
+#ifndef _STDATOMIC_ATOMIC_GENERIC_H_
+#define _STDATOMIC_ATOMIC_GENERIC_H_ 1
+
+void __impl_load (size_t size, void volatile* ptr, void volatile* ret, int mo);
+void __impl_store (size_t size, void volatile* ptr, void const volatile* val, int mo);
+void __impl_exchange (size_t size, void volatile*__restrict__ ptr, void const volatile* val, void volatile* ret, int mo);
+_Bool __impl_compare_exchange (size_t size, void volatile* ptr, void volatile* expected, void const volatile* desired, int mos, int mof);
+void __impl_print_stat(void);
+
+#endif
diff --git a/src/internal/atomic_stub.h b/src/internal/atomic_stub.h
new file mode 100644
index 0000000..c5e3329
--- /dev/null
+++ b/src/internal/atomic_stub.h
@@ -0,0 +1,208 @@
+#ifndef _STDATOMIC_STUB_H_
+#define _STDATOMIC_STUB_H_ 1
+
+#include <atomic_generic.h>
+
+#define INSTANTIATE_STUB_LCA(N, T) \
+T __impl_fetch_add_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E + V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_sub_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = -V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E - V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_and_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = 0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E & V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_xor_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E ^ V; \
+ } \
+ return E; \
+} \
+T __impl_fetch_or_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = MO == memory_order_relaxed ? memory_order_relaxed : memory_order_consume; \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E | V; \
+ } \
+ return E; \
+} \
+T __impl_add_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E + V; \
+ } \
+ return R; \
+} \
+T __impl_sub_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = -V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E - V; \
+ } \
+ return R; \
+} \
+T __impl_and_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = 0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E & V; \
+ } \
+ return R; \
+} \
+T __impl_xor_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E ^ V; \
+ } \
+ return R; \
+} \
+T __impl_or_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = V; \
+ int mof = MO == memory_order_relaxed ? memory_order_relaxed : memory_order_consume; \
+ while (!__impl_compare_exchange(N, X, &E, &R, MO, mof)){ \
+ R = E | V; \
+ } \
+ return R; \
+}
+
+#define INSTANTIATE_STUB_NAND(N, T) \
+T __impl_fetch_nand_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = ~0; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!atomic_compare_exchange_strong_explicit((_Atomic(T)*)X, &E, R, MO, mof)){ \
+ R = ~(E & V); \
+ } \
+ return E; \
+} \
+T __impl_nand_fetch_ ## N(void volatile* X, T const V, int MO) { \
+ T E = 0; \
+ T R = ~E; \
+ int mof = (MO == memory_order_relaxed \
+ ? memory_order_relaxed \
+ : memory_order_consume); \
+ while (!atomic_compare_exchange_strong_explicit((_Atomic(T)*)X, &E, R, MO, mof)){ \
+ R = ~(E & V); \
+ } \
+ return R; \
+}
+
+
+#define INSTANTIATE_STUB_LCM(N, T) \
+T __impl_load_ ## N(void volatile* X, int MO) { \
+ T ret; \
+ __impl_load(N, X, &ret, MO); \
+ return ret; \
+} \
+void __impl_store_ ## N(void volatile* X, T const V, int MO) { \
+ __impl_store(N, X, &V, MO); \
+} \
+T __impl_exchange_ ## N(void volatile* X, T const V, int MO) { \
+ T ret; \
+ __impl_exchange(N, X, &V, &ret, MO); \
+ return ret; \
+} \
+_Bool __impl_compare_exchange_ ## N(void volatile* X, T* E, T const V, int MOS, int MOf) { \
+ return __impl_compare_exchange(N, X, E, &V, MOS, MOf); \
+} \
+ INSTANTIATE_STUB_NAND(N, T)
+
+#define INSTANTIATE_STUB_LC(N, T) INSTANTIATE_STUB_LCA(N, T) INSTANTIATE_STUB_LCM(N, T)
+
+
+#define INSTANTIATE_SYNCA(N, T) \
+T __impl_fetch_and_add_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_add_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_sub_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_sub_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_and_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_and_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_or_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_or_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_fetch_and_xor_ ## N(void volatile* X, T const V) { \
+ return __impl_fetch_xor_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_add_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_add_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_sub_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_sub_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_and_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_and_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_or_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_or_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+} \
+T __impl_xor_and_fetch_ ## N(void volatile* X, T const V) { \
+ return __impl_xor_fetch_ ## N((_Atomic(T)*)X, V, memory_order_seq_cst); \
+}
+
+#define INSTANTIATE_SYNCM(N, T) \
+_Bool __impl_bool_compare_and_swap_ ## N(void volatile* X, T E, T const V) { \
+ T R = E; \
+ return __impl_compare_exchange_ ## N((_Atomic(T)*)X, &R, V, \
+ memory_order_seq_cst, memory_order_seq_cst); \
+} \
+T __impl_val_compare_and_swap_ ## N(void volatile* X, T E, T const V) { \
+ T R = E; \
+ __impl_compare_exchange_ ## N((_Atomic(T)*)X, &R, V, \
+ memory_order_seq_cst, memory_order_seq_cst); \
+ return R; \
+}
+
+#define INSTANTIATE_SYNC(N, T) INSTANTIATE_SYNCA(N, T) INSTANTIATE_SYNCM(N, T)
+
+#endif
diff --git a/src/internal/atomic_types.h b/src/internal/atomic_types.h
new file mode 100644
index 0000000..e873f35
--- /dev/null
+++ b/src/internal/atomic_types.h
@@ -0,0 +1,46 @@
+#ifndef _STDATOMIC_TYPES_H_
+#define _STDATOMIC_TYPES_ 1
+
+#include <wchar.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef _Atomic(_Bool) atomic_bool;
+typedef _Atomic(char) atomic_char;
+typedef _Atomic(int) atomic_int;
+typedef _Atomic(int_fast16_t) atomic_int_fast16_t;
+typedef _Atomic(int_fast32_t) atomic_int_fast32_t;
+typedef _Atomic(int_fast64_t) atomic_int_fast64_t;
+typedef _Atomic(int_fast8_t) atomic_int_fast8_t;
+typedef _Atomic(int_least16_t) atomic_int_least16_t;
+typedef _Atomic(int_least32_t) atomic_int_least32_t;
+typedef _Atomic(int_least64_t) atomic_int_least64_t;
+typedef _Atomic(int_least8_t) atomic_int_least8_t;
+typedef _Atomic(intmax_t) atomic_intmax_t;
+typedef _Atomic(intptr_t) atomic_intptr_t;
+typedef _Atomic(long long) atomic_llong;
+typedef _Atomic(long) atomic_long;
+typedef _Atomic(ptrdiff_t) atomic_ptrdiff_t;
+typedef _Atomic(short) atomic_short;
+typedef _Atomic(signed char) atomic_schar;
+typedef _Atomic(size_t) atomic_size_t;
+typedef _Atomic(uint_fast16_t) atomic_uint_fast16_t;
+typedef _Atomic(uint_fast32_t) atomic_uint_fast32_t;
+typedef _Atomic(uint_fast64_t) atomic_uint_fast64_t;
+typedef _Atomic(uint_fast8_t) atomic_uint_fast8_t;
+typedef _Atomic(uint_least16_t) atomic_char16_t;
+typedef _Atomic(uint_least16_t) atomic_uint_least16_t;
+typedef _Atomic(uint_least32_t) atomic_char32_t;
+typedef _Atomic(uint_least32_t) atomic_uint_least32_t;
+typedef _Atomic(uint_least64_t) atomic_uint_least64_t;
+typedef _Atomic(uint_least8_t) atomic_uint_least8_t;
+typedef _Atomic(uintmax_t) atomic_uintmax_t;
+typedef _Atomic(uintptr_t) atomic_uintptr_t;
+typedef _Atomic(unsigned char) atomic_uchar;
+typedef _Atomic(unsigned int) atomic_uint;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(unsigned long) atomic_ulong;
+typedef _Atomic(unsigned short) atomic_ushort;
+typedef _Atomic(wchar_t) atomic_wchar_t;
+
+#endif
diff --git a/src/internal/stdatomic-impl.h b/src/internal/stdatomic-impl.h
new file mode 100644
index 0000000..7aaa0d5
--- /dev/null
+++ b/src/internal/stdatomic-impl.h
@@ -0,0 +1,75 @@
+#ifndef _STDATOMIC_H_
+#define _STDATOMIC_H_ 1
+
+/* Copyright 2015, Jens Gustedt, France. */
+
+/**
+ ** @file
+ **
+ ** @brief An realization of the stdatomic.h interface by means of gcc
+ ** or clang compiler extensions.
+ **
+ ** This has three different realizations, using intrinsics for modern
+ ** clang (__c11_atomic ...), modern gcc (__atomic ...) or for the old
+ ** gcc __sync interface. The later should be available on a lot of
+ ** platforms, many other compilers, including clang implement these
+ ** interfaces.
+ **
+ ** For the first two, user code should be able to use all C11 atomic
+ ** features without problems.
+ **
+ ** For the __sync interface, we can't assume that there is support
+ ** for operators on atomics, so such code should simply not use
+ ** them. But the "functional" approach to atomics should work even
+ ** then. That is code that uses the _Atomic() variant to declare
+ ** atomic objects and only uses the atomic_... macros as of the C11
+ ** standard to act upon these objects should work.
+ **
+ ** The sync code also depends a lot of other gcc extensions to C:
+ **
+ ** - compound expressions
+ ** - __typeof__
+ ** - __alignof__
+ ** - __attribute__((aligned(something)))
+ **/
+
+
+#include <atomic_constants.h>
+#include <atomic_flag.h>
+#include <atomic_fence.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+/* In some places we need a type that is almost the same as base type
+ T, but
+
+ - returns a pointer to T in evaluation context
+ - can't be assigned to
+
+ T can be a type or an expression.
+*/
+#define __atyp(T) __typeof__(__typeof__(T)[1])
+
+/* To evaluate expressions we sometimes need temporaries of that type
+ with a certain value. */
+#define __atmp(T, V) (__atyp(T)){ [0] = (V), }
+
+/* When evaluating lvalues in gcc's compound expressions to return a
+ value, we want to take care that these lvalues can't be
+ accidentally be subject to the & operator. Force it to be an
+ rvalue. */
+#define __aret(V) (1 ? (V) : (V))
+
+
+#ifdef __ATOMIC_FORCE_SYNC
+#include <atomic_gcc_sync.h>
+#elif defined(__clang__)
+#include <atomic_clang_c11.h>
+#else
+#include <atomic_gcc_atomic.h>
+#endif
+
+#include <atomic_types.h>
+
+#endif
diff --git a/src/stdatomic/atomic_fence.c b/src/stdatomic/atomic_fence.c
new file mode 100644
index 0000000..996286f
--- /dev/null
+++ b/src/stdatomic/atomic_fence.c
@@ -0,0 +1,9 @@
+#include "atomic_fence.h"
+
+void (atomic_thread_fence)(memory_order mo) {
+ atomic_thread_fence(mo);
+}
+
+void (atomic_signal_fence)(memory_order mo) {
+ atomic_signal_fence(mo);
+}
diff --git a/src/stdatomic/atomic_flag.c b/src/stdatomic/atomic_flag.c
new file mode 100644
index 0000000..a27fbe3
--- /dev/null
+++ b/src/stdatomic/atomic_flag.c
@@ -0,0 +1,17 @@
+#include "atomic_flag.h"
+
+_Bool (atomic_flag_test_and_set)(volatile atomic_flag* f) {
+ return atomic_flag_test_and_set(f);
+}
+
+_Bool (atomic_flag_test_and_set_explicit)(volatile atomic_flag* f, memory_order mo) {
+ return atomic_flag_test_and_set_explicit(f, mo);
+}
+
+void (atomic_flag_clear)(volatile atomic_flag* f) {
+ atomic_flag_clear(f);
+}
+
+void (atomic_flag_clear_explicit)(volatile atomic_flag* f, memory_order mo) {
+ atomic_flag_clear_explicit(f, mo);
+}
diff --git a/src/stdatomic/atomic_futex_lock.c b/src/stdatomic/atomic_futex_lock.c
new file mode 100644
index 0000000..ded9eb0
--- /dev/null
+++ b/src/stdatomic/atomic_futex_lock.c
@@ -0,0 +1,88 @@
+#include "pthread_impl.h"
+#include "stdatomic-impl.h"
+
+/* The HO bit. */
+static unsigned const lockbit = (UINT_MAX/2u)+1u;
+static unsigned const contrib = (UINT_MAX/2u)+2u;
+
+size_t __impl_total = 0;
+size_t __impl_fast = 0;
+size_t __impl_slow = 0;
+size_t __impl_futex = 0;
+size_t __impl_again = 0;
+size_t __impl_spin = 0;
+
+#ifdef BENCH
+# define ACCOUNT(X, V) (X) += (V)
+#else
+# define ACCOUNT(X, V) do { } while(0)
+#endif
+
+void __impl_mut_lock_slow(_Atomic(unsigned)* loc)
+{
+#ifdef BENCH
+ size_t slow = 0;
+ size_t futex = 0;
+ size_t again = 0;
+ size_t spin = 0;
+#endif
+ register unsigned spins = 0;
+ unsigned val = 1+atomic_fetch_add_explicit(loc, 1, memory_order_relaxed);
+ if (!(val & lockbit)) goto BIT_UNSET;
+ /* The lock acquisition loop. This has been designed such that the
+ only possible change that is done inside that loop is setting
+ the lock bit. This has a double objective. First all atomic
+ operations are expensive and doing a pair of ++ and -- inside
+ the loop would just waste memory bandwidth. Then, less changes
+ to the count, means that other threads that are inside this
+ same loop are less perturbed. */
+ for (;;) {
+ /* The lock bit is set by someone else, wait until it is
+ unset. */
+ for (spins = 0; spins < 10; ++spins) {
+ a_spin();
+ /* be optimistic and hope that the lock has been released */
+ register unsigned des = val-1;
+ val -= contrib;
+ if (atomic_compare_exchange_strong_explicit(loc, &val, des, memory_order_acq_rel, memory_order_consume))
+ goto FINISH;
+ if (!(val & lockbit)) goto BIT_UNSET;
+ }
+ /* The same inner loop as before, but with futex wait instead of
+ a_spin. */
+ for (;;) {
+ ACCOUNT(futex, 1);
+ if (__syscall(SYS_futex, loc, FUTEX_WAIT|FUTEX_PRIVATE, val, 0) == -EAGAIN)
+ ACCOUNT(again, 1);
+ /* be optimistic and hope that the lock has been released */
+ register unsigned des = val-1;
+ val -= contrib;
+ if (atomic_compare_exchange_strong_explicit(loc, &val, des, memory_order_acq_rel, memory_order_consume))
+ goto FINISH;
+ if (!(val & lockbit)) goto BIT_UNSET;
+ }
+ /* The lock bit isn't set, try to acquire it. */
+ BIT_UNSET:
+ ACCOUNT(spin, spins);
+ ACCOUNT(slow, 1);
+ do {
+ a_spin();
+ if (atomic_compare_exchange_strong_explicit(loc, &val, val|lockbit, memory_order_acq_rel, memory_order_consume))
+ goto FINISH;
+ } while (!(val & lockbit));
+ }
+ FINISH:
+#ifdef BENCH
+ __impl_total += 1;
+ __impl_slow += slow;
+ __impl_futex += futex;
+ __impl_again += again;
+ __impl_spin += spin;
+#endif
+ return;
+}
+
+void __impl_mut_unlock_slow(_Atomic(unsigned)* loc)
+{
+ __syscall(SYS_futex, loc, FUTEX_WAKE|FUTEX_PRIVATE, 1);
+}
diff --git a/src/stdatomic/atomic_generic.c b/src/stdatomic/atomic_generic.c
new file mode 100644
index 0000000..c51e4dc
--- /dev/null
+++ b/src/stdatomic/atomic_generic.c
@@ -0,0 +1,242 @@
+#include <limits.h>
+#include <string.h>
+#include "stdatomic-impl.h"
+#include "atomic_generic.h"
+#include "libc.h"
+
+#ifdef HASH_STAT
+# include <math.h>
+# include <stdio.h>
+#endif
+
+/* This is compatible with musl's internal locks. */
+/* The lock itself must be lock-free, so in general the can only be an
+ atomic_flag if we know nothing else about the platform. */
+
+typedef _Atomic(unsigned) __impl_lock;
+void __impl_mut_lock_slow(_Atomic(unsigned)* loc);
+void __impl_mut_unlock_slow(_Atomic(unsigned)* loc);
+
+static unsigned const contrib = ((UINT_MAX/2u)+2u);
+
+__attribute__((__always_inline__))
+static inline
+void __impl_mut_lock(_Atomic(unsigned)* loc)
+{
+ if (!atomic_compare_exchange_strong_explicit(loc, (unsigned[1]){ 0 }, contrib, memory_order_acq_rel, memory_order_consume))
+ __impl_mut_lock_slow(loc);
+}
+
+__attribute__((__always_inline__))
+static inline
+void __impl_mut_unlock(_Atomic(unsigned)* loc)
+{
+ if (contrib != atomic_fetch_sub_explicit(loc, contrib, memory_order_relaxed))
+ __impl_mut_unlock_slow(loc);
+}
+
+/* the size of this table has a trade off between the probability of
+ collisions (the bigger the table, the better) and the waste of
+ space (the smaller, the better). */
+
+#ifndef HBIT
+# define HBIT 8
+#endif
+/* len is a power of two such that we just can mask out higher bits */
+enum { LEN = 1<<HBIT, };
+enum { ptrbit = sizeof(uintptr_t)*CHAR_BIT, };
+
+static __impl_lock table[LEN];
+
+#ifdef HASH_STAT
+static _Atomic(size_t) draw[LEN];
+#endif
+
+/* Chose a medium sized prime number as a factor. The multiplication
+ by it is a bijection modulo any LEN. */
+#define MAGIC 14530039U
+
+__attribute__((__unused__))
+static
+unsigned __impl_hash(void volatile const* X) {
+ uintptr_t const len = LEN;
+ uintptr_t x = (uintptr_t)X;
+ x *= MAGIC;
+ /* Be sure to use all bits in the result. */
+ if (ptrbit > 8*HBIT) x ^= (x / (len*len*len*len*len*len*len*len));
+ if (ptrbit > 4*HBIT) x ^= (x / (len*len*len*len));
+ if (ptrbit > 2*HBIT) x ^= (x / (len*len));
+ if (ptrbit > 1*HBIT) x ^= (x / len);
+ x %= len;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[x], 1, memory_order_relaxed);
+#endif
+ return x;
+}
+
+__attribute__((__unused__))
+static
+unsigned __impl_jenkins_one_at_a_time_hash(void volatile const* k) {
+ union {
+ unsigned char b[sizeof k];
+ uintptr_t v;
+ void volatile const* k;
+ } key = { .k = k, };
+ uintptr_t i, x = 0;
+ for(i = 0; i < sizeof(uintptr_t); ++i) {
+ x += key.b[i];
+ x += (x << 10);
+ x ^= (x >> 6);
+ }
+ x += (x << 3);
+ x ^= (x >> 11);
+ x += (x << 15);
+ x %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[x], 1, memory_order_relaxed);
+#endif
+ return x;
+}
+
+__attribute__((__unused__))
+static
+uintptr_t __impl_mix(void volatile const* x) {
+ uintptr_t h = (uintptr_t)x;
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+ h %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[h], 1, memory_order_relaxed);
+#endif
+ return h;
+}
+
+__attribute__((__unused__))
+static
+uintptr_t __impl_8(void volatile const* x) {
+ uintptr_t h = (uintptr_t)x;
+ h ^= (h >> 8);
+ h %= LEN;
+#ifdef HASH_STAT
+ atomic_fetch_add_explicit(&draw[h], 1, memory_order_relaxed);
+#endif
+ return h;
+}
+
+#ifndef __ATOMIC_HASH
+# define __ATOMIC_HASH __impl_hash
+#endif
+#define hash __ATOMIC_HASH
+
+
+void __impl_load (size_t size, void volatile* ptr, void volatile* ret, int mo) {
+ unsigned pos = hash(ptr);
+ __impl_mut_lock(&table[pos]);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)ret, (void*)ptr, size);
+ __impl_mut_unlock(&table[pos]);
+}
+
+void __impl_store (size_t size, void volatile* ptr, void const volatile* val, int mo) {
+ unsigned pos = hash(ptr);
+ __impl_mut_lock(&table[pos]);
+ memcpy((void*)ptr, (void*)val, size);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ __impl_mut_unlock(&table[pos]);
+}
+
+static
+void atomic_exchange_restrict (size_t size, void volatile*__restrict__ ptr, void const volatile*__restrict__ val, void volatile*__restrict__ ret, int mo) {
+ unsigned pos = hash(ptr);
+ __impl_mut_lock(&table[pos]);
+ memcpy((void*)ret, (void*)ptr, size);
+ if (mo == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)ptr, (void*)val, size);
+ __impl_mut_unlock(&table[pos]);
+}
+
+void __impl_exchange (size_t size, void volatile*__restrict__ ptr, void const volatile* val, void volatile* ret, int mo) {
+ if (val == ret) {
+ unsigned char buffer[size];
+ atomic_exchange_restrict(size, ptr, val, buffer, mo);
+ memcpy((void*)ret, buffer, size);
+ } else {
+ atomic_exchange_restrict(size, ptr, val, ret, mo);
+ }
+}
+
+_Bool __impl_compare_exchange (size_t size, void volatile* ptr, void volatile* expected, void const volatile* desired, int mos, int mof) {
+ unsigned pos = hash(ptr);
+ __impl_mut_lock(&table[pos]);
+ _Bool ret = !memcmp((void*)ptr, (void*)expected, size);
+ if (ret) {
+ memcpy((void*)ptr, (void*)desired, size);
+ if (mos == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ } else {
+ if (mof == memory_order_seq_cst)
+ atomic_thread_fence(memory_order_seq_cst);
+ memcpy((void*)expected, (void*)ptr, size);
+ }
+ __impl_mut_unlock(&table[pos]);
+ /* fprintf(stderr, "cas for %p (%zu) at pos %u, %s, exp %p, des %p\n", */
+ /* ptr, size, pos, ret ? "suceeded" : "failed", */
+ /* expected, desired); */
+ return ret;
+}
+
+/* To collect hash statistics about atomics, compile with
+ ``HASH_STAT'' */
+void __impl_print_stat(void) {
+#ifdef HASH_STAT
+ size_t x1 = 0;
+ size_t x2 = 0;
+ size_t min = -1;
+ size_t max = 0;
+ size_t i;
+ for (i = 0; i < LEN; i++) {
+ size_t val = atomic_load(&draw[i]);
+ fprintf(stderr, "\t%zu", val);
+ if ((i % 8) == 7) fputc('\n', stderr);
+ x1 += val;
+ x2 += val*val;
+ if (val < min) min = val;
+ if (val > max) max = val;
+ }
+ fputc('\n', stderr);
+ double avg1 = (x1+0.0)/LEN;
+ double avg2 = (x2+0.0)/LEN;
+ double var = avg2 - avg1*avg1;
+ fprintf(stderr, "hash utilisation, %d positions, %zu draws: %zu < %e (+%e) < %zu\n",
+ LEN, x1, min, avg1, sqrt(var), max);
+#endif
+}
+
+/* For the four functions defined here, we need two entries in the
+ symbol table. One will be the final link name of the replacement
+ stub, something like __atomic_load, e.g. The other one is the
+ __impl prefixed name. It may eventually be used by the fixed-sized
+ stub functions, since these can't use the name that corresponds to
+ the builtin.
+
+ The replacement to the final name is not done within compiling,
+ since the name clashes can only create conflicts for a C
+ compiler. Instead, we use an external tool (objcopy) that does the
+ renaming.
+
+ We want these to be strong aliases, so they can't accidentally be
+ replaced. Therefore we can't use musl's weak_alias macro but create
+ one of our own. */
+
+#define alias(X, Y) __attribute__((__alias__(#X))) __typeof__(X) Y
+
+alias(__impl_load, __impl_load_replace);
+alias(__impl_store, __impl_store_replace);
+alias(__impl_exchange, __impl_exchange_replace);
+alias(__impl_compare_exchange, __impl_compare_exchange_replace);
diff --git a/src/stdatomic/atomic_generic_1.c b/src/stdatomic/atomic_generic_1.c
new file mode 100644
index 0000000..722657b
--- /dev/null
+++ b/src/stdatomic/atomic_generic_1.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+INSTANTIATE_STUB_LF(1, uint8_t)
+#else
+INSTANTIATE_STUB_LC(1, uint8_t)
+#endif
+
+INSTANTIATE_SYNC(1, uint8_t)
diff --git a/src/stdatomic/atomic_generic_16.c b/src/stdatomic/atomic_generic_16.c
new file mode 100644
index 0000000..ac5105f
--- /dev/null
+++ b/src/stdatomic/atomic_generic_16.c
@@ -0,0 +1,15 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+INSTANTIATE_STUB_LF(16, __impl_uint128_t)
+INSTANTIATE_SYNC(16, __impl_uint128_t)
+#else
+INSTANTIATE_STUB_LCM(16, __impl_uint128_t)
+INSTANTIATE_SYNCM(16, __impl_uint128_t)
+# if __UINT128__
+INSTANTIATE_STUB_LCA(16, __impl_uint128_t)
+INSTANTIATE_SYNCA(16, __impl_uint128_t)
+# endif
+#endif
+
diff --git a/src/stdatomic/atomic_generic_2.c b/src/stdatomic/atomic_generic_2.c
new file mode 100644
index 0000000..a8244f2
--- /dev/null
+++ b/src/stdatomic/atomic_generic_2.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+INSTANTIATE_STUB_LF(2, uint16_t)
+#else
+INSTANTIATE_STUB_LC(2, uint16_t)
+#endif
+
+INSTANTIATE_SYNC(2, uint16_t)
diff --git a/src/stdatomic/atomic_generic_4.c b/src/stdatomic/atomic_generic_4.c
new file mode 100644
index 0000000..7b1693f
--- /dev/null
+++ b/src/stdatomic/atomic_generic_4.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+INSTANTIATE_STUB_LF(4, uint32_t)
+#else
+INSTANTIATE_STUB_LC(4, uint32_t)
+#endif
+
+INSTANTIATE_SYNC(4, uint32_t)
diff --git a/src/stdatomic/atomic_generic_8.c b/src/stdatomic/atomic_generic_8.c
new file mode 100644
index 0000000..d652497
--- /dev/null
+++ b/src/stdatomic/atomic_generic_8.c
@@ -0,0 +1,10 @@
+
+#include "stdatomic-impl.h"
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+INSTANTIATE_STUB_LF(8, uint64_t)
+#else
+INSTANTIATE_STUB_LC(8, uint64_t)
+#endif
+
+INSTANTIATE_SYNC(8, uint64_t)
diff --git a/src/stdatomic/redefine_syms.sh b/src/stdatomic/redefine_syms.sh
new file mode 100755
index 0000000..31740c5
--- /dev/null
+++ b/src/stdatomic/redefine_syms.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+objects=$(ls *.o)
+
+for obj in ${objects} ; do
+ objcopy --redefine-syms=redefine_syms.txt ${obj} tmp.o
+ mv tmp.o ${obj}
+ objcopy --redefine-syms=redefine_syms.txt ${obj%.o}.lo tmp.o
+ mv tmp.o ${obj%.o}.lo
+done
diff --git a/src/stdatomic/redefine_syms.txt b/src/stdatomic/redefine_syms.txt
new file mode 100644
index 0000000..5197a49
--- /dev/null
+++ b/src/stdatomic/redefine_syms.txt
@@ -0,0 +1,144 @@
+__impl_load_1 __atomic_load_1
+__impl_store_1 __atomic_store_1
+__impl_exchange_1 __atomic_exchange_1
+__impl_compare_exchange_1 __atomic_compare_exchange_1
+__impl_fetch_add_1 __atomic_fetch_add_1
+__impl_fetch_sub_1 __atomic_fetch_sub_1
+__impl_fetch_and_1 __atomic_fetch_and_1
+__impl_fetch_xor_1 __atomic_fetch_xor_1
+__impl_fetch_nand_1 __atomic_fetch_nand_1
+__impl_fetch_or_1 __atomic_fetch_or_1
+__impl_add_fetch_1 __atomic_add_fetch_1
+__impl_sub_fetch_1 __atomic_sub_fetch_1
+__impl_and_fetch_1 __atomic_and_fetch_1
+__impl_xor_fetch_1 __atomic_xor_fetch_1
+__impl_nand_fetch_1 __atomic_nand_fetch_1
+__impl_or_fetch_1 __atomic_or_fetch_1
+__impl_load_2 __atomic_load_2
+__impl_store_2 __atomic_store_2
+__impl_exchange_2 __atomic_exchange_2
+__impl_compare_exchange_2 __atomic_compare_exchange_2
+__impl_fetch_add_2 __atomic_fetch_add_2
+__impl_fetch_sub_2 __atomic_fetch_sub_2
+__impl_fetch_and_2 __atomic_fetch_and_2
+__impl_fetch_xor_2 __atomic_fetch_xor_2
+__impl_fetch_nand_2 __atomic_fetch_nand_2
+__impl_fetch_or_2 __atomic_fetch_or_2
+__impl_add_fetch_2 __atomic_add_fetch_2
+__impl_sub_fetch_2 __atomic_sub_fetch_2
+__impl_and_fetch_2 __atomic_and_fetch_2
+__impl_xor_fetch_2 __atomic_xor_fetch_2
+__impl_nand_fetch_2 __atomic_nand_fetch_2
+__impl_or_fetch_2 __atomic_or_fetch_2
+__impl_load_4 __atomic_load_4
+__impl_store_4 __atomic_store_4
+__impl_exchange_4 __atomic_exchange_4
+__impl_compare_exchange_4 __atomic_compare_exchange_4
+__impl_fetch_add_4 __atomic_fetch_add_4
+__impl_fetch_sub_4 __atomic_fetch_sub_4
+__impl_fetch_and_4 __atomic_fetch_and_4
+__impl_fetch_xor_4 __atomic_fetch_xor_4
+__impl_fetch_nand_4 __atomic_fetch_nand_4
+__impl_fetch_or_4 __atomic_fetch_or_4
+__impl_add_fetch_4 __atomic_add_fetch_4
+__impl_sub_fetch_4 __atomic_sub_fetch_4
+__impl_and_fetch_4 __atomic_and_fetch_4
+__impl_xor_fetch_4 __atomic_xor_fetch_4
+__impl_nand_fetch_4 __atomic_nand_fetch_4
+__impl_or_fetch_4 __atomic_or_fetch_4
+__impl_load_8 __atomic_load_8
+__impl_store_8 __atomic_store_8
+__impl_exchange_8 __atomic_exchange_8
+__impl_compare_exchange_8 __atomic_compare_exchange_8
+__impl_fetch_add_8 __atomic_fetch_add_8
+__impl_fetch_sub_8 __atomic_fetch_sub_8
+__impl_fetch_and_8 __atomic_fetch_and_8
+__impl_fetch_xor_8 __atomic_fetch_xor_8
+__impl_fetch_nand_8 __atomic_fetch_nand_8
+__impl_fetch_or_8 __atomic_fetch_or_8
+__impl_add_fetch_8 __atomic_add_fetch_8
+__impl_sub_fetch_8 __atomic_sub_fetch_8
+__impl_and_fetch_8 __atomic_and_fetch_8
+__impl_xor_fetch_8 __atomic_xor_fetch_8
+__impl_nand_fetch_8 __atomic_nand_fetch_8
+__impl_or_fetch_8 __atomic_or_fetch_8
+__impl_load_16 __atomic_load_16
+__impl_store_16 __atomic_store_16
+__impl_exchange_16 __atomic_exchange_16
+__impl_compare_exchange_16 __atomic_compare_exchange_16
+__impl_fetch_add_16 __atomic_fetch_add_16
+__impl_fetch_sub_16 __atomic_fetch_sub_16
+__impl_fetch_and_16 __atomic_fetch_and_16
+__impl_fetch_xor_16 __atomic_fetch_xor_16
+__impl_fetch_nand_16 __atomic_fetch_nand_16
+__impl_fetch_or_16 __atomic_fetch_or_16
+__impl_add_fetch_16 __atomic_add_fetch_16
+__impl_sub_fetch_16 __atomic_sub_fetch_16
+__impl_and_fetch_16 __atomic_and_fetch_16
+__impl_xor_fetch_16 __atomic_xor_fetch_16
+__impl_nand_fetch_16 __atomic_nand_fetch_16
+__impl_or_fetch_16 __atomic_or_fetch_16
+__impl_bool_compare_and_swap_1 __sync_bool_compare_and_swap_1
+__impl_val_compare_and_swap_1 __sync_val_compare_and_swap_1
+__impl_fetch_and_add_1 __sync_fetch_and_add_1
+__impl_fetch_and_sub_1 __sync_fetch_and_sub_1
+__impl_fetch_and_and_1 __sync_fetch_and_and_1
+__impl_fetch_and_xor_1 __sync_fetch_and_xor_1
+__impl_fetch_and_or_1 __sync_fetch_and_or_1
+__impl_add_and_fetch_1 __sync_add_and_fetch_1
+__impl_sub_and_fetch_1 __sync_sub_and_fetch_1
+__impl_and_and_fetch_1 __sync_and_and_fetch_1
+__impl_xor_and_fetch_1 __sync_xor_and_fetch_1
+__impl_or_and_fetch_1 __sync_or_and_fetch_1
+__impl_bool_compare_and_swap_2 __sync_bool_compare_and_swap_2
+__impl_val_compare_and_swap_2 __sync_val_compare_and_swap_2
+__impl_fetch_and_add_2 __sync_fetch_and_add_2
+__impl_fetch_and_sub_2 __sync_fetch_and_sub_2
+__impl_fetch_and_and_2 __sync_fetch_and_and_2
+__impl_fetch_and_xor_2 __sync_fetch_and_xor_2
+__impl_fetch_and_or_2 __sync_fetch_and_or_2
+__impl_add_and_fetch_2 __sync_add_and_fetch_2
+__impl_sub_and_fetch_2 __sync_sub_and_fetch_2
+__impl_and_and_fetch_2 __sync_and_and_fetch_2
+__impl_xor_and_fetch_2 __sync_xor_and_fetch_2
+__impl_or_and_fetch_2 __sync_or_and_fetch_2
+__impl_bool_compare_and_swap_4 __sync_bool_compare_and_swap_4
+__impl_val_compare_and_swap_4 __sync_val_compare_and_swap_4
+__impl_fetch_and_add_4 __sync_fetch_and_add_4
+__impl_fetch_and_sub_4 __sync_fetch_and_sub_4
+__impl_fetch_and_and_4 __sync_fetch_and_and_4
+__impl_fetch_and_xor_4 __sync_fetch_and_xor_4
+__impl_fetch_and_or_4 __sync_fetch_and_or_4
+__impl_add_and_fetch_4 __sync_add_and_fetch_4
+__impl_sub_and_fetch_4 __sync_sub_and_fetch_4
+__impl_and_and_fetch_4 __sync_and_and_fetch_4
+__impl_xor_and_fetch_4 __sync_xor_and_fetch_4
+__impl_or_and_fetch_4 __sync_or_and_fetch_4
+__impl_bool_compare_and_swap_8 __sync_bool_compare_and_swap_8
+__impl_val_compare_and_swap_8 __sync_val_compare_and_swap_8
+__impl_fetch_and_add_8 __sync_fetch_and_add_8
+__impl_fetch_and_sub_8 __sync_fetch_and_sub_8
+__impl_fetch_and_and_8 __sync_fetch_and_and_8
+__impl_fetch_and_xor_8 __sync_fetch_and_xor_8
+__impl_fetch_and_or_8 __sync_fetch_and_or_8
+__impl_add_and_fetch_8 __sync_add_and_fetch_8
+__impl_sub_and_fetch_8 __sync_sub_and_fetch_8
+__impl_and_and_fetch_8 __sync_and_and_fetch_8
+__impl_xor_and_fetch_8 __sync_xor_and_fetch_8
+__impl_or_and_fetch_8 __sync_or_and_fetch_8
+__impl_bool_compare_and_swap_16 __sync_bool_compare_and_swap_16
+__impl_val_compare_and_swap_16 __sync_val_compare_and_swap_16
+__impl_fetch_and_add_16 __sync_fetch_and_add_16
+__impl_fetch_and_sub_16 __sync_fetch_and_sub_16
+__impl_fetch_and_and_16 __sync_fetch_and_and_16
+__impl_fetch_and_xor_16 __sync_fetch_and_xor_16
+__impl_fetch_and_or_16 __sync_fetch_and_or_16
+__impl_add_and_fetch_16 __sync_add_and_fetch_16
+__impl_sub_and_fetch_16 __sync_sub_and_fetch_16
+__impl_and_and_fetch_16 __sync_and_and_fetch_16
+__impl_xor_and_fetch_16 __sync_xor_and_fetch_16
+__impl_or_and_fetch_16 __sync_or_and_fetch_16
+__impl_load_replace __atomic_load
+__impl_store_replace __atomic_store
+__impl_exchange_replace __atomic_exchange
+__impl_compare_exchange_replace __atomic_compare_exchange
diff --git a/src/thread/__lock1.c b/src/thread/__lock1.c
new file mode 100644
index 0000000..1a7123d
--- /dev/null
+++ b/src/thread/__lock1.c
@@ -0,0 +1,79 @@
+#include "pthread_impl.h"
+
+#if INT_MIN == -INT_MAX
+# error "this implementation supposes that INT_MIN has only the HO bit set and all others 0"
+#endif
+
+int volatile __cnt_slow = 0;
+int volatile __cnt_futex = 0;
+int volatile __cnt_again = 0;
+
+#ifndef NBENCH
+# define ACCOUNT(X) a_inc(&(X))
+#else
+# define ACCOUNT(X) do { } while(0)
+#endif
+
+/* A lock is an int that is interpreted specially. The HO bit tells if
+ the lock is already taken by some thread or not. The other bits are
+ a counter for the number of threads that are inside the critical section.
+
+ If loc is non-negative it holds the number of threads that entered
+ the critical section.
+
+ If loc is negative (the lock is taken) *loc - INT_MIN is the number
+ of treads in the critical section, including the lock holder. Or in
+ other words the number is *loc where the sign bit is zeroed. */
+
+void __lock1(int volatile* loc)
+{
+ if (libc.threads_minus_1) {
+ int spins;
+ /* fast path */
+ /* -INT_MAX is binary 10000…00001 */
+ int val = a_cas(loc, 0, -INT_MAX);
+ if (!val) return;
+ val = 1+a_fetch_add(loc, 1);
+ /* The lock acquisition loop. This has been designed such that the
+ only possible change that is done inside that loop is setting
+ the lock bit. This has a double objective. First all atomic
+ operations are expensive and doing a pair of ++ and -- inside
+ the loop would just waste memory bandwidth. Then, less changes
+ to the count, means that other threads that are inside this
+ same loop are less perturbed. */
+ for (;;) {
+ /* The lock bit isn't yet set. */
+ NOLOCK:
+ while (val >= 0) {
+ /* This just sets the sign bit without causing overflow. */
+ int other = (val-INT_MAX)-1;
+ other = a_cas(loc, val, other);
+ if (other == val) return;
+ else val = other;
+ }
+ /* The lock bit is set by someone else. */
+ ACCOUNT(__cnt_slow);
+ for (spins = 0; spins < 100; ++spins) {
+ a_spin();
+ val = *loc;
+ if (val >= 0) goto NOLOCK;
+ }
+ while (val < 0) {
+ ACCOUNT(__cnt_futex);
+ if (__syscall(SYS_futex, loc, FUTEX_WAIT|FUTEX_PRIVATE, val, 0) == -EAGAIN)
+ ACCOUNT(__cnt_again);
+ val = *loc;
+ }
+ }
+ }
+}
+
+void __unlock1(int volatile* loc)
+{
+ if (*loc < 0) {
+ /** *loc must be negative when unlock1 is called, so INT_MAX can
+ always be added without overflow. */
+ if (-INT_MAX != a_fetch_add(loc, INT_MAX))
+ __syscall(SYS_futex, loc, FUTEX_WAKE|FUTEX_PRIVATE, 1);
+ }
+}
[-- Attachment #1.3: README.org --]
[-- Type: text/plain, Size: 26842 bytes --]
#+TITLE: An implementation of the C11 =<stdatomic.h>= interface
#+AUTHOR: Jens Gustedt
#+DATE: July, 2015
#+LATEX_HEADER: \usepackage{color}
#+LATEX_HEADER: \usepackage{listings}
#+LATEX_HEADER: \lstset{
#+LATEX_HEADER: keywordstyle=\color{blue},
#+LATEX_HEADER: commentstyle=\color{red},
#+LATEX_HEADER: stringstyle=\color{green},
#+LATEX_HEADER: basicstyle=\ttfamily\small,
#+LATEX_HEADER: columns=fullflexible,
#+LATEX_HEADER: frame=single,
#+LATEX_HEADER: basewidth={0.4em,0.4em},
#+LATEX_HEADER: }
* Short description
The implementation of the C11 atomic interface typically sits
between the implementation of the core language by the C compiler
and the implementation of the C library. It needs compiler support
for the individual atomic operations and library supports for the
cases where no low-level atomic instruction is available and a lock
must be taken.
- This implementation builds entirely on the two gcc ABIs for
atomics. It doesn't even attempt to go down to assembly level by
itself.
- We provide all function interfaces that the two gcc ABIs and the
C standard need.
- For compilers that don't offer the direct language support for
atomics this provides a reduced but fully functional approach to
atomic operations.
* Implemented library features
We distinguish between the implementation of library functions and
the =<stdatomic.h>= header file.
The latter already contains a lot of intelligence, because it has
to do type generic stuff. This is more involved than usual C
library header files.
The header file is optional, the created function interface should
be compatible with the header files that gcc and clang may provide.
** Type, constants and function interfaces
These are the types and proper functions that are foreseen by the
standard:
- =atomic_flag= and its four functions
- the =memory_order= enumeration type
- fences
- object-like macros to test for lock-freeness and similar things
- =typedef= for atomic integer and pointer types.
All of these are provided in forms that are compatible with gcc and
clang.
** Type generic functions
These are all implemented as macros, and should in many cases
result in optimized inlined assembler instructions, and not in
library calls. Library calls are only needed as fall back, when
there is no reasonable instruction set available.
This implementation uses predefined macros of the form
=__GCC_HAVE_SYNC_COMPARE_AND_SWAP_X=
where =X= can be one of 1, 2, 4, 8 or 16. All versions of gcc and
clang since at least ten years implement these macros and the
underlying operations consistently.
If that macro exists, we suppose that the compiler is able to
synthesize all corresponding memory functions for types of size =X=
and all necessary arithmetic function for integer types of that
size, as lock-free (= stateless) instructions.
This doesn't mean that there are direct assembler instruction for
all these operations. They can well be implemented as an unbounded
loop that uses a =compare_and_swap= (CAS) primitive for atomic
exchange. Gcc typically does this for the less common atomic
arithmetic instructions such as =atomic_fetch_and=, for
example. Lock-free doesn't mean a bounded number of instructions.
For the operations that cannot be mapped to assembler instructions
the compiler inserts calls to external functions. The names for
these functions are typically composed of the operation and
prefixed either by =__sync_= (the older gcc ABI) or =__atomic_=
(the newer gcc ABI). The names of these calls can be suffixed by
=_X= for =X= as above if this concerns an operation on a type of
the corresponding width.
All external functions that the gcc ABI's require are provided.
*** The =__atomic_= ABI
is already close to the C11 call interface. Relevant for C11 are 9
operations
- =fetch_add= for integer addition, returning the previous value
- =fetch_sub= for integer subtraction, returning the previous value
- =fetch_or= for bitwise or, returning the previous value
- =fetch_and= for bitwise and, returning the previous value
- =fetch_xor= for bitwise xor, returning the previous value
- =load= for an atomic load operation
- =store= for an atomic store operation
- =exchange= for an atomic exchange operation, equivalent to a
=store= that returns the previous value
- =compare_exchange= for an atomic compare and exchange
operation, equivalent to a conditional =store= that also saves
the previous value, and returns =false= or =true= according to
the success of the condition.
In addition to the more or less obvious operands, the built-in
functions take one or two additional parameters that reflect an
eventual requirement for the =memory_order= of the operation. So
the functions represent the C11 "explicit" features such as
=atomic_fetch_add_explicit=.
Observe that the built-in functions only foresee one interface
=compare_exchange=.
- The distinction between "weak" and "strong" versions of these
built-in functions are ruled through an additional parameter,
not through a different function interface.
- The function symbol fall-back =__atomic_compare_exchange=
confusingly has a different semantic and prototype than the
built-in function. It misses the parameter to chose between the
"weak" and the "strong" version, and solely corresponds to the
C11 operation
=atomic_compare_exchange_strong_explicit=
Load, store and compare operations have /memory/ semantics, that is
they are equivalent to the use of =memcpy= and =memcmp= library
functions. The implementation may use = or == operators in some
places for optimization, but it then does so with objects of
=uintXX_t=, so every bit is accounted for. For data types where
memory and value comparison are different, the result of an
=atomic_compare_exchange= operation can be different than you'd
expect:
- =_Bool= objects where other bits than the lowest-order bit have
been polluted, will not compare equal to =false= or =true=.
- Floating point types may compare different representations of
=0= not to be equal.
- Two floating point =NaN= may compare equal, though as value
comparison =NaN= never compares equal to anything.
- Objects of =struct= or =union= type may be considered unequal
because they differ on some padding bytes.
This behavior is in alignment with the intended interpretation by
the C and C++ standard's committees.
Function call interfaces for the arithmetic operations are only
generated if we can suppose that an integer type for the
corresponding size exists. We can reasonably assume that there are
always types =uint8_t=, =uint16_t=, =uint32_t= and =uint64_t=, so
the variants for 1, 2, 4 and 8 can always be generated.
For a 128 bit type these are only generated if =__SIZEOF_INT128__=
or =__GCC_HAVE_SYNC_COMPARE_AND_SWAP_X= exist. If so, we assume
that =__uint128_t= is such an integer type and known to the
compiler.
Arithmetic operations can safely use these =uintXX_t= types
internally, since the standard imposes two's complement
representation for signed atomic types and also enforces that
atomic operations may not produce traps on overflow.
Additionally to the operations that have generic function
interfaces in the C11 standard, gcc additionally implements six
other built-ins, namely
- =__atomic_add_fetch= for integer addition, returning the updated value
- =__atomic_sub_fetch= for integer subtraction, returning the updated value
- =__atomic_or_fetch= for bitwise or, returning the updated value
- =__atomic_and_fetch= for bitwise and, returning the updated value
- =__atomic_xor_fetch= for bitwise xor, returning the updated value
- =__atomic_fetch_nand= for bitwise nand (=x = ~(x & v)=), returning the previous value
- =__atomic_nand_fetch= for bitwise nand (=x = ~(x & v)=), returning the
updated value
For the completeness of the library interface we supply analogous
functions with the =_X= suffix for these. They might be called by
the compiler if the user code uses assign and add or similar
operators on atomic integers. The =__atomic_add_fetch= and
=__atomic_sub_fetch= functions may also eventually be used by the
compiler to implement an atomic prefix increment or decrement
operation (=++x= and =--x=). This would e.g happen if =x= is an
object of type =__int128_t= and the platform doesn't implement
lock-free atomics for types of size 16.
*** Clang's =__c11_atomic= built-ins
Clang has gone a different path for the built-ins that implement
C11 atomics, prefixed with =__c11_atomic=. These are a directly
feature equivalent to the C11 generic functions that have
=memory_order= arguments (=_explicit= suffix).
For the cases that no atomic instructions can be synthesized,
clang falls back to the same external calls as described for gcc's
=__atomic= ABI.
*** The =__sync= ABI
It dates back long before the C11 atomic interface had been
designed and thus cannot be directly conforming to it. It has
basically the same built-ins for arithmetic types as above, only
that
- The functions are named a bit differently.
- They only implement sequential consistency.
- There are no =load=, =store= or =exchange= features.
- The =nand= operations changed their meaning from version 4.4
onward. Therefore this operation cannot be used portably in an
environment that might use different versions of compilers. So
we don't implement these function interfaces and we deprecate
the use of this built-in.
Additionally this interface also implements a =test_and_set=
functionality that is used to implement the =atomic_flag=
functions. This built-in is documented to have acquire-release
consistency. If used with sequential consistency, an additional
fence is inserted to ensure that.
These features are sufficient to provide a decent implementation of
C11 atomics.
*** The lock-full fallback functions
In absence of proper architecture support, all fallbacks (for
the three built-in families) with =_X= suffix use the ones without
suffix underneath. These external interfaces receive the size of
the data type as an additional, leading parameter:
- =__atomic_load=
- =__atomic_store=
- =__atomic_exchange=
- =__atomic_compare_exchange=
They have pure memory semantics and their basic operations are
=memcpy= and =memcmp= for load, store and comparison.
These functions *cannot be called directly* from within your code,
because the compiler cannot distinguish them from the gcc built-ins,
/and/ they have different prototypes than these.
We implement these functions as critical sections that are
protected with a lock, similar to a mutex. This implementations
uses a table of locks and a hash function to choose one of the
entries that only depends on the address of the atomic object.
At the moment, this implementation has several address-hash
functions that can be chosen a library-compile time. Any function
that mixes the bits of the address should perform reasonably well.
More important for performance is the choice of the lock. Such a
lock can be relatively simple, since C11 atomics that are not
lock-free don't have to be asynchronous signal safe.
There are several possibilities, in order of preference:
- An OS specific light-weighted lock with non-active waits. The
integration into =musl= uses Linux' =futex= underneath to do an
efficient wait. If by coincidence these are called in an
un-threaded process, they are close to non-ops.
- C11's =mtx_t= type has an shallow interface that should allow
it to be implemented a bit simpler and efficient than OS
specific mutexes that implement a lot of functionality. This
solution should be portable to all platforms that implement
this part of C11. In a relatively near future these could be
all POSIX and Windows platforms. This approach has the
disadvantage that a table of =mtx_t= must be initialized at
process startup because =mtx_t= doesn't guarantee static
initialization.
- POSIX' =pthread_mutex_t= is a little less portable, but allows
for static initialization.
- A spinlock similar to =atomic_flag=. Such an approach is
portable to all platforms that implement atomics and allows for
static initialization. This is the only choice when compiled
without OS or library support.
The wait functionality is an active wait, that burns CPU cycles
and memory bandwidth. In many circumstances this should do
well, the critical sections that are protected by this are nice
and small.
* The =<stdatomic.h>= header file
** Full C11 support
Versions of gcc and clang that fully implement the C11 atomics
interface will not need a special header file but can use their own
that is shipped with the compiler:
- gcc starting with version 4.9
- clang starting with version 3.6
This full support of atomics allows to use atomic objects just as
other objects it whatever operations the base type supports.
These default operations on atomics use sequential consistency. That
is, each such an operation will enforce a full memory transfer and
the perceived effect is as if all these operations, even if issued
in different threads, have been done one after another. Thus, thread
parallelism can only play between such operations:
#+BEGIN_CENTER
*atomics operations are expensive*
#+END_CENTER
The functional interfaces with different =memory_order= arguments
(=_explicit= suffix to the name) that we described above may be used
to milder the memory effect that atomic operations have. The
possible gain of such different memory consistency models are very
architecture dependent. E.g on the x86 platforms they offer almost
no advantage, whereas on ARM platforms acquire/release semantics may
bring some noticeable gain.
But beware that this gain is bought with a sensible complexification
of the code. Only use this if the atomic operations are a measurable
performance bottleneck /and/ you already have reduced the number of
these operations to a minimum.
** Partial C11 atomics support
A series of compiler versions offers partial atomics support that
already implements most of the C11 semantic:
- gcc versions 4.7 and 4.8
- clang versions 3.2 to 3.5
The versions provide the built-in functions as described above but
lack full compiler support for atomic types and operations.
With the =<stdatomic.h>= header that we supply for these compilers,
application code can use the functional interfaces. A macro
=_Atomic(T)= is provided that can be used to issue emulated
declarations of atomic types that should be *forward compatible* to
platforms with complete C11 atomics support. Example:
#+begin_src C
// global variables
_Atomic(size_t) thread_inside_count = ATOMIC_VAR_INIT(0);
_Atomic(size_t) thread_total_count = ATOMIC_VAR_INIT(1);
int my_thread_function(void* arg) {
atomic_fetch_add(&thread_inside_count, 1);
atomic_fetch_add(&thread_total_count, 1);
// do something complicated here
// at the end
atomic_fetch_sub(&thread_inside_count, 1);
}
#+end_src
Underneath such emulated atomic objects are implemented as arrays of
=volatile= base type of size 1. This has the following sought
effects:
- They can't be assigned to.
- They evaluate to a pointer in almost any context.
- Operations with them cannot be reordered by the compiler.
So you should be relatively safe from programming errors that would
access such objects without passing through the type generic atomic
functions. The compiler will error out on improper usage of such
atomic objects, but the diagnostics may be a bit crude.
*** Issues
Since this approach may reinterpret data through pointer casts, it
could potentially be dangerous. So let us discuss the possible
issues.
- The generic fallbacks for memory access only use =memcpy= and
=memcmp= to access the data itself. So the access of the data is
within the constraints of the standard.
- The generic fallbacks for memory access ensure that their
arguments have compatible base types (if a pointer is passed in)
or are assignment compatible with the base type of the atomic
(if a value is passed in). So data that is copied across can
never be misinterpreted as being of a wrong type because the two
target types are compatible.
- The specialized functions with =_X= suffix may reinterpret their
data as the corresponding =uintXX_t= for the size. Copying or
comparing such data is always guaranteed to use all bits, so in
that sense it is equivalent to =memcpy= and =memcmp=.
- The arithmetic operations that are executed then are operations
on an unsigned integer type that has no padding bits. This
arithmetic is compatible for all integer types that have no
padding bits and, for the signed types, are represented with
two's complement.
- An emulated atomic with this approach is implemented as an array
to the base type, and so in the user code the base type of the
object remains visible to the compiler. As a consequence this
approach has no effect on the aliasing rules, the compiler
always has complete information about the type of each object.
The only potential problem for our approach that remains is
alignment. Since the stub functions that are provided may use
casts to =uintXX_t= of "atomic" objects you have to ensure that
these objects are at least aligned as these types would be. This
should not be a problem, if the base type is an integer type,
too. Integer types with same size should have the same alignment.
If you encounter problems with a user defined type that has a size
that is a small power of two you could force alignment
#+begin_src C
_Alignas(sizeof(toto)) _Atomic(toto) toto1;
__attribute__((__aligned__(sizeof(toto)))) _Atomic(toto) toto2;
#+end_src
with whatever of the two constructs works for you.
I am currently struggling to provide a version of the =_Atomic(T)=
macro that ensures that automatically. It seems to be possible but
produces a lot of noise for function parameters that are pointers
to atomics.
** Basic atomics support
Even older versions of gcc and clang implement the =__sync= built-in
functions and can thereby made to accept the same <stdatomic.h>
header as discussed above. Since, as their names indicate, these
built-ins only have fully synchronizing versions, they will not be
able to take advantage of the different consistency models. But
implementing atomics with stronger consistency than required, here
sequential consistency, only, is conforming to the C standard.
* The implementation
** Requirements
*** Compilers
You should be able to compile this implementation with any version
of modern gcc and clang. (Versions are hard to tell, gcc should work
for 4.1) The quality of the resulting binary will depend on the
implementation of atomic support by the compiler.
There are three different implementations, for modern clang and gcc,
and one for those compilers that only support the =__sync_=
built-ins. They are only tested with clang and gcc, but might work
with other compilers that implement one of the sets of built-ins and
is otherwise compatible to some gcc extensions:
- compound expressions with =({ })=
- =__attribute__= with =__alias__= and =__unused__=
- =__builtin_choose_expr= for the =__sync= version as a precursor of
C11's =_Generic=
There are some heuristics in place to decide at compile time which
case applies, namely =__clang__= to detect clang, =__ATOMIC_...=
macros to detect the C11 versions of the built-ins.
*** OS or C library support
The library may work with different lock constructs, currently we
implement one simple generic approach that only uses spinning, and
a mixed approach that uses Linux' =futex= as an inactive sleep
strategy as a last resort. The latter has been tested with the
=musl= C library.
This locking strategy can be a performance bottleneck for
applications with a strong congestion on one particular atomic
data, e.g code that would insert list elements through a
centralized list head. If this list head can not be realized with
a lock-free atomic, the critical section of modifying it is
protected by our lock. Such code has very particular properties.
- Since the critical section usually is really short compared to a
scheduling interval of the OS, the probability that the lock can
be taken immediately is high. So the fast path for taking the
lock must be *really fast*. Our implementation essentially has
an =atomic_compare_exchange_strong_explicit=, here. One memory
instruction on the fast path must be enough.
- If locking fails a the first try, still the probability is very
high that it will succeed soon after. This is because only
scheduled threads compete, here, so there are never more threads
in play than we have processors. Therefore as a second strategy
we spin for a while until we get the lock. In our experiments on
average one single round of spinning was enough.
- A third exceptional case may occur, when the thread that is
holding the lock is descheduled in the middle of the critical
section. The probability for that event is quite rare (0.1 % in
our experiments) but still this case occurs. If it does, the
world changes drastically, a herd of threads all have to wait
for a long time (until the locker is rescheduled) to have any
chance to obtain the lock. Active wait here is
counterproductive. In the contrary, by going into an inactive OS
sleep, the possibility for the locker to regain an execution
slot increases.
We implement this strategy a bit differently than classical locks
with wait-counters would do. We just have a single =unsigned= value
that at the same time holds the lock bit (HO bit) and a
counter. That counter is not viewed as a counter of the threads
that are in a kernel wait, but just counts the number of threads
inside the critical section. This has the following advantages:
- An update to the counter part is relatively rare. So we save
memory bandwidth, and we also avoid too much interaction between
the different threads that compete for the lock.
- The fast path occurs when the value is =0=, initially. It sets
the HO bit (the lock bit) and the LO bit (for a counter of value
=1=) in one go. The resulting value is =UINT_MAX/2u+2u=.
- If the fast path fails, the counter is atomically incremented by
one, and we enter a spin lock to set the HO bit as well.
- After having spun for sometime, we suppose that we are in the bad
situation and go into a =futex_wait=. Going into the =futex_wait=
may fail if the value changes. Since additional threads only
change the counter when they arrive, this can't happen too often
and the thread goes to sleep, eventually.
- Unlocking is a very simple operation. The locker has contributed
=UINT_MAX/2u+2u= to the value, and so just has to decrement the
value atomically by that amount. By doing so, the thread also
notices if other threads still are in the critical section and
wakens one of them.
** Caveats
*** Symbol renaming
There is one important difficulty when compiling this. The original
=__atomic= library interface was developed with C++ in mind and not
C. Therefore it freely uses function overloading for the built-ins
versus the library interface. Since we also use the library
functions as fallbacks in the implementation of some of the =_X=
variants this naming scheme is not supportable with a C compiler.
We get away with it by using internal names, prefixed with =__impl_=
for all functions. Then we rename symbols to the intended ones using
=objcopy=.
- The current integration into musl does this with a *script* that
you have to run *manually* after compilation.
- You then have to launch =make= a *second time* to do the final link.
This technique is certainly not ideal and subject to improvement.
*** Support of 16 byte atomic instructions
The main difference for modern processors that is relevant here is
if it supports 16 byte atomic instructions or not. There is no
difficulty to detect this at compile time, but if the library is
used with code that is compiled with a different compiler or just
different compiler options, incompatible binary code may be
produced.
My plan is to freeze that feature at compile time of the library
and reflect the capacity in the =<stdatomic.h>= that is
provided. This then may result in code that is a bit less
optimized than it could, but that is compatible.
- If the library is *not* compiled with direct 16 byte support the
application may not use it, and thus use a memory implementation
for such operations.
- If the library *is* compiled with direct 16 byte support but the
application compiler doesn't support it, the user code should
fallback to library calls, but which in turn use the atomic
instructions. So such a variant would have a call overhead and
would not be able to inline the atomics in the user binary.
All of this is not yet, done, though. Be careful when using this
preliminary version.
** Leftovers
There are some leftovers that will hopefully disappear.
- There are several hash functions and a instrumentation
infrastructure for the hashes. I didn't have enough test cases
yet to see what would be best, here.
- There is optional instrumentation for the lock
functions. Switching it on changes overall performance
substantially, and thus I'd expect a noticeable Heisenberg
effect. So these counter can give qualitative information about
what happens, you shouldn't take the figures verbally.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread