mailing list of musl libc
 help / color / mirror / code / Atom feed
* [PATCH] ARM atomics overhaul, try 1
@ 2014-11-18  3:00 Rich Felker
  0 siblings, 0 replies; only message in thread
From: Rich Felker @ 2014-11-18  3:00 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 956 bytes --]

Here's my first draft of the ARM atomics overhaul. The results should
be ideal for v7+, but I'm not entirely happey with the code for
supporting fallbacks. Right now the concept is to have multiple
fallback versions of the code laid out exactly like the kuser helper
page, and just select one of these. A relative address is used so that
the default of zero at early load time results in using safe "fake
atomics" that don't depend on kuser helper page or v6/v7 instructions.
But there's a lot of overhead loading the address to make the indirect
call that happens on each atomic. It probably doesn't perform that
bad, but it bloats the code size a good bit.

An alternative that might be less ugly is making direct calls to
external functons written in asm, and having the latter do the
dispatch to different versions of the code. I've tried this a few
times though and I wasn't too happy with where it was going.

Other ideas/comments are welcome!

Rich

[-- Attachment #2: arm_atomics_overhaul_try1.diff --]
[-- Type: text/plain, Size: 10455 bytes --]

diff --git a/arch/arm/atomic.h b/arch/arm/atomic.h
index 8665c87..c73d3b5 100644
--- a/arch/arm/atomic.h
+++ b/arch/arm/atomic.h
@@ -22,67 +22,193 @@ static inline int a_ctz_64(uint64_t x)
 	return a_ctz_l(y);
 }
 
-#if ((__ARM_ARCH_6__ || __ARM_ARCH_6K__ || __ARM_ARCH_6ZK__) && !__thumb__) \
- || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7
-
 #if __ARM_ARCH_7A__ || __ARM_ARCH_7R__ ||  __ARM_ARCH >= 7
-#define MEM_BARRIER "dmb ish"
-#else
-#define MEM_BARRIER "mcr p15,0,r0,c7,c10,5"
-#endif
 
-static inline int __k_cas(int t, int s, volatile int *p)
+static inline void a_barrier()
+{
+	__asm__ __volatile__("dmb ish");
+}
+
+static inline int a_cas(volatile int *p, int t, int s)
 {
-	int ret;
-	__asm__(
-		"	" MEM_BARRIER "\n"
+	int old;
+	__asm__ __volatile__(
+		"	dmb ish\n"
 		"1:	ldrex %0,%3\n"
-		"	subs %0,%0,%1\n"
-#ifdef __thumb__
-		"	itt eq\n"
-#endif
-		"	strexeq %0,%2,%3\n"
-		"	teqeq %0,#1\n"
-		"	beq 1b\n"
-		"	" MEM_BARRIER "\n"
-		: "=&r"(ret)
+		"	cmp %0,%1\n"
+		"	bne 1f\n"
+		"	strex %0,%2,%3\n"
+		"	cmp %0, #0\n"
+		"	bne 1b\n"
+		"	mov %0, %1\n"
+		"1:	dmb ish\n"
+		: "=&r"(old)
 		: "r"(t), "r"(s), "Q"(*p)
 		: "memory", "cc" );
-	return ret;
+	return old;
+}
+
+static inline int a_swap(volatile int *x, int v)
+{
+	int old, tmp;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	strex %1,%2,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+	return old;
+}
+
+static inline int a_fetch_add(volatile int *x, int v)
+{
+	int old, tmp;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	add %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+	return old-v;
+}
+
+static inline void a_inc(volatile int *x)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%2\n"
+		"	add %0,%0,#1\n"
+		"	strex %1,%0,%2\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_dec(volatile int *x)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%2\n"
+		"	sub %0,%0,#1\n"
+		"	strex %1,%0,%2\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "Q"(*x)
+		: "memory", "cc" );
 }
+
+static inline void a_and(volatile int *x, int v)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	and %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_or(volatile int *x, int v)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	orr %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_store(volatile int *p, int x)
+{
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"	str %1,%0\n"
+		"	dmb ish\n"
+		: "=m"(*p)
+		: "r"(x)
+		: "memory", "cc" );
+}
+
+#else
+
+extern const unsigned char __atomics_base[] __attribute__((__visibility__("hidden")));
+extern uintptr_t __atomics_selector __attribute__((__visibility__("hidden")));
+
+static inline void a_barrier()
+{
+	__asm__ __volatile__(
+		"	mov lr,pc\n"
+#ifdef __thumb__
+		"	bx %0\n"
 #else
-#define __k_cas ((int (*)(int, int, volatile int *))0xffff0fc0)
+		"	mov pc,%0\n"
 #endif
+		:
+		: "r"((uintptr_t)__atomics_base + __atomics_selector + 0)
+		: "memory", "lr" );
+}
+
 
 static inline int a_cas(volatile int *p, int t, int s)
 {
+	int (*cas)(int, int, volatile int *) = 
+		((int (*)(int, int, volatile int *))
+		((uintptr_t)__atomics_base + __atomics_selector + 32));
 	int old;
 	for (;;) {
-		if (!__k_cas(t, s, p))
+		if (!cas(t, s, p))
 			return t;
 		if ((old=*p) != t)
 			return old;
 	}
 }
 
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
 static inline int a_swap(volatile int *x, int v)
 {
+	int (*cas)(int, int, volatile int *) = 
+		((int (*)(int, int, volatile int *))
+		((uintptr_t)__atomics_base + __atomics_selector + 32));
 	int old;
 	do old = *x;
-	while (__k_cas(old, v, x));
+	while (cas(old, v, x));
 	return old;
 }
 
 static inline int a_fetch_add(volatile int *x, int v)
 {
+	int (*cas)(int, int, volatile int *) = 
+		((int (*)(int, int, volatile int *))
+		((uintptr_t)__atomics_base + __atomics_selector + 32));
 	int old;
 	do old = *x;
-	while (__k_cas(old, old+v, x));
+	while (cas(old, old+v, x));
 	return old;
 }
 
@@ -96,35 +222,45 @@ static inline void a_dec(volatile int *x)
 	a_fetch_add(x, -1);
 }
 
-static inline void a_store(volatile int *p, int x)
+static inline void a_and(volatile int *p, int v)
 {
-	while (__k_cas(*p, x, p));
+	int (*cas)(int, int, volatile int *) = 
+		((int (*)(int, int, volatile int *))
+		((uintptr_t)__atomics_base + __atomics_selector + 32));
+	int old;
+	do old = *p;
+	while (cas(old, old&v, p));
 }
 
-#define a_spin a_barrier
-
-static inline void a_barrier()
+static inline void a_or(volatile int *p, int v)
 {
-	__k_cas(0, 0, &(int){0});
+	int (*cas)(int, int, volatile int *) = 
+		((int (*)(int, int, volatile int *))
+		((uintptr_t)__atomics_base + __atomics_selector + 32));
+	int old;
+	do old = *p;
+	while (cas(old, old|v, p));
 }
 
-static inline void a_crash()
+static inline void a_store(volatile int *p, int x)
 {
-	*(volatile char *)0=0;
+	a_barrier();
+	*p = x;
+	a_barrier();
 }
 
-static inline void a_and(volatile int *p, int v)
+#endif
+
+static inline void *a_cas_p(volatile void *p, void *t, void *s)
 {
-	int old;
-	do old = *p;
-	while (__k_cas(old, old&v, p));
+	return (void *)a_cas(p, (int)t, (int)s);
 }
 
-static inline void a_or(volatile int *p, int v)
+#define a_spin a_barrier
+
+static inline void a_crash()
 {
-	int old;
-	do old = *p;
-	while (__k_cas(old, old|v, p));
+	*(volatile char *)0=0;
 }
 
 static inline void a_or_l(volatile void *p, long v)
diff --git a/arch/arm/pthread_arch.h b/arch/arm/pthread_arch.h
index 6d9dc3a..b292e01 100644
--- a/arch/arm/pthread_arch.h
+++ b/arch/arm/pthread_arch.h
@@ -10,9 +10,13 @@ static inline __attribute__((const)) pthread_t __pthread_self()
 
 #else
 
+extern const unsigned char __atomics_base[] __attribute__((__visibility__("hidden")));
+extern uintptr_t __atomics_selector __attribute__((__visibility__("hidden")));
+
 typedef char *(*__ptr_func_t)(void) __attribute__((const));
-#define __pthread_self() \
-	((pthread_t)(((__ptr_func_t)0xffff0fe0)()+8-sizeof(struct pthread)))
+#define __pthread_self() ((pthread_t)( \
+	((__ptr_func_t)((unsigned long)__atomics_base+__atomics_selector+64))()  \
+	+8-sizeof(struct pthread)))
 
 #endif
 
diff --git a/arch/arm/src/__set_thread_area.c b/arch/arm/src/__set_thread_area.c
index e69de29..ba6a30b 100644
--- a/arch/arm/src/__set_thread_area.c
+++ b/arch/arm/src/__set_thread_area.c
@@ -0,0 +1,37 @@
+#include <stdint.h>
+#include <elf.h>
+#include "pthread_impl.h"
+#include "libc.h"
+
+#define HWCAP_TLS (1 << 15)
+
+extern const unsigned char __attribute__((__visibility__("hidden")))
+	__atomics_base[], __atomics_oldkuser_base[],
+	__atomics_v6_base[], __atomics_v7_base[];
+
+extern uintptr_t __atomics_selector __attribute__((__visibility__("hidden")));
+
+int __set_thread_area(void *p)
+{
+#if !__ARM_ARCH_7A__ && !__ARM_ARCH_7R__ && __ARM_ARCH < 7
+	uintptr_t sel;
+	if (__hwcap & HWCAP_TLS) {
+		size_t *aux;
+		sel = (uintptr_t)__atomics_v7_base;
+		for (aux=libc.auxv; *aux; aux+=2) {
+			if (*aux != AT_PLATFORM) continue;
+			const char *s = (void *)aux[1];
+			if (s[0]=='v' && s[1]=='6' && s[2]-'0'>=10u)
+				sel = (uintptr_t)__atomics_v6_base;
+			break;
+		}
+	} else {
+		int ver = *(int *)0xffff0ffc;
+		sel = (uintptr_t)0xffff0fa0;
+		if (ver < 2) a_crash();
+		if (ver < 3) sel = (uintptr_t)__atomics_oldkuser_base;
+	}
+	__atomics_selector = sel - (uintptr_t)__atomics_base;
+#endif
+	return __syscall(0xf0005, p);
+}
diff --git a/arch/arm/src/arm/atomics.s b/arch/arm/src/arm/atomics.s
index e69de29..b55abdf 100644
--- a/arch/arm/src/arm/atomics.s
+++ b/arch/arm/src/arm/atomics.s
@@ -0,0 +1,114 @@
+.text
+
+.global __atomics_base
+.global __atomics_dummy_base
+.global __atomics_oldkuser_base
+.global __atomics_v6_base
+.global __atomics_v7_base
+
+.hidden __atomics_base
+.hidden __atomics_dummy_base
+.hidden __atomics_oldkuser_base
+.hidden __atomics_v6_base
+.hidden __atomics_v7_base
+
+.p2align 5
+__atomics_base:
+__atomics_dummy_base:
+dummy_barrier:
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+
+.p2align 5
+dummy_cas:
+	ldr r3,[r2]
+	subs r3,r3,r0
+	streq r1,[r2]
+	rsbs r0,r3,#0
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+
+.p2align 5
+dummy_gettp:
+	mov r0,#0
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+
+.p2align 5
+__atomics_oldkuser_base:
+oldkuser_barrier:
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+
+.p2align 5
+oldkuser_cas:
+	adr ip,1f
+	ldr ip,[ip]
+	mov pc,ip
+1:	.word 0xffff0fc0
+
+.p2align 5
+oldkuser_gettp:
+	adr r0,1f
+	ldr r0,[r0]
+	mov pc,r0
+1:	.word 0xffff0fe0
+
+.p2align 5
+__atomics_v6_base:
+v6_barrier:
+	mcr p15,0,r0,c7,c10,5
+	bx lr
+
+.p2align 5
+v6_cas:
+	mov r3,r0
+	mcr p15,0,r0,c7,c10,5
+1:	.word 0xe1920f9f        /* ldrex r0,[r2] */
+	subs r0,r0,r3
+	.word 0x01820f91        /* strexeq r0,r1,[r2] */
+	teqeq r0,#1
+	beq 1b
+	b v6_barrier
+
+.p2align 5
+v6_gettp:
+	mrc p15,0,r0,c13,c0,3
+	bx lr
+
+
+
+.p2align 5
+__atomics_v7_base:
+v7_barrier:
+	.word 0xf57ff05b        /* dmb ish */
+	bx lr
+
+.p2align 5
+v7_cas:
+	mov r3,r0
+	.word 0xf57ff05b        /* dmb ish */
+1:	.word 0xe1920f9f        /* ldrex r0,[r2] */
+	subs r0,r0,r3
+	.word 0x01820f91        /* strexeq r0,r1,[r2] */
+	teqeq r0,#1
+	beq 1b
+	b v7_barrier
+
+.p2align 5
+v7_gettp:
+	mrc p15,0,r0,c13,c0,3
+	bx lr
+
+
+
+
+.data
+.global __atomics_selector
+.hidden __atomics_selector
+__atomics_selector:
+	.word 0
diff --git a/src/thread/arm/__set_thread_area.s b/src/thread/arm/__set_thread_area.s
index 63d8884..4a4cd0d 100644
--- a/src/thread/arm/__set_thread_area.s
+++ b/src/thread/arm/__set_thread_area.s
@@ -1,12 +1 @@
-.text
-.global __set_thread_area
-.type   __set_thread_area,%function
-__set_thread_area:
-	mov r1,r7
-	mov r7,#0x0f0000
-	add r7,r7,#5
-	svc 0
-	mov r7,r1
-	tst lr,#1
-	moveq pc,lr
-	bx lr
+/* Replaced by C code in arch/arm/src */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2014-11-18  3:00 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-18  3:00 [PATCH] ARM atomics overhaul, try 1 Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).