[PATCH] ARM atomics overhaul, try 2

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] ARM atomics overhaul, try 2
@ 2014-11-18 19:15 Rich Felker
  2014-11-19  6:12 ` Rich Felker
  0 siblings, 1 reply; 2+ messages in thread
From: Rich Felker @ 2014-11-18 19:15 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1379 bytes --]

Here's a new version of the ARM atomics overhaul patch which I'm much
happier with. Whereas the old version imposed a heavy address
computation in the caller at each point where an atomic was used, the
new version achieves a light computed jump inside the callee, using an
idiom of the form:

	ldr ip,1f
	ldr ip,[pc,ip]
	add pc,pc,ip
1:	.word relativeptr-1b

When relativeptr contains zero, as at program startup, the code
continues with the instruction after the .word directive (a dummy
version that's safe to use before initialization). Later, relativeptr
is filled with the difference between the address of the desired
version and the address of this dummy code.

As before, v7+ is the most highly optimized, with special versions of
the various atomics using ldrex/strex directly to avoid a nested cas
loop. For atomics, compile-time v6 builds are not significantly better
than baseline (v4t) builds, although the thread-pointer load is
optimized with a hard-coded instruction. I could make v6 builds use
the inline asm like v7+ does, but with "bl __a_barrier" instead of
"dmb ish", but I'm not sure how much of a win this would be, if any.

Comments? If no problems are noticed right away I'll probably commit
this soon as a basis for any future work that needs to be done
improving it, since I think it's already reasonably good (and much
better than what we had).

Rich

[-- Attachment #2: arm_atomics_overhaul_try2.diff --]
[-- Type: text/plain, Size: 10130 bytes --]

diff --git a/arch/arm/atomic.h b/arch/arm/atomic.h
index 8665c87..8ae35bb 100644
--- a/arch/arm/atomic.h
+++ b/arch/arm/atomic.h
@@ -22,37 +22,150 @@ static inline int a_ctz_64(uint64_t x)
 	return a_ctz_l(y);
 }
 
-#if ((__ARM_ARCH_6__ || __ARM_ARCH_6K__ || __ARM_ARCH_6ZK__) && !__thumb__) \
- || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7
-
 #if __ARM_ARCH_7A__ || __ARM_ARCH_7R__ ||  __ARM_ARCH >= 7
-#define MEM_BARRIER "dmb ish"
-#else
-#define MEM_BARRIER "mcr p15,0,r0,c7,c10,5"
-#endif
 
-static inline int __k_cas(int t, int s, volatile int *p)
+static inline void a_barrier()
 {
-	int ret;
-	__asm__(
-		"	" MEM_BARRIER "\n"
+	__asm__ __volatile__("dmb ish");
+}
+
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	int old;
+	__asm__ __volatile__(
+		"	dmb ish\n"
 		"1:	ldrex %0,%3\n"
-		"	subs %0,%0,%1\n"
-#ifdef __thumb__
-		"	itt eq\n"
-#endif
-		"	strexeq %0,%2,%3\n"
-		"	teqeq %0,#1\n"
-		"	beq 1b\n"
-		"	" MEM_BARRIER "\n"
-		: "=&r"(ret)
+		"	cmp %0,%1\n"
+		"	bne 1f\n"
+		"	strex %0,%2,%3\n"
+		"	cmp %0, #0\n"
+		"	bne 1b\n"
+		"	mov %0, %1\n"
+		"1:	dmb ish\n"
+		: "=&r"(old)
 		: "r"(t), "r"(s), "Q"(*p)
 		: "memory", "cc" );
-	return ret;
+	return old;
+}
+
+static inline int a_swap(volatile int *x, int v)
+{
+	int old, tmp;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	strex %1,%2,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+	return old;
+}
+
+static inline int a_fetch_add(volatile int *x, int v)
+{
+	int old, tmp;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	add %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+	return old-v;
+}
+
+static inline void a_inc(volatile int *x)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%2\n"
+		"	add %0,%0,#1\n"
+		"	strex %1,%0,%2\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_dec(volatile int *x)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%2\n"
+		"	sub %0,%0,#1\n"
+		"	strex %1,%0,%2\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_and(volatile int *x, int v)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	and %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_or(volatile int *x, int v)
+{
+	int tmp, tmp2;
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"1:	ldrex %0,%3\n"
+		"	orr %0,%0,%2\n"
+		"	strex %1,%0,%3\n"
+		"	cmp %1, #0\n"
+		"	bne 1b\n"
+		"	dmb ish\n"
+		: "=&r"(tmp), "=&r"(tmp2)
+		: "r"(v), "Q"(*x)
+		: "memory", "cc" );
+}
+
+static inline void a_store(volatile int *p, int x)
+{
+	__asm__ __volatile__(
+		"	dmb ish\n"
+		"	str %1,%0\n"
+		"	dmb ish\n"
+		: "=m"(*p)
+		: "r"(x)
+		: "memory", "cc" );
 }
+
 #else
-#define __k_cas ((int (*)(int, int, volatile int *))0xffff0fc0)
-#endif
+
+int __a_cas(int, int, volatile int *) __attribute__((__visibility__("hidden")));
+#define __k_cas __a_cas
+
+static inline void a_barrier()
+{
+	__asm__ __volatile__("bl __a_barrier"
+		: : : "memory", "cc", "ip", "lr" );
+}
 
 static inline int a_cas(volatile int *p, int t, int s)
 {
@@ -65,11 +178,6 @@ static inline int a_cas(volatile int *p, int t, int s)
 	}
 }
 
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
 static inline int a_swap(volatile int *x, int v)
 {
 	int old;
@@ -98,19 +206,9 @@ static inline void a_dec(volatile int *x)
 
 static inline void a_store(volatile int *p, int x)
 {
-	while (__k_cas(*p, x, p));
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	__k_cas(0, 0, &(int){0});
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
+	a_barrier();
+	*p = x;
+	a_barrier();
 }
 
 static inline void a_and(volatile int *p, int v)
@@ -127,6 +225,20 @@ static inline void a_or(volatile int *p, int v)
 	while (__k_cas(old, old|v, p));
 }
 
+#endif
+
+static inline void *a_cas_p(volatile void *p, void *t, void *s)
+{
+	return (void *)a_cas(p, (int)t, (int)s);
+}
+
+#define a_spin a_barrier
+
+static inline void a_crash()
+{
+	*(volatile char *)0=0;
+}
+
 static inline void a_or_l(volatile void *p, long v)
 {
 	a_or(p, v);
diff --git a/arch/arm/pthread_arch.h b/arch/arm/pthread_arch.h
index 6d9dc3a..e72f74d 100644
--- a/arch/arm/pthread_arch.h
+++ b/arch/arm/pthread_arch.h
@@ -10,9 +10,17 @@ static inline __attribute__((const)) pthread_t __pthread_self()
 
 #else
 
-typedef char *(*__ptr_func_t)(void) __attribute__((const));
-#define __pthread_self() \
-	((pthread_t)(((__ptr_func_t)0xffff0fe0)()+8-sizeof(struct pthread)))
+static inline __attribute__((const)) pthread_t __pthread_self()
+{
+#ifdef __clang__
+	char *p;
+	__asm__( "bl __a_gettp\n\tmov %0,r0" : "=r"(p) : : "cc", "r0", "ip", "lr" );
+#else
+	register char *p __asm__("r0");
+	__asm__( "bl __a_gettp" : "=r"(p) : : "cc", "ip", "lr" );
+#endif
+	return (void *)(p+8-sizeof(struct pthread));
+}
 
 #endif
 
diff --git a/arch/arm/src/__set_thread_area.c b/arch/arm/src/__set_thread_area.c
index e69de29..680510e 100644
--- a/arch/arm/src/__set_thread_area.c
+++ b/arch/arm/src/__set_thread_area.c
@@ -0,0 +1,50 @@
+#include <stdint.h>
+#include <elf.h>
+#include "pthread_impl.h"
+#include "libc.h"
+
+#define HWCAP_TLS (1 << 15)
+
+extern const unsigned char __attribute__((__visibility__("hidden")))
+	__a_barrier_dummy[], __a_barrier_oldkuser[],
+	__a_barrier_v6[], __a_barrier_v7[],
+	__a_cas_dummy[], __a_cas_v6[], __a_cas_v7[],
+	__a_gettp_dummy[], __a_gettp_native[];
+
+#define __a_barrier_kuser 0xffff0fa0
+#define __a_cas_kuser 0xffff0fc0
+#define __a_gettp_kuser 0xffff0fe0
+
+extern uintptr_t __attribute__((__visibility__("hidden")))
+	__a_barrier_ptr, __a_cas_ptr, __a_gettp_ptr;
+
+#define SET(op,ver) (__a_##op##_ptr = \
+	(uintptr_t)__a_##op##_##ver - (uintptr_t)__a_##op##_dummy)
+
+int __set_thread_area(void *p)
+{
+#if !__ARM_ARCH_7A__ && !__ARM_ARCH_7R__ && __ARM_ARCH < 7
+	if (__hwcap & HWCAP_TLS) {
+		size_t *aux;
+		SET(gettp, native);
+		SET(cas, v7);
+		SET(barrier, v7);
+		for (aux=libc.auxv; *aux; aux+=2) {
+			if (*aux != AT_PLATFORM) continue;
+			const char *s = (void *)aux[1];
+			if (s[0]!='v' || s[1]!='6' || s[2]-'0'<10u) break;
+			SET(cas, v6);
+			SET(barrier, v6);
+			break;
+		}
+	} else {
+		int ver = *(int *)0xffff0ffc;
+		SET(gettp, kuser);
+		SET(cas, kuser);
+		SET(barrier, kuser);
+		if (ver < 2) a_crash();
+		if (ver < 3) SET(barrier, oldkuser);
+	}
+#endif
+	return __syscall(0xf0005, p);
+}
diff --git a/arch/arm/src/arm/atomics.s b/arch/arm/src/arm/atomics.s
index e69de29..93e5928 100644
--- a/arch/arm/src/arm/atomics.s
+++ b/arch/arm/src/arm/atomics.s
@@ -0,0 +1,119 @@
+.text
+
+.global __a_barrier
+.hidden __a_barrier
+.type __a_barrier,%function
+__a_barrier:
+	ldr ip,1f
+	ldr ip,[pc,ip]
+	add pc,pc,ip
+1:	.word __a_barrier_ptr-1b
+.global __a_barrier_dummy
+.hidden __a_barrier_dummy
+__a_barrier_dummy:
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+.global __a_barrier_oldkuser
+.hidden __a_barrier_oldkuser
+__a_barrier_oldkuser:
+	push {r0,r1,r2,r3,ip,lr}
+	mov r1,r0
+	mov r2,sp
+	ldr ip,=0xffff0fc0
+	mov lr,pc
+	mov pc,ip
+	pop {r0,r1,r2,r3,ip,lr}
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+.global __a_barrier_v6
+.hidden __a_barrier_v6
+__a_barrier_v6:
+	mcr p15,0,r0,c7,c10,5
+	bx lr
+.global __a_barrier_v7
+.hidden __a_barrier_v7
+__a_barrier_v7:
+	.word 0xf57ff05b        /* dmb ish */
+	bx lr
+
+.global __a_cas
+.hidden __a_cas
+.type __a_cas,%function
+__a_cas:
+	ldr ip,1f
+	ldr ip,[pc,ip]
+	add pc,pc,ip
+1:	.word __a_cas_ptr-1b
+.global __a_cas_dummy
+.hidden __a_cas_dummy
+__a_cas_dummy:
+	mov r3,r0
+	ldr r0,[r2]
+	subs r0,r3,r0
+	streq r1,[r2]
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+.global __a_cas_v6
+.hidden __a_cas_v6
+__a_cas_v6:
+	mov r3,r0
+	mcr p15,0,r0,c7,c10,5
+1:	.word 0xe1920f9f        /* ldrex r0,[r2] */
+	subs r0,r3,r0
+	.word 0x01820f91        /* strexeq r0,r1,[r2] */
+	teqeq r0,#1
+	beq 1b
+	mcr p15,0,r0,c7,c10,5
+	bx lr
+.global __a_cas_v7
+.hidden __a_cas_v7
+__a_cas_v7:
+	mov r3,r0
+	.word 0xf57ff05b        /* dmb ish */
+1:	.word 0xe1920f9f        /* ldrex r0,[r2] */
+	subs r0,r3,r0
+	.word 0x01820f91        /* strexeq r0,r1,[r2] */
+	teqeq r0,#1
+	beq 1b
+	.word 0xf57ff05b        /* dmb ish */
+	bx lr
+
+.global __a_gettp
+.hidden __a_gettp
+.type __a_gettp,%function
+__a_gettp:
+	ldr ip,1f
+	ldr ip,[pc,ip]
+	add pc,pc,ip
+1:	.word __a_gettp_ptr-1b
+.global __a_gettp_dummy
+.hidden __a_gettp_dummy
+__a_gettp_dummy:
+	.word 0xe7fddef1
+	tst lr,#1
+	moveq pc,lr
+	bx lr
+.global __a_gettp_native
+.hidden __a_gettp_native
+__a_gettp_native:
+	mrc p15,0,r0,c13,c0,3
+	bx lr
+
+.data
+.global __a_barrier_ptr
+.hidden __a_barrier_ptr
+__a_barrier_ptr:
+	.word 0
+
+.global __a_cas_ptr
+.hidden __a_cas_ptr
+__a_cas_ptr:
+	.word 0
+
+.global __a_gettp_ptr
+.hidden __a_gettp_ptr
+__a_gettp_ptr:
+	.word 0
diff --git a/src/ldso/dynlink.c b/src/ldso/dynlink.c
index c90fe99..1de430c 100644
--- a/src/ldso/dynlink.c
+++ b/src/ldso/dynlink.c
@@ -1126,6 +1126,7 @@ void *__dynlink(int argc, char **argv)
 		libc.secure = 1;
 	}
 	libc.page_size = aux[AT_PAGESZ];
+	libc.auxv = auxv;
 
 	/* If the dynamic linker was invoked as a program itself, AT_BASE
 	 * will not be set. In that case, we assume the base address is
diff --git a/src/thread/arm/__set_thread_area.s b/src/thread/arm/__set_thread_area.s
index 63d8884..4a4cd0d 100644
--- a/src/thread/arm/__set_thread_area.s
+++ b/src/thread/arm/__set_thread_area.s
@@ -1,12 +1 @@
-.text
-.global __set_thread_area
-.type   __set_thread_area,%function
-__set_thread_area:
-	mov r1,r7
-	mov r7,#0x0f0000
-	add r7,r7,#5
-	svc 0
-	mov r7,r1
-	tst lr,#1
-	moveq pc,lr
-	bx lr
+/* Replaced by C code in arch/arm/src */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] ARM atomics overhaul, try 2
  2014-11-18 19:15 [PATCH] ARM atomics overhaul, try 2 Rich Felker
@ 2014-11-19  6:12 ` Rich Felker
  0 siblings, 0 replies; 2+ messages in thread
From: Rich Felker @ 2014-11-19  6:12 UTC (permalink / raw)
  To: musl

On Tue, Nov 18, 2014 at 02:15:45PM -0500, Rich Felker wrote:
> Here's a new version of the ARM atomics overhaul patch which I'm much
> happier with. Whereas the old version imposed a heavy address
> [...]

Committed, with minor changes.

Rich


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2014-11-19  6:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-18 19:15 [PATCH] ARM atomics overhaul, try 2 Rich Felker
2014-11-19  6:12 ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).