[PATCH] inline llsc atomics when compiling for sh4a

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] inline llsc atomics when compiling for sh4a
@ 2015-05-17 18:55 Bobby Bingham
  2015-05-18  2:34 ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Bobby Bingham @ 2015-05-17 18:55 UTC (permalink / raw)
  To: musl


[-- Attachment #1.1: Type: text/plain, Size: 47 bytes --]

As suggested by Rich on IRC.

--
Bobby Bingham

[-- Attachment #1.2: 0001-inline-llsc-atomics-when-building-for-sh4a.patch --]
[-- Type: text/x-diff, Size: 7796 bytes --]

From d49efad3d8cce0c8ba0bd72f572d75d4f8a1d949 Mon Sep 17 00:00:00 2001
From: Bobby Bingham <koorogi@koorogi.info>
Date: Sun, 17 May 2015 13:46:38 -0500
Subject: [PATCH] inline llsc atomics when building for sh4a

If we're building for sh4a, the compiler is already free to use
instructions only available on sh4a, so we can do the same and inline the
llsc atomics. If we're building for an older processor, we still do the
same runtime atomics selection as before.
---
 arch/sh/atomic.h     |  83 +++++++++++++++++++++++++++++++
 arch/sh/src/atomic.c | 135 +++++++++++++++++----------------------------------
 2 files changed, 128 insertions(+), 90 deletions(-)

diff --git a/arch/sh/atomic.h b/arch/sh/atomic.h
index a1d22e4..f2e6dac 100644
--- a/arch/sh/atomic.h
+++ b/arch/sh/atomic.h
@@ -22,6 +22,88 @@ static inline int a_ctz_64(uint64_t x)
 	return a_ctz_l(y);
 }
 
+#define LLSC_CLOBBERS "r0", "t", "memory"
+#define LLSC_START(mem) "synco\n"  \
+	"0:	movli.l @" mem ", r0\n"
+#define LLSC_END(mem)              \
+	"1:	movco.l r0, @" mem "\n"    \
+	"	bf 0b\n"                   \
+	"	synco\n"
+
+static inline int __sh_cas_llsc(volatile int *p, int t, int s)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	cmp/eq %0, %2\n"
+		"	bf 1f\n"
+		"	mov %3, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline int __sh_swap_llsc(volatile int *x, int v)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	mov %2, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline int __sh_fetch_add_llsc(volatile int *x, int v)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	add %2, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline void __sh_store_llsc(volatile int *p, int x)
+{
+	__asm__ __volatile__(
+		"	synco\n"
+		"	mov.l %1, @%0\n"
+		"	synco\n"
+		: : "r"(p), "r"(x) : "memory");
+}
+
+static inline void __sh_and_llsc(volatile int *x, int v)
+{
+	__asm__ __volatile__(
+		LLSC_START("%0")
+		"	and %1, r0\n"
+		LLSC_END("%0")
+		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
+}
+
+static inline void __sh_or_llsc(volatile int *x, int v)
+{
+	__asm__ __volatile__(
+		LLSC_START("%0")
+		"	or %1, r0\n"
+		LLSC_END("%0")
+		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
+}
+
+#ifdef __SH4A__
+#define a_cas(p,t,s)     __sh_cas_llsc(p,t,s)
+#define a_swap(x,v)      __sh_swap_llsc(x,v)
+#define a_fetch_add(x,v) __sh_fetch_add_llsc(x, v)
+#define a_store(x,v)     __sh_store_llsc(x, v)
+#define a_and(x,v)       __sh_and_llsc(x, v)
+#define a_or(x,v)        __sh_or_llsc(x, v)
+#else
+
 int  __sh_cas(volatile int *, int, int);
 int  __sh_swap(volatile int *, int);
 int  __sh_fetch_add(volatile int *, int);
@@ -35,6 +117,7 @@ void __sh_or(volatile int *, int);
 #define a_store(x,v)     __sh_store(x, v)
 #define a_and(x,v)       __sh_and(x, v)
 #define a_or(x,v)        __sh_or(x, v)
+#endif
 
 static inline void *a_cas_p(volatile void *p, void *t, void *s)
 {
diff --git a/arch/sh/src/atomic.c b/arch/sh/src/atomic.c
index 1339567..f8c615f 100644
--- a/arch/sh/src/atomic.c
+++ b/arch/sh/src/atomic.c
@@ -1,12 +1,7 @@
-#include "libc.h"
+#ifndef __SH4A__
 
-#define LLSC_CLOBBERS   "r0", "t", "memory"
-#define LLSC_START(mem) "synco\n"  \
-	"0:	movli.l @" mem ", r0\n"
-#define LLSC_END(mem)              \
-	"1:	movco.l r0, @" mem "\n"    \
-	"	bf 0b\n"                   \
-	"	synco\n"
+#include "atomic.h"
+#include "libc.h"
 
 /* gusa is a hack in the kernel which lets you create a sequence of instructions
  * which will be restarted if the process is preempted in the middle of the
@@ -34,114 +29,74 @@
 
 int __sh_cas(volatile int *p, int t, int s)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_cas_llsc(p, t, s);
+
 	int old;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	cmp/eq %0, %2\n"
-			"	bf 1f\n"
-			"	mov %3, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%1", "%0")
-			"	cmp/eq %0, %2\n"
-			"	bf 1f\n"
-			GUSA_END("%1", "%3")
-			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%1", "%0")
+		"	cmp/eq %0, %2\n"
+		"	bf 1f\n"
+		GUSA_END("%1", "%3")
+		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
 	return old;
 }
 
 int __sh_swap(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_swap_llsc(x, v);
+
 	int old;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	mov %2, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%1", "%0")
-			GUSA_END("%1", "%2")
-			: "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%1", "%0")
+		GUSA_END("%1", "%2")
+		: "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 
 int __sh_fetch_add(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_fetch_add_llsc(x, v);
+
 	int old, dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	add %2, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%2", "%0")
-			"	mov %0, %1\n"
-			"	add %3, %1\n"
-			GUSA_END("%2", "%1")
-			: "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%2", "%0")
+		"	mov %0, %1\n"
+		"	add %3, %1\n"
+		GUSA_END("%2", "%1")
+		: "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 
 void __sh_store(volatile int *p, int x)
 {
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			"	synco\n"
-			"	mov.l %1, @%0\n"
-			"	synco\n"
-			: : "r"(p), "r"(x) : "memory");
-	} else {
-		__asm__ __volatile__(
-			"	mov.l %1, @%0\n"
-			: : "r"(p), "r"(x) : "memory");
-	}
+	if (__hwcap & CPU_HAS_LLSC) return __sh_store_llsc(p, x);
+	__asm__ __volatile__(
+		"	mov.l %1, @%0\n"
+		: : "r"(p), "r"(x) : "memory");
 }
 
 void __sh_and(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_and_llsc(x, v);
+
 	int dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%0")
-			"	and %1, r0\n"
-			LLSC_END("%0")
-			: : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_ODD("%1", "%0")
-			"	and %2, %0\n"
-			GUSA_END("%1", "%0")
-			: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_ODD("%1", "%0")
+		"	and %2, %0\n"
+		GUSA_END("%1", "%0")
+		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
 
 void __sh_or(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_or_llsc(x, v);
+
 	int dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%0")
-			"	or %1, r0\n"
-			LLSC_END("%0")
-			: : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_ODD("%1", "%0")
-			"	or %2, %0\n"
-			GUSA_END("%1", "%0")
-			: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_ODD("%1", "%0")
+		"	or %2, %0\n"
+		GUSA_END("%1", "%0")
+		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
+
+#endif
-- 
2.3.3


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-17 18:55 [PATCH] inline llsc atomics when compiling for sh4a Bobby Bingham
@ 2015-05-18  2:34 ` Rich Felker
  2015-05-18  4:28   ` Rich Felker
  2015-05-18 22:56   ` Bobby Bingham
  0 siblings, 2 replies; 7+ messages in thread
From: Rich Felker @ 2015-05-18  2:34 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 01:55:16PM -0500, Bobby Bingham wrote:
> If we're building for sh4a, the compiler is already free to use
> instructions only available on sh4a, so we can do the same and inline the
> llsc atomics. If we're building for an older processor, we still do the
> same runtime atomics selection as before.

Thanks! I think it's ok for commit as-is, but based on re-reading this
code I have some ideas for improving it that are orthogonal to this
change. See comments inline:

> ---
>  arch/sh/atomic.h     |  83 +++++++++++++++++++++++++++++++
>  arch/sh/src/atomic.c | 135 +++++++++++++++++----------------------------------
>  2 files changed, 128 insertions(+), 90 deletions(-)
> 
> diff --git a/arch/sh/atomic.h b/arch/sh/atomic.h
> index a1d22e4..f2e6dac 100644
> --- a/arch/sh/atomic.h
> +++ b/arch/sh/atomic.h
> @@ -22,6 +22,88 @@ static inline int a_ctz_64(uint64_t x)
>  	return a_ctz_l(y);
>  }
>  
> +#define LLSC_CLOBBERS "r0", "t", "memory"
> +#define LLSC_START(mem) "synco\n"  \
> +	"0:	movli.l @" mem ", r0\n"
> +#define LLSC_END(mem)              \
> +	"1:	movco.l r0, @" mem "\n"    \
> +	"	bf 0b\n"                   \
> +	"	synco\n"
> +
> +static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> +{
> +	int old;
> +	__asm__ __volatile__(
> +		LLSC_START("%1")
> +		"	mov r0, %0\n"
> +		"	cmp/eq %0, %2\n"
> +		"	bf 1f\n"
> +		"	mov %3, r0\n"
> +		LLSC_END("%1")
> +		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
> +	return old;
> +}

The mov from r0 to %0 seems unnecessary here. Presumably it's because
you didn't have a constraint to force old to come out via r0. Could
you do the following?

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	register int old __asm__("r0");
	__asm__ __volatile__(
		LLSC_START("%1")
		"	cmp/eq r0, %2\n"
		"	bf 1f\n"
		"	mov %3, r0\n"
		LLSC_END("%1")
		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : "t", "memory");
	return old;
}

Note that I had to drop LLSC_CLOBBERS because "r0" was in it and you
can't clobber an output register.

I've actually always wondered about the value of having the LLSC_*
macros. I usually prefer for the whole asm to be written out
explicitly and readable unless there's a compelling reason to wrap it
up in macros. Then it would look like:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	register int old __asm__("r0");
	__asm__ __volatile__(
		"	synco\n"
		"0:	movli.l @%1, r0\n"
		"	cmp/eq r0, %2\n"
		"	bf 1f\n"
		"	mov %3, r0\n"
		"1:	movco.l r0, @%1\n"
		"	bf 0b\n"
		"	synco\n"
		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : "t", "memory");
	return old;
}

and similar for other functions. Part of the motivation of not hiding
the outer logic in macros is that it might make it possible to
fold/simplify some special cases like I did above for CAS.

Another idea is letting the compiler simplify, with something like the
following, which could actually be used cross-platform for all
llsc-type archs:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	do old = llsc_start(p);
	while (*p == t && !llsc_end(p, s));
	return old;
}

Here llsc_start and llsc_end would be inline functions using asm with
appropriate constraints. Unfortunately I don't see a way to model
using the value of the truth flag "t" as the output of the asm for
llsc_end, though. I suspect this would be a problem on a number of
other archs too; the asm would have to waste an instruction (or
several) converting the flag to an integer. Unless there's a solution
to that problem, it makes an approach like this less appealing.

>  int __sh_cas(volatile int *p, int t, int s)
>  {
> +	if (__hwcap & CPU_HAS_LLSC) return __sh_cas_llsc(p, t, s);
> +
>  	int old;
> -	if (__hwcap & CPU_HAS_LLSC) {
> -		__asm__ __volatile__(
> -			LLSC_START("%1")
> -			"	mov r0, %0\n"
> -			"	cmp/eq %0, %2\n"
> -			"	bf 1f\n"
> -			"	mov %3, r0\n"
> -			LLSC_END("%1")
> -			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
> -	} else {
> -		__asm__ __volatile__(
> -			GUSA_START_EVEN("%1", "%0")
> -			"	cmp/eq %0, %2\n"
> -			"	bf 1f\n"
> -			GUSA_END("%1", "%3")
> -			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
> -	}
> +	__asm__ __volatile__(
> +		GUSA_START_EVEN("%1", "%0")
> +		"	cmp/eq %0, %2\n"
> +		"	bf 1f\n"
> +		GUSA_END("%1", "%3")
> +		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
>  	return old;
>  }

For the GUSA stuff, do you really nexed the ODD/EVEN macros? I think
you could just add appropriate .align inside that would cause the
assembler to insert a nop if necessary.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-18  2:34 ` Rich Felker
@ 2015-05-18  4:28   ` Rich Felker
  2015-05-18 22:56   ` Bobby Bingham
  1 sibling, 0 replies; 7+ messages in thread
From: Rich Felker @ 2015-05-18  4:28 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 10:34:02PM -0400, Rich Felker wrote:
> Another idea is letting the compiler simplify, with something like the
> following, which could actually be used cross-platform for all
> llsc-type archs:
> 
> static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> {
> 	do old = llsc_start(p);
> 	while (*p == t && !llsc_end(p, s));
> 	return old;
> }
> 
> Here llsc_start and llsc_end would be inline functions using asm with
> appropriate constraints. Unfortunately I don't see a way to model
> using the value of the truth flag "t" as the output of the asm for
> llsc_end, though. I suspect this would be a problem on a number of
> other archs too; the asm would have to waste an instruction (or
> several) converting the flag to an integer. Unless there's a solution
> to that problem, it makes an approach like this less appealing.

Indeed, I can't find a way to make it work without two useless
instructions. This is what I get (for an extern function a_cas based
on the above approach):

a_cas:
        .align 2
.L4:
#APP
        synco ; mov.li @r4, r0
#NO_APP
        cmp/eq  r0,r5
        bf/s    .L7
        mov     r0,r1
        mov     r6,r0
#APP
        mov.co r0, @r4 ; movt r2
#NO_APP
        tst     r2,r2
        bt      .L4
.L7:
        rts
        mov     r1,r0

Both the movt (which I had to write) and the tst (which the compiler
generated) are useless. It's a shame, because otherwise this approach
is really clean and elegant.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-18  2:34 ` Rich Felker
  2015-05-18  4:28   ` Rich Felker
@ 2015-05-18 22:56   ` Bobby Bingham
  2015-05-19  0:30     ` Rich Felker
  2015-05-19  4:52     ` Rich Felker
  1 sibling, 2 replies; 7+ messages in thread
From: Bobby Bingham @ 2015-05-18 22:56 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 5970 bytes --]

On Sun, May 17, 2015 at 10:34:02PM -0400, Rich Felker wrote:
> On Sun, May 17, 2015 at 01:55:16PM -0500, Bobby Bingham wrote:
> > If we're building for sh4a, the compiler is already free to use
> > instructions only available on sh4a, so we can do the same and inline the
> > llsc atomics. If we're building for an older processor, we still do the
> > same runtime atomics selection as before.
>
> Thanks! I think it's ok for commit as-is, but based on re-reading this
> code I have some ideas for improving it that are orthogonal to this
> change. See comments inline:

Would you prefer I resend this patch to remove the LLSC_* macros at the
same time, or another patch to remove them separately?

>
> > ---
> >  arch/sh/atomic.h     |  83 +++++++++++++++++++++++++++++++
> >  arch/sh/src/atomic.c | 135 +++++++++++++++++----------------------------------
> >  2 files changed, 128 insertions(+), 90 deletions(-)
> >
> > diff --git a/arch/sh/atomic.h b/arch/sh/atomic.h
> > index a1d22e4..f2e6dac 100644
> > --- a/arch/sh/atomic.h
> > +++ b/arch/sh/atomic.h
> > @@ -22,6 +22,88 @@ static inline int a_ctz_64(uint64_t x)
> >  	return a_ctz_l(y);
> >  }
> >
> > +#define LLSC_CLOBBERS "r0", "t", "memory"
> > +#define LLSC_START(mem) "synco\n"  \
> > +	"0:	movli.l @" mem ", r0\n"
> > +#define LLSC_END(mem)              \
> > +	"1:	movco.l r0, @" mem "\n"    \
> > +	"	bf 0b\n"                   \
> > +	"	synco\n"
> > +
> > +static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> > +{
> > +	int old;
> > +	__asm__ __volatile__(
> > +		LLSC_START("%1")
> > +		"	mov r0, %0\n"
> > +		"	cmp/eq %0, %2\n"
> > +		"	bf 1f\n"
> > +		"	mov %3, r0\n"
> > +		LLSC_END("%1")
> > +		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
> > +	return old;
> > +}
>
> The mov from r0 to %0 seems unnecessary here. Presumably it's because
> you didn't have a constraint to force old to come out via r0. Could
> you do the following?

No, because the "mov %3, r0" a couple instructions down clobbers r0, and
this is necessary because movco.l only accept r0 as the source operand.

>
> static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> {
> 	register int old __asm__("r0");
> 	__asm__ __volatile__(
> 		LLSC_START("%1")
> 		"	cmp/eq r0, %2\n"
> 		"	bf 1f\n"
> 		"	mov %3, r0\n"
> 		LLSC_END("%1")
> 		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : "t", "memory");
> 	return old;
> }
>
> Note that I had to drop LLSC_CLOBBERS because "r0" was in it and you
> can't clobber an output register.
>
> I've actually always wondered about the value of having the LLSC_*
> macros. I usually prefer for the whole asm to be written out
> explicitly and readable unless there's a compelling reason to wrap it
> up in macros. Then it would look like:

I think it made a bigger difference for the gusa version, so I mostly
did it with llsc for consistency.  And before this patch, with the gusa
and llsc versions were side by side in the same function, it made it
easier for me to verify both versions were doing the same thing as I
wrote them.

Now that the llsc version is moving, I'm less attached to the LLSC_*
macros.  I do think the gusa stuff is ugly and magical enough that I'd
still prefer to keep it hidden away, if you don't object.

>
> static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> {
> 	register int old __asm__("r0");
> 	__asm__ __volatile__(
> 		"	synco\n"
> 		"0:	movli.l @%1, r0\n"
> 		"	cmp/eq r0, %2\n"
> 		"	bf 1f\n"
> 		"	mov %3, r0\n"
> 		"1:	movco.l r0, @%1\n"
> 		"	bf 0b\n"
> 		"	synco\n"
> 		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : "t", "memory");
> 	return old;
> }
>
> and similar for other functions. Part of the motivation of not hiding
> the outer logic in macros is that it might make it possible to
> fold/simplify some special cases like I did above for CAS.

I don't mind in principle, but I think the fact that movco.l requires
its input be in r0 is going to mean there's not actually any
simplification you can do.

>
> Another idea is letting the compiler simplify, with something like the
> following, which could actually be used cross-platform for all
> llsc-type archs:
>
> static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> {
> 	do old = llsc_start(p);
> 	while (*p == t && !llsc_end(p, s));
> 	return old;
> }
>
> Here llsc_start and llsc_end would be inline functions using asm with
> appropriate constraints. Unfortunately I don't see a way to model
> using the value of the truth flag "t" as the output of the asm for
> llsc_end, though. I suspect this would be a problem on a number of
> other archs too; the asm would have to waste an instruction (or
> several) converting the flag to an integer. Unless there's a solution
> to that problem, it makes an approach like this less appealing.

I agree this would be even nicer if we could make it work.

>
> >  int __sh_cas(volatile int *p, int t, int s)
> >  {
> > +	if (__hwcap & CPU_HAS_LLSC) return __sh_cas_llsc(p, t, s);
> > +
> >  	int old;
> > -	if (__hwcap & CPU_HAS_LLSC) {
> > -		__asm__ __volatile__(
> > -			LLSC_START("%1")
> > -			"	mov r0, %0\n"
> > -			"	cmp/eq %0, %2\n"
> > -			"	bf 1f\n"
> > -			"	mov %3, r0\n"
> > -			LLSC_END("%1")
> > -			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
> > -	} else {
> > -		__asm__ __volatile__(
> > -			GUSA_START_EVEN("%1", "%0")
> > -			"	cmp/eq %0, %2\n"
> > -			"	bf 1f\n"
> > -			GUSA_END("%1", "%3")
> > -			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
> > -	}
> > +	__asm__ __volatile__(
> > +		GUSA_START_EVEN("%1", "%0")
> > +		"	cmp/eq %0, %2\n"
> > +		"	bf 1f\n"
> > +		GUSA_END("%1", "%3")
> > +		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
> >  	return old;
> >  }
>
> For the GUSA stuff, do you really nexed the ODD/EVEN macros? I think
> you could just add appropriate .align inside that would cause the
> assembler to insert a nop if necessary.

Should be possible.  I'll work on it and send another patch.

>
> Rich

--
Bobby Bingham

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-18 22:56   ` Bobby Bingham
@ 2015-05-19  0:30     ` Rich Felker
  2015-05-19  2:12       ` Rich Felker
  2015-05-19  4:52     ` Rich Felker
  1 sibling, 1 reply; 7+ messages in thread
From: Rich Felker @ 2015-05-19  0:30 UTC (permalink / raw)
  To: musl

On Mon, May 18, 2015 at 05:56:18PM -0500, Bobby Bingham wrote:
> On Sun, May 17, 2015 at 10:34:02PM -0400, Rich Felker wrote:
> > On Sun, May 17, 2015 at 01:55:16PM -0500, Bobby Bingham wrote:
> > > If we're building for sh4a, the compiler is already free to use
> > > instructions only available on sh4a, so we can do the same and inline the
> > > llsc atomics. If we're building for an older processor, we still do the
> > > same runtime atomics selection as before.
> >
> > Thanks! I think it's ok for commit as-is, but based on re-reading this
> > code I have some ideas for improving it that are orthogonal to this
> > change. See comments inline:
> 
> Would you prefer I resend this patch to remove the LLSC_* macros at the
> same time, or another patch to remove them separately?

No, let's do that separately. I like keeping independent changes
separate in commits. I've tested the current patch already and it
seems to be fine. If we do the second LLSC_* removal patch it
shouldn't affect the generated binaries, which is easy to verify.

> 
> >
> > > ---
> > >  arch/sh/atomic.h     |  83 +++++++++++++++++++++++++++++++
> > >  arch/sh/src/atomic.c | 135 +++++++++++++++++----------------------------------
> > >  2 files changed, 128 insertions(+), 90 deletions(-)
> > >
> > > diff --git a/arch/sh/atomic.h b/arch/sh/atomic.h
> > > index a1d22e4..f2e6dac 100644
> > > --- a/arch/sh/atomic.h
> > > +++ b/arch/sh/atomic.h
> > > @@ -22,6 +22,88 @@ static inline int a_ctz_64(uint64_t x)
> > >  	return a_ctz_l(y);
> > >  }
> > >
> > > +#define LLSC_CLOBBERS "r0", "t", "memory"
> > > +#define LLSC_START(mem) "synco\n"  \
> > > +	"0:	movli.l @" mem ", r0\n"
> > > +#define LLSC_END(mem)              \
> > > +	"1:	movco.l r0, @" mem "\n"    \
> > > +	"	bf 0b\n"                   \
> > > +	"	synco\n"
> > > +
> > > +static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> > > +{
> > > +	int old;
> > > +	__asm__ __volatile__(
> > > +		LLSC_START("%1")
> > > +		"	mov r0, %0\n"
> > > +		"	cmp/eq %0, %2\n"
> > > +		"	bf 1f\n"
> > > +		"	mov %3, r0\n"
> > > +		LLSC_END("%1")
> > > +		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
> > > +	return old;
> > > +}
> >
> > The mov from r0 to %0 seems unnecessary here. Presumably it's because
> > you didn't have a constraint to force old to come out via r0. Could
> > you do the following?
> 
> No, because the "mov %3, r0" a couple instructions down clobbers r0, and
> this is necessary because movco.l only accept r0 as the source operand.

Oh, I see. We lose the old value that the function needs to return. So
a version of cas that just returns success/failure rather than the old
value would be able to omit it, but musl's can't, right? I should keep
that in mind, since at some point, if I can determine that the old
value isn't important anywhere, I might consider changing the a_cas
API to just have success/failure as its result. This is mildly cheaper
on some other archs too, I think.

> > I've actually always wondered about the value of having the LLSC_*
> > macros. I usually prefer for the whole asm to be written out
> > explicitly and readable unless there's a compelling reason to wrap it
> > up in macros. Then it would look like:
> 
> I think it made a bigger difference for the gusa version, so I mostly
> did it with llsc for consistency.  And before this patch, with the gusa
> and llsc versions were side by side in the same function, it made it
> easier for me to verify both versions were doing the same thing as I
> wrote them.
> 
> Now that the llsc version is moving, I'm less attached to the LLSC_*
> macros.  I do think the gusa stuff is ugly and magical enough that I'd
> still prefer to keep it hidden away, if you don't object.

Yeah, I don't mind.

> > static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> > {
> > 	register int old __asm__("r0");
> > 	__asm__ __volatile__(
> > 		"	synco\n"
> > 		"0:	movli.l @%1, r0\n"
> > 		"	cmp/eq r0, %2\n"
> > 		"	bf 1f\n"
> > 		"	mov %3, r0\n"
> > 		"1:	movco.l r0, @%1\n"
> > 		"	bf 0b\n"
> > 		"	synco\n"
> > 		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : "t", "memory");
> > 	return old;
> > }
> >
> > and similar for other functions. Part of the motivation of not hiding
> > the outer logic in macros is that it might make it possible to
> > fold/simplify some special cases like I did above for CAS.
> 
> I don't mind in principle, but I think the fact that movco.l requires
> its input be in r0 is going to mean there's not actually any
> simplification you can do.

I think you may be right.

> > Another idea is letting the compiler simplify, with something like the
> > following, which could actually be used cross-platform for all
> > llsc-type archs:
> >
> > static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> > {
> > 	do old = llsc_start(p);
> > 	while (*p == t && !llsc_end(p, s));
> > 	return old;
> > }
> >
> > Here llsc_start and llsc_end would be inline functions using asm with
> > appropriate constraints. Unfortunately I don't see a way to model
> > using the value of the truth flag "t" as the output of the asm for
> > llsc_end, though. I suspect this would be a problem on a number of
> > other archs too; the asm would have to waste an instruction (or
> > several) converting the flag to an integer. Unless there's a solution
> > to that problem, it makes an approach like this less appealing.
> 
> I agree this would be even nicer if we could make it work.

Yes. FWIW the above code has a bug. It should be:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	do old = llsc_start(p);
	while (old == t && !llsc_end(p, s));
	return old;
}

I think there would need to be a barrier before the return statement
too. Embedding the barrier in llsc_end would not work because (1) it
won't get executed on old!=t, and (2) it should be avoided when
llsc_end fails since llsc_start will do a barrier anyway, but if we
put the conditional inside llsc_end's asm then the condition would get
branched-on twice. Issue (2) may be solvable by using a C conditional
inside llsc_end, but that still leaves issue (1), so I think something
like this would be needed:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	do old = llsc_start(p);
	while (old == t && !llsc_end(p, s));
	a_barrier();
	return old;
}

But that's suboptimal on archs where the 'sc' part of llsc has an
implicit barrier (microblaze and or1k, IIRC). So perhaps the ideal
general version would be:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	do old = llsc_start(p);
	while (old == t ? !llsc_end(p, s) : (a_barrier(), 0));
	return old;
}

with a barrier in the success path for llsc_end. Alternatively we
could aim to always do the sc:

static inline int __sh_cas_llsc(volatile int *p, int t, int s)
{
	do old = llsc_start(p);
	while (!llsc_end(p, old==t ? s : old));
	return old;
}

This version is structurally analogous to the non-CAS atomics, but
perhaps more costly in the old!=t case.

Anyway at this point I don't see an efficient way do to the
conditional, so it's mostly a theoretical topic at this point.

> > For the GUSA stuff, do you really nexed the ODD/EVEN macros? I think
> > you could just add appropriate .align inside that would cause the
> > assembler to insert a nop if necessary.
> 
> Should be possible.  I'll work on it and send another patch.

Thanks!

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-19  0:30     ` Rich Felker
@ 2015-05-19  2:12       ` Rich Felker
  0 siblings, 0 replies; 7+ messages in thread
From: Rich Felker @ 2015-05-19  2:12 UTC (permalink / raw)
  To: musl

On Mon, May 18, 2015 at 08:30:45PM -0400, Rich Felker wrote:
> [...]
> static inline int __sh_cas_llsc(volatile int *p, int t, int s)
> {
> 	do old = llsc_start(p);
> 	while (!llsc_end(p, old==t ? s : old));
> 	return old;
> }
> 
> This version is structurally analogous to the non-CAS atomics, but
> perhaps more costly in the old!=t case.
> 
> Anyway at this point I don't see an efficient way do to the
> conditional, so it's mostly a theoretical topic at this point.

I have a partial solution but it's utterly hideous:

static inline int llsc_end(volatile int *p, int v)
{
	__asm__ __volatile__ goto ( "mov.co %1, @%0 ; bf %l[fail]" : :
	"r"(p), "z"(v) : "memory" : fail );
	return 1;
fail:
	return 0;
}

This doesn't eliminate the branch in the asm, but it does allow gcc to
eliminate the branch outside the asm. The resulting code is:

a_cas:
        .align 2
.L2:
#APP
        synco ; mov.li @r4, r0
#NO_APP
        cmp/eq  r0,r5
        bf      .L4
        mov     r6,r0
#APP
        mov.co r0, @r4 ; bf .L2
#NO_APP
.L6:
        rts
        mov     r5,r0
        .align 1
.L4:
        bra     .L6
        mov     r0,r5

Of course asm goto is utterly hideous, so I don't propose actually
using this. I doubt any of the non-GCC compilers we support would
accept it.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] inline llsc atomics when compiling for sh4a
  2015-05-18 22:56   ` Bobby Bingham
  2015-05-19  0:30     ` Rich Felker
@ 2015-05-19  4:52     ` Rich Felker
  1 sibling, 0 replies; 7+ messages in thread
From: Rich Felker @ 2015-05-19  4:52 UTC (permalink / raw)
  To: musl

On Mon, May 18, 2015 at 05:56:18PM -0500, Bobby Bingham wrote:
> On Sun, May 17, 2015 at 10:34:02PM -0400, Rich Felker wrote:
> > On Sun, May 17, 2015 at 01:55:16PM -0500, Bobby Bingham wrote:
> > > If we're building for sh4a, the compiler is already free to use
> > > instructions only available on sh4a, so we can do the same and inline the
> > > llsc atomics. If we're building for an older processor, we still do the
> > > same runtime atomics selection as before.
> >
> > Thanks! I think it's ok for commit as-is, but based on re-reading this
> > code I have some ideas for improving it that are orthogonal to this
> > change. See comments inline:
> 
> Would you prefer I resend this patch to remove the LLSC_* macros at the
> same time, or another patch to remove them separately?

I've committed the original patch.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-05-19  4:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-17 18:55 [PATCH] inline llsc atomics when compiling for sh4a Bobby Bingham
2015-05-18  2:34 ` Rich Felker
2015-05-18  4:28   ` Rich Felker
2015-05-18 22:56   ` Bobby Bingham
2015-05-19  0:30     ` Rich Felker
2015-05-19  2:12       ` Rich Felker
2015-05-19  4:52     ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).