[musl] [PATCH v2] add qsort

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [musl] [PATCH v2] add qsort_r.
@ 2021-03-09  3:56 Érico Nogueira
  2021-03-09  9:11 ` Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Érico Nogueira @ 2021-03-09  3:56 UTC (permalink / raw)
  To: musl; +Cc: Érico Nogueira

since most discussion around the addition of this function has centered
around the possible code duplication it requires or that qsort would
become much slower if implemented as a wrapper around qsort_r, this
commit uses a qsort.c file that's included with the appropriate
definitions in order to generate qsort_r. this avoids
code duplication and the efficiency loss, at the cost of greater binary
size - size(1) reports 602282 vs 600673.

this commit requires that the extra argument used by qsort_r always be
called 'arg', and it should be the first argument of the function, which
allows the usage of variadic macros to call the implementation internal
functions, instead of requiring the addition of multiple macros with
hard to read definitions. it also requires the comparison function
always be called 'cmp'.

some of the other functions which don't deal with 'cmp' or 'arg' can
probably be factored out into a separate file, allowing for possibly
smaller code size, but that hasn't been tried yet.
---

And of course, right after I sent the patch I realized there was a much
smaller diff that could be applied instead. Sorry for the noise.

 include/stdlib.h     |  1 +
 src/stdlib/qsort.c   | 38 ++++++++++++++++++++++++++++++--------
 src/stdlib/qsort_r.c |  2 ++
 3 files changed, 33 insertions(+), 8 deletions(-)
 create mode 100644 src/stdlib/qsort_r.c

diff --git a/include/stdlib.h b/include/stdlib.h
index b54a051f..0c0ced5f 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -158,6 +158,7 @@ struct __locale_struct;
 float strtof_l(const char *__restrict, char **__restrict, struct __locale_struct *);
 double strtod_l(const char *__restrict, char **__restrict, struct __locale_struct *);
 long double strtold_l(const char *__restrict, char **__restrict, struct __locale_struct *);
+void qsort_r (void *, size_t, size_t, int (*)(const void *, const void *, void *), void *);
 #endif
 
 #if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
diff --git a/src/stdlib/qsort.c b/src/stdlib/qsort.c
index da58fd31..16dbb50b 100644
--- a/src/stdlib/qsort.c
+++ b/src/stdlib/qsort.c
@@ -31,7 +31,13 @@
 #include "atomic.h"
 #define ntz(x) a_ctz_l((x))
 
+#ifdef QSORT_R
+typedef int (*cmpfun)(const void *, const void *, void *);
+# define call_cmp(v1,v2) ((*cmp)(v1,v2,arg))
+#else
 typedef int (*cmpfun)(const void *, const void *);
+# define call_cmp(v1,v2) ((*cmp)(v1,v2))
+#endif
 
 static inline int pntz(size_t p[2]) {
 	int r = ntz(p[0] - 1);
@@ -88,7 +94,13 @@ static inline void shr(size_t p[2], int n)
 	p[1] >>= n;
 }
 
-static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size_t lp[])
+#ifdef QSORT_R
+# define sift(...) sift_impl(arg, __VA_ARGS__)
+static void sift_impl(void *arg, unsigned char *head, size_t width, cmpfun cmp, int pshift, size_t lp[])
+#else
+# define sift sift_impl
+static void sift_impl(unsigned char *head, size_t width, cmpfun cmp, int pshift, size_t lp[])
+#endif
 {
 	unsigned char *rt, *lf;
 	unsigned char *ar[14 * sizeof(size_t) + 1];
@@ -99,10 +111,10 @@ static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size
 		rt = head - width;
 		lf = head - width - lp[pshift - 2];
 
-		if((*cmp)(ar[0], lf) >= 0 && (*cmp)(ar[0], rt) >= 0) {
+		if(call_cmp(ar[0], lf) >= 0 && call_cmp(ar[0], rt) >= 0) {
 			break;
 		}
-		if((*cmp)(lf, rt) >= 0) {
+		if(call_cmp(lf, rt) >= 0) {
 			ar[i++] = lf;
 			head = lf;
 			pshift -= 1;
@@ -115,7 +127,13 @@ static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size
 	cycle(width, ar, i);
 }
 
-static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2], int pshift, int trusty, size_t lp[])
+#ifdef QSORT_R
+# define trinkle(...) trinkle_impl(arg, __VA_ARGS__)
+static void trinkle_impl(void *arg, unsigned char *head, size_t width, cmpfun cmp, size_t pp[2], int pshift, int trusty, size_t lp[])
+#else
+# define trinkle trinkle_impl
+static void trinkle_impl(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2], int pshift, int trusty, size_t lp[])
+#endif
 {
 	unsigned char *stepson,
 	              *rt, *lf;
@@ -130,13 +148,13 @@ static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2],
 	ar[0] = head;
 	while(p[0] != 1 || p[1] != 0) {
 		stepson = head - lp[pshift];
-		if((*cmp)(stepson, ar[0]) <= 0) {
+		if(call_cmp(stepson, ar[0]) <= 0) {
 			break;
 		}
 		if(!trusty && pshift > 1) {
 			rt = head - width;
 			lf = head - width - lp[pshift - 2];
-			if((*cmp)(rt, stepson) >= 0 || (*cmp)(lf, stepson) >= 0) {
+			if(call_cmp(rt, stepson) >= 0 || call_cmp(lf, stepson) >= 0) {
 				break;
 			}
 		}
@@ -154,7 +172,11 @@ static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2],
 	}
 }
 
+#ifdef QSORT_R
+void qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
+#else
 void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
+#endif
 {
 	size_t lp[12*sizeof(size_t)];
 	size_t i, size = width * nel;
@@ -182,7 +204,7 @@ void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
 			} else {
 				sift(head, width, cmp, pshift, lp);
 			}
-			
+
 			if(pshift == 1) {
 				shl(p, 1);
 				pshift = 0;
@@ -191,7 +213,7 @@ void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
 				pshift = 1;
 			}
 		}
-		
+
 		p[0] |= 1;
 		head += width;
 	}
diff --git a/src/stdlib/qsort_r.c b/src/stdlib/qsort_r.c
new file mode 100644
index 00000000..48dc04f1
--- /dev/null
+++ b/src/stdlib/qsort_r.c
@@ -0,0 +1,2 @@
+#define QSORT_R
+#include "qsort.c"
-- 
2.30.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09  3:56 [musl] [PATCH v2] add qsort_r Érico Nogueira
@ 2021-03-09  9:11 ` Alexander Monakov
  2021-03-09 13:42   ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2021-03-09  9:11 UTC (permalink / raw)
  To: musl; +Cc: Érico Nogueira

[-- Attachment #1: Type: text/plain, Size: 638 bytes --]

On Tue, 9 Mar 2021, Érico Nogueira wrote:

> since most discussion around the addition of this function has centered
> around the possible code duplication it requires or that qsort would
> become much slower if implemented as a wrapper around qsort_r

How much is "much slower", did anyone provide figures to support this claim?
The extra cost that a wrapper brings is either one indirect jump instruction,
or one trivially-predictable conditional branch per one comparator invocation.

Constant factor in musl qsort is quite high, I'd be surprised if the extra
overhead from one additional branch is even possible to measure.

Alexander

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09  9:11 ` Alexander Monakov
@ 2021-03-09 13:42   ` Rich Felker
  2021-03-09 14:13     ` Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2021-03-09 13:42 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: musl, Érico Nogueira

On Tue, Mar 09, 2021 at 12:11:37PM +0300, Alexander Monakov wrote:
> On Tue, 9 Mar 2021, Érico Nogueira wrote:
> 
> > since most discussion around the addition of this function has centered
> > around the possible code duplication it requires or that qsort would
> > become much slower if implemented as a wrapper around qsort_r
> 
> How much is "much slower", did anyone provide figures to support this claim?
> The extra cost that a wrapper brings is either one indirect jump instruction,
> or one trivially-predictable conditional branch per one comparator invocation.

Quite a bit I'd expect. Each call to cmp would involve an extra level
of call wrapper. With full IPA/inlining it could be optimized out, but
only by making a non-_r copy of all the qsort code in the process at
optimize time.

> Constant factor in musl qsort is quite high, I'd be surprised if the extra
> overhead from one additional branch is even possible to measure.

I don't think it's just a branch. It's a call layer. qsort_r internals
with cmp=wrapper_cmp, ctx=real_cmp -> wrapper_cmp(x, y, real_cmp) ->
real_cmp(x, y). But I'm not opposed to looking at some numbers if you
think it might not matter. Maybe because it's a tail call it does
collapse to essentially just a branch in terms of cost..

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09 13:42   ` Rich Felker
@ 2021-03-09 14:13     ` Alexander Monakov
  2021-03-09 15:03       ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2021-03-09 14:13 UTC (permalink / raw)
  To: musl; +Cc: Érico Nogueira

[-- Attachment #1: Type: text/plain, Size: 2092 bytes --]

> On Tue, Mar 09, 2021 at 12:11:37PM +0300, Alexander Monakov wrote:
> > On Tue, 9 Mar 2021, Érico Nogueira wrote:
> > 
> > > since most discussion around the addition of this function has centered
> > > around the possible code duplication it requires or that qsort would
> > > become much slower if implemented as a wrapper around qsort_r
> > 
> > How much is "much slower", did anyone provide figures to support this claim?
> > The extra cost that a wrapper brings is either one indirect jump instruction,
> > or one trivially-predictable conditional branch per one comparator invocation.
> 
> Quite a bit I'd expect. Each call to cmp would involve an extra level
> of call wrapper. With full IPA/inlining it could be optimized out, but
> only by making a non-_r copy of all the qsort code in the process at
> optimize time.
> 
> > Constant factor in musl qsort is quite high, I'd be surprised if the extra
> > overhead from one additional branch is even possible to measure.
> 
> I don't think it's just a branch. It's a call layer. qsort_r internals
> with cmp=wrapper_cmp, ctx=real_cmp -> wrapper_cmp(x, y, real_cmp) ->
> real_cmp(x, y). But I'm not opposed to looking at some numbers if you
> think it might not matter. Maybe because it's a tail call it does
> collapse to essentially just a branch in terms of cost..

First of all it's not necessarily a "call layer".

You could change cmp call site such that NULL comparator implies that
non-_r version was called and the original comparator address is in ctx:

static inline int call_cmp(void *v1, void *v2, void *ctx, cmpfun cmp)
{
	if (cmp)
		return cmp(v1, v2, ctx);
	return ((cmpfun)ctx)(v1, v2);
}

This is just a conditional branch at call site after trivial inlining.

Second, if you make a "conventional" wrapper, then on popular architectures
it is a single instruction (powerpc64 ABI demonstrates its insanity here):

static int wrapper_cmp(void *v1, void *v2, void *ctx)
{
	return ((cmpfun)ctx)(v1, v2);
}

Some examples:

amd64:	jmp %rdx
i386:	jmp *12(%esp)
arm:	bx r2
aarch64:br x2

How is this not obvious?

Alexander

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09 14:13     ` Alexander Monakov
@ 2021-03-09 15:03       ` Rich Felker
  2021-03-09 16:54         ` Markus Wichmann
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2021-03-09 15:03 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: musl, Érico Nogueira

On Tue, Mar 09, 2021 at 05:13:39PM +0300, Alexander Monakov wrote:
> > On Tue, Mar 09, 2021 at 12:11:37PM +0300, Alexander Monakov wrote:
> > > On Tue, 9 Mar 2021, Érico Nogueira wrote:
> > > 
> > > > since most discussion around the addition of this function has centered
> > > > around the possible code duplication it requires or that qsort would
> > > > become much slower if implemented as a wrapper around qsort_r
> > > 
> > > How much is "much slower", did anyone provide figures to support this claim?
> > > The extra cost that a wrapper brings is either one indirect jump instruction,
> > > or one trivially-predictable conditional branch per one comparator invocation.
> > 
> > Quite a bit I'd expect. Each call to cmp would involve an extra level
> > of call wrapper. With full IPA/inlining it could be optimized out, but
> > only by making a non-_r copy of all the qsort code in the process at
> > optimize time.
> > 
> > > Constant factor in musl qsort is quite high, I'd be surprised if the extra
> > > overhead from one additional branch is even possible to measure.
> > 
> > I don't think it's just a branch. It's a call layer. qsort_r internals
> > with cmp=wrapper_cmp, ctx=real_cmp -> wrapper_cmp(x, y, real_cmp) ->
> > real_cmp(x, y). But I'm not opposed to looking at some numbers if you
> > think it might not matter. Maybe because it's a tail call it does
> > collapse to essentially just a branch in terms of cost..
> 
> First of all it's not necessarily a "call layer".
> 
> You could change cmp call site such that NULL comparator implies that
> non-_r version was called and the original comparator address is in ctx:
> 
> static inline int call_cmp(void *v1, void *v2, void *ctx, cmpfun cmp)
> {
> 	if (cmp)
> 		return cmp(v1, v2, ctx);
> 	return ((cmpfun)ctx)(v1, v2);
> }
> 
> This is just a conditional branch at call site after trivial inlining.

This works, but it's not what I would call writing qsort as a wrapper
around qsort_r, because it depends on qsort_r having this additional
libc-internal contract to treat null cmp specially, and it might be
undesirable because it then does something rather awful if the
application calls qsort_r with a null cmp pointer (rather than just
crashing with PC=0).

> Second, if you make a "conventional" wrapper, then on popular architectures
> it is a single instruction (powerpc64 ABI demonstrates its insanity here):
> 
> static int wrapper_cmp(void *v1, void *v2, void *ctx)
> {
> 	return ((cmpfun)ctx)(v1, v2);
> }
> 
> Some examples:
> 
> amd64:	jmp %rdx
> i386:	jmp *12(%esp)
> arm:	bx r2
> aarch64:br x2
> 
> How is this not obvious?

Now that you mention it it's obvious that the compiler should be able
to do this. gcc -Os alone does not, which looks like yet another
reason to nuke -Os, but I think in musl it would do the right thing
already. It turns out the problem is that gcc emits spurious frame
pointer setup and teardown without -fomit-frame-pointer.

For some reason though it's gigantic on powerpc64. It fails to do a
tail call at all...

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09 15:03       ` Rich Felker
@ 2021-03-09 16:54         ` Markus Wichmann
  2021-03-09 18:04           ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Markus Wichmann @ 2021-03-09 16:54 UTC (permalink / raw)
  To: musl

On Tue, Mar 09, 2021 at 10:03:22AM -0500, Rich Felker wrote:
> On Tue, Mar 09, 2021 at 05:13:39PM +0300, Alexander Monakov wrote:
> > Second, if you make a "conventional" wrapper, then on popular architectures
> > it is a single instruction (powerpc64 ABI demonstrates its insanity here):
> >
> > static int wrapper_cmp(void *v1, void *v2, void *ctx)
> > {
> > 	return ((cmpfun)ctx)(v1, v2);
> > }
> >
> > Some examples:
> >
> > amd64:	jmp %rdx
> > i386:	jmp *12(%esp)
> > arm:	bx r2
> > aarch64:br x2
> >
> > How is this not obvious?
>
> [...]
>
> For some reason though it's gigantic on powerpc64. It fails to do a
> tail call at all...
>

So, I experimented a bit with clang (for simplicity, since clang can
switch targets with a compiler switch). And indeed the above is
reproducible. Had to search around a bit for the ELFv2 ABI switch for
clang (the ELFv1 version is even worse, since it uses function
descriptors, so calling through a function pointer requires reading out
the function descriptors before being able to use them).

So with ELFv2, the function consists of buildup and teardown of a stack
frame, save and restore of R2, and the actual indirect call. The stack
frame is necessary because of R2 being spilled, and R2 being spilled is
necessary since the wrapper function might be called locally (so the
contract is that R2 is preserved), but the function pointer might point
to a function in another module, so R2 would be overwritten by the call.

That makes sense. What doesn't make sense is that the stack frame is
still used in 32-bit powerpc. Nothing is saved into that stack frame;
"mtctr 5; bctr" would be a valid implementation. But no matter what
switches I threw at it, the stack frame remained.

For other architectures: I could not test microblaze, mipsn32, m68k,
or1k, riscv64, and sh, since clang did not recognize those
architectures. Probably not included by default. MIPS and MIPS64 both
establish a stack frame, and s390x does not.

Ciao,
Markus

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09 16:54         ` Markus Wichmann
@ 2021-03-09 18:04           ` Rich Felker
  2021-03-15 23:49             ` Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2021-03-09 18:04 UTC (permalink / raw)
  To: Markus Wichmann; +Cc: musl

On Tue, Mar 09, 2021 at 05:54:04PM +0100, Markus Wichmann wrote:
> On Tue, Mar 09, 2021 at 10:03:22AM -0500, Rich Felker wrote:
> > On Tue, Mar 09, 2021 at 05:13:39PM +0300, Alexander Monakov wrote:
> > > Second, if you make a "conventional" wrapper, then on popular architectures
> > > it is a single instruction (powerpc64 ABI demonstrates its insanity here):
> > >
> > > static int wrapper_cmp(void *v1, void *v2, void *ctx)
> > > {
> > > 	return ((cmpfun)ctx)(v1, v2);
> > > }
> > >
> > > Some examples:
> > >
> > > amd64:	jmp %rdx
> > > i386:	jmp *12(%esp)
> > > arm:	bx r2
> > > aarch64:br x2
> > >
> > > How is this not obvious?
> >
> > [...]
> >
> > For some reason though it's gigantic on powerpc64. It fails to do a
> > tail call at all...
> >
> 
> So, I experimented a bit with clang (for simplicity, since clang can
> switch targets with a compiler switch). And indeed the above is
> reproducible. Had to search around a bit for the ELFv2 ABI switch for
> clang (the ELFv1 version is even worse, since it uses function
> descriptors, so calling through a function pointer requires reading out
> the function descriptors before being able to use them).
> 
> So with ELFv2, the function consists of buildup and teardown of a stack
> frame, save and restore of R2, and the actual indirect call. The stack
> frame is necessary because of R2 being spilled, and R2 being spilled is
> necessary since the wrapper function might be called locally (so the
> contract is that R2 is preserved), but the function pointer might point
> to a function in another module, so R2 would be overwritten by the call.

Yes, I found the comment in GCC source to that effect (rs6000.c,
rs6000_function_ok_for_sibcall):

  /* Under the AIX or ELFv2 ABIs we can't allow calls to non-local
     functions, because the callee may have a different TOC pointer to
     the caller and there's no way to ensure we restore the TOC when
     we return.  With the secure-plt SYSV ABI we can't make non-local
     calls when -fpic/PIC because the plt call stubs use r30.  */

However, this problem is solvable: just don't have a local entry. If
the function is defined without a local entry, the caller is forced to
save the TOC pointer. So GCC should be enhanced not to suppress tail
calls, but to suppress emitting a local entry point whenever there are
tail calls in function. (Tail call saves A LOT more than eliding the
TOC spill savaes!)

> That makes sense. What doesn't make sense is that the stack frame is
> still used in 32-bit powerpc. Nothing is saved into that stack frame;
> "mtctr 5; bctr" would be a valid implementation. But no matter what
> switches I threw at it, the stack frame remained.

It works for me, but my 32-bit ppc gcc is still very old (5.3). Maybe
this is a regression? Or if you're using clang, a clang-only
limitation?

It's known that 32-bit ppc can't tail call to the PLT with secure-plt
mode, but this is an indirect call not thru the PLT so it shouldn't
matter.

> For other architectures: I could not test microblaze, mipsn32, m68k,
> or1k, riscv64, and sh, since clang did not recognize those
> architectures. Probably not included by default. MIPS and MIPS64 both
> establish a stack frame, and s390x does not.

I tested sh4, sh2/fdpic, rv64, s390x, or1k, m68k, and mips (32-bit)
and they all do the tail call properly. But mips64 (n64 and n32) both
fail to. According to the GCC source, it's some thing to allow lazy
binding. MIPS64 does not use a real PLT, but actually has GOT entries
that might go through a lazy resolver and that expect %gp (call-saved)
to be valid on entry.

musl does not, and will never, do lazy binding, so this is purely
counterproductive for musl and we should probably teach GCC not to do
it. The current logic is:

  /* Sibling calls should not prevent lazy binding.  Lazy-binding stubs
     require $gp to be valid on entry, so sibcalls can only use stubs
     if $gp is call-clobbered.  */
  if (decl
      && TARGET_CALL_SAVED_GP
      && !TARGET_ABICALLS_PIC0
      && !targetm.binds_local_p (decl))
    return false;

TARGET_CALL_SAVED_GP is rightly true (it's the ABI).

TARGET_ABICALLS_PIC0 is rightly false (I'm pretty sure that's a bogus
alt ABI, and defined as TARGET_ABSOLUTE_ABICALLS && TARGET_PLT).

It probably needs an addition condition && TARGET_LAZY_BINDING that we
can define as false. Alternatively the issue could just be fixed not
to go through lazy resolver anywhere.

I opened a bug for it here:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99491

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-09 18:04           ` Rich Felker
@ 2021-03-15 23:49             ` Alexander Monakov
  2021-03-16  2:18               ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2021-03-15 23:49 UTC (permalink / raw)
  To: musl; +Cc: Markus Wichmann

On Tue, 9 Mar 2021, Rich Felker wrote:

> I tested sh4, sh2/fdpic, rv64, s390x, or1k, m68k, and mips (32-bit)
> and they all do the tail call properly. But mips64 (n64 and n32) both
> fail to. According to the GCC source, it's some thing to allow lazy
> binding. MIPS64 does not use a real PLT, but actually has GOT entries
> that might go through a lazy resolver and that expect %gp (call-saved)
> to be valid on entry.

I think you botched something in your MIPS64 testing, probably making
a direct call instead of an indirect call. An indirect call should not
go to a lazy resolver (because then if you take the address once and
reuse it many times, you risk entering the resolver multiple times).

Here's a Compiler Explorer link demonstrating that indirect call
compiles to a simple jump on MIPS64: https://godbolt.org/z/vroTs9

> musl does not, and will never, do lazy binding, so this is purely
> counterproductive for musl and we should probably teach GCC not to do
> it. The current logic is:
> 
>   /* Sibling calls should not prevent lazy binding.  Lazy-binding stubs
>      require $gp to be valid on entry, so sibcalls can only use stubs
>      if $gp is call-clobbered.  */
>   if (decl

decl will be NULL for an indirect call

>       && TARGET_CALL_SAVED_GP
>       && !TARGET_ABICALLS_PIC0
>       && !targetm.binds_local_p (decl))
>     return false;
> 
> TARGET_CALL_SAVED_GP is rightly true (it's the ABI).
> 
> TARGET_ABICALLS_PIC0 is rightly false (I'm pretty sure that's a bogus
> alt ABI, and defined as TARGET_ABSOLUTE_ABICALLS && TARGET_PLT).
> 
> It probably needs an addition condition && TARGET_LAZY_BINDING that we
> can define as false. Alternatively the issue could just be fixed not
> to go through lazy resolver anywhere.
> 
> I opened a bug for it here:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99491
> 
> 
> Rich
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [PATCH v2] add qsort_r.
  2021-03-15 23:49             ` Alexander Monakov
@ 2021-03-16  2:18               ` Rich Felker
  0 siblings, 0 replies; 9+ messages in thread
From: Rich Felker @ 2021-03-16  2:18 UTC (permalink / raw)
  To: musl

On Tue, Mar 16, 2021 at 02:49:59AM +0300, Alexander Monakov wrote:
> On Tue, 9 Mar 2021, Rich Felker wrote:
> 
> > I tested sh4, sh2/fdpic, rv64, s390x, or1k, m68k, and mips (32-bit)
> > and they all do the tail call properly. But mips64 (n64 and n32) both
> > fail to. According to the GCC source, it's some thing to allow lazy
> > binding. MIPS64 does not use a real PLT, but actually has GOT entries
> > that might go through a lazy resolver and that expect %gp (call-saved)
> > to be valid on entry.
> 
> I think you botched something in your MIPS64 testing, probably making
> a direct call instead of an indirect call. An indirect call should not
> go to a lazy resolver (because then if you take the address once and
> reuse it many times, you risk entering the resolver multiple times).

No, it's even stupider. Somehow when I deleted the $TARGET-gcc to
replace with mips64-*-gcc I also deleted the -O2 right after it, so I
was looking at -O0...

> Here's a Compiler Explorer link demonstrating that indirect call
> compiles to a simple jump on MIPS64: https://godbolt.org/z/vroTs9

Yes, confirmed that now. However direct call indeed does not tail-call.

> > musl does not, and will never, do lazy binding, so this is purely
> > counterproductive for musl and we should probably teach GCC not to do
> > it. The current logic is:
> > 
> >   /* Sibling calls should not prevent lazy binding.  Lazy-binding stubs
> >      require $gp to be valid on entry, so sibcalls can only use stubs
> >      if $gp is call-clobbered.  */
> >   if (decl
> 
> decl will be NULL for an indirect call

*nod* ...

> 
> >       && TARGET_CALL_SAVED_GP
> >       && !TARGET_ABICALLS_PIC0
> >       && !targetm.binds_local_p (decl))
> >     return false;
> > 
> > TARGET_CALL_SAVED_GP is rightly true (it's the ABI).
> > 
> > TARGET_ABICALLS_PIC0 is rightly false (I'm pretty sure that's a bogus
> > alt ABI, and defined as TARGET_ABSOLUTE_ABICALLS && TARGET_PLT).
> > 
> > It probably needs an addition condition && TARGET_LAZY_BINDING that we
> > can define as false. Alternatively the issue could just be fixed not
> > to go through lazy resolver anywhere.
> > 
> > I opened a bug for it here:
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99491

...but this bug is still valid, as it's undesirable for this to happen
with direct calls too.

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-03-16  2:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-09  3:56 [musl] [PATCH v2] add qsort_r Érico Nogueira
2021-03-09  9:11 ` Alexander Monakov
2021-03-09 13:42   ` Rich Felker
2021-03-09 14:13     ` Alexander Monakov
2021-03-09 15:03       ` Rich Felker
2021-03-09 16:54         ` Markus Wichmann
2021-03-09 18:04           ` Rich Felker
2021-03-15 23:49             ` Alexander Monakov
2021-03-16  2:18               ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).