[musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
@ 2021-08-01 15:59 Stefan Kanthak
  2021-08-03 20:27 ` Szabolcs Nagy
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2021-08-01 15:59 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1624 bytes --]

<https://git.musl-libc.org/cgit/musl/plain/src/math/i386/remquo.s>

Halve the number of instructions (from 12 to 6) to fetch the
(3-bit partial) quotient from the FPU flags C0:C3:C1, and
perform its negation without conditional branch.

--- -/math/i386/remquo.s
+++ +/math/i386/remquo.s
@@ -2,49 +2,44 @@
 .type remquof,@function
 remquof:
        mov 12(%esp),%ecx
+       mov 8(%esp),%eax
+       xor 4(%esp),%eax
        flds 8(%esp)
        flds 4(%esp)
-       mov 11(%esp),%dh
-       xor 7(%esp),%dh
-       jmp 1f
+       jmp 0f

 .global remquol
 .type remquol,@function
 remquol:
        mov 28(%esp),%ecx
+       mov 24(%esp),%eax
+       xor 12(%esp),%eax
+       cwtl
        fldt 16(%esp)
        fldt 4(%esp)
-       mov 25(%esp),%dh
-       xor 13(%esp),%dh
-       jmp 1f
+       jmp 0f

 .global remquo
 .type remquo,@function
 remquo:
        mov 20(%esp),%ecx
+       mov 16(%esp),%eax
+       xor 8(%esp),%eax
        fldl 12(%esp)
        fldl 4(%esp)
-       mov 19(%esp),%dh
-       xor 11(%esp),%dh
+0:     cltd
 1:     fprem1
        fnstsw %ax
        sahf
        jp 1b
        fstp %st(1)
-       mov %ah,%dl
-       shr %dl
-       and $1,%dl
-       mov %ah,%al
-       shr $5,%al
-       and $2,%al
-       or %al,%dl
-       mov %ah,%al
-       shl $2,%al
-       and $4,%al
-       or %al,%dl
-       test %dh,%dh
-       jns 1f
-       neg %dl
-1:     movsbl %dl,%edx
-       mov %edx,(%ecx)
+       adc %al,%al
+       shl $2,%ah
+       adc %al,%al
+       shl $5,%ah
+       adc %al,%al
+       and $7,%eax
+       xor %edx,%eax
+       sub %edx,%eax
+       mov %eax,(%ecx)
        ret

[-- Attachment #2: remquo.patch --]
[-- Type: application/octet-stream, Size: 1097 bytes --]

--- -remquo.s
+++ +remquo.s
@@ -2,49 +2,44 @@
 .type remquof,@function
 remquof:
 	mov 12(%esp),%ecx
+	mov 8(%esp),%eax
+	xor 4(%esp),%eax
 	flds 8(%esp)
 	flds 4(%esp)
-	mov 11(%esp),%dh
-	xor 7(%esp),%dh
-	jmp 1f
+	jmp 0f
 
 .global remquol
 .type remquol,@function
 remquol:
 	mov 28(%esp),%ecx
+	mov 24(%esp),%eax
+	xor 12(%esp),%eax
+	cwtl
 	fldt 16(%esp)
 	fldt 4(%esp)
-	mov 25(%esp),%dh
-	xor 13(%esp),%dh
-	jmp 1f
+	jmp 0f
 
 .global remquo
 .type remquo,@function
 remquo:
 	mov 20(%esp),%ecx
+	mov 16(%esp),%eax
+	xor 8(%esp),%eax
 	fldl 12(%esp)
 	fldl 4(%esp)
-	mov 19(%esp),%dh
-	xor 11(%esp),%dh
+0:	cltd
 1:	fprem1
 	fnstsw %ax
 	sahf
 	jp 1b
 	fstp %st(1)
-	mov %ah,%dl
-	shr %dl
-	and $1,%dl
-	mov %ah,%al
-	shr $5,%al
-	and $2,%al
-	or %al,%dl
-	mov %ah,%al
-	shl $2,%al
-	and $4,%al
-	or %al,%dl
-	test %dh,%dh
-	jns 1f
-	neg %dl
-1:	movsbl %dl,%edx
-	mov %edx,(%ecx)
+	adc %al,%al
+	shl $2,%ah
+	adc %al,%al
+	shl $5,%ah
+	adc %al,%al
+	and $7,%eax
+	xor %edx,%eax
+	sub %edx,%eax
+	mov %eax,(%ecx)
 	ret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-01 15:59 [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling Stefan Kanthak
@ 2021-08-03 20:27 ` Szabolcs Nagy
  2021-08-04 10:02   ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Szabolcs Nagy @ 2021-08-03 20:27 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: musl

* Stefan Kanthak <stefan.kanthak@nexgo.de> [2021-08-01 17:59:52 +0200]:
> Halve the number of instructions (from 12 to 6) to fetch the
> (3-bit partial) quotient from the FPU flags C0:C3:C1, and
> perform its negation without conditional branch.

i haven't tested it but it looks good.

i think we should not tweak x87 asm code too much though.
it can introduce bugs and there are not many users of it.
i think only the size saving can justify keeping any i386
math code at all.

but i'm not against committing this.
thanks for the patch.

> --- -/math/i386/remquo.s
> +++ +/math/i386/remquo.s
> @@ -2,49 +2,44 @@
>  .type remquof,@function
>  remquof:
>         mov 12(%esp),%ecx
> +       mov 8(%esp),%eax
> +       xor 4(%esp),%eax
>         flds 8(%esp)
>         flds 4(%esp)
> -       mov 11(%esp),%dh
> -       xor 7(%esp),%dh
> -       jmp 1f
> +       jmp 0f
> 
>  .global remquol
>  .type remquol,@function
>  remquol:
>         mov 28(%esp),%ecx
> +       mov 24(%esp),%eax
> +       xor 12(%esp),%eax
> +       cwtl
>         fldt 16(%esp)
>         fldt 4(%esp)
> -       mov 25(%esp),%dh
> -       xor 13(%esp),%dh
> -       jmp 1f
> +       jmp 0f
> 
>  .global remquo
>  .type remquo,@function
>  remquo:
>         mov 20(%esp),%ecx
> +       mov 16(%esp),%eax
> +       xor 8(%esp),%eax
>         fldl 12(%esp)
>         fldl 4(%esp)
> -       mov 19(%esp),%dh
> -       xor 11(%esp),%dh
> +0:     cltd
>  1:     fprem1
>         fnstsw %ax
>         sahf
>         jp 1b
>         fstp %st(1)
> -       mov %ah,%dl
> -       shr %dl
> -       and $1,%dl
> -       mov %ah,%al
> -       shr $5,%al
> -       and $2,%al
> -       or %al,%dl
> -       mov %ah,%al
> -       shl $2,%al
> -       and $4,%al
> -       or %al,%dl
> -       test %dh,%dh
> -       jns 1f
> -       neg %dl
> -1:     movsbl %dl,%edx
> -       mov %edx,(%ecx)
> +       adc %al,%al
> +       shl $2,%ah
> +       adc %al,%al
> +       shl $5,%ah
> +       adc %al,%al
> +       and $7,%eax
> +       xor %edx,%eax
> +       sub %edx,%eax
> +       mov %eax,(%ecx)
>         ret



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-03 20:27 ` Szabolcs Nagy
@ 2021-08-04 10:02   ` Stefan Kanthak
  2021-08-05 13:40     ` Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2021-08-04 10:02 UTC (permalink / raw)
  To: Szabolcs Nagy; +Cc: musl

"Szabolcs Nagy" <nsz@port70.net> wrote:


>* Stefan Kanthak <stefan.kanthak@nexgo.de> [2021-08-01 17:59:52 +0200]:
>> Halve the number of instructions (from 12 to 6) to fetch the
>> (3-bit partial) quotient from the FPU flags C0:C3:C1, and
>> perform its negation without conditional branch.
> 
> i haven't tested it but it looks good.

This is basically well-tested code I wrote about 20 years ago
for my own NOMSVCRT.LIB: I always found the bit twiddling of
J.T.Conklins code rather awful.

> i think we should not tweak x87 asm code too much though.
> it can introduce bugs and there are not many users of it.
> i think only the size saving can justify keeping any i386
> math code at all.

From your own FAQ <http://www.musl-libc.org/faq.html>

| When will it be finished?
| When there's nothing left to remove.

The change just follows by removing 6 LOC/instructions.-)

> but i'm not against committing this.
> thanks for the patch.

regards
Stefan

>> --- -/math/i386/remquo.s
>> +++ +/math/i386/remquo.s
>> @@ -2,49 +2,44 @@
>>  .type remquof,@function
>>  remquof:
>>         mov 12(%esp),%ecx
>> +       mov 8(%esp),%eax
>> +       xor 4(%esp),%eax
>>         flds 8(%esp)
>>         flds 4(%esp)
>> -       mov 11(%esp),%dh
>> -       xor 7(%esp),%dh
>> -       jmp 1f
>> +       jmp 0f
>> 
>>  .global remquol
>>  .type remquol,@function
>>  remquol:
>>         mov 28(%esp),%ecx
>> +       mov 24(%esp),%eax
>> +       xor 12(%esp),%eax
>> +       cwtl
>>         fldt 16(%esp)
>>         fldt 4(%esp)
>> -       mov 25(%esp),%dh
>> -       xor 13(%esp),%dh
>> -       jmp 1f
>> +       jmp 0f
>> 
>>  .global remquo
>>  .type remquo,@function
>>  remquo:
>>         mov 20(%esp),%ecx
>> +       mov 16(%esp),%eax
>> +       xor 8(%esp),%eax
>>         fldl 12(%esp)
>>         fldl 4(%esp)
>> -       mov 19(%esp),%dh
>> -       xor 11(%esp),%dh
>> +0:     cltd
>>  1:     fprem1
>>         fnstsw %ax
>>         sahf
>>         jp 1b
>>         fstp %st(1)
>> -       mov %ah,%dl
>> -       shr %dl
>> -       and $1,%dl
>> -       mov %ah,%al
>> -       shr $5,%al
>> -       and $2,%al
>> -       or %al,%dl
>> -       mov %ah,%al
>> -       shl $2,%al
>> -       and $4,%al
>> -       or %al,%dl
>> -       test %dh,%dh
>> -       jns 1f
>> -       neg %dl
>> -1:     movsbl %dl,%edx
>> -       mov %edx,(%ecx)
>> +       adc %al,%al
>> +       shl $2,%ah
>> +       adc %al,%al
>> +       shl $5,%ah
>> +       adc %al,%al
>> +       and $7,%eax
>> +       xor %edx,%eax
>> +       sub %edx,%eax
>> +       mov %eax,(%ecx)
>>         ret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-04 10:02   ` Stefan Kanthak
@ 2021-08-05 13:40     ` Alexander Monakov
  2021-08-06 10:17       ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2021-08-05 13:40 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: Szabolcs Nagy, musl

On Wed, 4 Aug 2021, Stefan Kanthak wrote:
> The change just follows by removing 6 LOC/instructions.-)

Have you considered collecting the three bits in one go via a multiplication?
You can first isolate the necessary bits with 'and $0x4300, %eax', then do
'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
right order, then shift right by 29. Three instructions, 14 bytes.

Alexander

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-05 13:40     ` Alexander Monakov
@ 2021-08-06 10:17       ` Stefan Kanthak
  2021-08-06 14:27         ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2021-08-06 10:17 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Szabolcs Nagy, musl

[-- Attachment #1: Type: text/plain, Size: 601 bytes --]

Alexander Monakov <amonakov@ispras.ru> wrote:

> On Wed, 4 Aug 2021, Stefan Kanthak wrote:
>> The change just follows by removing 6 LOC/instructions.-)
> 
> Have you considered collecting the three bits in one go via a multiplication?

No. My mind is not that twisted;-)

> You can first isolate the necessary bits with 'and $0x4300, %eax', then do
> 'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
> right order, then shift right by 29. Three instructions, 14 bytes.

Thanks, VERY NICE! How did you come up to it?

Revised patch with shorter bit twiddling attached.

Stefan

[-- Attachment #2: remquo.patch --]
[-- Type: application/octet-stream, Size: 1073 bytes --]

--- -remquo.s
+++ +remquo.s
@@ -2,49 +2,41 @@
 .type remquof,@function
 remquof:
 	mov 12(%esp),%ecx
+	mov 8(%esp),%eax
+	xor 4(%esp),%eax
 	flds 8(%esp)
 	flds 4(%esp)
-	mov 11(%esp),%dh
-	xor 7(%esp),%dh
-	jmp 1f
+	jmp 0f
 
 .global remquol
 .type remquol,@function
 remquol:
 	mov 28(%esp),%ecx
+	mov 24(%esp),%eax
+	xor 12(%esp),%eax
+	cwtl
 	fldt 16(%esp)
 	fldt 4(%esp)
-	mov 25(%esp),%dh
-	xor 13(%esp),%dh
-	jmp 1f
+	jmp 0f
 
 .global remquo
 .type remquo,@function
 remquo:
 	mov 20(%esp),%ecx
+	mov 16(%esp),%eax
+	xor 8(%esp),%eax
 	fldl 12(%esp)
 	fldl 4(%esp)
-	mov 19(%esp),%dh
-	xor 11(%esp),%dh
+0:	cltd
 1:	fprem1
 	fnstsw %ax
 	sahf
 	jp 1b
 	fstp %st(1)
-	mov %ah,%dl
-	shr %dl
-	and $1,%dl
-	mov %ah,%al
-	shr $5,%al
-	and $2,%al
-	or %al,%dl
-	mov %ah,%al
-	shl $2,%al
-	and $4,%al
-	or %al,%dl
-	test %dh,%dh
-	jns 1f
-	neg %dl
-1:	movsbl %dl,%edx
-	mov %edx,(%ecx)
+	and $0x4300,%eax
+	imul $0x910000,%eax,%eax
+	shr $29,%eax
+	xor %edx,%eax
+	sub %edx,%eax
+	mov %eax,(%ecx)
 	ret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-06 10:17       ` Stefan Kanthak
@ 2021-08-06 14:27         ` Rich Felker
  2021-08-06 17:23           ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2021-08-06 14:27 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: Alexander Monakov, Szabolcs Nagy, musl

On Fri, Aug 06, 2021 at 12:17:12PM +0200, Stefan Kanthak wrote:
> Alexander Monakov <amonakov@ispras.ru> wrote:
> 
> > On Wed, 4 Aug 2021, Stefan Kanthak wrote:
> >> The change just follows by removing 6 LOC/instructions.-)
> > 
> > Have you considered collecting the three bits in one go via a multiplication?
> 
> No. My mind is not that twisted;-)
> 
> > You can first isolate the necessary bits with 'and $0x4300, %eax', then do
> > 'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
> > right order, then shift right by 29. Three instructions, 14 bytes.
> 
> Thanks, VERY NICE! How did you come up to it?
> 
> Revised patch with shorter bit twiddling attached.

The path forward for all the math asm is moving it to inline asm in C
files, with no flow control or bit/register shuffling in the asm, only
using asm for the single instructions. See how Alexander Monakov did
x86_64 remquol in commit 19f870c3a68a959c7c6ef1de12086ac908920e5e. I
haven't read the mul trick here in detail but I believe it should be
duplicable with plain C * operator.

I really do not want to review/merge asm changes that keep this kind
of complex logic in asm when there's no strong motivation for it (like
fixing an actual bug, vs just reducing size or improving speed). The
risk to reward ratio is just not reasonable.

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-06 14:27         ` Rich Felker
@ 2021-08-06 17:23           ` Stefan Kanthak
  2021-08-07  0:55             ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2021-08-06 17:23 UTC (permalink / raw)
  To: Rich Felker; +Cc: Alexander Monakov, Szabolcs Nagy, musl

[-- Attachment #1: Type: text/plain, Size: 2223 bytes --]

Rich Felker <dalias@libc.org> wrote:

If you don't want patches for assembly modules, please state so VERY
CLEAR in your FAQ.


> On Fri, Aug 06, 2021 at 12:17:12PM +0200, Stefan Kanthak wrote:
>> Alexander Monakov <amonakov@ispras.ru> wrote:
>> 
>> > On Wed, 4 Aug 2021, Stefan Kanthak wrote:
>> >> The change just follows by removing 6 LOC/instructions.-)
>> > 
>> > Have you considered collecting the three bits in one go via a multiplication?
>> 
>> No. My mind is not that twisted;-)
>> 
>> > You can first isolate the necessary bits with 'and $0x4300, %eax', then do
>> > 'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
>> > right order, then shift right by 29. Three instructions, 14 bytes.
>> 
>> Thanks, VERY NICE! How did you come up to it?
>> 
>> Revised patch with shorter bit twiddling attached.
> 
> The path forward for all the math asm is moving it to inline asm in C
> files, with no flow control or bit/register shuffling in the asm, only
> using asm for the single instructions. See how Alexander Monakov did
> x86_64 remquol in commit 19f870c3a68a959c7c6ef1de12086ac908920e5e.

This commit is for i386 fmod/fmodf/fmodl. The bit twiddling used in
<https://git.musl-libc.org/cgit/musl/plain/src/math/x86_64/remquol.c>
(which I hadn't noticed yet) and the code GCC generates for it is but
(almost) as bad as the original assembly code:

|        shrl    $8, %eax
|        movl    %eax, %ecx
|        movabsq $8463725162920157216, %rax
|        rolb    $4, %cl
|        andl    $60, %ecx
|        sarq    %cl, %rax
|        andl    $7, %eax

vs.

|        mov %ah,%dl
|        shr %dl
|        and $1,%dl
|        mov %ah,%al
|        shr $5,%al
|        and $2,%al
|        or %al,%dl
|        mov %ah,%al
|        shl $2,%al
|        and $4,%al
|        or %al,%dl

> I haven't read the mul trick here in detail but I believe it should be
> duplicable with plain C * operator.

It is.

> I really do not want to review/merge asm changes that keep this kind
> of complex logic in asm when there's no strong motivation for it (like
> fixing an actual bug, vs just reducing size or improving speed). The
> risk to reward ratio is just not reasonable.

Final patch attached!

Stefan

[-- Attachment #2: remquo.patch --]
[-- Type: application/octet-stream, Size: 3263 bytes --]

--- -/src/math/x86_64/remquol.c
+++ +/src/math/x86_64/remquol.c
@@ -16,2 +16,2 @@
-	unsigned fpsr;
-	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y));
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
@@ -23,6 +23,1 @@
-	unsigned char i = fpsr >> 8;
-	i = i>>4 | i<<4;
-	/* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via
-	 * in-register table lookup. */
-	unsigned qbits = 0x7575313164642020 >> (i & 60);
-	qbits &= 7;
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;

--- /dev/null
+++ +/src/math/i386/remquof.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+float remquof(float x, float y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	float t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	*quo = (cx[sizeof(x) - 1]^cy[sizeof(y) - 1]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquof.s
+++ /dev/null
@@ -1,1 +0,0 @@
-# see remquo.s

--- /dev/null
+++ +/src/math/i386/remquol.c
@@ -0,0 +1,17 @@
+#include <math.h>
+
+long double remquol(long double x, long double y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	long double t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	/* [sizeof(long double) - 1] not usable here due to
+	   GCC's braindead handling of long double alias tbyte */
+	*quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquol.s
+++ /dev/null
@@ -1,1 +0,0 @@
-# see remquo.s

--- /dev/null
+++ +/src/math/i386/remquo.c
@@ -0,0 +1,14 @@
+#include <math.h>
+
+double remquo(double x, double y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	double t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	*quo = (cx[sizeof(x) - 1]^cy[sizeof(y) - 1]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquo.s
+++ /dev/null
@@ -1,50 +0,0 @@
-.global remquof
-.type remquof,@function
-remquof:
-	mov 12(%esp),%ecx
-	flds 8(%esp)
-	flds 4(%esp)
-	mov 11(%esp),%dh
-	xor 7(%esp),%dh
-	jmp 1f
-
-.global remquol
-.type remquol,@function
-remquol:
-	mov 28(%esp),%ecx
-	fldt 16(%esp)
-	fldt 4(%esp)
-	mov 25(%esp),%dh
-	xor 13(%esp),%dh
-	jmp 1f
-
-.global remquo
-.type remquo,@function
-remquo:
-	mov 20(%esp),%ecx
-	fldl 12(%esp)
-	fldl 4(%esp)
-	mov 19(%esp),%dh
-	xor 11(%esp),%dh
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	mov %ah,%dl
-	shr %dl
-	and $1,%dl
-	mov %ah,%al
-	shr $5,%al
-	and $2,%al
-	or %al,%dl
-	mov %ah,%al
-	shl $2,%al
-	and $4,%al
-	or %al,%dl
-	test %dh,%dh
-	jns 1f
-	neg %dl
-1:	movsbl %dl,%edx
-	mov %edx,(%ecx)
-	ret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-06 17:23           ` Stefan Kanthak
@ 2021-08-07  0:55             ` Rich Felker
  2021-08-07 13:12               ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2021-08-07  0:55 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: Alexander Monakov, Szabolcs Nagy, musl

On Fri, Aug 06, 2021 at 07:23:19PM +0200, Stefan Kanthak wrote:
> Rich Felker <dalias@libc.org> wrote:
> 
> If you don't want patches for assembly modules, please state so VERY
> CLEAR in your FAQ.

I'm sorry we don't have better material published on this. A lot of
this knowledge lives collectively in mailing list and people's
memories and I can see how that's not friendly to new contributors. 

My reason for not wanting to take small changes to code like this is
partly a matter of budgeting my own time as maintainer, but more
importantly it's a matter of respecting the time of folks who rely on
musl in environments where trust and correctness matter, where each
change introduces a burden to assure oneself that it hasn't introduced
bugs. That comes with changing to the C-with-inline-asm too, but at
least the new form is more accessible to read, and carries with it
other potential advantages like ease of reviewing that there are not
x87 stack balancing bugs like the ones fixed in
f3ed8bfe8a82af1870ddc8696ed4cc1d5aa6b441 or excess precision bugs like
the ones fixed in 1c9afd69051a64cf085c6fb3674a444ff9a43857,
ab9e20905dc1cb74d955cc94b5b76e6e1892b8fe,
a662220df547e5c2446518e74440a7d834f9ebe6, and possibly others.

In any case, I should really see to it that the FAQ/wiki get updated
with information relevant saving new contributors to frustration of
finding out about things like this late.

> > On Fri, Aug 06, 2021 at 12:17:12PM +0200, Stefan Kanthak wrote:
> >> Alexander Monakov <amonakov@ispras.ru> wrote:
> >> 
> >> > On Wed, 4 Aug 2021, Stefan Kanthak wrote:
> >> >> The change just follows by removing 6 LOC/instructions.-)
> >> > 
> >> > Have you considered collecting the three bits in one go via a multiplication?
> >> 
> >> No. My mind is not that twisted;-)
> >> 
> >> > You can first isolate the necessary bits with 'and $0x4300, %eax', then do
> >> > 'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
> >> > right order, then shift right by 29. Three instructions, 14 bytes.
> >> 
> >> Thanks, VERY NICE! How did you come up to it?
> >> 
> >> Revised patch with shorter bit twiddling attached.
> > 
> > The path forward for all the math asm is moving it to inline asm in C
> > files, with no flow control or bit/register shuffling in the asm, only
> > using asm for the single instructions. See how Alexander Monakov did
> > x86_64 remquol in commit 19f870c3a68a959c7c6ef1de12086ac908920e5e.
> 
> This commit is for i386 fmod/fmodf/fmodl. The bit twiddling used in
> <https://git.musl-libc.org/cgit/musl/plain/src/math/x86_64/remquol.c>
> (which I hadn't noticed yet) and the code GCC generates for it is but
> (almost) as bad as the original assembly code:
> 
> |        shrl    $8, %eax
> |        movl    %eax, %ecx
> |        movabsq $8463725162920157216, %rax
> |        rolb    $4, %cl
> |        andl    $60, %ecx
> |        sarq    %cl, %rax
> |        andl    $7, %eax
> 
> vs.
> 
> |        mov %ah,%dl
> |        shr %dl
> |        and $1,%dl
> |        mov %ah,%al
> |        shr $5,%al
> |        and $2,%al
> |        or %al,%dl
> |        mov %ah,%al
> |        shl $2,%al
> |        and $4,%al
> |        or %al,%dl
> 
> > I haven't read the mul trick here in detail but I believe it should be
> > duplicable with plain C * operator.
> 
> It is.

Great! Does it improve the code generation?

> > I really do not want to review/merge asm changes that keep this kind
> > of complex logic in asm when there's no strong motivation for it (like
> > fixing an actual bug, vs just reducing size or improving speed). The
> > risk to reward ratio is just not reasonable.
> 
> Final patch attached!

Thanks. I don't mind making any final tweaks needed, if necessary.
Normally some care is required when treating outputs of asm as "float"
or "double" type, but I think in this case they're okay, simply
because the fprem1 instruction is exact and always produces a result
which fits in the same or fewer number of mantissa bits as the input
(so that 'dropping excess precision' is a no-op). I'll check that
before merging and follow up on this thread if I see any other
possible problems while reviewing.

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
  2021-08-07  0:55             ` Rich Felker
@ 2021-08-07 13:12               ` Stefan Kanthak
  0 siblings, 0 replies; 9+ messages in thread
From: Stefan Kanthak @ 2021-08-07 13:12 UTC (permalink / raw)
  To: Rich Felker; +Cc: Alexander Monakov, Szabolcs Nagy, musl

Rich Felker <dalias@libc.org> wrote:


> On Fri, Aug 06, 2021 at 07:23:19PM +0200, Stefan Kanthak wrote:
>> Rich Felker <dalias@libc.org> wrote:

>> > The path forward for all the math asm is moving it to inline asm in C
>> > files, with no flow control or bit/register shuffling in the asm, only
>> > using asm for the single instructions.

I'd even go a step further and put all those short functions into header
files to give the compiler a chance to properly them in context.
And then get rid of all the __builtin_* bloat GCC unfortunately carries
with it...

>> > I haven't read the mul trick here in detail but I believe it should be
>> > duplicable with plain C * operator.
>> 
>> It is.
> 
> Great! Does it improve the code generation?

Of course: the same 3 instructions Alexander showed here:

remquol:
        fldt    16(%esp)
        fldt    4(%esp)
.L2:
#APP
# 19 "remquol.c" 1
        fprem1; fnstsw %ax
# 0 "" 2
#NO_APP
        testb   $4, %ah
        jne     .L2
        fstp    %st(1)
        andl    $17152, %eax         #
        movzbl  15(%esp), %ecx       # this should be movb 15(%esp), %cl
        imull   $9502720, %eax, %eax #
        shrl    $29, %eax            #
        movl    %eax, %edx
        negl    %edx
        xorb    27(%esp), %cl
        cmovs   %edx, %eax
        movl    28(%esp), %edx
        movl    %eax, (%edx)
        ret

The code to flip the sign is but not as short as in the assembly version
I posted; the following change may give shorter code:

    int sign = (*cx^*cy) >> 7;
    qbits += sign;
    qbits ^= sign;

>> > I really do not want to review/merge asm changes that keep this kind
>> > of complex logic in asm when there's no strong motivation for it (like
>> > fixing an actual bug, vs just reducing size or improving speed). The
>> > risk to reward ratio is just not reasonable.
>> 
>> Final patch attached!
> 
> Thanks. I don't mind making any final tweaks needed, if necessary.

Do so as you will and wish: I don't mind^W^Wappreciate if someone can
improve code I've written.

Stefan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-08-07 13:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-01 15:59 [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling Stefan Kanthak
2021-08-03 20:27 ` Szabolcs Nagy
2021-08-04 10:02   ` Stefan Kanthak
2021-08-05 13:40     ` Alexander Monakov
2021-08-06 10:17       ` Stefan Kanthak
2021-08-06 14:27         ` Rich Felker
2021-08-06 17:23           ` Stefan Kanthak
2021-08-07  0:55             ` Rich Felker
2021-08-07 13:12               ` Stefan Kanthak

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).