Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: "Stefan Kanthak" <stefan.kanthak@nexgo.de>
To: "Rich Felker" <dalias@libc.org>
Cc: "Alexander Monakov" <amonakov@ispras.ru>,
	"Szabolcs Nagy" <nsz@port70.net>, <musl@lists.openwall.com>
Subject: Re: [musl] [Patch] src/math/i386/remquo.s: remove conditional branch, shorter bit twiddling
Date: Fri, 6 Aug 2021 19:23:19 +0200	[thread overview]
Message-ID: <35E4CF2B08294E46B0451043F46D2C7C@H270> (raw)
In-Reply-To: <20210806142702.GV13220@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 2223 bytes --]

Rich Felker <dalias@libc.org> wrote:

If you don't want patches for assembly modules, please state so VERY
CLEAR in your FAQ.


> On Fri, Aug 06, 2021 at 12:17:12PM +0200, Stefan Kanthak wrote:
>> Alexander Monakov <amonakov@ispras.ru> wrote:
>> 
>> > On Wed, 4 Aug 2021, Stefan Kanthak wrote:
>> >> The change just follows by removing 6 LOC/instructions.-)
>> > 
>> > Have you considered collecting the three bits in one go via a multiplication?
>> 
>> No. My mind is not that twisted;-)
>> 
>> > You can first isolate the necessary bits with 'and $0x4300, %eax', then do
>> > 'imul $0x910000, %eax, %eax' to put the required bits in EAX[31:29] in the
>> > right order, then shift right by 29. Three instructions, 14 bytes.
>> 
>> Thanks, VERY NICE! How did you come up to it?
>> 
>> Revised patch with shorter bit twiddling attached.
> 
> The path forward for all the math asm is moving it to inline asm in C
> files, with no flow control or bit/register shuffling in the asm, only
> using asm for the single instructions. See how Alexander Monakov did
> x86_64 remquol in commit 19f870c3a68a959c7c6ef1de12086ac908920e5e.

This commit is for i386 fmod/fmodf/fmodl. The bit twiddling used in
<https://git.musl-libc.org/cgit/musl/plain/src/math/x86_64/remquol.c>
(which I hadn't noticed yet) and the code GCC generates for it is but
(almost) as bad as the original assembly code:

|        shrl    $8, %eax
|        movl    %eax, %ecx
|        movabsq $8463725162920157216, %rax
|        rolb    $4, %cl
|        andl    $60, %ecx
|        sarq    %cl, %rax
|        andl    $7, %eax

vs.

|        mov %ah,%dl
|        shr %dl
|        and $1,%dl
|        mov %ah,%al
|        shr $5,%al
|        and $2,%al
|        or %al,%dl
|        mov %ah,%al
|        shl $2,%al
|        and $4,%al
|        or %al,%dl

> I haven't read the mul trick here in detail but I believe it should be
> duplicable with plain C * operator.

It is.

> I really do not want to review/merge asm changes that keep this kind
> of complex logic in asm when there's no strong motivation for it (like
> fixing an actual bug, vs just reducing size or improving speed). The
> risk to reward ratio is just not reasonable.

Final patch attached!

Stefan

[-- Attachment #2: remquo.patch --]
[-- Type: application/octet-stream, Size: 3263 bytes --]

--- -/src/math/x86_64/remquol.c
+++ +/src/math/x86_64/remquol.c
@@ -16,2 +16,2 @@
-	unsigned fpsr;
-	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y));
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
@@ -23,6 +23,1 @@
-	unsigned char i = fpsr >> 8;
-	i = i>>4 | i<<4;
-	/* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via
-	 * in-register table lookup. */
-	unsigned qbits = 0x7575313164642020 >> (i & 60);
-	qbits &= 7;
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;

--- /dev/null
+++ +/src/math/i386/remquof.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+float remquof(float x, float y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	float t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	*quo = (cx[sizeof(x) - 1]^cy[sizeof(y) - 1]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquof.s
+++ /dev/null
@@ -1,1 +0,0 @@
-# see remquo.s

--- /dev/null
+++ +/src/math/i386/remquol.c
@@ -0,0 +1,17 @@
+#include <math.h>
+
+long double remquol(long double x, long double y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	long double t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	/* [sizeof(long double) - 1] not usable here due to
+	   GCC's braindead handling of long double alias tbyte */
+	*quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquol.s
+++ /dev/null
@@ -1,1 +0,0 @@
-# see remquo.s

--- /dev/null
+++ +/src/math/i386/remquo.c
@@ -0,0 +1,14 @@
+#include <math.h>
+
+double remquo(double x, double y, int *quo)
+{
+	/* see ../x86_64/remquol.c */
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	__asm__ ("" :: "X"(cx), "X"(cy));
+	double t = x;
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %0" : "=a"(fpsr), "+t"(t) : "u"(y));
+	while (fpsr & 0x400);
+	unsigned qbits = (fpsr & 0x4300) * 0x910000u >> 29;
+	*quo = (cx[sizeof(x) - 1]^cy[sizeof(y) - 1]) < 0 ? -qbits : qbits;
+	return t;
+}

--- -/src/math/i386/remquo.s
+++ /dev/null
@@ -1,50 +0,0 @@
-.global remquof
-.type remquof,@function
-remquof:
-	mov 12(%esp),%ecx
-	flds 8(%esp)
-	flds 4(%esp)
-	mov 11(%esp),%dh
-	xor 7(%esp),%dh
-	jmp 1f
-
-.global remquol
-.type remquol,@function
-remquol:
-	mov 28(%esp),%ecx
-	fldt 16(%esp)
-	fldt 4(%esp)
-	mov 25(%esp),%dh
-	xor 13(%esp),%dh
-	jmp 1f
-
-.global remquo
-.type remquo,@function
-remquo:
-	mov 20(%esp),%ecx
-	fldl 12(%esp)
-	fldl 4(%esp)
-	mov 19(%esp),%dh
-	xor 11(%esp),%dh
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	mov %ah,%dl
-	shr %dl
-	and $1,%dl
-	mov %ah,%al
-	shr $5,%al
-	and $2,%al
-	or %al,%dl
-	mov %ah,%al
-	shl $2,%al
-	and $4,%al
-	or %al,%dl
-	test %dh,%dh
-	jns 1f
-	neg %dl
-1:	movsbl %dl,%edx
-	mov %edx,(%ecx)
-	ret

next prev parent reply	other threads:[~2021-08-06 17:30 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-01 15:59 Stefan Kanthak
2021-08-03 20:27 ` Szabolcs Nagy
2021-08-04 10:02   ` Stefan Kanthak
2021-08-05 13:40     ` Alexander Monakov
2021-08-06 10:17       ` Stefan Kanthak
2021-08-06 14:27         ` Rich Felker
2021-08-06 17:23           ` Stefan Kanthak [this message]
2021-08-07  0:55             ` Rich Felker
2021-08-07 13:12               ` Stefan Kanthak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=35E4CF2B08294E46B0451043F46D2C7C@H270 \
    --to=stefan.kanthak@nexgo.de \
    --cc=amonakov@ispras.ru \
    --cc=dalias@libc.org \
    --cc=musl@lists.openwall.com \
    --cc=nsz@port70.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).