From mboxrd@z Thu Jan  1 00:00:00 1970
X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/14093
Path: news.gmane.org!.POSTED.blaine.gmane.org!not-for-mail
From: Szabolcs Nagy <nsz@port70.net>
Newsgroups: gmane.linux.lib.musl.general
Subject: [PATCH 0/3] fp_barrier improvements
Date: Sun, 28 Apr 2019 00:13:34 +0200
Message-ID: <20190427221334.GK26605@port70.net>
Reply-To: musl@lists.openwall.com
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="raC6veAxrt5nqIoY"
Injection-Info: blaine.gmane.org; posting-host="blaine.gmane.org:195.159.176.226";
	logging-data="88951"; mail-complaints-to="usenet@blaine.gmane.org"
User-Agent: Mutt/1.10.1 (2018-07-13)
To: musl@lists.openwall.com
Original-X-From: musl-return-14109-gllmg-musl=m.gmane.org@lists.openwall.com Sun Apr 28 00:13:50 2019
Return-path: <musl-return-14109-gllmg-musl=m.gmane.org@lists.openwall.com>
Envelope-to: gllmg-musl@m.gmane.org
Original-Received: from mother.openwall.net ([195.42.179.200])
	by blaine.gmane.org with smtp (Exim 4.89)
	(envelope-from <musl-return-14109-gllmg-musl=m.gmane.org@lists.openwall.com>)
	id 1hKVZt-000N3Y-Ve
	for gllmg-musl@m.gmane.org; Sun, 28 Apr 2019 00:13:50 +0200
Original-Received: (qmail 1474 invoked by uid 550); 27 Apr 2019 22:13:46 -0000
Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:musl@lists.openwall.com>
List-Help: <mailto:musl-help@lists.openwall.com>
List-Unsubscribe: <mailto:musl-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:musl-subscribe@lists.openwall.com>
List-ID: <musl.lists.openwall.com>
Original-Received: (qmail 1443 invoked from network); 27 Apr 2019 22:13:46 -0000
Mail-Followup-To: musl@lists.openwall.com
Content-Disposition: inline
Xref: news.gmane.org gmane.linux.lib.musl.general:14093
Archived-At: <http://permalink.gmane.org/gmane.linux.lib.musl.general/14093>


--raC6veAxrt5nqIoY
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

fp_barrier does not need to drop excess precision, so the type is
changed accordingly.

i386 fp_barrier is now "more efficient" than in the previous patch.

at least the first two patches shoud improve things on x86.

Szabolcs Nagy (3):
  math: keep excess precision in fp_barrier
  x86: optimize fp_arch.h
  math: use fp_force_eval and fp_barrier in fma

 arch/aarch64/fp_arch.h |  6 ++++--
 arch/i386/fp_arch.h    | 48 ++++++++++++++++++++++++++++++++++++++++++
 arch/x32/fp_arch.h     | 40 +++++++++++++++++++++++++++++++++++
 arch/x86_64/fp_arch.h  | 40 +++++++++++++++++++++++++++++++++++
 src/internal/libm.h    | 14 ++++++------
 src/math/fma.c         |  8 +++----
 6 files changed, 143 insertions(+), 13 deletions(-)
 create mode 100644 arch/i386/fp_arch.h
 create mode 100644 arch/x32/fp_arch.h
 create mode 100644 arch/x86_64/fp_arch.h

-- 
2.21.0

--raC6veAxrt5nqIoY
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0001-math-keep-excess-precision-in-fp_barrier.patch"

>From d5d1d9670df51c237280f7fdb32f92df0aba47ed Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 16:21:31 +0000
Subject: [PATCH 1/3] math: keep excess precision in fp_barrier

---
 arch/aarch64/fp_arch.h |  6 ++++--
 src/internal/libm.h    | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/aarch64/fp_arch.h b/arch/aarch64/fp_arch.h
index f3d445b9..e4c4c868 100644
--- a/arch/aarch64/fp_arch.h
+++ b/arch/aarch64/fp_arch.h
@@ -1,12 +1,14 @@
+#include <float.h>
+
 #define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float_t fp_barrierf(float_t x)
 {
 	__asm__ __volatile__ ("" : "+w"(x));
 	return x;
 }
 
 #define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
 {
 	__asm__ __volatile__ ("" : "+w"(x));
 	return x;
diff --git a/src/internal/libm.h b/src/internal/libm.h
index b5bd26b8..f77dfa4d 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -113,24 +113,24 @@ static inline double eval_as_double(double x)
 	return y;
 }
 
-/* fp_barrier returns its input, but limits code transformations
-   as if it had a side-effect (e.g. observable io) and returned
-   an arbitrary value.  */
+/* fp_barrier returns its input (without dropping excess precision),
+   but limits code transformations as if it had a side-effect
+   (e.g. observable io) and returned an arbitrary value.  */
 
 #ifndef fp_barrierf
 #define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float fp_barrierf(float_t x)
 {
-	volatile float y = x;
+	volatile float_t y = x;
 	return y;
 }
 #endif
 
 #ifndef fp_barrier
 #define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
 {
-	volatile double y = x;
+	volatile double_t y = x;
 	return y;
 }
 #endif
-- 
2.21.0


--raC6veAxrt5nqIoY
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0002-x86-optimize-fp_arch.h.patch"

>From 75ad4e8ec4abc6ce1d801017679c9e9e50fdfcf5 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Wed, 24 Apr 2019 23:29:05 +0000
Subject: [PATCH 2/3] x86: optimize fp_arch.h

Use inline asm constraints instead of volatile store: fp_barrier does
not need to drop excess precision when x87 fpu is used, fp_force_eval
uses memory constraint to drop excess precision, when sse2 math is
available xmm register constraint is used.

This saves 416 and 322 bytes in .text on x86_64 and i386 respectively.
---
 arch/i386/fp_arch.h   | 48 +++++++++++++++++++++++++++++++++++++++++++
 arch/x32/fp_arch.h    | 40 ++++++++++++++++++++++++++++++++++++
 arch/x86_64/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 arch/i386/fp_arch.h
 create mode 100644 arch/x32/fp_arch.h
 create mode 100644 arch/x86_64/fp_arch.h

diff --git a/arch/i386/fp_arch.h b/arch/i386/fp_arch.h
new file mode 100644
index 00000000..33ac222d
--- /dev/null
+++ b/arch/i386/fp_arch.h
@@ -0,0 +1,48 @@
+#include <float.h>
+
+#ifdef __SSE2_MATH__
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+x"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+x"(x))
+#else
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+t"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+m"(x))
+#endif
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	FP_BARRIER(x);
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	FP_BARRIER(x);
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	FP_EVAL(x);
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	FP_EVAL(x);
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x32/fp_arch.h b/arch/x32/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x32/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x86_64/fp_arch.h b/arch/x86_64/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x86_64/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
-- 
2.21.0


--raC6veAxrt5nqIoY
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0003-math-use-fp_force_eval-and-fp_barrier-in-fma.patch"

>From 34341757cbaffc878f10676aeb6820f3a02ac29d Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 19:32:49 +0000
Subject: [PATCH 3/3] math: use fp_force_eval and fp_barrier in fma

Idiomatic fenv handling with the new fp_arch.h, less portable, but
it means a bit smaller code size.
---
 src/math/fma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/math/fma.c b/src/math/fma.c
index 0c6f90c9..84dfeec1 100644
--- a/src/math/fma.c
+++ b/src/math/fma.c
@@ -1,6 +1,7 @@
 #include <stdint.h>
 #include <float.h>
 #include <math.h>
+#include "libm.h"
 #include "atomic.h"
 
 #define ASUINT64(x) ((union {double f; uint64_t i;}){x}).i
@@ -163,11 +164,10 @@ double fma(double x, double y, double z)
 				r = i;
 				r = 2*r - c; /* remove top bit */
 
-				/* raise underflow portably, such that it
-				   cannot be optimized away */
+				/* raise underflow */
 				{
-					double_t tiny = DBL_MIN/FLT_MIN * r;
-					r += (double)(tiny*tiny) * (r-r);
+					double_t tiny = fp_barrier(DBL_MIN/FLT_MIN);
+					fp_force_eval(tiny*tiny);
 				}
 			}
 		} else {
-- 
2.21.0


--raC6veAxrt5nqIoY--