[PATCH 0/3] fp_barrier improvements

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH 0/3] fp_barrier improvements
@ 2019-04-27 22:13 Szabolcs Nagy
  0 siblings, 0 replies; only message in thread
From: Szabolcs Nagy @ 2019-04-27 22:13 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 859 bytes --]

fp_barrier does not need to drop excess precision, so the type is
changed accordingly.

i386 fp_barrier is now "more efficient" than in the previous patch.

at least the first two patches shoud improve things on x86.

Szabolcs Nagy (3):
  math: keep excess precision in fp_barrier
  x86: optimize fp_arch.h
  math: use fp_force_eval and fp_barrier in fma

 arch/aarch64/fp_arch.h |  6 ++++--
 arch/i386/fp_arch.h    | 48 ++++++++++++++++++++++++++++++++++++++++++
 arch/x32/fp_arch.h     | 40 +++++++++++++++++++++++++++++++++++
 arch/x86_64/fp_arch.h  | 40 +++++++++++++++++++++++++++++++++++
 src/internal/libm.h    | 14 ++++++------
 src/math/fma.c         |  8 +++----
 6 files changed, 143 insertions(+), 13 deletions(-)
 create mode 100644 arch/i386/fp_arch.h
 create mode 100644 arch/x32/fp_arch.h
 create mode 100644 arch/x86_64/fp_arch.h

-- 
2.21.0

[-- Attachment #2: 0001-math-keep-excess-precision-in-fp_barrier.patch --]
[-- Type: text/x-diff, Size: 1921 bytes --]

From d5d1d9670df51c237280f7fdb32f92df0aba47ed Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 16:21:31 +0000
Subject: [PATCH 1/3] math: keep excess precision in fp_barrier

---
 arch/aarch64/fp_arch.h |  6 ++++--
 src/internal/libm.h    | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/aarch64/fp_arch.h b/arch/aarch64/fp_arch.h
index f3d445b9..e4c4c868 100644
--- a/arch/aarch64/fp_arch.h
+++ b/arch/aarch64/fp_arch.h
@@ -1,12 +1,14 @@
+#include <float.h>
+
 #define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float_t fp_barrierf(float_t x)
 {
 	__asm__ __volatile__ ("" : "+w"(x));
 	return x;
 }
 
 #define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
 {
 	__asm__ __volatile__ ("" : "+w"(x));
 	return x;
diff --git a/src/internal/libm.h b/src/internal/libm.h
index b5bd26b8..f77dfa4d 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -113,24 +113,24 @@ static inline double eval_as_double(double x)
 	return y;
 }
 
-/* fp_barrier returns its input, but limits code transformations
-   as if it had a side-effect (e.g. observable io) and returned
-   an arbitrary value.  */
+/* fp_barrier returns its input (without dropping excess precision),
+   but limits code transformations as if it had a side-effect
+   (e.g. observable io) and returned an arbitrary value.  */
 
 #ifndef fp_barrierf
 #define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float fp_barrierf(float_t x)
 {
-	volatile float y = x;
+	volatile float_t y = x;
 	return y;
 }
 #endif
 
 #ifndef fp_barrier
 #define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
 {
-	volatile double y = x;
+	volatile double_t y = x;
 	return y;
 }
 #endif
-- 
2.21.0


[-- Attachment #3: 0002-x86-optimize-fp_arch.h.patch --]
[-- Type: text/x-diff, Size: 4047 bytes --]

From 75ad4e8ec4abc6ce1d801017679c9e9e50fdfcf5 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Wed, 24 Apr 2019 23:29:05 +0000
Subject: [PATCH 2/3] x86: optimize fp_arch.h

Use inline asm constraints instead of volatile store: fp_barrier does
not need to drop excess precision when x87 fpu is used, fp_force_eval
uses memory constraint to drop excess precision, when sse2 math is
available xmm register constraint is used.

This saves 416 and 322 bytes in .text on x86_64 and i386 respectively.
---
 arch/i386/fp_arch.h   | 48 +++++++++++++++++++++++++++++++++++++++++++
 arch/x32/fp_arch.h    | 40 ++++++++++++++++++++++++++++++++++++
 arch/x86_64/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 arch/i386/fp_arch.h
 create mode 100644 arch/x32/fp_arch.h
 create mode 100644 arch/x86_64/fp_arch.h

diff --git a/arch/i386/fp_arch.h b/arch/i386/fp_arch.h
new file mode 100644
index 00000000..33ac222d
--- /dev/null
+++ b/arch/i386/fp_arch.h
@@ -0,0 +1,48 @@
+#include <float.h>
+
+#ifdef __SSE2_MATH__
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+x"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+x"(x))
+#else
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+t"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+m"(x))
+#endif
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	FP_BARRIER(x);
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	FP_BARRIER(x);
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	FP_EVAL(x);
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	FP_EVAL(x);
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x32/fp_arch.h b/arch/x32/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x32/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x86_64/fp_arch.h b/arch/x86_64/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x86_64/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+	return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+	return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	__asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	__asm__ __volatile__ ("" : "+t"(x));
+}
-- 
2.21.0


[-- Attachment #4: 0003-math-use-fp_force_eval-and-fp_barrier-in-fma.patch --]
[-- Type: text/x-diff, Size: 1131 bytes --]

From 34341757cbaffc878f10676aeb6820f3a02ac29d Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 19:32:49 +0000
Subject: [PATCH 3/3] math: use fp_force_eval and fp_barrier in fma

Idiomatic fenv handling with the new fp_arch.h, less portable, but
it means a bit smaller code size.
---
 src/math/fma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/math/fma.c b/src/math/fma.c
index 0c6f90c9..84dfeec1 100644
--- a/src/math/fma.c
+++ b/src/math/fma.c
@@ -1,6 +1,7 @@
 #include <stdint.h>
 #include <float.h>
 #include <math.h>
+#include "libm.h"
 #include "atomic.h"
 
 #define ASUINT64(x) ((union {double f; uint64_t i;}){x}).i
@@ -163,11 +164,10 @@ double fma(double x, double y, double z)
 				r = i;
 				r = 2*r - c; /* remove top bit */
 
-				/* raise underflow portably, such that it
-				   cannot be optimized away */
+				/* raise underflow */
 				{
-					double_t tiny = DBL_MIN/FLT_MIN * r;
-					r += (double)(tiny*tiny) * (r-r);
+					double_t tiny = fp_barrier(DBL_MIN/FLT_MIN);
+					fp_force_eval(tiny*tiny);
 				}
 			}
 		} else {
-- 
2.21.0


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-04-27 22:13 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-27 22:13 [PATCH 0/3] fp_barrier improvements Szabolcs Nagy

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).