* [PATCH 0/3] fp_barrier improvements
@ 2019-04-27 22:13 Szabolcs Nagy
0 siblings, 0 replies; only message in thread
From: Szabolcs Nagy @ 2019-04-27 22:13 UTC (permalink / raw)
To: musl
[-- Attachment #1: Type: text/plain, Size: 859 bytes --]
fp_barrier does not need to drop excess precision, so the type is
changed accordingly.
i386 fp_barrier is now "more efficient" than in the previous patch.
at least the first two patches shoud improve things on x86.
Szabolcs Nagy (3):
math: keep excess precision in fp_barrier
x86: optimize fp_arch.h
math: use fp_force_eval and fp_barrier in fma
arch/aarch64/fp_arch.h | 6 ++++--
arch/i386/fp_arch.h | 48 ++++++++++++++++++++++++++++++++++++++++++
arch/x32/fp_arch.h | 40 +++++++++++++++++++++++++++++++++++
arch/x86_64/fp_arch.h | 40 +++++++++++++++++++++++++++++++++++
src/internal/libm.h | 14 ++++++------
src/math/fma.c | 8 +++----
6 files changed, 143 insertions(+), 13 deletions(-)
create mode 100644 arch/i386/fp_arch.h
create mode 100644 arch/x32/fp_arch.h
create mode 100644 arch/x86_64/fp_arch.h
--
2.21.0
[-- Attachment #2: 0001-math-keep-excess-precision-in-fp_barrier.patch --]
[-- Type: text/x-diff, Size: 1921 bytes --]
From d5d1d9670df51c237280f7fdb32f92df0aba47ed Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 16:21:31 +0000
Subject: [PATCH 1/3] math: keep excess precision in fp_barrier
---
arch/aarch64/fp_arch.h | 6 ++++--
src/internal/libm.h | 14 +++++++-------
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/arch/aarch64/fp_arch.h b/arch/aarch64/fp_arch.h
index f3d445b9..e4c4c868 100644
--- a/arch/aarch64/fp_arch.h
+++ b/arch/aarch64/fp_arch.h
@@ -1,12 +1,14 @@
+#include <float.h>
+
#define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float_t fp_barrierf(float_t x)
{
__asm__ __volatile__ ("" : "+w"(x));
return x;
}
#define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
{
__asm__ __volatile__ ("" : "+w"(x));
return x;
diff --git a/src/internal/libm.h b/src/internal/libm.h
index b5bd26b8..f77dfa4d 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -113,24 +113,24 @@ static inline double eval_as_double(double x)
return y;
}
-/* fp_barrier returns its input, but limits code transformations
- as if it had a side-effect (e.g. observable io) and returned
- an arbitrary value. */
+/* fp_barrier returns its input (without dropping excess precision),
+ but limits code transformations as if it had a side-effect
+ (e.g. observable io) and returned an arbitrary value. */
#ifndef fp_barrierf
#define fp_barrierf fp_barrierf
-static inline float fp_barrierf(float x)
+static inline float fp_barrierf(float_t x)
{
- volatile float y = x;
+ volatile float_t y = x;
return y;
}
#endif
#ifndef fp_barrier
#define fp_barrier fp_barrier
-static inline double fp_barrier(double x)
+static inline double_t fp_barrier(double_t x)
{
- volatile double y = x;
+ volatile double_t y = x;
return y;
}
#endif
--
2.21.0
[-- Attachment #3: 0002-x86-optimize-fp_arch.h.patch --]
[-- Type: text/x-diff, Size: 4047 bytes --]
From 75ad4e8ec4abc6ce1d801017679c9e9e50fdfcf5 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Wed, 24 Apr 2019 23:29:05 +0000
Subject: [PATCH 2/3] x86: optimize fp_arch.h
Use inline asm constraints instead of volatile store: fp_barrier does
not need to drop excess precision when x87 fpu is used, fp_force_eval
uses memory constraint to drop excess precision, when sse2 math is
available xmm register constraint is used.
This saves 416 and 322 bytes in .text on x86_64 and i386 respectively.
---
arch/i386/fp_arch.h | 48 +++++++++++++++++++++++++++++++++++++++++++
arch/x32/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++
arch/x86_64/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++
3 files changed, 128 insertions(+)
create mode 100644 arch/i386/fp_arch.h
create mode 100644 arch/x32/fp_arch.h
create mode 100644 arch/x86_64/fp_arch.h
diff --git a/arch/i386/fp_arch.h b/arch/i386/fp_arch.h
new file mode 100644
index 00000000..33ac222d
--- /dev/null
+++ b/arch/i386/fp_arch.h
@@ -0,0 +1,48 @@
+#include <float.h>
+
+#ifdef __SSE2_MATH__
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+x"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+x"(x))
+#else
+#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+t"(x))
+#define FP_EVAL(x) __asm__ __volatile__ ("" : "+m"(x))
+#endif
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+ FP_BARRIER(x);
+ return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+ FP_BARRIER(x);
+ return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+ return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+ FP_EVAL(x);
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+ FP_EVAL(x);
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x32/fp_arch.h b/arch/x32/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x32/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+ return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+ return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+ return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+}
diff --git a/arch/x86_64/fp_arch.h b/arch/x86_64/fp_arch.h
new file mode 100644
index 00000000..af4309d9
--- /dev/null
+++ b/arch/x86_64/fp_arch.h
@@ -0,0 +1,40 @@
+#include <float.h>
+
+#define fp_barrierf fp_barrierf
+static inline float_t fp_barrierf(float_t x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+ return x;
+}
+
+#define fp_barrier fp_barrier
+static inline double_t fp_barrier(double_t x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+ return x;
+}
+
+#define fp_barrierl fp_barrierl
+static inline long double fp_barrierl(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+ return x;
+}
+
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+ __asm__ __volatile__ ("" : "+x"(x));
+}
+
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+ __asm__ __volatile__ ("" : "+t"(x));
+}
--
2.21.0
[-- Attachment #4: 0003-math-use-fp_force_eval-and-fp_barrier-in-fma.patch --]
[-- Type: text/x-diff, Size: 1131 bytes --]
From 34341757cbaffc878f10676aeb6820f3a02ac29d Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 27 Apr 2019 19:32:49 +0000
Subject: [PATCH 3/3] math: use fp_force_eval and fp_barrier in fma
Idiomatic fenv handling with the new fp_arch.h, less portable, but
it means a bit smaller code size.
---
src/math/fma.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/math/fma.c b/src/math/fma.c
index 0c6f90c9..84dfeec1 100644
--- a/src/math/fma.c
+++ b/src/math/fma.c
@@ -1,6 +1,7 @@
#include <stdint.h>
#include <float.h>
#include <math.h>
+#include "libm.h"
#include "atomic.h"
#define ASUINT64(x) ((union {double f; uint64_t i;}){x}).i
@@ -163,11 +164,10 @@ double fma(double x, double y, double z)
r = i;
r = 2*r - c; /* remove top bit */
- /* raise underflow portably, such that it
- cannot be optimized away */
+ /* raise underflow */
{
- double_t tiny = DBL_MIN/FLT_MIN * r;
- r += (double)(tiny*tiny) * (r-r);
+ double_t tiny = fp_barrier(DBL_MIN/FLT_MIN);
+ fp_force_eval(tiny*tiny);
}
}
} else {
--
2.21.0
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-04-27 22:13 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-27 22:13 [PATCH 0/3] fp_barrier improvements Szabolcs Nagy
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).