From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/13319 Path: news.gmane.org!.POSTED!not-for-mail From: Szabolcs Nagy Newsgroups: gmane.linux.lib.musl.general Subject: Re: [PATCH 0/5] add FP_FAST_FMA to math.h Date: Wed, 26 Sep 2018 22:52:39 +0200 Message-ID: <20180926205238.GH10209@port70.net> References: <20180923150933.GC10209@port70.net> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: blaine.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="s9fJI615cBHmzTOP" X-Trace: blaine.gmane.org 1537995047 20783 195.159.176.226 (26 Sep 2018 20:50:47 GMT) X-Complaints-To: usenet@blaine.gmane.org NNTP-Posting-Date: Wed, 26 Sep 2018 20:50:47 +0000 (UTC) User-Agent: Mutt/1.10.1 (2018-07-13) To: musl@lists.openwall.com Original-X-From: musl-return-13335-gllmg-musl=m.gmane.org@lists.openwall.com Wed Sep 26 22:50:43 2018 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.84_2) (envelope-from ) id 1g5Glf-0005KQ-5I for gllmg-musl@m.gmane.org; Wed, 26 Sep 2018 22:50:43 +0200 Original-Received: (qmail 30195 invoked by uid 550); 26 Sep 2018 20:52:51 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 30177 invoked from network); 26 Sep 2018 20:52:50 -0000 Mail-Followup-To: musl@lists.openwall.com Content-Disposition: inline In-Reply-To: <20180923150933.GC10209@port70.net> Xref: news.gmane.org gmane.linux.lib.musl.general:13319 Archived-At: --s9fJI615cBHmzTOP Content-Type: text/plain; charset=us-ascii Content-Disposition: inline v2: fixed the arm patch to work around a clang bug. --s9fJI615cBHmzTOP Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0001-s390x-add-single-instruction-fma-and-fmaf.patch" >From 58cff58a39fe21d0ad55b572670b2ece0ed6c00e Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Thu, 13 Sep 2018 22:35:13 +0000 Subject: [PATCH 1/5] s390x: add single instruction fma and fmaf These are available in the s390x baseline isa -march=z900. --- src/math/s390x/fma.c | 7 +++++++ src/math/s390x/fmaf.c | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 src/math/s390x/fma.c create mode 100644 src/math/s390x/fmaf.c diff --git a/src/math/s390x/fma.c b/src/math/s390x/fma.c new file mode 100644 index 00000000..86da0e49 --- /dev/null +++ b/src/math/s390x/fma.c @@ -0,0 +1,7 @@ +#include + +double fma(double x, double y, double z) +{ + __asm__ ("madbr %0, %1, %2" : "+f"(z) : "f"(x), "f"(y)); + return z; +} diff --git a/src/math/s390x/fmaf.c b/src/math/s390x/fmaf.c new file mode 100644 index 00000000..f1aec6ad --- /dev/null +++ b/src/math/s390x/fmaf.c @@ -0,0 +1,7 @@ +#include + +float fmaf(float x, float y, float z) +{ + __asm__ ("maebr %0, %1, %2" : "+f"(z) : "f"(x), "f"(y)); + return z; +} -- 2.18.0 --s9fJI615cBHmzTOP Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0002-powerpc-add-single-instruction-fabs-fabsf-fma-fmaf-s.patch" >From 6adb9c5683f8e9846223d6c062bfcfa1f21a85fe Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Thu, 20 Sep 2018 23:14:11 +0000 Subject: [PATCH 2/5] powerpc: add single instruction fabs, fabsf, fma, fmaf, sqrt, sqrtf These are only available on hard float target and sqrt is not available in the base ISA, so further check is used. --- src/math/powerpc/fabs.c | 15 +++++++++++++++ src/math/powerpc/fabsf.c | 15 +++++++++++++++ src/math/powerpc/fma.c | 15 +++++++++++++++ src/math/powerpc/fmaf.c | 15 +++++++++++++++ src/math/powerpc/sqrt.c | 15 +++++++++++++++ src/math/powerpc/sqrtf.c | 15 +++++++++++++++ 6 files changed, 90 insertions(+) create mode 100644 src/math/powerpc/fabs.c create mode 100644 src/math/powerpc/fabsf.c create mode 100644 src/math/powerpc/fma.c create mode 100644 src/math/powerpc/fmaf.c create mode 100644 src/math/powerpc/sqrt.c create mode 100644 src/math/powerpc/sqrtf.c diff --git a/src/math/powerpc/fabs.c b/src/math/powerpc/fabs.c new file mode 100644 index 00000000..f6ec4433 --- /dev/null +++ b/src/math/powerpc/fabs.c @@ -0,0 +1,15 @@ +#include + +#ifdef _SOFT_FLOAT + +#include "../fabs.c" + +#else + +double fabs(double x) +{ + __asm__ ("fabs %0, %1" : "=d"(x) : "d"(x)); + return x; +} + +#endif diff --git a/src/math/powerpc/fabsf.c b/src/math/powerpc/fabsf.c new file mode 100644 index 00000000..d88b5911 --- /dev/null +++ b/src/math/powerpc/fabsf.c @@ -0,0 +1,15 @@ +#include + +#ifdef _SOFT_FLOAT + +#include "../fabsf.c" + +#else + +float fabsf(float x) +{ + __asm__ ("fabs %0, %1" : "=f"(x) : "f"(x)); + return x; +} + +#endif diff --git a/src/math/powerpc/fma.c b/src/math/powerpc/fma.c new file mode 100644 index 00000000..fd268f5f --- /dev/null +++ b/src/math/powerpc/fma.c @@ -0,0 +1,15 @@ +#include + +#ifdef _SOFT_FLOAT + +#include "../fma.c" + +#else + +double fma(double x, double y, double z) +{ + __asm__("fmadd %0, %1, %2, %3" : "=d"(x) : "d"(x), "d"(y), "d"(z)); + return x; +} + +#endif diff --git a/src/math/powerpc/fmaf.c b/src/math/powerpc/fmaf.c new file mode 100644 index 00000000..a99a2a3b --- /dev/null +++ b/src/math/powerpc/fmaf.c @@ -0,0 +1,15 @@ +#include + +#ifdef _SOFT_FLOAT + +#include "../fmaf.c" + +#else + +float fmaf(float x, float y, float z) +{ + __asm__("fmadds %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); + return x; +} + +#endif diff --git a/src/math/powerpc/sqrt.c b/src/math/powerpc/sqrt.c new file mode 100644 index 00000000..8718dbd0 --- /dev/null +++ b/src/math/powerpc/sqrt.c @@ -0,0 +1,15 @@ +#include + +#if !defined _SOFT_FLOAT && defined _ARCH_PPCSQ + +double sqrt(double x) +{ + __asm__ ("fsqrt %0, %1\n" : "=d" (x) : "d" (x)); + return x; +} + +#else + +#include "../sqrt.c" + +#endif diff --git a/src/math/powerpc/sqrtf.c b/src/math/powerpc/sqrtf.c new file mode 100644 index 00000000..3431b672 --- /dev/null +++ b/src/math/powerpc/sqrtf.c @@ -0,0 +1,15 @@ +#include + +#if !defined _SOFT_FLOAT && defined _ARCH_PPCSQ + +float sqrtf(float x) +{ + __asm__ ("fsqrts %0, %1\n" : "=f" (x) : "f" (x)); + return x; +} + +#else + +#include "../sqrtf.c" + +#endif -- 2.18.0 --s9fJI615cBHmzTOP Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0003-arm-add-single-instruction-fma.patch" >From e33ac3fd4a39416c4b681d610c7ad9737a279260 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Sat, 22 Sep 2018 18:47:27 +0000 Subject: [PATCH 3/5] arm: add single instruction fma vfma is available in the vfpv4 fpu and above, the ACLE standard feature test for double precision hardware fma support is __ARM_FEATURE_FMA && __ARM_FP&8 we need further checks to work around clang bugs (fixed in clang >=7.0) && !__SOFTFP__ because __ARM_FP is defined even with -mfloat-abi=soft && !BROKEN_VFP_ASM to disable the single precision code when inline asm handling is broken. For runtime selection the HWCAP_ARM_VFPv4 hwcap flag can be used, but that requires further work. --- src/math/arm/fma.c | 15 +++++++++++++++ src/math/arm/fmaf.c | 15 +++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 src/math/arm/fma.c create mode 100644 src/math/arm/fmaf.c diff --git a/src/math/arm/fma.c b/src/math/arm/fma.c new file mode 100644 index 00000000..2a9b8efa --- /dev/null +++ b/src/math/arm/fma.c @@ -0,0 +1,15 @@ +#include + +#if __ARM_FEATURE_FMA && __ARM_FP&8 && !__SOFTFP__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfma.f64 %P0, %P1, %P2" : "+w"(z) : "w"(x), "w"(y)); + return z; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/arm/fmaf.c b/src/math/arm/fmaf.c new file mode 100644 index 00000000..a1793d27 --- /dev/null +++ b/src/math/arm/fmaf.c @@ -0,0 +1,15 @@ +#include + +#if __ARM_FEATURE_FMA && __ARM_FP&4 && !__SOFTFP__ && !BROKEN_VFP_ASM + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfma.f32 %0, %1, %2" : "+t"(z) : "t"(x), "t"(y)); + return z; +} + +#else + +#include "../fmaf.c" + +#endif -- 2.18.0 --s9fJI615cBHmzTOP Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0004-x86_64-add-single-instruction-fma.patch" >From 7a54c4fee1771cdc9de42445c813d9e7d43d272e Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Sat, 22 Sep 2018 21:43:42 +0000 Subject: [PATCH 4/5] x86_64: add single instruction fma fma is only available on recent x86_64 cpus and it is much faster than a software fma, so this should be done with a runtime check, however that requires more changes, this patch just adds the code so it can be tested when musl is compiled with -mfma or -mfma4. --- src/math/x32/fma.c | 23 +++++++++++++++++++++++ src/math/x32/fmaf.c | 23 +++++++++++++++++++++++ src/math/x86_64/fma.c | 23 +++++++++++++++++++++++ src/math/x86_64/fmaf.c | 23 +++++++++++++++++++++++ 4 files changed, 92 insertions(+) create mode 100644 src/math/x32/fma.c create mode 100644 src/math/x32/fmaf.c create mode 100644 src/math/x86_64/fma.c create mode 100644 src/math/x86_64/fmaf.c diff --git a/src/math/x32/fma.c b/src/math/x32/fma.c new file mode 100644 index 00000000..4dd53f2a --- /dev/null +++ b/src/math/x32/fma.c @@ -0,0 +1,23 @@ +#include + +#if __FMA__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmadd132sd %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/x32/fmaf.c b/src/math/x32/fmaf.c new file mode 100644 index 00000000..30b971ff --- /dev/null +++ b/src/math/x32/fmaf.c @@ -0,0 +1,23 @@ +#include + +#if __FMA__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmadd132ss %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fmaf.c" + +#endif diff --git a/src/math/x86_64/fma.c b/src/math/x86_64/fma.c new file mode 100644 index 00000000..4dd53f2a --- /dev/null +++ b/src/math/x86_64/fma.c @@ -0,0 +1,23 @@ +#include + +#if __FMA__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmadd132sd %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/x86_64/fmaf.c b/src/math/x86_64/fmaf.c new file mode 100644 index 00000000..30b971ff --- /dev/null +++ b/src/math/x86_64/fmaf.c @@ -0,0 +1,23 @@ +#include + +#if __FMA__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmadd132ss %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fmaf.c" + +#endif -- 2.18.0 --s9fJI615cBHmzTOP Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0005-define-FP_FAST_FMA-and-FP_FAST_FMAF-when-fma-and-fma.patch" >From 03768f71f9ece09838c86828f710180d9e1803d6 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Sun, 19 Mar 2017 03:56:01 +0000 Subject: [PATCH 5/5] define FP_FAST_FMA and FP_FAST_FMAF when fma and fmaf can be inlined FP_FAST_FMA can be defined if "the fma function generally executes about as fast as, or faster than, a multiply and an add of double operands", which can only be true if the fma call is inlined as an instruction. gcc sets __FP_FAST_FMA if __builtin_fma is inlined as an instruction, but that does not mean an fma call will be inlined (e.g. it is defined with -fno-builtin-fma), other compilers (clang) don't even have such macro, so there is no reliable way to tell when fma is inlined. one approach is to define FP_FAST_FMA based on the libc implementation: when it has a single instruction implementation, then the compiler should also be able to do the inlining and in case that fails at least the libc code is still fast (there is just an extern call overhead). on aarch64, powerpc, powerpc64, s390x we can give this guarantee, but on arm, x32 and x86_64 runtime checks would be needed to do the same. for now arm, x32 and x86_64 set FP_FAST_FMA when the compiler should be able to inline fma, but if that fails the libc code will be slow (unless musl is built for an isa baseline that includes an fma instruction). --- arch/aarch64/bits/math.h | 2 ++ arch/arm/bits/math.h | 6 ++++++ arch/generic/bits/math.h | 0 arch/powerpc/bits/math.h | 4 ++++ arch/powerpc64/bits/math.h | 2 ++ arch/s390x/bits/math.h | 2 ++ arch/x32/bits/math.h | 4 ++++ arch/x86_64/bits/math.h | 4 ++++ include/math.h | 2 ++ 9 files changed, 26 insertions(+) create mode 100644 arch/aarch64/bits/math.h create mode 100644 arch/arm/bits/math.h create mode 100644 arch/generic/bits/math.h create mode 100644 arch/powerpc/bits/math.h create mode 100644 arch/powerpc64/bits/math.h create mode 100644 arch/s390x/bits/math.h create mode 100644 arch/x32/bits/math.h create mode 100644 arch/x86_64/bits/math.h diff --git a/arch/aarch64/bits/math.h b/arch/aarch64/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/aarch64/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/arm/bits/math.h b/arch/arm/bits/math.h new file mode 100644 index 00000000..f87817f0 --- /dev/null +++ b/arch/arm/bits/math.h @@ -0,0 +1,6 @@ +#if __ARM_FEATURE_FMA && __ARM_FP&8 && !__SOFTFP__ +#define FP_FAST_FMA 1 +#endif +#if __ARM_FEATURE_FMA && __ARM_FP&4 && !__SOFTFP__ +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/generic/bits/math.h b/arch/generic/bits/math.h new file mode 100644 index 00000000..e69de29b diff --git a/arch/powerpc/bits/math.h b/arch/powerpc/bits/math.h new file mode 100644 index 00000000..3913b15e --- /dev/null +++ b/arch/powerpc/bits/math.h @@ -0,0 +1,4 @@ +#ifndef _SOFT_FLOAT +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/powerpc64/bits/math.h b/arch/powerpc64/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/powerpc64/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/s390x/bits/math.h b/arch/s390x/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/s390x/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/x32/bits/math.h b/arch/x32/bits/math.h new file mode 100644 index 00000000..c7569d6c --- /dev/null +++ b/arch/x32/bits/math.h @@ -0,0 +1,4 @@ +#if __FMA__ || __FMA4__ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/x86_64/bits/math.h b/arch/x86_64/bits/math.h new file mode 100644 index 00000000..c7569d6c --- /dev/null +++ b/arch/x86_64/bits/math.h @@ -0,0 +1,4 @@ +#if __FMA__ || __FMA4__ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/include/math.h b/include/math.h index fea34686..58da26c2 100644 --- a/include/math.h +++ b/include/math.h @@ -11,6 +11,8 @@ extern "C" { #define __NEED_double_t #include +#include + #if 100*__GNUC__+__GNUC_MINOR__ >= 303 #define NAN __builtin_nanf("") #define INFINITY __builtin_inff() -- 2.18.0 --s9fJI615cBHmzTOP--