From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/11248 Path: news.gmane.org!.POSTED!not-for-mail From: Szabolcs Nagy Newsgroups: gmane.linux.lib.musl.general Subject: [PATCH] math: rewrite fma with mostly int arithmetics Date: Wed, 19 Apr 2017 00:41:40 +0200 Message-ID: <20170418224140.GN2082@port70.net> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: blaine.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="+QahgC5+KEYLbs62" X-Trace: blaine.gmane.org 1492555316 25559 195.159.176.226 (18 Apr 2017 22:41:56 GMT) X-Complaints-To: usenet@blaine.gmane.org NNTP-Posting-Date: Tue, 18 Apr 2017 22:41:56 +0000 (UTC) User-Agent: Mutt/1.6.0 (2016-04-01) To: musl@lists.openwall.com Original-X-From: musl-return-11263-gllmg-musl=m.gmane.org@lists.openwall.com Wed Apr 19 00:41:52 2017 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.84_2) (envelope-from ) id 1d0boj-0006WY-HJ for gllmg-musl@m.gmane.org; Wed, 19 Apr 2017 00:41:49 +0200 Original-Received: (qmail 10119 invoked by uid 550); 18 Apr 2017 22:41:54 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 10087 invoked from network); 18 Apr 2017 22:41:52 -0000 Mail-Followup-To: musl@lists.openwall.com Content-Disposition: inline Xref: news.gmane.org gmane.linux.lib.musl.general:11248 Archived-At: --+QahgC5+KEYLbs62 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline the freebsd fma code failed to raise underflow exception in some cases in nearest rounding mode (affects fmal too) e.g. fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022) and the inexact exception may be raised spuriously since the fenv is not saved/restored around the exact multiplication algorithm (affects x86 fma too). another issue is that the underflow behaviour when the rounded result is the minimal normal number is target dependent, ieee754 allows two ways to raise underflow for inexact results: raise if the result before rounding is in the subnormal range (e.g. aarch64, arm, powerpc) or if the result after rounding with infinite exponent range is in the subnormal range (e.g. x86, mips, sh). to avoid all these issues the algorithm was rewritten with mostly int arithmetics and float arithmetics is only used to get correct rounding and raise exceptions according to the behaviour of the target without any fenv.h dependency. it also unifies x86 and non-x86 fma. fmaf is not affected, fmal need to be fixed too. this algorithm depends on a_clz_64 and it required a nasty volatile hack: gcc seems to miscompile the FORCE_EVAL macro of libm.h on i386. --- src/math/fma.c | 582 ++++++++++++++++----------------------------------------- 1 file changed, 158 insertions(+), 424 deletions(-) attaching the new fma.c instead of a diff, it's more readable. depends on the a_clz_64 patch and previous scalbn fix. fmal should be possible to do in a similar way. i expect it to be faster than the previous code on most targets as the rounding mode is not changed and has less multiplications (it is faster on x86_64 and i386), the code size is a bit bigger though. --+QahgC5+KEYLbs62 Content-Type: text/x-csrc; charset=us-ascii Content-Disposition: attachment; filename="fma.c" #include #include #include #include "atomic.h" static inline uint64_t asuint64(double x) { union {double f; uint64_t i;} u = {x}; return u.i; } static inline double asdouble(uint64_t x) { union {uint64_t i; double f;} u = {x}; return u.f; } struct num { uint64_t m; int e; int sign; }; static struct num normalize(uint64_t x) { int e = x>>52; int sign = e & 1<<11; e &= (1<<11)-1; x &= (1ull<<52)-1; if (!e) { int k = a_clz_64(x); x <<= k-11; e = -k+12; } x |= 1ull<<52; x <<= 1; e -= 0x3ff + 52 + 1; return (struct num){x,e,sign}; } static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y) { uint64_t t1,t2,t3; uint64_t xlo = (uint32_t)x, xhi = x>>32; uint64_t ylo = (uint32_t)y, yhi = y>>32; t1 = xlo*ylo; t2 = xlo*yhi + xhi*ylo; t3 = xhi*yhi; *lo = t1 + (t2<<32); *hi = t3 + (t2>>32) + (t1 > *lo); } static int zeroinfnan(uint64_t x) { return 2*x-1 >= 2*asuint64(INFINITY)-1; } double fma(double x, double y, double z) { #pragma STDC FENV_ACCESS ON uint64_t ix = asuint64(x); uint64_t iy = asuint64(y); uint64_t iz = asuint64(z); if (zeroinfnan(ix) || zeroinfnan(iy)) return x*y + z; if (zeroinfnan(iz)) { if (z == 0) return x*y + z; return z; } /* normalize so top 10bits and last bit are 0 */ struct num nx, ny, nz; nx = normalize(ix); ny = normalize(iy); nz = normalize(iz); /* mul: r = x*y */ uint64_t rhi, rlo, zhi, zlo; mul(&rhi, &rlo, nx.m, ny.m); /* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */ /* align exponents */ int e = nx.e + ny.e; int d = nz.e - e; /* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */ if (d > 0) { if (d < 64) { zlo = nz.m<>64-d; } else { zlo = 0; zhi = nz.m; e = nz.e - 64; d -= 64; if (d == 0) { } else if (d < 64) { rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d); rhi = rhi>>d; } else { rlo = 1; rhi = 0; } } } else { zhi = 0; d = -d; if (d == 0) { zlo = nz.m; } else if (d < 64) { zlo = nz.m>>d | !!(nz.m<<64-d); } else { zlo = 1; } } /* add */ int sign = nx.sign^ny.sign; int samesign = !(sign^nz.sign); int nonzero = 1; if (samesign) { /* r += z */ rlo += zlo; rhi += zhi + (rlo < zlo); } else { /* r -= z */ uint64_t t = rlo; rlo -= zlo; rhi = rhi - zhi - (t < rlo); if (rhi>>63) { rlo = -rlo; rhi = -rhi-!!rlo; sign = !sign; } nonzero = !!rhi; } /* set rhi to top 63bit of the result (last bit is sticky) */ if (nonzero) { e += 64; d = a_clz_64(rhi)-1; /* note: d > 0 */ rhi = rhi<>64-d | !!(rlo<>1 | (rlo&1); else rhi = rlo<>1 | (rhi&1) | 1ull<<62; if (sign) i = -i; r = i; r = 2*r - c; /* remove top bit */ volatile double uflow = DBL_MIN/FLT_MIN; uflow *= uflow; } } else { /* only round once when scaled */ d = 10; i = ( rhi>>d | !!(rhi<<64-d) ) << d; if (sign) i = -i; r = i; } } return scalbn(r, e); } --+QahgC5+KEYLbs62--