[PATCH] math: rewrite fma with mostly int arithmetics

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: Szabolcs Nagy <nsz@port70.net>
To: musl@lists.openwall.com
Subject: [PATCH] math: rewrite fma with mostly int arithmetics
Date: Wed, 19 Apr 2017 00:41:40 +0200	[thread overview]
Message-ID: <20170418224140.GN2082@port70.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 1675 bytes --]

the freebsd fma code failed to raise underflow exception in some
cases in nearest rounding mode (affects fmal too) e.g.

  fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022)

and the inexact exception may be raised spuriously since the fenv
is not saved/restored around the exact multiplication algorithm
(affects x86 fma too).

another issue is that the underflow behaviour when the rounded result
is the minimal normal number is target dependent, ieee754 allows two
ways to raise underflow for inexact results: raise if the result before
rounding is in the subnormal range (e.g. aarch64, arm, powerpc) or if
the result after rounding with infinite exponent range is in the
subnormal range (e.g. x86, mips, sh).

to avoid all these issues the algorithm was rewritten with mostly int
arithmetics and float arithmetics is only used to get correct rounding
and raise exceptions according to the behaviour of the target without
any fenv.h dependency. it also unifies x86 and non-x86 fma.

fmaf is not affected, fmal need to be fixed too.

this algorithm depends on a_clz_64 and it required a nasty volatile
hack: gcc seems to miscompile the FORCE_EVAL macro of libm.h on i386.
---
 src/math/fma.c | 582 ++++++++++++++++-----------------------------------------
 1 file changed, 158 insertions(+), 424 deletions(-)

attaching the new fma.c instead of a diff, it's more readable.

depends on the a_clz_64 patch and previous scalbn fix.

fmal should be possible to do in a similar way.

i expect it to be faster than the previous code on most targets as
the rounding mode is not changed and has less multiplications
(it is faster on x86_64 and i386), the code size is a bit bigger
though.

[-- Attachment #2: fma.c --]
[-- Type: text/x-csrc, Size: 3797 bytes --]

#include <stdint.h>
#include <float.h>
#include <math.h>
#include "atomic.h"

static inline uint64_t asuint64(double x)
{
	union {double f; uint64_t i;} u = {x};
	return u.i;
}

static inline double asdouble(uint64_t x)
{
	union {uint64_t i; double f;} u = {x};
	return u.f;
}

struct num { uint64_t m; int e; int sign; };

static struct num normalize(uint64_t x)
{
	int e = x>>52;
	int sign = e & 1<<11;
	e &= (1<<11)-1;
	x &= (1ull<<52)-1;
	if (!e) {
		int k = a_clz_64(x);
		x <<= k-11;
		e = -k+12;
	}
	x |= 1ull<<52;
	x <<= 1;
	e -= 0x3ff + 52 + 1;
	return (struct num){x,e,sign};
}

static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
{
	uint64_t t1,t2,t3;
	uint64_t xlo = (uint32_t)x, xhi = x>>32;
	uint64_t ylo = (uint32_t)y, yhi = y>>32;

	t1 = xlo*ylo;
	t2 = xlo*yhi + xhi*ylo;
	t3 = xhi*yhi;
	*lo = t1 + (t2<<32);
	*hi = t3 + (t2>>32) + (t1 > *lo);
}

static int zeroinfnan(uint64_t x)
{
	return 2*x-1 >= 2*asuint64(INFINITY)-1;
}

double fma(double x, double y, double z)
{
	#pragma STDC FENV_ACCESS ON
	uint64_t ix = asuint64(x);
	uint64_t iy = asuint64(y);
	uint64_t iz = asuint64(z);

	if (zeroinfnan(ix) || zeroinfnan(iy))
		return x*y + z;
	if (zeroinfnan(iz)) {
		if (z == 0)
			return x*y + z;
		return z;
	}

	/* normalize so top 10bits and last bit are 0 */
	struct num nx, ny, nz;
	nx = normalize(ix);
	ny = normalize(iy);
	nz = normalize(iz);

	/* mul: r = x*y */
	uint64_t rhi, rlo, zhi, zlo;
	mul(&rhi, &rlo, nx.m, ny.m);
	/* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */

	/* align exponents */
	int e = nx.e + ny.e;
	int d = nz.e - e;
	/* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */
	if (d > 0) {
		if (d < 64) {
			zlo = nz.m<<d;
			zhi = nz.m>>64-d;
		} else {
			zlo = 0;
			zhi = nz.m;
			e = nz.e - 64;
			d -= 64;
			if (d == 0) {
			} else if (d < 64) {
				rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d);
				rhi = rhi>>d;
			} else {
				rlo = 1;
				rhi = 0;
			}
		}
	} else {
		zhi = 0;
		d = -d;
		if (d == 0) {
			zlo = nz.m;
		} else if (d < 64) {
			zlo = nz.m>>d | !!(nz.m<<64-d);
		} else {
			zlo = 1;
		}
	}

	/* add */
	int sign = nx.sign^ny.sign;
	int samesign = !(sign^nz.sign);
	int nonzero = 1;
	if (samesign) {
		/* r += z */
		rlo += zlo;
		rhi += zhi + (rlo < zlo);
	} else {
		/* r -= z */
		uint64_t t = rlo;
		rlo -= zlo;
		rhi = rhi - zhi - (t < rlo);
		if (rhi>>63) {
			rlo = -rlo;
			rhi = -rhi-!!rlo;
			sign = !sign;
		}
		nonzero = !!rhi;
	}

	/* set rhi to top 63bit of the result (last bit is sticky) */
	if (nonzero) {
		e += 64;
		d = a_clz_64(rhi)-1;
		/* note: d > 0 */
		rhi = rhi<<d | rlo>>64-d | !!(rlo<<d);
	} else if (rlo) {
		d = a_clz_64(rlo)-1;
		if (d < 0)
			rhi = rlo>>1 | (rlo&1);
		else
			rhi = rlo<<d;
	} else {
		/* exact +-0 */
		return x*y + z;
	}
	e -= d;

	/* convert to double */
	int64_t i = rhi; /* in [1<<62,(1<<63)-1] */
	if (sign)
		i = -i;
	double r = i; /* in [0x1p62,0x1p63] */

	if (e < -1022-62) {
		/* result is subnormal before rounding */
		if (e == -1022-63) {
			double c = 0x1p63;
			if (sign)
				c = -c;
			if (r == c) {
				/* min normal after rounding, underflow depends
				   on arch behaviour which can be imitated by
				   a double to float conversion */
				float fltmin = 0x0.ffffff8p-63*FLT_MIN * r;
				return DBL_MIN/FLT_MIN * fltmin;
			}
			/* one bit is lost when scaled, add another top bit to
			   only round once at conversion if it is inexact */
			if (rhi << 53) {
				i = rhi>>1 | (rhi&1) | 1ull<<62;
				if (sign)
					i = -i;
				r = i;
				r = 2*r - c; /* remove top bit */
				volatile double uflow = DBL_MIN/FLT_MIN;
				uflow *= uflow;
			}
		} else {
			/* only round once when scaled */
			d = 10;
			i = ( rhi>>d | !!(rhi<<64-d) ) << d;
			if (sign)
				i = -i;
			r = i;
		}
	}
	return scalbn(r, e);
}

next             reply	other threads:[~2017-04-18 22:41 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-18 22:41 Szabolcs Nagy [this message]
2017-04-22 22:24 ` Rich Felker
2017-04-23 11:00   ` Szabolcs Nagy
2017-04-23 15:15     ` Rich Felker
2017-04-23 22:34       ` Szabolcs Nagy
2017-04-23 22:35         ` Szabolcs Nagy
2017-08-30  2:04           ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170418224140.GN2082@port70.net \
    --to=nsz@port70.net \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).