[PATCH] math: rewrite fma with mostly int arithmetics

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] math: rewrite fma with mostly int arithmetics
@ 2017-04-18 22:41 Szabolcs Nagy
  2017-04-22 22:24 ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Szabolcs Nagy @ 2017-04-18 22:41 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1675 bytes --]

the freebsd fma code failed to raise underflow exception in some
cases in nearest rounding mode (affects fmal too) e.g.

  fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022)

and the inexact exception may be raised spuriously since the fenv
is not saved/restored around the exact multiplication algorithm
(affects x86 fma too).

another issue is that the underflow behaviour when the rounded result
is the minimal normal number is target dependent, ieee754 allows two
ways to raise underflow for inexact results: raise if the result before
rounding is in the subnormal range (e.g. aarch64, arm, powerpc) or if
the result after rounding with infinite exponent range is in the
subnormal range (e.g. x86, mips, sh).

to avoid all these issues the algorithm was rewritten with mostly int
arithmetics and float arithmetics is only used to get correct rounding
and raise exceptions according to the behaviour of the target without
any fenv.h dependency. it also unifies x86 and non-x86 fma.

fmaf is not affected, fmal need to be fixed too.

this algorithm depends on a_clz_64 and it required a nasty volatile
hack: gcc seems to miscompile the FORCE_EVAL macro of libm.h on i386.
---
 src/math/fma.c | 582 ++++++++++++++++-----------------------------------------
 1 file changed, 158 insertions(+), 424 deletions(-)

attaching the new fma.c instead of a diff, it's more readable.

depends on the a_clz_64 patch and previous scalbn fix.

fmal should be possible to do in a similar way.

i expect it to be faster than the previous code on most targets as
the rounding mode is not changed and has less multiplications
(it is faster on x86_64 and i386), the code size is a bit bigger
though.

[-- Attachment #2: fma.c --]
[-- Type: text/x-csrc, Size: 3797 bytes --]

#include <stdint.h>
#include <float.h>
#include <math.h>
#include "atomic.h"

static inline uint64_t asuint64(double x)
{
	union {double f; uint64_t i;} u = {x};
	return u.i;
}

static inline double asdouble(uint64_t x)
{
	union {uint64_t i; double f;} u = {x};
	return u.f;
}

struct num { uint64_t m; int e; int sign; };

static struct num normalize(uint64_t x)
{
	int e = x>>52;
	int sign = e & 1<<11;
	e &= (1<<11)-1;
	x &= (1ull<<52)-1;
	if (!e) {
		int k = a_clz_64(x);
		x <<= k-11;
		e = -k+12;
	}
	x |= 1ull<<52;
	x <<= 1;
	e -= 0x3ff + 52 + 1;
	return (struct num){x,e,sign};
}

static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
{
	uint64_t t1,t2,t3;
	uint64_t xlo = (uint32_t)x, xhi = x>>32;
	uint64_t ylo = (uint32_t)y, yhi = y>>32;

	t1 = xlo*ylo;
	t2 = xlo*yhi + xhi*ylo;
	t3 = xhi*yhi;
	*lo = t1 + (t2<<32);
	*hi = t3 + (t2>>32) + (t1 > *lo);
}

static int zeroinfnan(uint64_t x)
{
	return 2*x-1 >= 2*asuint64(INFINITY)-1;
}

double fma(double x, double y, double z)
{
	#pragma STDC FENV_ACCESS ON
	uint64_t ix = asuint64(x);
	uint64_t iy = asuint64(y);
	uint64_t iz = asuint64(z);

	if (zeroinfnan(ix) || zeroinfnan(iy))
		return x*y + z;
	if (zeroinfnan(iz)) {
		if (z == 0)
			return x*y + z;
		return z;
	}

	/* normalize so top 10bits and last bit are 0 */
	struct num nx, ny, nz;
	nx = normalize(ix);
	ny = normalize(iy);
	nz = normalize(iz);

	/* mul: r = x*y */
	uint64_t rhi, rlo, zhi, zlo;
	mul(&rhi, &rlo, nx.m, ny.m);
	/* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */

	/* align exponents */
	int e = nx.e + ny.e;
	int d = nz.e - e;
	/* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */
	if (d > 0) {
		if (d < 64) {
			zlo = nz.m<<d;
			zhi = nz.m>>64-d;
		} else {
			zlo = 0;
			zhi = nz.m;
			e = nz.e - 64;
			d -= 64;
			if (d == 0) {
			} else if (d < 64) {
				rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d);
				rhi = rhi>>d;
			} else {
				rlo = 1;
				rhi = 0;
			}
		}
	} else {
		zhi = 0;
		d = -d;
		if (d == 0) {
			zlo = nz.m;
		} else if (d < 64) {
			zlo = nz.m>>d | !!(nz.m<<64-d);
		} else {
			zlo = 1;
		}
	}

	/* add */
	int sign = nx.sign^ny.sign;
	int samesign = !(sign^nz.sign);
	int nonzero = 1;
	if (samesign) {
		/* r += z */
		rlo += zlo;
		rhi += zhi + (rlo < zlo);
	} else {
		/* r -= z */
		uint64_t t = rlo;
		rlo -= zlo;
		rhi = rhi - zhi - (t < rlo);
		if (rhi>>63) {
			rlo = -rlo;
			rhi = -rhi-!!rlo;
			sign = !sign;
		}
		nonzero = !!rhi;
	}

	/* set rhi to top 63bit of the result (last bit is sticky) */
	if (nonzero) {
		e += 64;
		d = a_clz_64(rhi)-1;
		/* note: d > 0 */
		rhi = rhi<<d | rlo>>64-d | !!(rlo<<d);
	} else if (rlo) {
		d = a_clz_64(rlo)-1;
		if (d < 0)
			rhi = rlo>>1 | (rlo&1);
		else
			rhi = rlo<<d;
	} else {
		/* exact +-0 */
		return x*y + z;
	}
	e -= d;

	/* convert to double */
	int64_t i = rhi; /* in [1<<62,(1<<63)-1] */
	if (sign)
		i = -i;
	double r = i; /* in [0x1p62,0x1p63] */

	if (e < -1022-62) {
		/* result is subnormal before rounding */
		if (e == -1022-63) {
			double c = 0x1p63;
			if (sign)
				c = -c;
			if (r == c) {
				/* min normal after rounding, underflow depends
				   on arch behaviour which can be imitated by
				   a double to float conversion */
				float fltmin = 0x0.ffffff8p-63*FLT_MIN * r;
				return DBL_MIN/FLT_MIN * fltmin;
			}
			/* one bit is lost when scaled, add another top bit to
			   only round once at conversion if it is inexact */
			if (rhi << 53) {
				i = rhi>>1 | (rhi&1) | 1ull<<62;
				if (sign)
					i = -i;
				r = i;
				r = 2*r - c; /* remove top bit */
				volatile double uflow = DBL_MIN/FLT_MIN;
				uflow *= uflow;
			}
		} else {
			/* only round once when scaled */
			d = 10;
			i = ( rhi>>d | !!(rhi<<64-d) ) << d;
			if (sign)
				i = -i;
			r = i;
		}
	}
	return scalbn(r, e);
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-18 22:41 [PATCH] math: rewrite fma with mostly int arithmetics Szabolcs Nagy
@ 2017-04-22 22:24 ` Rich Felker
  2017-04-23 11:00   ` Szabolcs Nagy
  0 siblings, 1 reply; 7+ messages in thread
From: Rich Felker @ 2017-04-22 22:24 UTC (permalink / raw)
  To: musl

A few thoughts, inline below. I'm not entirely opposed to this, if it
turns out to be better than the alternatives, but I would like to
understand whether it really is...

On Wed, Apr 19, 2017 at 12:41:40AM +0200, Szabolcs Nagy wrote:
> the freebsd fma code failed to raise underflow exception in some
> cases in nearest rounding mode (affects fmal too) e.g.
> 
>   fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022)
> 
> and the inexact exception may be raised spuriously since the fenv
> is not saved/restored around the exact multiplication algorithm
> (affects x86 fma too).

Is it difficult to determine when the multiplication part of an fma is
exact? If you can determine this quickly, you can just return x*y+z in
this special case and avoid all the costly operations. For normal
range, I think it's roughly just using ctz to count mantissa bits of x
and y, and checking whether the sum is <= 53. Some additional handling
for denormals is needed of course.

> another issue is that the underflow behaviour when the rounded result
> is the minimal normal number is target dependent, ieee754 allows two
> ways to raise underflow for inexact results: raise if the result before
> rounding is in the subnormal range (e.g. aarch64, arm, powerpc) or if
> the result after rounding with infinite exponent range is in the
> subnormal range (e.g. x86, mips, sh).
> 
> to avoid all these issues the algorithm was rewritten with mostly int
> arithmetics and float arithmetics is only used to get correct rounding
> and raise exceptions according to the behaviour of the target without
> any fenv.h dependency. it also unifies x86 and non-x86 fma.
> 
> fmaf is not affected, fmal need to be fixed too.
> 
> this algorithm depends on a_clz_64 and it required a nasty volatile
> hack: gcc seems to miscompile the FORCE_EVAL macro of libm.h on i386.

These are two particular aspects I don't like; (1) I'd rather reduce
the number of a_* primitives we have rather than add new ones, and (2)
I'd like to avoid volatile dances to get the compiler to do what it's
supposed to do. I know the latter might be inevitable in some cases,
though, because compilers are buggy... :(

> ---
>  src/math/fma.c | 582 ++++++++++++++++-----------------------------------------
>  1 file changed, 158 insertions(+), 424 deletions(-)
> 
> attaching the new fma.c instead of a diff, it's more readable.

Thanks, much preferred!

> depends on the a_clz_64 patch and previous scalbn fix.
> 
> fmal should be possible to do in a similar way.
> 
> i expect it to be faster than the previous code on most targets as
> the rounding mode is not changed and has less multiplications
> (it is faster on x86_64 and i386), the code size is a bit bigger
> though.

Kinda surprising on i386 -- I'd expect the 64x64 multiplications to be
costly compared to float ones.

> #include <stdint.h>
> #include <float.h>
> #include <math.h>
> #include "atomic.h"
> 
> static inline uint64_t asuint64(double x)
> {
> 	union {double f; uint64_t i;} u = {x};
> 	return u.i;
> }
> 
> static inline double asdouble(uint64_t x)
> {
> 	union {uint64_t i; double f;} u = {x};
> 	return u.f;
> }

These could just be written with compound literals, making macros an
option, though I don't know if there's any reason one would prefer
macros.

> struct num { uint64_t m; int e; int sign; };
> 
> static struct num normalize(uint64_t x)
> {
> 	int e = x>>52;
> 	int sign = e & 1<<11;
> 	e &= (1<<11)-1;
> 	x &= (1ull<<52)-1;
> 	if (!e) {
> 		int k = a_clz_64(x);
> 		x <<= k-11;
> 		e = -k+12;
> 	}
> 	x |= 1ull<<52;
> 	x <<= 1;
> 	e -= 0x3ff + 52 + 1;
> 	return (struct num){x,e,sign};
> }
> 
> static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
> {
> 	uint64_t t1,t2,t3;
> 	uint64_t xlo = (uint32_t)x, xhi = x>>32;
> 	uint64_t ylo = (uint32_t)y, yhi = y>>32;
> 
> 	t1 = xlo*ylo;
> 	t2 = xlo*yhi + xhi*ylo;
> 	t3 = xhi*yhi;
> 	*lo = t1 + (t2<<32);
> 	*hi = t3 + (t2>>32) + (t1 > *lo);
> }
> 
> static int zeroinfnan(uint64_t x)
> {
> 	return 2*x-1 >= 2*asuint64(INFINITY)-1;
> }
> 
> double fma(double x, double y, double z)
> {
> 	#pragma STDC FENV_ACCESS ON
> 	uint64_t ix = asuint64(x);
> 	uint64_t iy = asuint64(y);
> 	uint64_t iz = asuint64(z);
> 
> 	if (zeroinfnan(ix) || zeroinfnan(iy))
> 		return x*y + z;
> 	if (zeroinfnan(iz)) {
> 		if (z == 0)
> 			return x*y + z;
> 		return z;
> 	}
> 
> 	/* normalize so top 10bits and last bit are 0 */
> 	struct num nx, ny, nz;
> 	nx = normalize(ix);
> 	ny = normalize(iy);
> 	nz = normalize(iz);

If the only constraint here is that top 10 bits and last bit are 0, I
don't see why clz is even needed. You can meet this constraint for
denormals by always multiplying by 2 and using a fixed exponent value.

> 	/* mul: r = x*y */
> 	uint64_t rhi, rlo, zhi, zlo;
> 	mul(&rhi, &rlo, nx.m, ny.m);
> 	/* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */
> 
> 	/* align exponents */
> 	int e = nx.e + ny.e;
> 	int d = nz.e - e;
> 	/* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */
> 	if (d > 0) {
> 		if (d < 64) {
> 			zlo = nz.m<<d;
> 			zhi = nz.m>>64-d;
> 		} else {
> 			zlo = 0;
> 			zhi = nz.m;
> 			e = nz.e - 64;
> 			d -= 64;
> 			if (d == 0) {
> 			} else if (d < 64) {
> 				rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d);
> 				rhi = rhi>>d;
> 			} else {
> 				rlo = 1;
> 				rhi = 0;
> 			}
> 		}
> 	} else {
> 		zhi = 0;
> 		d = -d;
> 		if (d == 0) {
> 			zlo = nz.m;
> 		} else if (d < 64) {
> 			zlo = nz.m>>d | !!(nz.m<<64-d);
> 		} else {
> 			zlo = 1;
> 		}
> 	}
> 
> 	/* add */
> 	int sign = nx.sign^ny.sign;
> 	int samesign = !(sign^nz.sign);
> 	int nonzero = 1;
> 	if (samesign) {
> 		/* r += z */
> 		rlo += zlo;
> 		rhi += zhi + (rlo < zlo);
> 	} else {
> 		/* r -= z */
> 		uint64_t t = rlo;
> 		rlo -= zlo;
> 		rhi = rhi - zhi - (t < rlo);
> 		if (rhi>>63) {
> 			rlo = -rlo;
> 			rhi = -rhi-!!rlo;
> 			sign = !sign;
> 		}
> 		nonzero = !!rhi;
> 	}
> 
> 	/* set rhi to top 63bit of the result (last bit is sticky) */
> 	if (nonzero) {
> 		e += 64;
> 		d = a_clz_64(rhi)-1;
> 		/* note: d > 0 */
> 		rhi = rhi<<d | rlo>>64-d | !!(rlo<<d);
> 	} else if (rlo) {
> 		d = a_clz_64(rlo)-1;
> 		if (d < 0)
> 			rhi = rlo>>1 | (rlo&1);
> 		else
> 			rhi = rlo<<d;
> 	} else {
> 		/* exact +-0 */
> 		return x*y + z;
> 	}
> 	e -= d;
> 
> 	/* convert to double */
> 	int64_t i = rhi; /* in [1<<62,(1<<63)-1] */
> 	if (sign)
> 		i = -i;
> 	double r = i; /* in [0x1p62,0x1p63] */
> 
> 	if (e < -1022-62) {
> 		/* result is subnormal before rounding */
> 		if (e == -1022-63) {
> 			double c = 0x1p63;
> 			if (sign)
> 				c = -c;
> 			if (r == c) {
> 				/* min normal after rounding, underflow depends
> 				   on arch behaviour which can be imitated by
> 				   a double to float conversion */
> 				float fltmin = 0x0.ffffff8p-63*FLT_MIN * r;
> 				return DBL_MIN/FLT_MIN * fltmin;
> 			}
> 			/* one bit is lost when scaled, add another top bit to
> 			   only round once at conversion if it is inexact */
> 			if (rhi << 53) {
> 				i = rhi>>1 | (rhi&1) | 1ull<<62;
> 				if (sign)
> 					i = -i;
> 				r = i;
> 				r = 2*r - c; /* remove top bit */
> 				volatile double uflow = DBL_MIN/FLT_MIN;
> 				uflow *= uflow;
> 			}
> 		} else {
> 			/* only round once when scaled */
> 			d = 10;
> 			i = ( rhi>>d | !!(rhi<<64-d) ) << d;
> 			if (sign)
> 				i = -i;
> 			r = i;
> 		}
> 	}
> 	return scalbn(r, e);
> }



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-22 22:24 ` Rich Felker
@ 2017-04-23 11:00   ` Szabolcs Nagy
  2017-04-23 15:15     ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Szabolcs Nagy @ 2017-04-23 11:00 UTC (permalink / raw)
  To: musl

* Rich Felker <dalias@libc.org> [2017-04-22 18:24:25 -0400]:
> A few thoughts, inline below. I'm not entirely opposed to this, if it
> turns out to be better than the alternatives, but I would like to
> understand whether it really is...
> 
> On Wed, Apr 19, 2017 at 12:41:40AM +0200, Szabolcs Nagy wrote:
> > the freebsd fma code failed to raise underflow exception in some
> > cases in nearest rounding mode (affects fmal too) e.g.
> > 
> >   fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022)
> > 
> > and the inexact exception may be raised spuriously since the fenv
> > is not saved/restored around the exact multiplication algorithm
> > (affects x86 fma too).
> 
> Is it difficult to determine when the multiplication part of an fma is
> exact? If you can determine this quickly, you can just return x*y+z in
> this special case and avoid all the costly operations. For normal
> range, I think it's roughly just using ctz to count mantissa bits of x
> and y, and checking whether the sum is <= 53. Some additional handling
> for denormals is needed of course.
> 

it is a bit more difficult than that:

bits(a) + bits(b) < 54 || (bits(a) + bits(b) == 54 && a*b < 2)

this is probably possible to handle when i do the int mul.

however the rounding mode special cases don't get simpler
and inexact flag still may be raised incorrectly when tail
bits of x*y beyond 53 bits are eliminated when z is added
(the result is exact but the dekker algorithm raises inexact).

> > another issue is that the underflow behaviour when the rounded result
> > is the minimal normal number is target dependent, ieee754 allows two
> > ways to raise underflow for inexact results: raise if the result before
> > rounding is in the subnormal range (e.g. aarch64, arm, powerpc) or if
> > the result after rounding with infinite exponent range is in the
> > subnormal range (e.g. x86, mips, sh).
> > 
> > to avoid all these issues the algorithm was rewritten with mostly int
> > arithmetics and float arithmetics is only used to get correct rounding
> > and raise exceptions according to the behaviour of the target without
> > any fenv.h dependency. it also unifies x86 and non-x86 fma.
> > 
> > fmaf is not affected, fmal need to be fixed too.
> > 
> > this algorithm depends on a_clz_64 and it required a nasty volatile
> > hack: gcc seems to miscompile the FORCE_EVAL macro of libm.h on i386.
> 
> These are two particular aspects I don't like; (1) I'd rather reduce
> the number of a_* primitives we have rather than add new ones, and (2)
> I'd like to avoid volatile dances to get the compiler to do what it's
> supposed to do. I know the latter might be inevitable in some cases,
> though, because compilers are buggy... :(
> 
> > ---
> >  src/math/fma.c | 582 ++++++++++++++++-----------------------------------------
> >  1 file changed, 158 insertions(+), 424 deletions(-)
> > 
> > attaching the new fma.c instead of a diff, it's more readable.
> 
> Thanks, much preferred!
> 
> > depends on the a_clz_64 patch and previous scalbn fix.
> > 
> > fmal should be possible to do in a similar way.
> > 
> > i expect it to be faster than the previous code on most targets as
> > the rounding mode is not changed and has less multiplications
> > (it is faster on x86_64 and i386), the code size is a bit bigger
> > though.
> 
> Kinda surprising on i386 -- I'd expect the 64x64 multiplications to be
> costly compared to float ones.
> 

i implement 64x64 int mul by four 32x32->64 mul,
i386 has 32x32->64 mul op so that works out well.

the float code has to do dekker's exact multiplication
which uses six(!) fp mul (each of which is probably
slower than an int mul) and a lot more fp add.

> > #include <stdint.h>
> > #include <float.h>
> > #include <math.h>
> > #include "atomic.h"
> > 
> > static inline uint64_t asuint64(double x)
> > {
> > 	union {double f; uint64_t i;} u = {x};
> > 	return u.i;
> > }
> > 
> > static inline double asdouble(uint64_t x)
> > {
> > 	union {uint64_t i; double f;} u = {x};
> > 	return u.f;
> > }
> 
> These could just be written with compound literals, making macros an
> option, though I don't know if there's any reason one would prefer
> macros.
> 
> > struct num { uint64_t m; int e; int sign; };
> > 
> > static struct num normalize(uint64_t x)
> > {
> > 	int e = x>>52;
> > 	int sign = e & 1<<11;
> > 	e &= (1<<11)-1;
> > 	x &= (1ull<<52)-1;
> > 	if (!e) {
> > 		int k = a_clz_64(x);
> > 		x <<= k-11;
> > 		e = -k+12;
> > 	}
> > 	x |= 1ull<<52;
> > 	x <<= 1;
> > 	e -= 0x3ff + 52 + 1;
> > 	return (struct num){x,e,sign};
> > }
> > 
> > static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
> > {
> > 	uint64_t t1,t2,t3;
> > 	uint64_t xlo = (uint32_t)x, xhi = x>>32;
> > 	uint64_t ylo = (uint32_t)y, yhi = y>>32;
> > 
> > 	t1 = xlo*ylo;
> > 	t2 = xlo*yhi + xhi*ylo;
> > 	t3 = xhi*yhi;
> > 	*lo = t1 + (t2<<32);
> > 	*hi = t3 + (t2>>32) + (t1 > *lo);
> > }
> > 
> > static int zeroinfnan(uint64_t x)
> > {
> > 	return 2*x-1 >= 2*asuint64(INFINITY)-1;
> > }
> > 
> > double fma(double x, double y, double z)
> > {
> > 	#pragma STDC FENV_ACCESS ON
> > 	uint64_t ix = asuint64(x);
> > 	uint64_t iy = asuint64(y);
> > 	uint64_t iz = asuint64(z);
> > 
> > 	if (zeroinfnan(ix) || zeroinfnan(iy))
> > 		return x*y + z;
> > 	if (zeroinfnan(iz)) {
> > 		if (z == 0)
> > 			return x*y + z;
> > 		return z;
> > 	}
> > 
> > 	/* normalize so top 10bits and last bit are 0 */
> > 	struct num nx, ny, nz;
> > 	nx = normalize(ix);
> > 	ny = normalize(iy);
> > 	nz = normalize(iz);
> 
> If the only constraint here is that top 10 bits and last bit are 0, I
> don't see why clz is even needed. You can meet this constraint for
> denormals by always multiplying by 2 and using a fixed exponent value.
> 

yeah that should work, but i also use clz later

> > 	/* mul: r = x*y */
> > 	uint64_t rhi, rlo, zhi, zlo;
> > 	mul(&rhi, &rlo, nx.m, ny.m);
> > 	/* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */
> > 
> > 	/* align exponents */
> > 	int e = nx.e + ny.e;
> > 	int d = nz.e - e;
> > 	/* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */
> > 	if (d > 0) {
> > 		if (d < 64) {
> > 			zlo = nz.m<<d;
> > 			zhi = nz.m>>64-d;
> > 		} else {
> > 			zlo = 0;
> > 			zhi = nz.m;
> > 			e = nz.e - 64;
> > 			d -= 64;
> > 			if (d == 0) {
> > 			} else if (d < 64) {
> > 				rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d);
> > 				rhi = rhi>>d;
> > 			} else {
> > 				rlo = 1;
> > 				rhi = 0;
> > 			}
> > 		}
> > 	} else {
> > 		zhi = 0;
> > 		d = -d;
> > 		if (d == 0) {
> > 			zlo = nz.m;
> > 		} else if (d < 64) {
> > 			zlo = nz.m>>d | !!(nz.m<<64-d);
> > 		} else {
> > 			zlo = 1;
> > 		}
> > 	}
> > 
> > 	/* add */
> > 	int sign = nx.sign^ny.sign;
> > 	int samesign = !(sign^nz.sign);
> > 	int nonzero = 1;
> > 	if (samesign) {
> > 		/* r += z */
> > 		rlo += zlo;
> > 		rhi += zhi + (rlo < zlo);
> > 	} else {
> > 		/* r -= z */
> > 		uint64_t t = rlo;
> > 		rlo -= zlo;
> > 		rhi = rhi - zhi - (t < rlo);
> > 		if (rhi>>63) {
> > 			rlo = -rlo;
> > 			rhi = -rhi-!!rlo;
> > 			sign = !sign;
> > 		}
> > 		nonzero = !!rhi;
> > 	}
> > 
> > 	/* set rhi to top 63bit of the result (last bit is sticky) */

i need at least 55 bits of result in a 64bit int here,
ideally with known top bit position for the exponent
computation and later checks.

> > 	if (nonzero) {
> > 		e += 64;
> > 		d = a_clz_64(rhi)-1;
> > 		/* note: d > 0 */
> > 		rhi = rhi<<d | rlo>>64-d | !!(rlo<<d);
> > 	} else if (rlo) {
> > 		d = a_clz_64(rlo)-1;
> > 		if (d < 0)
> > 			rhi = rlo>>1 | (rlo&1);
> > 		else
> > 			rhi = rlo<<d;
> > 	} else {
> > 		/* exact +-0 */
> > 		return x*y + z;
> > 	}
> > 	e -= d;
> > 
> > 	/* convert to double */
> > 	int64_t i = rhi; /* in [1<<62,(1<<63)-1] */
> > 	if (sign)
> > 		i = -i;
> > 	double r = i; /* in [0x1p62,0x1p63] */
> > 
> > 	if (e < -1022-62) {
> > 		/* result is subnormal before rounding */
> > 		if (e == -1022-63) {
> > 			double c = 0x1p63;
> > 			if (sign)
> > 				c = -c;
> > 			if (r == c) {
> > 				/* min normal after rounding, underflow depends
> > 				   on arch behaviour which can be imitated by
> > 				   a double to float conversion */
> > 				float fltmin = 0x0.ffffff8p-63*FLT_MIN * r;
> > 				return DBL_MIN/FLT_MIN * fltmin;
> > 			}
> > 			/* one bit is lost when scaled, add another top bit to
> > 			   only round once at conversion if it is inexact */
> > 			if (rhi << 53) {
> > 				i = rhi>>1 | (rhi&1) | 1ull<<62;
> > 				if (sign)
> > 					i = -i;
> > 				r = i;
> > 				r = 2*r - c; /* remove top bit */
> > 				volatile double uflow = DBL_MIN/FLT_MIN;
> > 				uflow *= uflow;
> > 			}
> > 		} else {
> > 			/* only round once when scaled */
> > 			d = 10;
> > 			i = ( rhi>>d | !!(rhi<<64-d) ) << d;
> > 			if (sign)
> > 				i = -i;
> > 			r = i;
> > 		}
> > 	}
> > 	return scalbn(r, e);
> > }


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-23 11:00   ` Szabolcs Nagy
@ 2017-04-23 15:15     ` Rich Felker
  2017-04-23 22:34       ` Szabolcs Nagy
  0 siblings, 1 reply; 7+ messages in thread
From: Rich Felker @ 2017-04-23 15:15 UTC (permalink / raw)
  To: musl

On Sun, Apr 23, 2017 at 01:00:52PM +0200, Szabolcs Nagy wrote:
> * Rich Felker <dalias@libc.org> [2017-04-22 18:24:25 -0400]:
> > A few thoughts, inline below. I'm not entirely opposed to this, if it
> > turns out to be better than the alternatives, but I would like to
> > understand whether it really is...
> > 
> > On Wed, Apr 19, 2017 at 12:41:40AM +0200, Szabolcs Nagy wrote:
> > > the freebsd fma code failed to raise underflow exception in some
> > > cases in nearest rounding mode (affects fmal too) e.g.
> > > 
> > >   fma(-0x1p-1000, 0x1.000001p-74, 0x1p-1022)
> > > 
> > > and the inexact exception may be raised spuriously since the fenv
> > > is not saved/restored around the exact multiplication algorithm
> > > (affects x86 fma too).
> > 
> > Is it difficult to determine when the multiplication part of an fma is
> > exact? If you can determine this quickly, you can just return x*y+z in
> > this special case and avoid all the costly operations. For normal
> > range, I think it's roughly just using ctz to count mantissa bits of x
> > and y, and checking whether the sum is <= 53. Some additional handling
> > for denormals is needed of course.
> 
> it is a bit more difficult than that:
> 
> bits(a) + bits(b) < 54 || (bits(a) + bits(b) == 54 && a*b < 2)
> 
> this is probably possible to handle when i do the int mul.
> 
> however the rounding mode special cases don't get simpler
> and inexact flag still may be raised incorrectly when tail
> bits of x*y beyond 53 bits are eliminated when z is added
> (the result is exact but the dekker algorithm raises inexact).

One thing to note: even if it's not a replacement for the whole
algorithm, this seems like a very useful optimization for a case
that's easy to test. "return x*y+z;" is going to be a lot faster than
anything else you can do. But maybe it's rare to hit cases where the
optimization works; it certainly "should" be rare if people are using
fma for the semantics rather than as a misguided optimization.

> > > depends on the a_clz_64 patch and previous scalbn fix.
> > > 
> > > fmal should be possible to do in a similar way.
> > > 
> > > i expect it to be faster than the previous code on most targets as
> > > the rounding mode is not changed and has less multiplications
> > > (it is faster on x86_64 and i386), the code size is a bit bigger
> > > though.
> > 
> > Kinda surprising on i386 -- I'd expect the 64x64 multiplications to be
> > costly compared to float ones.
> > 
> 
> i implement 64x64 int mul by four 32x32->64 mul,
> i386 has 32x32->64 mul op so that works out well.

Most archs should have a 32x32->64; if not this is an ISA quality
issue and not one I really want to focus on remedying.

> the float code has to do dekker's exact multiplication
> which uses six(!) fp mul (each of which is probably
> slower than an int mul) and a lot more fp add.

OK, this makes your approach make a lot more sense! Thanks for sharing
that info.

> > > 	/* normalize so top 10bits and last bit are 0 */
> > > 	struct num nx, ny, nz;
> > > 	nx = normalize(ix);
> > > 	ny = normalize(iy);
> > > 	nz = normalize(iz);
> > 
> > If the only constraint here is that top 10 bits and last bit are 0, I
> > don't see why clz is even needed. You can meet this constraint for
> > denormals by always multiplying by 2 and using a fixed exponent value.
> 
> yeah that should work, but i also use clz later

Ah, I missed that. Still it might be a worthwhile optimization here; I
think it shaves off a few ops in normalize().

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-23 15:15     ` Rich Felker
@ 2017-04-23 22:34       ` Szabolcs Nagy
  2017-04-23 22:35         ` Szabolcs Nagy
  0 siblings, 1 reply; 7+ messages in thread
From: Szabolcs Nagy @ 2017-04-23 22:34 UTC (permalink / raw)
  To: musl

* Rich Felker <dalias@libc.org> [2017-04-23 11:15:39 -0400]:
> On Sun, Apr 23, 2017 at 01:00:52PM +0200, Szabolcs Nagy wrote:
> > * Rich Felker <dalias@libc.org> [2017-04-22 18:24:25 -0400]:
> > > Is it difficult to determine when the multiplication part of an fma is
> > > exact? If you can determine this quickly, you can just return x*y+z in
> > > this special case and avoid all the costly operations. For normal
> > > range, I think it's roughly just using ctz to count mantissa bits of x
> > > and y, and checking whether the sum is <= 53. Some additional handling
> > > for denormals is needed of course.
> > 
> > it is a bit more difficult than that:
> > 
> > bits(a) + bits(b) < 54 || (bits(a) + bits(b) == 54 && a*b < 2)
> > 
> > this is probably possible to handle when i do the int mul.
> > 
> > however the rounding mode special cases don't get simpler
> > and inexact flag still may be raised incorrectly when tail
> > bits of x*y beyond 53 bits are eliminated when z is added
> > (the result is exact but the dekker algorithm raises inexact).
> 
> One thing to note: even if it's not a replacement for the whole
> algorithm, this seems like a very useful optimization for a case
> that's easy to test. "return x*y+z;" is going to be a lot faster than
> anything else you can do. But maybe it's rare to hit cases where the
> optimization works; it certainly "should" be rare if people are using
> fma for the semantics rather than as a misguided optimization.

i didn't see a simple way to check for exact x*y result
(if it were easy then that could capture the exact 0 result
case which means one less special case later, but this is
not easy if x*y is in the subnormal range or overflows)

> > > If the only constraint here is that top 10 bits and last bit are 0, I
> > > don't see why clz is even needed. You can meet this constraint for
> > > denormals by always multiplying by 2 and using a fixed exponent value.
> > 
> > yeah that should work, but i also use clz later
> 
> Ah, I missed that. Still it might be a worthwhile optimization here; I
> think it shaves off a few ops in normalize().

attached a new version with updated normalize.

on my laptop latency and code size:

old x86_64: 67 ns/call  893 bytes
new x86_64: 20 ns/call  960 bytes
old i386:   80 ns/call  942 bytes
new i386:   75 ns/call 1871 bytes
old  arm:   -           960 bytes
new  arm:   -          1200 bytes


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-23 22:34       ` Szabolcs Nagy
@ 2017-04-23 22:35         ` Szabolcs Nagy
  2017-08-30  2:04           ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Szabolcs Nagy @ 2017-04-23 22:35 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2582 bytes --]

* Szabolcs Nagy <nsz@port70.net> [2017-04-24 00:34:48 +0200]:
> * Rich Felker <dalias@libc.org> [2017-04-23 11:15:39 -0400]:
> > On Sun, Apr 23, 2017 at 01:00:52PM +0200, Szabolcs Nagy wrote:
> > > * Rich Felker <dalias@libc.org> [2017-04-22 18:24:25 -0400]:
> > > > Is it difficult to determine when the multiplication part of an fma is
> > > > exact? If you can determine this quickly, you can just return x*y+z in
> > > > this special case and avoid all the costly operations. For normal
> > > > range, I think it's roughly just using ctz to count mantissa bits of x
> > > > and y, and checking whether the sum is <= 53. Some additional handling
> > > > for denormals is needed of course.
> > > 
> > > it is a bit more difficult than that:
> > > 
> > > bits(a) + bits(b) < 54 || (bits(a) + bits(b) == 54 && a*b < 2)
> > > 
> > > this is probably possible to handle when i do the int mul.
> > > 
> > > however the rounding mode special cases don't get simpler
> > > and inexact flag still may be raised incorrectly when tail
> > > bits of x*y beyond 53 bits are eliminated when z is added
> > > (the result is exact but the dekker algorithm raises inexact).
> > 
> > One thing to note: even if it's not a replacement for the whole
> > algorithm, this seems like a very useful optimization for a case
> > that's easy to test. "return x*y+z;" is going to be a lot faster than
> > anything else you can do. But maybe it's rare to hit cases where the
> > optimization works; it certainly "should" be rare if people are using
> > fma for the semantics rather than as a misguided optimization.
> 
> i didn't see a simple way to check for exact x*y result
> (if it were easy then that could capture the exact 0 result
> case which means one less special case later, but this is
> not easy if x*y is in the subnormal range or overflows)
> 
> > > > If the only constraint here is that top 10 bits and last bit are 0, I
> > > > don't see why clz is even needed. You can meet this constraint for
> > > > denormals by always multiplying by 2 and using a fixed exponent value.
> > > 
> > > yeah that should work, but i also use clz later
> > 
> > Ah, I missed that. Still it might be a worthwhile optimization here; I
> > think it shaves off a few ops in normalize().
> 
> attached a new version with updated normalize.
> 

now really

> on my laptop latency and code size:
> 
> old x86_64: 67 ns/call  893 bytes
> new x86_64: 20 ns/call  960 bytes
> old i386:   80 ns/call  942 bytes
> new i386:   75 ns/call 1871 bytes
> old  arm:   -           960 bytes
> new  arm:   -          1200 bytes

[-- Attachment #2: fma.c --]
[-- Type: text/x-csrc, Size: 3624 bytes --]

#include <stdint.h>
#include <float.h>
#include <math.h>
#include "atomic.h"

#define ASUINT64(x) ((union {double f; uint64_t i;}){x}).i
#define ZEROINFNAN (0x7ff-0x3ff-52-1)

struct num { uint64_t m; int e; int sign; };

static struct num normalize(double x)
{
	uint64_t ix = ASUINT64(x);
	int e = ix>>52;
	int sign = e & 0x800;
	e &= 0x7ff;
	if (!e) {
		ix = ASUINT64(x*0x1p63);
		e = ix>>52 & 0x7ff;
		e = e ? e-63 : 0x800;
	}
	ix &= (1ull<<52)-1;
	ix |= 1ull<<52;
	ix <<= 1;
	e -= 0x3ff + 52 + 1;
	return (struct num){ix,e,sign};
}

static void mul(uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
{
	uint64_t t1,t2,t3;
	uint64_t xlo = (uint32_t)x, xhi = x>>32;
	uint64_t ylo = (uint32_t)y, yhi = y>>32;

	t1 = xlo*ylo;
	t2 = xlo*yhi + xhi*ylo;
	t3 = xhi*yhi;
	*lo = t1 + (t2<<32);
	*hi = t3 + (t2>>32) + (t1 > *lo);
}

double fma(double x, double y, double z)
{
	#pragma STDC FENV_ACCESS ON

	/* normalize so top 10bits and last bit are 0 */
	struct num nx, ny, nz;
	nx = normalize(x);
	ny = normalize(y);
	nz = normalize(z);

	if (nx.e >= ZEROINFNAN || ny.e >= ZEROINFNAN)
		return x*y + z;
	if (nz.e >= ZEROINFNAN) {
		if (nz.e > ZEROINFNAN) /* z==0 */
			return x*y + z;
		return z;
	}

	/* mul: r = x*y */
	uint64_t rhi, rlo, zhi, zlo;
	mul(&rhi, &rlo, nx.m, ny.m);
	/* either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */

	/* align exponents */
	int e = nx.e + ny.e;
	int d = nz.e - e;
	/* shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz) */
	if (d > 0) {
		if (d < 64) {
			zlo = nz.m<<d;
			zhi = nz.m>>64-d;
		} else {
			zlo = 0;
			zhi = nz.m;
			e = nz.e - 64;
			d -= 64;
			if (d == 0) {
			} else if (d < 64) {
				rlo = rhi<<64-d | rlo>>d | !!(rlo<<64-d);
				rhi = rhi>>d;
			} else {
				rlo = 1;
				rhi = 0;
			}
		}
	} else {
		zhi = 0;
		d = -d;
		if (d == 0) {
			zlo = nz.m;
		} else if (d < 64) {
			zlo = nz.m>>d | !!(nz.m<<64-d);
		} else {
			zlo = 1;
		}
	}

	/* add */
	int sign = nx.sign^ny.sign;
	int samesign = !(sign^nz.sign);
	int nonzero = 1;
	if (samesign) {
		/* r += z */
		rlo += zlo;
		rhi += zhi + (rlo < zlo);
	} else {
		/* r -= z */
		uint64_t t = rlo;
		rlo -= zlo;
		rhi = rhi - zhi - (t < rlo);
		if (rhi>>63) {
			rlo = -rlo;
			rhi = -rhi-!!rlo;
			sign = !sign;
		}
		nonzero = !!rhi;
	}

	/* set rhi to top 63bit of the result (last bit is sticky) */
	if (nonzero) {
		e += 64;
		d = a_clz_64(rhi)-1;
		/* note: d > 0 */
		rhi = rhi<<d | rlo>>64-d | !!(rlo<<d);
	} else if (rlo) {
		d = a_clz_64(rlo)-1;
		if (d < 0)
			rhi = rlo>>1 | (rlo&1);
		else
			rhi = rlo<<d;
	} else {
		/* exact +-0 */
		return x*y + z;
	}
	e -= d;

	/* convert to double */
	int64_t i = rhi; /* i is in [1<<62,(1<<63)-1] */
	if (sign)
		i = -i;
	double r = i; /* |r| is in [0x1p62,0x1p63] */

	if (e < -1022-62) {
		/* result is subnormal before rounding */
		if (e == -1022-63) {
			double c = 0x1p63;
			if (sign)
				c = -c;
			if (r == c) {
				/* min normal after rounding, underflow depends
				   on arch behaviour which can be imitated by
				   a double to float conversion */
				float fltmin = 0x0.ffffff8p-63*FLT_MIN * r;
				return DBL_MIN/FLT_MIN * fltmin;
			}
			/* one bit is lost when scaled, add another top bit to
			   only round once at conversion if it is inexact */
			if (rhi << 53) {
				i = rhi>>1 | (rhi&1) | 1ull<<62;
				if (sign)
					i = -i;
				r = i;
				r = 2*r - c; /* remove top bit */
				volatile double uflow = DBL_MIN/FLT_MIN;
				uflow *= uflow;
			}
		} else {
			/* only round once when scaled */
			d = 10;
			i = ( rhi>>d | !!(rhi<<64-d) ) << d;
			if (sign)
				i = -i;
			r = i;
		}
	}
	return scalbn(r, e);
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] math: rewrite fma with mostly int arithmetics
  2017-04-23 22:35         ` Szabolcs Nagy
@ 2017-08-30  2:04           ` Rich Felker
  0 siblings, 0 replies; 7+ messages in thread
From: Rich Felker @ 2017-08-30  2:04 UTC (permalink / raw)
  To: musl

On Mon, Apr 24, 2017 at 12:35:33AM +0200, Szabolcs Nagy wrote:
> 			/* one bit is lost when scaled, add another top bit to
> 			   only round once at conversion if it is inexact */
> 			if (rhi << 53) {
> 				i = rhi>>1 | (rhi&1) | 1ull<<62;
> 				if (sign)
> 					i = -i;
> 				r = i;
> 				r = 2*r - c; /* remove top bit */
> 				volatile double uflow = DBL_MIN/FLT_MIN;
> 				uflow *= uflow;
> 			}

Is there any way to get rid of the volatile hack here?

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-08-30  2:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-18 22:41 [PATCH] math: rewrite fma with mostly int arithmetics Szabolcs Nagy
2017-04-22 22:24 ` Rich Felker
2017-04-23 11:00   ` Szabolcs Nagy
2017-04-23 15:15     ` Rich Felker
2017-04-23 22:34       ` Szabolcs Nagy
2017-04-23 22:35         ` Szabolcs Nagy
2017-08-30  2:04           ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).