mailing list of musl libc
 help / color / mirror / code / Atom feed
* correctly rounded sqrt
@ 2012-03-14 16:56 Szabolcs Nagy
  2012-03-14 16:57 ` Szabolcs Nagy
  2012-03-14 19:01 ` Szabolcs Nagy
  0 siblings, 2 replies; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-14 16:56 UTC (permalink / raw)
  To: musl

i tried to solve the sqrt issue with a wrapper
as discussed on irc, but i couldn't find a nice
solution

if we check |x - y*y| and try to figure out
the rounding from that, then there are two
approaches

1) use the veltkamp-dekker exact multiplication formulas
(sq1 algorithm in my code)

or

2) use simpler method for y*y and use argumentum reduction
to handle large/small numbers (x*=2^2n, y*=2^-n)


i tried both: 1) only works correctly for subnormals
with double precision evaluation so it needs arg reduction
2) does not work in all cases (|x-y*y| and |x-y1*y1|
might be the same and resolving the tie is nontrivial)

so 1) with arg reduction is my current solution
and i don't know if it works in all cases
..and the code might be slow



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-14 16:56 correctly rounded sqrt Szabolcs Nagy
@ 2012-03-14 16:57 ` Szabolcs Nagy
  2012-03-14 19:01 ` Szabolcs Nagy
  1 sibling, 0 replies; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-14 16:57 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 111 bytes --]

* Szabolcs Nagy <nsz@port70.net> [2012-03-14 17:56:05 +0100]:
> ..and the code might be slow

forgot to attach

[-- Attachment #2: sqrt.c --]
[-- Type: text/x-csrc, Size: 5142 bytes --]

#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <float.h>

#include <mpfr.h>
#include <gmp.h>

/*
int read(double *x, double *y) {
	char buf[256];

	if (!fgets(buf, sizeof buf, stdin))
		return 0;
	if (sscanf(buf, "%la %la", x, y) != 2)
		return 0;
	return 1;
}
*/

struct {
	double x,y;
} testdata[] = {
//	#include "sqrt.data"
0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+511,
0x1.ffffffffffffbp+1023, 0x1.ffffffffffffdp+511,
0x1.ffffffffffff7p+1023, 0x1.ffffffffffffbp+511,
0x1.ffffffffffff3p+1023, 0x1.ffffffffffff9p+511,
0x1.fffffffffffefp+1023, 0x1.ffffffffffff7p+511,
0x1.fffffffffffebp+1023, 0x1.ffffffffffff5p+511,
0x1.fffffffffffe7p+1023, 0x1.ffffffffffff3p+511,
0x1.fffffffffffe3p+1023, 0x1.ffffffffffff1p+511,
0x1.fffffffffffdfp+1023, 0x1.fffffffffffefp+511,
0x1.fffffffffffdbp+1023, 0x1.fffffffffffedp+511,
0x1.fffffffffffd7p+1023, 0x1.fffffffffffebp+511,
0x1.0000000000003p-1022, 0x1.0000000000001p-511,
0x1.0000000000007p-1022, 0x1.0000000000003p-511,
0x1.000000000000bp-1022, 0x1.0000000000005p-511,
0x1.000000000000fp-1022, 0x1.0000000000007p-511,
0x1.0000000000013p-1022, 0x1.0000000000009p-511,
0x1.0000000000017p-1022, 0x1.000000000000bp-511,
0x1.000000000001bp-1022, 0x1.000000000000dp-511,
0x1.000000000001fp-1022, 0x1.000000000000fp-511,
0x1.0000000000023p-1022, 0x1.0000000000011p-511,
0x1.0000000000027p-1022, 0x1.0000000000013p-511,
0x1.000000000002bp-1022, 0x1.0000000000015p-511,
0x1.000000000002fp-1022, 0x1.0000000000017p-511,
0x1.0000000000033p-1022, 0x1.0000000000019p-511,
0x1.0000000000037p-1022, 0x1.000000000001bp-511,
0x1.7167bc36eaa3bp+6, 0x1.3384c7db650cdp+3,
0x1.7570994273ad7p+6, 0x1.353186e89b8ffp+3,
0x1.7dae969442fe6p+6, 0x1.389640fb18b75p+3,
0x1.7f8444fcf67e5p+6, 0x1.395659e94669fp+3,
0x1.8364650e63a54p+6, 0x1.3aea9efe1a3d7p+3,
0x1.85bedd274edd8p+6, 0x1.3bdf20c867057p+3,
0x1.8609cf496ab77p+6, 0x1.3bfd7e14b5eabp+3,
0x1.873849c70a375p+6, 0x1.3c77ed341d27fp+3,
0x1.8919c962cbaaep+6, 0x1.3d3a7113ee82fp+3,
0x1.8de4493e22dc6p+6, 0x1.3f27d448220c3p+3,
0x1.924829a17a288p+6, 0x1.40e9552eec28fp+3,
0x1.92702cd992f12p+6, 0x1.40f94a6fdfddfp+3,
0x1.92b763a8311fdp+6, 0x1.4115af614695fp+3,
0x1.947da013c7293p+6, 0x1.41ca91102940fp+3,
0x1.9536091c494d2p+6, 0x1.4213e334c77adp+3,
0x1.61b04c6p-1019, 0x1.a98b88f18b46dp-510,
0x1.93789f1p-1018, 0x1.4162ae43d5821p-509,
0x1.a1989b4p-1018, 0x1.46f6736eb44bbp-509,
0x1.f93bc9p-1018, 0x1.67a36ec403bafp-509,
0x1.2f675e3p-1017, 0x1.8a22ab6dcfee1p-509,
0x1.a158508p-1017, 0x1.ce418a96cf589p-509,
0x1.cd31f078p-1017, 0x1.e5ef1c65dccebp-509,
0x1.33b43b08p-1016, 0x1.18a9f607e1701p-508,
0x1.6e66a858p-1016, 0x1.324402a00b45fp-508,
0x1.8661cbf8p-1016, 0x1.3c212046bfdffp-508,
0x1.bbb221b4p-1016, 0x1.510681b939931p-508,
0x1.c4942f3cp-1016, 0x1.5461e59227ab5p-508,
0x1.dbb258c8p-1016, 0x1.5cf7b0f78d3afp-508,
0x1.57103ea4p-1015, 0x1.a31ab946d340bp-508,
0x1.9b294f88p-1015, 0x1.cad197e28e85bp-508,
0x1.0000000000001p+0, 0x1p+0,
0x1.fffffffffffffp-1, 0x1.fffffffffffffp-1,
};

void sq1(double *hi, double *lo, double u)
{
	static const double c = 1.0 + 0x1p27;
	double cu, u1, u2, r;

	cu = c*u;
	u1 = u - cu;
	u1 += cu;
	u2 = u - u1;
	*hi = u*u;
	r = u1*u1;
	r -= *hi;
	r += u1*u2;
	r += u2*u1;
	r += u2*u2;
	*lo = r;
}

void sq2(double *hi, double *lo, double u)
{
	double u1;
	double u2;
	double c;

	c = scalbn(1, ilogb(u)+27);
	u1 = u + c;
	u1 -= c;
	u2 = u - u1;
	*hi = u1*u1;
	*lo = u1*u2 + u1*u2 + u2*u2;
}

void sq3(double *hi, double *lo, double u)
{
	float u1;
	double u2;

	u1 = u;
	u2 = u - u1;
	*hi = (double)u1*u1;
	*lo = u1*u2 + u1*u2 + u2*u2;
}

double d_sqrt(double x)
{
	double y;
	double hi, lo, r, y1, r1;
	union {double x; uint64_t u;} u = {x};
	int e = u.u >> 52;
	int k;

	/* +-inf or nan */
	if ((e&0x7ff) == 0x7ff)
		return x + INFINITY;
	/* +-0 */
	if ((u.u & (uint64_t)-1>>1) == 0)
		return 0;
	/* x < 0 */
	if (e&0x800)
		return (x-x)/(x-x);
	if (!e) {
		/* subnormal */
		x *= 0x1p1022;
		k = -511;
	} else {
		/* normal */
		k = (e - 0x3ff)/2;
		e -= 2*k;
		u.u = (uint64_t)e<<52 | (u.u & (uint64_t)-1>>12);
		x = u.x;
	}
	y = sqrt(x);
	sq1(&hi, &lo, y);
	r = x - hi;
	r -= lo;
	if (r == 0)
		return scalbn(y, k);
	if (r < 0) {
		y1 = nextafter(y, 0);
		r = -r;
	} else
		y1 = nextafter(y, DBL_MAX);
	sq1(&hi, &lo, y1);
	r1 = x - hi;
	r1 -= lo;
	if (r1 < 0)
		r1 = -r1;
	y1 = scalbn(y1, k);
	y = scalbn(y, k);
	if (r1 < r)
		return y1;
	if (r1 > r)
		return y;
	printf("tie x %a y %a %a\n", scalbn(x, 2*k), y, y1);
	return y1;
}

static mpfr_t mx, my;

void mp_init()
{
	mpfr_init2(mx, 256);
	mpfr_init2(my, 256);
}

double mp_sqrt(double x) {
	mpfr_set_d(mx, x, GMP_RNDN);
	mpfr_sqrt(my, mx, GMP_RNDN);
	return mpfr_get_d(my, GMP_RNDN);
}

int main()
{
	int i;
	int c;
	double x, my, dy;

	c = 0;
/*
	mp_init();
	x = 1.0;
	for (i = 0; i < 1000000; i++) {
		my = mp_sqrt(x);
		dy = d_sqrt(x);

		if (my != dy) {
			printf("fail %a %a %a\n", x, my, dy);
			c++;
		}

		x = nextafter(x, 0);
	}
*/

	for (i = 0; i < sizeof testdata / sizeof *testdata; i++) {
		x = testdata[i].x;
		my = testdata[i].y;
		dy = d_sqrt(x);
		if (my != dy) {
			printf("fail %a %a %a\n", x, my, dy);
			c++;
		}
	}

	printf("%d\n", c);
	return 0;
}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-14 16:56 correctly rounded sqrt Szabolcs Nagy
  2012-03-14 16:57 ` Szabolcs Nagy
@ 2012-03-14 19:01 ` Szabolcs Nagy
  2012-03-15  1:46   ` Szabolcs Nagy
  1 sibling, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-14 19:01 UTC (permalink / raw)
  To: musl

* Szabolcs Nagy <nsz@port70.net> [2012-03-14 17:56:05 +0100]:
> i tried to solve the sqrt issue with a wrapper
> as discussed on irc, but i couldn't find a nice
> solution
> 

i did some benchmarks:
(1x == single sqrt instruction)

wrapped sqrt is about 3.5x
the fdlibm software sqrt is about 10x
fp precision setting is about 1.2x

it might be possible to do a bit better wrapping
but not much better, so fp prec setting wins if
that's feasible..


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-14 19:01 ` Szabolcs Nagy
@ 2012-03-15  1:46   ` Szabolcs Nagy
  2012-03-15  3:23     ` Szabolcs Nagy
  0 siblings, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-15  1:46 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 259 bytes --]

* Szabolcs Nagy <nsz@port70.net> [2012-03-14 20:01:23 +0100]:
> * Szabolcs Nagy <nsz@port70.net> [2012-03-14 17:56:05 +0100]:
> > i tried to solve the sqrt issue with a wrapper

ok as discussed a simpler and better solution
without precision setting etc in c

[-- Attachment #2: x_sqrt.c --]
[-- Type: text/x-csrc, Size: 766 bytes --]

#include <math.h>
#include <stdint.h>

/* dekker exact mult u*u == hi + lo */
static void sq(long double *hi, long double *lo, long double u)
{
	static const long double c = 1.0 + 0x1p33;
	long double cu, u1, u2;

	cu = c*u;
	u1 = (u - cu) + cu;
	u2 = u - u1;
	*hi = u*u;
	*lo = (u1*u1 - *hi) + u1*u2*2.0 + u2*u2;
}

/* correctly rounded double sqrt using long double arithmetics for x87 */
double x_sqrt(double x)
{
	union {
		long double x;
		struct{
			uint64_t m;
			uint16_t es;
			uint16_t pad;
		} bits;
	} u;
	long double r, hi, lo;

	u.x = sqrtl(x);
	if ((u.bits.m&0x7ff) == 0x400) {
		if ((u.bits.es&0x7fff) == 0x7fff)
			return u.x;
		sq(&hi, &lo, u.x);
		r = x - hi - lo;
		if (r > 0)
			u.bits.m++;
		else
			u.bits.m--;
		return u.x;
	}
	return u.x;
}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15  1:46   ` Szabolcs Nagy
@ 2012-03-15  3:23     ` Szabolcs Nagy
  2012-03-15  5:07       ` Rich Felker
  0 siblings, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-15  3:23 UTC (permalink / raw)
  To: musl

* Szabolcs Nagy <nsz@port70.net> [2012-03-15 02:46:48 +0100]:
> /* dekker exact mult u*u == hi + lo */
> static void sq(long double *hi, long double *lo, long double u)
> {
> 	static const long double c = 1.0 + 0x1p33;
this was wrong, c = 1.0 + 0x1p32; the correct value

and probably
*hi = u1*u1;
*lo = u1*u2*2.0 + u2*u2;
is enough instead of
> 	*hi = u*u;
> 	*lo = (u1*u1 - *hi) + u1*u2*2.0 + u2*u2;

but now that we figured out the fpu status register based
solution this does not matter..


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15  3:23     ` Szabolcs Nagy
@ 2012-03-15  5:07       ` Rich Felker
  2012-03-15  5:30         ` Szabolcs Nagy
  0 siblings, 1 reply; 11+ messages in thread
From: Rich Felker @ 2012-03-15  5:07 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 307 bytes --]

On Thu, Mar 15, 2012 at 04:23:04AM +0100, Szabolcs Nagy wrote:
> but now that we figured out the fpu status register based
> solution this does not matter..

Here's the asm version based on fpu status word. I'm not sure if it's
safe for non-default rounding modes but otherwise it should be
correct.

Rich


[-- Attachment #2: sqrt.s --]
[-- Type: text/plain, Size: 297 bytes --]

.global sqrt
.type sqrt,@function
sqrt:	fldl 4(%esp)
	fsqrt
	fstsw %ax
	sub $12,%esp
	fld %st(0)
	fstpt (%esp)
	mov (%esp),%ecx
	and $0x7ff,%ecx
	cmp $0x400,%ecx
	jnz 1f
	and $0x200,%eax
	sub $0x100,%eax
	sub %eax,(%esp)
	fstp %st(0)
	fldt (%esp)
1:	add $12,%esp
	fstpl 4(%esp)
	fldl 4(%esp)
	ret

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15  5:07       ` Rich Felker
@ 2012-03-15  5:30         ` Szabolcs Nagy
  2012-03-15  8:02           ` Szabolcs Nagy
  0 siblings, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-15  5:30 UTC (permalink / raw)
  To: musl

* Rich Felker <dalias@aerifal.cx> [2012-03-15 01:07:48 -0400]:
> Here's the asm version based on fpu status word. I'm not sure if it's
> safe for non-default rounding modes but otherwise it should be
> correct.

up,down,zero rounding should be ok
we change
..xxx|10000..
into one of
..xxx|10100..
..xxx|01100..
and both of them round the same way with ru,rd,rz

> .global sqrt
> .type sqrt,@function
> sqrt:	fldl 4(%esp)
> 	fsqrt
> 	fstsw %ax
> 	sub $12,%esp
> 	fld %st(0)
> 	fstpt (%esp)
> 	mov (%esp),%ecx
> 	and $0x7ff,%ecx
> 	cmp $0x400,%ecx
> 	jnz 1f
> 	and $0x200,%eax
> 	sub $0x100,%eax
> 	sub %eax,(%esp)
here you modify the return value even if it was nan
eg c99 F.9 recommends no nan modifications
if ((y & 0x7fff<<64) == 0x7fff<<64) ...

> 	fstp %st(0)
> 	fldt (%esp)
> 1:	add $12,%esp
> 	fstpl 4(%esp)
> 	fldl 4(%esp)
> 	ret



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15  5:30         ` Szabolcs Nagy
@ 2012-03-15  8:02           ` Szabolcs Nagy
  2012-03-15 16:12             ` Rich Felker
  0 siblings, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-15  8:02 UTC (permalink / raw)
  To: musl

* Szabolcs Nagy <nsz@port70.net> [2012-03-15 06:30:11 +0100]:
> * Rich Felker <dalias@aerifal.cx> [2012-03-15 01:07:48 -0400]:
> > .global sqrt
> > .type sqrt,@function
> > sqrt:	fldl 4(%esp)
> > 	fsqrt
> > 	fstsw %ax
> > 	sub $12,%esp
> > 	fld %st(0)
> > 	fstpt (%esp)
> > 	mov (%esp),%ecx
> > 	and $0x7ff,%ecx
> > 	cmp $0x400,%ecx
> > 	jnz 1f
> > 	and $0x200,%eax
> > 	sub $0x100,%eax
> > 	sub %eax,(%esp)
> here you modify the return value even if it was nan
> eg c99 F.9 recommends no nan modifications
> if ((y & 0x7fff<<64) == 0x7fff<<64) ...
> 
posix requires this as well
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_20

if (signexp >= 0x7fff) is probably a better check


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15  8:02           ` Szabolcs Nagy
@ 2012-03-15 16:12             ` Rich Felker
  2012-03-15 18:14               ` Pascal Cuoq
  0 siblings, 1 reply; 11+ messages in thread
From: Rich Felker @ 2012-03-15 16:12 UTC (permalink / raw)
  To: musl

On Thu, Mar 15, 2012 at 09:02:44AM +0100, Szabolcs Nagy wrote:
> * Szabolcs Nagy <nsz@port70.net> [2012-03-15 06:30:11 +0100]:
> > * Rich Felker <dalias@aerifal.cx> [2012-03-15 01:07:48 -0400]:
> > > .global sqrt
> > > .type sqrt,@function
> > > sqrt:	fldl 4(%esp)
> > > 	fsqrt
> > > 	fstsw %ax
> > > 	sub $12,%esp
> > > 	fld %st(0)
> > > 	fstpt (%esp)
> > > 	mov (%esp),%ecx
> > > 	and $0x7ff,%ecx
> > > 	cmp $0x400,%ecx
> > > 	jnz 1f
> > > 	and $0x200,%eax
> > > 	sub $0x100,%eax
> > > 	sub %eax,(%esp)
> > here you modify the return value even if it was nan
> > eg c99 F.9 recommends no nan modifications
> > if ((y & 0x7fff<<64) == 0x7fff<<64) ...
> > 
> posix requires this as well
> http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_20
> 
> if (signexp >= 0x7fff) is probably a better check

Haha, this is the second time I stupidly forget sqrt(x) is
nonnegative... :-) Fixing it..

Rich


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15 16:12             ` Rich Felker
@ 2012-03-15 18:14               ` Pascal Cuoq
  2012-03-15 18:22                 ` Szabolcs Nagy
  0 siblings, 1 reply; 11+ messages in thread
From: Pascal Cuoq @ 2012-03-15 18:14 UTC (permalink / raw)
  To: musl; +Cc: musl


>> 
>> if (signexp >= 0x7fff) is probably a better check
> 
> Haha, this is the second time I stupidly forget sqrt(x) is
> nonnegative... :-) Fixing it..

I apologize if this is not relevant —I admit I followed the discussion with only half my attention— but note that sqrt(-0.) is -0., so the sign bit may be set in both argument and result of sqrt().
> 

Pascal

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: correctly rounded sqrt
  2012-03-15 18:14               ` Pascal Cuoq
@ 2012-03-15 18:22                 ` Szabolcs Nagy
  0 siblings, 0 replies; 11+ messages in thread
From: Szabolcs Nagy @ 2012-03-15 18:22 UTC (permalink / raw)
  To: musl

* Pascal Cuoq <pascal.cuoq@gmail.com> [2012-03-15 19:14:09 +0100]:
> >> if (signexp >= 0x7fff) is probably a better check
> > 
> > Haha, this is the second time I stupidly forget sqrt(x) is
> > nonnegative... :-) Fixing it..
> 
> I apologize if this is not relevant ???I admit I followed the discussion with only half my attention??? but note that sqrt(-0.) is -0., so the sign bit may be set in both argument and result of sqrt().

yes, that's handled correctly
we were talking only about the 'hard to round case'
(long double square root significand ends in ..xxx10000000000)

dalias:
meanwhile i realized that a double nan cannot end in 0x400
when converted to long double
so i was wrong about the nan check, it's not needed sorry

(maybe it makes sense to add a small comment to that sqrt.s
because it's not trivial for newcommers)



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-03-15 18:22 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-14 16:56 correctly rounded sqrt Szabolcs Nagy
2012-03-14 16:57 ` Szabolcs Nagy
2012-03-14 19:01 ` Szabolcs Nagy
2012-03-15  1:46   ` Szabolcs Nagy
2012-03-15  3:23     ` Szabolcs Nagy
2012-03-15  5:07       ` Rich Felker
2012-03-15  5:30         ` Szabolcs Nagy
2012-03-15  8:02           ` Szabolcs Nagy
2012-03-15 16:12             ` Rich Felker
2012-03-15 18:14               ` Pascal Cuoq
2012-03-15 18:22                 ` Szabolcs Nagy

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).