From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <musl-return-15243-ml=inbox.vuxu.org@lists.openwall.com>
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on inbox.vuxu.org
X-Spam-Level: 
X-Spam-Status: No, score=-3.0 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS,
	MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,RCVD_IN_MSPIKE_H3,
	RCVD_IN_MSPIKE_WL autolearn=ham autolearn_force=no version=3.4.2
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by inbox.vuxu.org (OpenSMTPD) with SMTP id 52dc16cf
	for <ml@inbox.vuxu.org>;
	Mon, 20 Jan 2020 22:37:40 +0000 (UTC)
Received: (qmail 26013 invoked by uid 550); 20 Jan 2020 22:37:38 -0000
Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:musl@lists.openwall.com>
List-Help: <mailto:musl-help@lists.openwall.com>
List-Unsubscribe: <mailto:musl-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:musl-subscribe@lists.openwall.com>
List-ID: <musl.lists.openwall.com>
Reply-To: musl@lists.openwall.com
Received: (qmail 25979 invoked from network); 20 Jan 2020 22:37:38 -0000
Date: Mon, 20 Jan 2020 23:37:26 +0100
From: Szabolcs Nagy <nsz@port70.net>
To: musl@lists.openwall.com
Message-ID: <20200120223726.GQ23985@port70.net>
Mail-Followup-To: musl@lists.openwall.com
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="b5gNqxB1S1yM7hjW"
Content-Disposition: inline
User-Agent: Mutt/1.10.1 (2018-07-13)
Subject: [musl] [PATCH 0/2] math: fix known directed rounding problems


--b5gNqxB1S1yM7hjW
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

fix the two known directed rounding bugs in current math code.
(there are still large ulp errors in j0, y0, jn, yn functions,
but all other functions should have small worst case ulp error now)

Szabolcs Nagy (2):
  math: fix __rem_pio2 in non-nearest rounding modes
  math: fix sinh overflows in non-nearest rounding

 src/internal/libm.h    |  4 ++--
 src/math/__expo2.c     |  5 +++--
 src/math/__expo2f.c    |  5 +++--
 src/math/__rem_pio2.c  | 15 ++++++++++++++-
 src/math/__rem_pio2f.c | 13 ++++++++++++-
 src/math/__rem_pio2l.c | 16 +++++++++++++++-
 src/math/cosh.c        |  2 +-
 src/math/coshf.c       |  2 +-
 src/math/sinh.c        |  2 +-
 src/math/sinhf.c       |  2 +-
 10 files changed, 53 insertions(+), 13 deletions(-)

-- 
2.24.1

--b5gNqxB1S1yM7hjW
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0001-math-fix-__rem_pio2-in-non-nearest-rounding-modes.patch"

>From 7d2aed5f8f699b7e644be030cc826d0152009ef8 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sat, 18 Jan 2020 17:55:25 +0000
Subject: [PATCH 1/2] math: fix __rem_pio2 in non-nearest rounding modes

Handle when after reduction |y| > pi/4+tiny. This happens in directed
rounding modes because the fast round to int code does not give the
nearest integer. In such cases the reduction may not be symmetric
between x and -x so e.g. cos(x)==cos(-x) may not hold (but polynomial
evaluation is not symmetric either with directed rounding so fixing
that would require more changes with bigger performance impact).

The fix only adds two predictable branches in nearest rounding mode,
simple ubenchmark does not show relevant performance regression in
nearest rounding mode.

The code could be improved: e.g reducing the medium size threshold
such that two step reduction is enough instead of three, and the
single precision case can avoid the issue by doing the round to int
differently, but this fix was kept minimal.
---
 src/math/__rem_pio2.c  | 15 ++++++++++++++-
 src/math/__rem_pio2f.c | 13 ++++++++++++-
 src/math/__rem_pio2l.c | 16 +++++++++++++++-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/math/__rem_pio2.c b/src/math/__rem_pio2.c
index d403f81c..dcf672fb 100644
--- a/src/math/__rem_pio2.c
+++ b/src/math/__rem_pio2.c
@@ -36,6 +36,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb54442d18p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
@@ -117,11 +118,23 @@ int __rem_pio2(double x, double *y)
 	}
 	if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */
 medium:
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n = (int32_t)fn;
 		r = x - fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round, good to 85 bits */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r - w;
 		u.f = y[0];
 		ey = u.i>>52 & 0x7ff;
diff --git a/src/math/__rem_pio2f.c b/src/math/__rem_pio2f.c
index 4473c1c4..e6765643 100644
--- a/src/math/__rem_pio2f.c
+++ b/src/math/__rem_pio2f.c
@@ -35,6 +35,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb6p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */
 pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
@@ -50,10 +51,20 @@ int __rem_pio2f(float x, double *y)
 	ix = u.i & 0x7fffffff;
 	/* 25+53 bit pi is good enough for medium size */
 	if (ix < 0x4dc90fdb) {  /* |x| ~< 2^28*(pi/2), medium size */
-		/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+		/* Use a specialized rint() to get fn. */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n  = (int32_t)fn;
 		*y = x - fn*pio2_1 - fn*pio2_1t;
+		/* Matters with directed rounding. */
+		if (predict_false(*y < -pio4)) {
+			n--;
+			fn--;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		} else if (predict_false(*y > pio4)) {
+			n++;
+			fn++;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		}
 		return n;
 	}
 	if(ix>=0x7f800000) {  /* x is inf or NaN */
diff --git a/src/math/__rem_pio2l.c b/src/math/__rem_pio2l.c
index 77255bd8..236b2def 100644
--- a/src/math/__rem_pio2l.c
+++ b/src/math/__rem_pio2l.c
@@ -44,6 +44,7 @@ pio2_1 =  1.57079632679597125389e+00, /* 0x3FF921FB, 0x54444000 */
 pio2_2 = -1.07463465549783099519e-12, /* -0x12e7b967674000.0p-92 */
 pio2_3 =  6.36831716351370313614e-25; /*  0x18a2e037074000.0p-133 */
 static const long double
+pio4    =  0x1.921fb54442d1846ap-1L,
 invpio2 =  6.36619772367581343076e-01L, /*  0xa2f9836e4e44152a.0p-64 */
 pio2_1t = -1.07463465549719416346e-12L, /* -0x973dcb3b399d747f.0p-103 */
 pio2_2t =  6.36831716351095013979e-25L, /*  0xc51701b839a25205.0p-144 */
@@ -57,6 +58,7 @@ pio2_3t = -2.75299651904407171810e-37L; /* -0xbb5bf6c7ddd660ce.0p-185 */
 #define NX 5
 #define NY 3
 static const long double
+pio4    =  0x1.921fb54442d18469898cc51701b8p-1L,
 invpio2 =  6.3661977236758134307553505349005747e-01L,	/*  0x145f306dc9c882a53f84eafa3ea6a.0p-113 */
 pio2_1  =  1.5707963267948966192292994253909555e+00L,	/*  0x1921fb54442d18469800000000000.0p-112 */
 pio2_1t =  2.0222662487959507323996846200947577e-21L,	/*  0x13198a2e03707344a4093822299f3.0p-181 */
@@ -76,11 +78,23 @@ int __rem_pio2l(long double x, long double *y)
 	u.f = x;
 	ex = u.i.se & 0x7fff;
 	if (SMALL(u)) {
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = x*invpio2 + toint - toint;
 		n = QUOBITS(fn);
 		r = x-fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round good to 102/180 bits (ld80/ld128) */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r-w;
 		u.f = y[0];
 		ey = u.i.se & 0x7fff;
-- 
2.24.1


--b5gNqxB1S1yM7hjW
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0002-math-fix-sinh-overflows-in-non-nearest-rounding.patch"

>From f8c5704213b09821d37dd95872b7e0f89906375c Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Mon, 20 Jan 2020 20:38:45 +0000
Subject: [PATCH 2/2] math: fix sinh overflows in non-nearest rounding

The final roundig operation should be done with the correct sign
otherwise huge results may incorrectly get rounded to or away from
infinity in upward or downward rounding modes.

This affected sinh and sinhf which set the sign on the result after
a potentially overflowing mul. There may be other non-nearest rounding
issues, but this was a known long standing issue with large ulp error
(depending on how ulp is defined near infinity).

The fix should have no effect on sinh and sinhf performance but may
have a tiny effect on cosh and coshf.
---
 src/internal/libm.h | 4 ++--
 src/math/__expo2.c  | 5 +++--
 src/math/__expo2f.c | 5 +++--
 src/math/cosh.c     | 2 +-
 src/math/coshf.c    | 2 +-
 src/math/sinh.c     | 2 +-
 src/math/sinhf.c    | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/internal/libm.h b/src/internal/libm.h
index b5bd26b8..7533f6ba 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -236,13 +236,13 @@ hidden int    __rem_pio2(double,double*);
 hidden double __sin(double,double,int);
 hidden double __cos(double,double);
 hidden double __tan(double,double,int);
-hidden double __expo2(double);
+hidden double __expo2(double,double);
 
 hidden int    __rem_pio2f(float,double*);
 hidden float  __sindf(double);
 hidden float  __cosdf(double);
 hidden float  __tandf(double,int);
-hidden float  __expo2f(float);
+hidden float  __expo2f(float,float);
 
 hidden int __rem_pio2l(long double, long double *);
 hidden long double __sinl(long double, long double, int);
diff --git a/src/math/__expo2.c b/src/math/__expo2.c
index 740ac680..248f052b 100644
--- a/src/math/__expo2.c
+++ b/src/math/__expo2.c
@@ -5,12 +5,13 @@ static const int k = 2043;
 static const double kln2 = 0x1.62066151add8bp+10;
 
 /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */
-double __expo2(double x)
+double __expo2(double x, double sign)
 {
 	double scale;
 
 	/* note that k is odd and scale*scale overflows */
 	INSERT_WORDS(scale, (uint32_t)(0x3ff + k/2) << 20, 0);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return exp(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return exp(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/__expo2f.c b/src/math/__expo2f.c
index 5163e418..538eb09c 100644
--- a/src/math/__expo2f.c
+++ b/src/math/__expo2f.c
@@ -5,12 +5,13 @@ static const int k = 235;
 static const float kln2 = 0x1.45c778p+7f;
 
 /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
-float __expo2f(float x)
+float __expo2f(float x, float sign)
 {
 	float scale;
 
 	/* note that k is odd and scale*scale overflows */
 	SET_FLOAT_WORD(scale, (uint32_t)(0x7f + k/2) << 23);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return expf(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return expf(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/cosh.c b/src/math/cosh.c
index 100f8231..490c15fb 100644
--- a/src/math/cosh.c
+++ b/src/math/cosh.c
@@ -35,6 +35,6 @@ double cosh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = __expo2(x);
+	t = __expo2(x, 1.0);
 	return t;
 }
diff --git a/src/math/coshf.c b/src/math/coshf.c
index b09f2ee5..e739cff9 100644
--- a/src/math/coshf.c
+++ b/src/math/coshf.c
@@ -28,6 +28,6 @@ float coshf(float x)
 	}
 
 	/* |x| > log(FLT_MAX) or nan */
-	t = __expo2f(x);
+	t = __expo2f(x, 1.0f);
 	return t;
 }
diff --git a/src/math/sinh.c b/src/math/sinh.c
index 00022c4e..a01951ae 100644
--- a/src/math/sinh.c
+++ b/src/math/sinh.c
@@ -34,6 +34,6 @@ double sinh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = 2*h*__expo2(absx);
+	t = __expo2(absx, 2*h);
 	return t;
 }
diff --git a/src/math/sinhf.c b/src/math/sinhf.c
index 6ad19ea2..b9caa793 100644
--- a/src/math/sinhf.c
+++ b/src/math/sinhf.c
@@ -26,6 +26,6 @@ float sinhf(float x)
 	}
 
 	/* |x| > logf(FLT_MAX) or nan */
-	t = 2*h*__expo2f(absx);
+	t = __expo2f(absx, 2*h);
 	return t;
 }
-- 
2.24.1


--b5gNqxB1S1yM7hjW--