[-- Attachment #1: Type: text/plain, Size: 1313 bytes --] * Szabolcs Nagy <nsz@port70.net> [2020-07-08 00:38:14 +0200]: > + > +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ > +static inline uint64_t mul64(uint64_t a, uint64_t b) > +{ > + uint64_t ahi = a>>32; > + uint64_t alo = a&0xffffffff; > + uint64_t bhi = b>>32; > + uint64_t blo = b&0xffffffff; > + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); > +} ... > +/* returns a*b exactly. */ > +static inline u128 mul64_128(uint64_t a, uint64_t b) > +{ > + u128 r; > + uint64_t ahi = a>>32; > + uint64_t alo = a&0xffffffff; > + uint64_t bhi = b>>32; > + uint64_t blo = b&0xffffffff; > + uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); > + uint64_t lo2 = (alo*blo)&0xffffffff; > + r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); > + r.lo = (lo1<<32) + lo2; > + return r; > +} > + > +/* returns a*b*2^-128 - e, with error 0 <= e < 3. */ > +static inline u128 mul128(u128 a, u128 b) > +{ > + u128 hi = mul64_128(a.hi, b.hi); > + u128 m1 = mul64_128(a.hi, b.lo); > + u128 m2 = mul64_128(a.lo, b.hi); > + return add64(add64(hi, m1.hi), m2.hi); > +} this is does not have to be this precise, i should have spotted it earlier. attached a fixed version, should be a bit faster (and smaller unless -Os is used which prevents inlining) reran the tests, they still pass. [-- Attachment #2: 0004-math-new-software-sqrtl.patch --] [-- Type: text/x-diff, Size: 6826 bytes --] From f1f166cdb9616959f7055ce0477e3ef1bd0c6daf Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy <nsz@port70.net> Date: Sun, 14 Jun 2020 13:41:21 +0000 Subject: [PATCH 4/4] math: new software sqrtl same approach as in sqrt. sqrtl was broken on aarch64, riscv64 and s390x targets because of missing quad precision support and on m68k-sf because of missing ld80 sqrtl. this implementation is written for quad precision and then edited to make it work for both m68k and x86 style ld80 formats too, but it is not expected to be optimal for them. note: using fp instructions for the initial estimate when such instructions are available (e.g. double prec sqrt or rsqrt) is avoided because of fenv correctness. --- src/math/sqrtl.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-) diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c index 83a8f80c..1b9f19c7 100644 --- a/src/math/sqrtl.c +++ b/src/math/sqrtl.c @@ -1,7 +1,259 @@ +#include <stdint.h> #include <math.h> +#include <float.h> +#include "libm.h" +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 long double sqrtl(long double x) { - /* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */ return sqrt(x); } +#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384 +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +typedef struct { + uint64_t hi; + uint64_t lo; +} u128; + +/* top: 16 bit sign+exponent, x: significand. */ +static inline long double mkldbl(uint64_t top, u128 x) +{ + union ldshape u; +#if LDBL_MANT_DIG == 113 + u.i2.hi = x.hi; + u.i2.lo = x.lo; + u.i2.hi &= 0x0000ffffffffffff; + u.i2.hi |= top << 48; +#elif LDBL_MANT_DIG == 64 + u.i.se = top; + u.i.m = x.lo; + /* force the top bit on non-zero (and non-subnormal) results. */ + if (top & 0x7fff) + u.i.m |= 0x8000000000000000; +#endif + return u.f; +} + +/* return: top 16 bit is sign+exp and following bits are the significand. */ +static inline u128 asu128(long double x) +{ + union ldshape u = {.f=x}; + u128 r; +#if LDBL_MANT_DIG == 113 + r.hi = u.i2.hi; + r.lo = u.i2.lo; +#elif LDBL_MANT_DIG == 64 + r.lo = u.i.m<<49; + /* ignore the top bit: pseudo numbers are not handled. */ + r.hi = u.i.m>>15; + r.hi &= 0x0000ffffffffffff; + r.hi |= (uint64_t)u.i.se << 48; +#endif + return r; +} + +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +static inline u128 add64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo + b; + r.hi = a.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 add128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo + b.lo; + r.hi = a.hi + b.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 sub64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo - b; + r.hi = a.hi; + if (a.lo < b) + r.hi--; + return r; +} + +static inline u128 sub128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo - b.lo; + r.hi = a.hi - b.hi; + if (a.lo < b.lo) + r.hi--; + return r; +} + +/* a<<n, 0 <= n <= 127 */ +static inline u128 lsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.hi = a.lo<<(n-64); + a.lo = 0; + } else { + a.hi = (a.hi<<n) | (a.lo>>(64-n)); + a.lo = a.lo<<n; + } + return a; +} + +/* a>>n, 0 <= n <= 127 */ +static inline u128 rsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.lo = a.hi>>(n-64); + a.hi = 0; + } else { + a.lo = (a.lo>>n) | (a.hi<<(64-n)); + a.hi = a.hi>>n; + } + return a; +} + +/* returns a*b exactly. */ +static inline u128 mul64_128(uint64_t a, uint64_t b) +{ + u128 r; + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); + uint64_t lo2 = (alo*blo)&0xffffffff; + r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); + r.lo = (lo1<<32) + lo2; + return r; +} + +/* returns a*b*2^-128 - e, with error 0 <= e < 7. */ +static inline u128 mul128(u128 a, u128 b) +{ + u128 hi = mul64_128(a.hi, b.hi); + uint64_t m1 = mul64(a.hi, b.lo); + uint64_t m2 = mul64(a.lo, b.hi); + return add64(add64(hi, m1), m2); +} + +/* returns a*b % 2^128. */ +static inline u128 mul128_tail(u128 a, u128 b) +{ + u128 lo = mul64_128(a.lo, b.lo); + lo.hi += a.hi*b.lo + a.lo*b.hi; + return lo; +} + + +/* see sqrt.c for detailed comments. */ + +long double sqrtl(long double x) +{ + u128 ix, ml; + uint64_t top; + + ix = asu128(x); + top = ix.hi >> 48; + if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) { + /* x < 0x1p-16382 or inf or nan. */ + if (2*ix.hi == 0 && ix.lo == 0) + return x; + if (ix.hi == 0x7fff000000000000 && ix.lo == 0) + return x; + if (top >= 0x7fff) + return __math_invalidl(x); + /* x is subnormal, normalize it. */ + ix = asu128(x * 0x1p112); + top = ix.hi >> 48; + top -= 112; + } + + /* x = 4^e m; with int e and m in [1, 4) */ + int even = top & 1; + ml = lsh(ix, 15); + ml.hi |= 0x8000000000000000; + if (even) ml = rsh(ml, 1); + top = (top + 0x3fff) >> 1; + + /* r ~ 1/sqrt(m) */ + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + i = (ix.hi >> 42) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1p-8 */ + s = mul32(ml.hi>>32, r); + d = mul32(s, r); + u = three - d; + r = mul32(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */ + r = r<<32; + s = mul64(ml.hi, r); + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.a5p-31 */ + s = mul64(u, s) << 1; + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */ + + static const u128 threel = {.hi=three<<32, .lo=0}; + u128 rl, sl, dl, ul; + rl.hi = r; + rl.lo = 0; + sl = mul128(ml, rl); + dl = mul128(sl, rl); + ul = sub128(threel, dl); + sl = mul128(ul, sl); /* repr: 3.125 */ + /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ + sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1)); + /* s < sqrt(m) < s + 1 ULP + tiny */ + + long double y; + u128 d2, d1, d0; + d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl)); + d1 = sub128(sl, d0); + d2 = add128(add64(sl, 1), d1); + sl = add64(sl, d1.hi >> 63); + y = mkldbl(top, sl); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception. */ + top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1; + top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48; + y += mkldbl(top, (u128){0}); + } + return y; +} +#else +#error unsupported long double format +#endif -- 2.27.0

```
* Alexander Scherbatiy:
> The pthread_getspecific(3) specification describes:
> "The pthread_getspecific() function shall return the thread-specific
> data value associated with the given key. If no thread-specific data
> value is associated with key, then the value NULL shall be returned. "
POSIX says this:
| The effect of calling pthread_getspecific() or pthread_setspecific()
| with a key value not obtained from pthread_key_create() or after key
| has been deleted with pthread_key_delete() is undefined.
So no check for a valid key is required.
Thanks,
Florian
```

Hello, The pthread_getspecific(3) specification describes: "The pthread_getspecific() function shall return the thread-specific data value associated with the given key. If no thread-specific data value is associated with key, then the value NULL shall be returned. " The glibc version pthread_getspecific(3) returns NULL for the negative key value. The musl version does not check key bounds [1]. Is it intentional behavior? [1] https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_getspecific.c#n7 Thanks, Alexander.

[-- Attachment #1: Type: text/plain, Size: 990 bytes --] Faster than the old implementation and fixes the missing quad precision sqrt. sqrtf was tested on all inputs, sqrt and sqrtl was tested on random inputs and on near half way cases. code size should be similar to the old implementation, but rodata is increased by a 256 byte lookup table (shared between sqrt, sqrtf and sqrtl). Szabolcs Nagy (4): math: new software sqrt math: new software sqrtf math: add __math_invalidl math: new software sqrtl src/internal/libm.h | 3 + src/math/__math_invalidl.c | 9 ++ src/math/sqrt.c | 320 +++++++++++++++++-------------------- src/math/sqrt_data.c | 19 +++ src/math/sqrt_data.h | 13 ++ src/math/sqrtf.c | 140 ++++++++-------- src/math/sqrtl.c | 254 ++++++++++++++++++++++++++++- 7 files changed, 514 insertions(+), 244 deletions(-) create mode 100644 src/math/__math_invalidl.c create mode 100644 src/math/sqrt_data.c create mode 100644 src/math/sqrt_data.h -- 2.27.0 [-- Attachment #2: 0001-math-new-software-sqrt.patch --] [-- Type: text/x-diff, Size: 13290 bytes --] From ad4f091c5d19203de94941017d1a58bdcd04216a Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy <nsz@port70.net> Date: Sat, 13 Jun 2020 22:03:13 +0000 Subject: [PATCH 1/4] math: new software sqrt approximate 1/sqrt(x) and sqrt(x) with goldschmidt iterations. this is known to be a fast method for computing sqrt, but it is tricky to get right, so added detailed comments. use a lookup table for the initial estimate, this adds 256bytes rodata but it can be shared between sqrt, sqrtf and sqrtl. this saves one iteration compared to a linear estimate. this is for soft float targets, but it supports fenv by using a floating-point operation to get the final result. the result is correctly rounded in all rounding modes. if fenv support is turned off then the nearest rounded result is computed and inexact exception is not signaled. assumes fast 32bit integer arithmetics and 32 to 64bit mul. --- src/math/sqrt.c | 320 ++++++++++++++++++++----------------------- src/math/sqrt_data.c | 19 +++ src/math/sqrt_data.h | 13 ++ 3 files changed, 179 insertions(+), 173 deletions(-) create mode 100644 src/math/sqrt_data.c create mode 100644 src/math/sqrt_data.h diff --git a/src/math/sqrt.c b/src/math/sqrt.c index f1f6d76c..5ba26559 100644 --- a/src/math/sqrt.c +++ b/src/math/sqrt.c @@ -1,184 +1,158 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* sqrt(x) - * Return correctly rounded sqrt. - * ------------------------------------------ - * | Use the hardware sqrt if you have one | - * ------------------------------------------ - * Method: - * Bit by bit method using integer arithmetic. (Slow, but portable) - * 1. Normalization - * Scale x to y in [1,4) with even powers of 2: - * find an integer k such that 1 <= (y=x*2^(2k)) < 4, then - * sqrt(x) = 2^k * sqrt(y) - * 2. Bit by bit computation - * Let q = sqrt(y) truncated to i bit after binary point (q = 1), - * i 0 - * i+1 2 - * s = 2*q , and y = 2 * ( y - q ). (1) - * i i i i - * - * To compute q from q , one checks whether - * i+1 i - * - * -(i+1) 2 - * (q + 2 ) <= y. (2) - * i - * -(i+1) - * If (2) is false, then q = q ; otherwise q = q + 2 . - * i+1 i i+1 i - * - * With some algebric manipulation, it is not difficult to see - * that (2) is equivalent to - * -(i+1) - * s + 2 <= y (3) - * i i - * - * The advantage of (3) is that s and y can be computed by - * i i - * the following recurrence formula: - * if (3) is false - * - * s = s , y = y ; (4) - * i+1 i i+1 i - * - * otherwise, - * -i -(i+1) - * s = s + 2 , y = y - s - 2 (5) - * i+1 i i+1 i i - * - * One may easily use induction to prove (4) and (5). - * Note. Since the left hand side of (3) contain only i+2 bits, - * it does not necessary to do a full (53-bit) comparison - * in (3). - * 3. Final rounding - * After generating the 53 bits result, we compute one more bit. - * Together with the remainder, we can decide whether the - * result is exact, bigger than 1/2ulp, or less than 1/2ulp - * (it will never equal to 1/2ulp). - * The rounding mode can be detected by checking whether - * huge + tiny is equal to huge, and whether huge - tiny is - * equal to huge for some floating point number "huge" and "tiny". - * - * Special cases: - * sqrt(+-0) = +-0 ... exact - * sqrt(inf) = inf - * sqrt(-ve) = NaN ... with invalid signal - * sqrt(NaN) = NaN ... with invalid signal for signaling NaN - */ - +#include <stdint.h> +#include <math.h> #include "libm.h" +#include "sqrt_data.h" -static const double tiny = 1.0e-300; +#define FENV_SUPPORT 1 -double sqrt(double x) +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) { - double z; - int32_t sign = (int)0x80000000; - int32_t ix0,s0,q,m,t,i; - uint32_t r,t1,s1,ix1,q1; + return (uint64_t)a*b >> 32; +} - EXTRACT_WORDS(ix0, ix1, x); +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} - /* take care of Inf and NaN */ - if ((ix0&0x7ff00000) == 0x7ff00000) { - return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ - } - /* take care of zero */ - if (ix0 <= 0) { - if (((ix0&~sign)|ix1) == 0) - return x; /* sqrt(+-0) = +-0 */ - if (ix0 < 0) - return (x-x)/(x-x); /* sqrt(-ve) = sNaN */ - } - /* normalize x */ - m = ix0>>20; - if (m == 0) { /* subnormal x */ - while (ix0 == 0) { - m -= 21; - ix0 |= (ix1>>11); - ix1 <<= 21; - } - for (i=0; (ix0&0x00100000) == 0; i++) - ix0<<=1; - m -= i - 1; - ix0 |= ix1>>(32-i); - ix1 <<= i; - } - m -= 1023; /* unbias exponent */ - ix0 = (ix0&0x000fffff)|0x00100000; - if (m & 1) { /* odd m, double x to make it even */ - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - } - m >>= 1; /* m = [m/2] */ - - /* generate sqrt(x) bit by bit */ - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */ - r = 0x00200000; /* r = moving bit from right to left */ - - while (r != 0) { - t = s0 + r; - if (t <= ix0) { - s0 = t + r; - ix0 -= t; - q += r; - } - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - r >>= 1; - } +double sqrt(double x) +{ + uint64_t ix, top, m; - r = sign; - while (r != 0) { - t1 = s1 + r; - t = s0; - if (t < ix0 || (t == ix0 && t1 <= ix1)) { - s1 = t1 + r; - if ((t1&sign) == sign && (s1&sign) == 0) - s0++; - ix0 -= t; - if (ix1 < t1) - ix0--; - ix1 -= t1; - q1 += r; - } - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - r >>= 1; + /* special case handling. */ + ix = asuint64(x); + top = ix >> 52; + if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7ff0000000000000) + return x; + if (ix > 0x7ff0000000000000) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + top = ix >> 52; + top -= 52; } - /* use floating add to find out rounding direction */ - if ((ix0|ix1) != 0) { - z = 1.0 - tiny; /* raise inexact flag */ - if (z >= 1.0) { - z = 1.0 + tiny; - if (q1 == (uint32_t)0xffffffff) { - q1 = 0; - q++; - } else if (z > 1.0) { - if (q1 == (uint32_t)0xfffffffe) - q++; - q1 += 2; - } else - q1 += q1 & 1; - } + /* argument reduction: + x = 4^e m; with integer e, and m in [1, 4) + m: fixed point representation [2.62] + 2^e is the exponent part of the result. */ + int even = top & 1; + m = (ix << 11) | 0x8000000000000000; + if (even) m >>= 1; + top = (top + 0x3ff) >> 1; + + /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) + + initial estimate: + 7bit table lookup (1bit exponent and 6bit significand). + + iterative approximation: + using 2 goldschmidt iterations with 32bit int arithmetics + and a final iteration with 64bit int arithmetics. + + details: + + the relative error (e = r0 sqrt(m)-1) of a linear estimate + (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, + a table lookup is faster and needs one less iteration + 6 bit lookup table (128b) gives |e| < 0x1.f9p-8 + 7 bit lookup table (256b) gives |e| < 0x1.fdp-9 + for single and double prec 6bit is enough but for quad + prec 7bit is needed (or modified iterations). to avoid + one more iteration >=13bit table would be needed (16k). + + a newton-raphson iteration for r is + w = r*r + u = 3 - m*w + r = r*u/2 + can use a goldschmidt iteration for s at the end or + s = m*r + + first goldschmidt iteration is + s = m*r + u = 3 - s*r + r = r*u/2 + s = s*u/2 + next goldschmidt iteration is + u = 3 - s*r + r = r*u/2 + s = s*u/2 + and at the end r is not computed only s. + + they use the same amount of operations and converge at the + same quadratic rate, i.e. if + r1 sqrt(m) - 1 = e, then + r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 + the advantage of goldschmidt is that the mul for s and r + are independent (computed in parallel), however it is not + "self synchronizing": it only uses the input m in the + first iteration so rounding errors accumulate. at the end + or when switching to larger precision arithmetics rounding + errors dominate so the first iteration should be used. + + the fixed point representations are + m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 + and after switching to 64 bit + m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */ + + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + + i = (ix >> 46) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1.fdp-9 */ + s = mul32(m>>32, r); + /* |s/sqrt(m) - 1| < 0x1.fdp-9 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ + r = r << 32; + s = mul64(m, r); + d = mul64(s, r); + u = (three<<32) - d; + s = mul64(s, u); /* repr: 3.61 */ + /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ + s = (s - 2) >> 9; /* repr: 12.52 */ + /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ + + /* s < sqrt(m) < s + 0x1.09p-52, + compute nearest rounded result: + the nearest result to 52 bits is either s or s+0x1p-52, + we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */ + uint64_t d0, d1, d2; + double y, t; + d0 = (m << 42) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 63; + s &= 0x000fffffffffffff; + s |= top << 52; + y = asdouble(s); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception: + only (s+1)^2 == 2^42 m case is exact otherwise + add a tiny value to cause the fenv effects. */ + uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; + tiny |= (d1^d2) & 0x8000000000000000; + t = asdouble(tiny); + y = eval_as_double(y + t); } - ix0 = (q>>1) + 0x3fe00000; - ix1 = q1>>1; - if (q&1) - ix1 |= sign; - INSERT_WORDS(z, ix0 + ((uint32_t)m << 20), ix1); - return z; + return y; } diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c new file mode 100644 index 00000000..61bc22f4 --- /dev/null +++ b/src/math/sqrt_data.c @@ -0,0 +1,19 @@ +#include "sqrt_data.h" +const uint16_t __rsqrt_tab[128] = { +0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43, +0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b, +0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1, +0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430, +0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59, +0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925, +0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479, +0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040, +0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234, +0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2, +0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1, +0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192, +0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f, +0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4, +0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59, +0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560, +}; diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h new file mode 100644 index 00000000..260c7f9c --- /dev/null +++ b/src/math/sqrt_data.h @@ -0,0 +1,13 @@ +#ifndef _SQRT_DATA_H +#define _SQRT_DATA_H + +#include <features.h> +#include <stdint.h> + +/* if x in [1,2): i = (int)(64*x); + if x in [2,4): i = (int)(32*x-64); + __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error: + |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */ +extern hidden const uint16_t __rsqrt_tab[128]; + +#endif -- 2.27.0 [-- Attachment #3: 0002-math-new-software-sqrtf.patch --] [-- Type: text/x-diff, Size: 4922 bytes --] From a6f8fedb318c06f99909b8cd7f7d356640a40217 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy <nsz@port70.net> Date: Fri, 12 Jun 2020 17:34:28 +0000 Subject: [PATCH 2/4] math: new software sqrtf same method as in sqrt, this was tested on all inputs against an sqrtf instruction. (the only difference found was that x86 sqrtf does not signal the x86 specific input-denormal exception on negative subnormal inputs while the software sqrtf does, this is fine as it was designed for ieee754 exceptions only.) there is known faster method: "Computing Floating-Point Square Roots via Bivariate Polynomial Evaluation" that computes sqrtf directly via pipelined polynomial evaluation which allows more parallelism, but the design does not generalize easily to higher precisions. --- src/math/sqrtf.c | 140 +++++++++++++++++++++++------------------------ 1 file changed, 70 insertions(+), 70 deletions(-) diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c index d6ace38a..740d81cb 100644 --- a/src/math/sqrtf.c +++ b/src/math/sqrtf.c @@ -1,83 +1,83 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */ -/* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - +#include <stdint.h> +#include <math.h> #include "libm.h" +#include "sqrt_data.h" -static const float tiny = 1.0e-30; +#define FENV_SUPPORT 1 -float sqrtf(float x) +static inline uint32_t mul32(uint32_t a, uint32_t b) { - float z; - int32_t sign = (int)0x80000000; - int32_t ix,s,q,m,t,i; - uint32_t r; + return (uint64_t)a*b >> 32; +} - GET_FLOAT_WORD(ix, x); +/* see sqrt.c for more detailed comments. */ - /* take care of Inf and NaN */ - if ((ix&0x7f800000) == 0x7f800000) - return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ +float sqrtf(float x) +{ + uint32_t ix, m, m1, m0, even, ey; - /* take care of zero */ - if (ix <= 0) { - if ((ix&~sign) == 0) - return x; /* sqrt(+-0) = +-0 */ - if (ix < 0) - return (x-x)/(x-x); /* sqrt(-ve) = sNaN */ - } - /* normalize x */ - m = ix>>23; - if (m == 0) { /* subnormal x */ - for (i = 0; (ix&0x00800000) == 0; i++) - ix<<=1; - m -= i - 1; + ix = asuint(x); + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7f800000) + return x; + if (ix > 0x7f800000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; } - m -= 127; /* unbias exponent */ - ix = (ix&0x007fffff)|0x00800000; - if (m&1) /* odd m, double x to make it even */ - ix += ix; - m >>= 1; /* m = [m/2] */ - /* generate sqrt(x) bit by bit */ - ix += ix; - q = s = 0; /* q = sqrt(x) */ - r = 0x01000000; /* r = moving bit from right to left */ + /* x = 4^e m; with int e and m in [1, 4). */ + even = ix & 0x00800000; + m1 = (ix << 8) | 0x80000000; + m0 = (ix << 7) & 0x7fffffff; + m = even ? m0 : m1; - while (r != 0) { - t = s + r; - if (t <= ix) { - s = t+r; - ix -= t; - q += r; - } - ix += ix; - r >>= 1; - } + /* 2^e is the exponent part of the return value. */ + ey = ix >> 1; + ey += 0x3f800000 >> 1; + ey &= 0x7f800000; + + /* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations. */ + static const uint32_t three = 0xc0000000; + uint32_t r, s, d, u, i; + i = (ix >> 17) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r*sqrt(m) - 1| < 0x1p-8 */ + s = mul32(m, r); + /* |s/sqrt(m) - 1| < 0x1p-8 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r*sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + s = mul32(s, u); + /* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */ + s = (s - 1)>>6; + /* s < sqrt(m) < s + 0x1.08p-23 */ - /* use floating add to find out rounding direction */ - if (ix != 0) { - z = 1.0f - tiny; /* raise inexact flag */ - if (z >= 1.0f) { - z = 1.0f + tiny; - if (z > 1.0f) - q += 2; - else - q += q & 1; - } + /* compute nearest rounded result. */ + uint32_t d0, d1, d2; + float y, t; + d0 = (m << 16) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 31; + s &= 0x007fffff; + s |= ey; + y = asfloat(s); + if (FENV_SUPPORT) { + /* handle rounding and inexact exception. */ + uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000; + tiny |= (d1^d2) & 0x80000000; + t = asfloat(tiny); + y = eval_as_float(y + t); } - ix = (q>>1) + 0x3f000000; - SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23)); - return z; + return y; } -- 2.27.0 [-- Attachment #4: 0003-math-add-__math_invalidl.patch --] [-- Type: text/x-diff, Size: 1186 bytes --] From 1ad9a6ccb441b444d8bd9e599298d58e0548f38b Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy <nsz@port70.net> Date: Mon, 29 Jun 2020 17:14:42 +0000 Subject: [PATCH 3/4] math: add __math_invalidl for targets where long double is different from double. --- src/internal/libm.h | 3 +++ src/math/__math_invalidl.c | 9 +++++++++ 2 files changed, 12 insertions(+) create mode 100644 src/math/__math_invalidl.c diff --git a/src/internal/libm.h b/src/internal/libm.h index 7533f6ba..72ad17d8 100644 --- a/src/internal/libm.h +++ b/src/internal/libm.h @@ -267,5 +267,8 @@ hidden double __math_uflow(uint32_t); hidden double __math_oflow(uint32_t); hidden double __math_divzero(uint32_t); hidden double __math_invalid(double); +#if LDBL_MANT_DIG != DBL_MANT_DIG +hidden long double __math_invalidl(long double); +#endif #endif diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c new file mode 100644 index 00000000..1fca99de --- /dev/null +++ b/src/math/__math_invalidl.c @@ -0,0 +1,9 @@ +#include <float.h> +#include "libm.h" + +#if LDBL_MANT_DIG != DBL_MANT_DIG +long double __math_invalidl(long double x) +{ + return (x - x) / (x - x); +} +#endif -- 2.27.0 [-- Attachment #5: 0004-math-new-software-sqrtl.patch --] [-- Type: text/x-diff, Size: 6832 bytes --] From 1995cc96f1f0480505d1f4edc5d43da7ab370f59 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy <nsz@port70.net> Date: Sun, 14 Jun 2020 13:41:21 +0000 Subject: [PATCH 4/4] math: new software sqrtl same approach as in sqrt. sqrtl was broken on aarch64, riscv64 and s390x targets because of missing quad precision support and on m68k-sf because of missing ld80 sqrtl. this implementation is written for quad precision and then edited to make it work for both m68k and x86 style ld80 formats too, but it is not expected to be optimal for them. note: using fp instructions for the initial estimate when such instructions are available (e.g. double prec sqrt or rsqrt) is avoided because of fenv correctness. --- src/math/sqrtl.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-) diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c index 83a8f80c..70280c56 100644 --- a/src/math/sqrtl.c +++ b/src/math/sqrtl.c @@ -1,7 +1,259 @@ +#include <stdint.h> #include <math.h> +#include <float.h> +#include "libm.h" +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 long double sqrtl(long double x) { - /* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */ return sqrt(x); } +#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384 +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +typedef struct { + uint64_t hi; + uint64_t lo; +} u128; + +/* top: 16 bit sign+exponent, x: significand. */ +static inline long double mkldbl(uint64_t top, u128 x) +{ + union ldshape u; +#if LDBL_MANT_DIG == 113 + u.i2.hi = x.hi; + u.i2.lo = x.lo; + u.i2.hi &= 0x0000ffffffffffff; + u.i2.hi |= top << 48; +#elif LDBL_MANT_DIG == 64 + u.i.se = top; + u.i.m = x.lo; + /* force the top bit on non-zero (and non-subnormal) results. */ + if (top & 0x7fff) + u.i.m |= 0x8000000000000000; +#endif + return u.f; +} + +/* return: top 16 bit is sign+exp and following bits are the significand. */ +static inline u128 asu128(long double x) +{ + union ldshape u = {.f=x}; + u128 r; +#if LDBL_MANT_DIG == 113 + r.hi = u.i2.hi; + r.lo = u.i2.lo; +#elif LDBL_MANT_DIG == 64 + r.lo = u.i.m<<49; + /* ignore the top bit: pseudo numbers are not handled. */ + r.hi = u.i.m>>15; + r.hi &= 0x0000ffffffffffff; + r.hi |= (uint64_t)u.i.se << 48; +#endif + return r; +} + +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +static inline u128 add64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo + b; + r.hi = a.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 add128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo + b.lo; + r.hi = a.hi + b.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 sub64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo - b; + r.hi = a.hi; + if (a.lo < b) + r.hi--; + return r; +} + +static inline u128 sub128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo - b.lo; + r.hi = a.hi - b.hi; + if (a.lo < b.lo) + r.hi--; + return r; +} + +/* a<<n, 0 <= n <= 127 */ +static inline u128 lsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.hi = a.lo<<(n-64); + a.lo = 0; + } else { + a.hi = (a.hi<<n) | (a.lo>>(64-n)); + a.lo = a.lo<<n; + } + return a; +} + +/* a>>n, 0 <= n <= 127 */ +static inline u128 rsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.lo = a.hi>>(n-64); + a.hi = 0; + } else { + a.lo = (a.lo>>n) | (a.hi<<(64-n)); + a.hi = a.hi>>n; + } + return a; +} + +/* returns a*b exactly. */ +static inline u128 mul64_128(uint64_t a, uint64_t b) +{ + u128 r; + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); + uint64_t lo2 = (alo*blo)&0xffffffff; + r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); + r.lo = (lo1<<32) + lo2; + return r; +} + +/* returns a*b*2^-128 - e, with error 0 <= e < 3. */ +static inline u128 mul128(u128 a, u128 b) +{ + u128 hi = mul64_128(a.hi, b.hi); + u128 m1 = mul64_128(a.hi, b.lo); + u128 m2 = mul64_128(a.lo, b.hi); + return add64(add64(hi, m1.hi), m2.hi); +} + +/* returns a*b % 2^128. */ +static inline u128 mul128_tail(u128 a, u128 b) +{ + u128 lo = mul64_128(a.lo, b.lo); + lo.hi += a.hi*b.lo + a.lo*b.hi; + return lo; +} + + +/* see sqrt.c for detailed comments. */ + +long double sqrtl(long double x) +{ + u128 ix, ml; + uint64_t top; + + ix = asu128(x); + top = ix.hi >> 48; + if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) { + /* x < 0x1p-16382 or inf or nan. */ + if (2*ix.hi == 0 && ix.lo == 0) + return x; + if (ix.hi == 0x7fff000000000000 && ix.lo == 0) + return x; + if (top >= 0x7fff) + return __math_invalidl(x); + /* x is subnormal, normalize it. */ + ix = asu128(x * 0x1p112); + top = ix.hi >> 48; + top -= 112; + } + + /* x = 4^e m; with int e and m in [1, 4) */ + int even = top & 1; + ml = lsh(ix, 15); + ml.hi |= 0x8000000000000000; + if (even) ml = rsh(ml, 1); + top = (top + 0x3fff) >> 1; + + /* r ~ 1/sqrt(m) */ + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + i = (ix.hi >> 42) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1p-8 */ + s = mul32(ml.hi>>32, r); + d = mul32(s, r); + u = three - d; + r = mul32(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */ + r = r<<32; + s = mul64(ml.hi, r); + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.a5p-31 */ + s = mul64(u, s) << 1; + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */ + + static const u128 threel = {.hi=three<<32, .lo=0}; + u128 rl, sl, dl, ul; + rl.hi = r; + rl.lo = 0; + sl = mul128(ml, rl); + dl = mul128(sl, rl); + ul = sub128(threel, dl); + sl = mul128(ul, sl); /* repr: 3.125 */ + /* -0x1p-116 < s - sqrt(m) < 0x1.8001p-125 */ + sl = rsh(sub64(sl, 2), 125-(LDBL_MANT_DIG-1)); + /* s < sqrt(m) < s + 1 ULP + tiny */ + + long double y; + u128 d2, d1, d0; + d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl)); + d1 = sub128(sl, d0); + d2 = add128(add64(sl, 1), d1); + sl = add64(sl, d1.hi >> 63); + y = mkldbl(top, sl); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception. */ + top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1; + top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48; + y += mkldbl(top, (u128){0}); + } + return y; +} +#else +#error unsupported long double format +#endif -- 2.27.0

```
* Rich Felker <dalias@libc.org> [2020-07-07 13:22:57 -0400]:
> On Tue, Jul 07, 2020 at 05:00:20PM +0200, Szabolcs Nagy wrote:
> > * Rich Felker <dalias@libc.org> [2020-07-06 18:12:43 -0400]:
> > > I think you saw already, but just to make it clear on the list too,
> > > it's upstream now. I'm open to further improvements like doing
> > > memmove (either as a separate copy of the full implementation or some
> > > minimal branch-to-__memcpy_fwd approach) but I think what's already
> > > there is sufficient to solve the main practical performance issues
> > > users were hitting that made aarch64 look bad in relation to x86_64.
> > >
> > > I'd still like to revisit the topic of minimizing the per-arch code
> > > needed for this so that all archs can benefit from the basic logic,
> > > too.
> >
> > thanks.
> >
> > note that the code has some internal .p2align
> > directives that assume the entry is aligned to
> > some large alignment (.p2align 6 in orig code)
> >
> > i think it would be better to keep the entry
> > aligned (but i don't know if it makes a big
> > difference on some existing core, it's more
> > for consistency with upstream).
> >
> > musl normally does not align function entries
> > but for a few select functions it is probably
> > not too much overhead?
>
> I was under the impression that any .p2align N in the section
> inherently aligns the whole section as if it started with .p2align N,
> in which case not writing it explicitly just avoids redundancy and
> makes sure you don't actually have an initial alignment that's larger
> than any alignment actually wanted later. Is this incorrect?
>
> (To be incorrect I think it would have to do some fancy
> elastic-section-contents hack, but maybe aarch64 ELF object ABI has
> that..?)
ah you are right, then everything is fine i guess.
```

```
On Tue, Jul 07, 2020 at 05:00:20PM +0200, Szabolcs Nagy wrote:
> * Rich Felker <dalias@libc.org> [2020-07-06 18:12:43 -0400]:
> > I think you saw already, but just to make it clear on the list too,
> > it's upstream now. I'm open to further improvements like doing
> > memmove (either as a separate copy of the full implementation or some
> > minimal branch-to-__memcpy_fwd approach) but I think what's already
> > there is sufficient to solve the main practical performance issues
> > users were hitting that made aarch64 look bad in relation to x86_64.
> >
> > I'd still like to revisit the topic of minimizing the per-arch code
> > needed for this so that all archs can benefit from the basic logic,
> > too.
>
> thanks.
>
> note that the code has some internal .p2align
> directives that assume the entry is aligned to
> some large alignment (.p2align 6 in orig code)
>
> i think it would be better to keep the entry
> aligned (but i don't know if it makes a big
> difference on some existing core, it's more
> for consistency with upstream).
>
> musl normally does not align function entries
> but for a few select functions it is probably
> not too much overhead?
I was under the impression that any .p2align N in the section
inherently aligns the whole section as if it started with .p2align N,
in which case not writing it explicitly just avoids redundancy and
makes sure you don't actually have an initial alignment that's larger
than any alignment actually wanted later. Is this incorrect?
(To be incorrect I think it would have to do some fancy
elastic-section-contents hack, but maybe aarch64 ELF object ABI has
that..?)
Rich
```

```
* Rich Felker <dalias@libc.org> [2020-07-06 18:12:43 -0400]:
> I think you saw already, but just to make it clear on the list too,
> it's upstream now. I'm open to further improvements like doing
> memmove (either as a separate copy of the full implementation or some
> minimal branch-to-__memcpy_fwd approach) but I think what's already
> there is sufficient to solve the main practical performance issues
> users were hitting that made aarch64 look bad in relation to x86_64.
>
> I'd still like to revisit the topic of minimizing the per-arch code
> needed for this so that all archs can benefit from the basic logic,
> too.
thanks.
note that the code has some internal .p2align
directives that assume the entry is aligned to
some large alignment (.p2align 6 in orig code)
i think it would be better to keep the entry
aligned (but i don't know if it makes a big
difference on some existing core, it's more
for consistency with upstream).
musl normally does not align function entries
but for a few select functions it is probably
not too much overhead?
```

```
On 2020-07-06 15:22, Rich Felker wrote:
> On Mon, Jul 06, 2020 at 03:14:43PM -0700, Hydro Flask wrote:
>> On 2020-07-06 15:00, Rich Felker wrote:
>> >Yes, I see it clearly now. Sorry it took a while. I have prepared the
>> >attached patch which I'll push soon if there are no problems.
>>
>> Needs one more tiny tweak. I noticed that pthread_cancel() calls
>> pthread_kill(). That means pthread_kill() must be async-cancel-safe.
>> If an asynchronous cancellation happens after pthread_kill() grabs
>> the killlock, then it will deadlock because the asynchronous
>> pthread_exit(PTHREAD_CANCELED) call will then recursively try to
>> grab killlock.
>>
>> The solution as far as I can tell is to not just block app signals
>> when grabbing killlock, but all signals.
>
> Are you in agreement that it suffices for only pthread_kill to block
> all signals? (Still blocking just app signals everywhere else) The
> scheduling functions could be changed too, but I'm hesitant to change
> pthread_exit without thinking about it further since it has a lot more
> subtleties. And I think only pthread_kill needs it since it's the only
> one that needs to be AC-safe.
Yes I think that's right, since every other function that grabs the
killlock can assume that async cancellation won't happen.
```

```
On Mon, Jul 06, 2020 at 03:14:43PM -0700, Hydro Flask wrote:
> On 2020-07-06 15:00, Rich Felker wrote:
> >Yes, I see it clearly now. Sorry it took a while. I have prepared the
> >attached patch which I'll push soon if there are no problems.
>
> Needs one more tiny tweak. I noticed that pthread_cancel() calls
> pthread_kill(). That means pthread_kill() must be async-cancel-safe.
> If an asynchronous cancellation happens after pthread_kill() grabs
> the killlock, then it will deadlock because the asynchronous
> pthread_exit(PTHREAD_CANCELED) call will then recursively try to
> grab killlock.
>
> The solution as far as I can tell is to not just block app signals
> when grabbing killlock, but all signals.
Indeed. It'd also work to disable async cancellation for the duration
of the pthread_cancel call, but that's almost surely more work.
Are you in agreement that it suffices for only pthread_kill to block
all signals? (Still blocking just app signals everywhere else) The
scheduling functions could be changed too, but I'm hesitant to change
pthread_exit without thinking about it further since it has a lot more
subtleties. And I think only pthread_kill needs it since it's the only
one that needs to be AC-safe.
Rich
```

```
On 2020-07-06 15:00, Rich Felker wrote:
> Yes, I see it clearly now. Sorry it took a while. I have prepared the
> attached patch which I'll push soon if there are no problems.
Needs one more tiny tweak. I noticed that pthread_cancel() calls
pthread_kill(). That means pthread_kill() must be async-cancel-safe. If
an asynchronous cancellation happens after pthread_kill() grabs the
killlock, then it will deadlock because the asynchronous
pthread_exit(PTHREAD_CANCELED) call will then recursively try to grab
killlock.
The solution as far as I can tell is to not just block app signals when
grabbing killlock, but all signals.
```

```
On Fri, Jun 26, 2020 at 10:40:49AM +0200, Szabolcs Nagy wrote:
> * Rich Felker <dalias@libc.org> [2020-06-25 21:20:06 -0400]:
> > On Thu, Jun 25, 2020 at 05:15:42PM -0400, Rich Felker wrote:
> > > On Thu, Jun 25, 2020 at 04:50:24PM -0400, Rich Felker wrote:
> > > > > > > but it would be nice if we could get the aarch64
> > > > > > > memcpy patch in (the c implementation is really
> > > > > > > slow and i've seen ppl compare aarch64 vs x86
> > > > > > > server performance with some benchmark on alpine..)
> > > > > >
> > > > > > OK, I'll look again.
> > > > >
> > > > > thanks.
> > > > >
> > > > > (there are more aarch64 string functions in the
> > > > > optimized-routines github repo but i think they
> > > > > are not as important as memcpy/memmove/memset)
> > > >
> > > > I found the code. Can you commend on performance and whether memset is
> > > > needed? (The C memset should be rather good already, moreso than
> > > > memcpy.)
>
> the asm seems faster in all measurements but there is
> a lot of variance with different size/alignment cases.
>
> the avg improvement on typical workload and the possible
> improvements across various cases and cores i'd expect:
>
> memcpy typical: 1.6x-1.7x
> memcpy possible: 1.2x-3.1x
>
> memset typical: 1.1x-1.4x
> memset possible: 1.0x-2.6x
>
> > > Are the assumptions (v8-a, unaligned access) documented in memcpy.S
> > > valid for all presently supportable aarch64?
>
> yes, unaligned access on normal memory in userspace
> is valid (part of the base abi on linux).
>
> iirc a core can be configured to trap unaligned access
> and it is not valid on device memory so e.g. such
> memcpy would not work in the kernel. but avoiding
> unaligned access in memcpy is not enough to fix that,
> the compiler will generate unaligned load for
>
> int f(char *p)
> {
> int i;
> __builtin_memcpy(&i,p,sizeof i);
> return i;
> }
>
> > >
> > > A couple comments for merging if we do, that aren't hard requirements
> > > but preferences:
> > >
> > > - I'd like to expand out the macros from ../asmdefs.h since that won't
> > > be available and they just hide things (I guess they're attractive
> > > for Apple/macho users or something but not relevant to musl) and
> > > since the symbol name lines need to be changed anyway to public
> > > name. "Local var name" macros are ok to leave; changing them would
> > > be too error-prone and they make the code more readable anyway.
>
> the weird macros are there so the code is similar to glibc
> asm code (which adds cfi annotation and optionally adds
> profile hooks to entry etc)
>
> > >
> > > - I'd prefer not to have memmove logic in memcpy since it makes it
> > > larger and implies that misuse of memcpy when you mean memmove is
> > > supported usage. I'd be happy with an approach like x86 though,
> > > defining an __memcpy_fwd alias and having memmove tail call to that
> > > unless len>128 and reverse is needed, or just leaving memmove.c.
>
> in principle the code should be called memmove, not memcpy,
> since it satisfies the memmove contract, which of course
> works for memcpy too. so tail calling memmove from memcpy
> makes more sense but memcpy is more performance critical
> than memmove, so we probably should not add extra branches
> there..
>
> >
> > Something like the attached.
>
> looks good to me.
I think you saw already, but just to make it clear on the list too,
it's upstream now. I'm open to further improvements like doing
memmove (either as a separate copy of the full implementation or some
minimal branch-to-__memcpy_fwd approach) but I think what's already
there is sufficient to solve the main practical performance issues
users were hitting that made aarch64 look bad in relation to x86_64.
I'd still like to revisit the topic of minimizing the per-arch code
needed for this so that all archs can benefit from the basic logic,
too.
Rich
```

```
On Thu, Jul 02, 2020 at 04:44:47PM +0200, Markus Wichmann wrote:
> On Wed, Jul 01, 2020 at 11:23:09PM +0200, Valentin Ochs wrote:
> > On Wed, Jul 01, 2020 at 04:44:56PM -0400, Rich Felker wrote:
> > > On Wed, Jul 01, 2020 at 08:50:26PM +0200, Markus Wichmann wrote:
> > > > Hi all,
> > > >
> > > > I noticed something while reading code today: Near the end of qsort(),
> > > > we have this gem:
> > > >
> > > > shl(p, 2);
> > > > pshift -= 2;
> > > > p[0] ^= 7;
> > > > shr(p, 1);
> > > >
> > > > Now, I don't know if I am missing something, but don't the shl and the
> > > > shr partially cancel out? Isn't this the same as
> > > >
> > > > shl(p, 1);
> > > > p[0] ^= 3;
> > > >
> > > > As it is, it isn't wrong, just weird.
> > >
> > > Assuming non-overflow, I think they're equivalent (also assuming you
> > > keep the pshift-=2).
> >
> > Yes, it looks that way. I'm afraid I don't have any further insight -
> > it's been quite a while since I thought about the qsort code, and I've
> > not been doing much work on algorithms over the last couple of years.
> > The only thing I can think of is that one could figure out which
> > behaviour with regard to overflow in shl() should be the valid one. I
> > suspect that replacing it would be valid and this is some transformation
> > I did while implementing smoothsort without realizing that this can be
> > simplified.
> >
> > Cheers,
> > Valentin
> >
>
> Overflow on shl() is completely impossible. To overflow a shl(p, 2), we
> would need the penultimate bit in p to be set. Every bit in p stands in for
> a tree of that order, so if bit n is set, the heap contains a tree with
> a number of elements equal to the n'th Leonardo number.
>
> I don't know how big the Leonardo number corresponding to the
> penultimate bit is, but I do know that halfway through the upper word
> (wasn't the factor something like 1.44 or so?), the Leonardo numbers get
> too big to be contained in a machine word. Meaning that tree would be
> way too large to be addressed.
>
> I concur that this looks like a missed optimization opportunity, and not
> a deliberate design decision.
Indeed, I don't believe overflow is possible here; I just mentioned it
for completeness. I think the change proposed here is correct but I'll
hold off on touching it til after release since it's not fixing a bug,
just a minor missed simplification.
Rich
```

```
On Wed, Jul 01, 2020 at 03:12:14PM +0200, Julien Ramseier wrote:
> vfscanf() may use the variable 'alloc' uninitialized when taking the branch
> introduced by recent commit b287cd745c2243f8e5114331763a5a9813b5f6ee.
>
> Spotted by clang:
>
> .../lib/libc/src/stdio/vfscanf.c:80:6: warning: variable 'alloc' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
> if (!f->rpos) goto input_fail;
> ^~~~~~~~
> .../lib/libc/src/stdio/vfscanf.c:330:7: note: uninitialized use occurs here
> if (alloc) {
> ^~~~~
>
> ---
> src/stdio/vfscanf.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/stdio/vfscanf.c b/src/stdio/vfscanf.c
> index b5ebc16e..b78a374d 100644
> --- a/src/stdio/vfscanf.c
> +++ b/src/stdio/vfscanf.c
> @@ -57,7 +57,7 @@ int vfscanf(FILE *restrict f, const char *restrict fmt, va_list ap)
> {
> int width;
> int size;
> - int alloc;
> + int alloc = 0;
> int base;
> const unsigned char *p;
> int c, t;
> --
> 2.23.0
Thanks, applied.
Rich
```

[-- Attachment #1: Type: text/plain, Size: 967 bytes --] On Tue, Jun 30, 2020 at 02:00:13PM -0700, Hydro Flask wrote: > On 2020-06-30 12:54, Rich Felker wrote: > >Note that for fixing this issue, it won't suffice just to make > >pthread_kill block signals. The other places that use the killlock > >also need to block signals, to make the lock fully AS-safe. > > Rich did you see the issue I was talking about in the example in my > previous email? Does it make sense? > > I do not mind submitting a patch for this issue if you don't > currently have the spare cycles, just let me know if we are in > agreement in terms of understanding the issue and I can submit a > patch that blocks "app signals" in every place that uses the > killlock. > > Probably could also remove the pthread_self() special case that you > referenced earlier in the thread but that would be a second patch. Yes, I see it clearly now. Sorry it took a while. I have prepared the attached patch which I'll push soon if there are no problems. Rich [-- Attachment #2: 0001-make-thread-killlock-async-signal-safe-for-pthread_k.patch --] [-- Type: text/plain, Size: 4223 bytes --] From 7cc9496a18c3fa665c286b8be41d790c795e0171 Mon Sep 17 00:00:00 2001 From: Rich Felker <dalias@aerifal.cx> Date: Mon, 6 Jul 2020 17:56:19 -0400 Subject: [PATCH] make thread killlock async-signal-safe for pthread_kill pthread_kill is required to be AS-safe. that requirement can't be met if the target thread's killlock can be taken in contexts where application-installed signal handlers can run. block signals around use of this lock in all pthread_* functions which target a tid, and reorder blocking/unblocking of signals in pthread_exit so that they're blocked whenever the killlock is held. --- src/thread/pthread_create.c | 11 ++++++----- src/thread/pthread_getschedparam.c | 3 +++ src/thread/pthread_kill.c | 3 +++ src/thread/pthread_setschedparam.c | 3 +++ src/thread/pthread_setschedprio.c | 3 +++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c index 6bdfb44f..10f1b7d8 100644 --- a/src/thread/pthread_create.c +++ b/src/thread/pthread_create.c @@ -72,12 +72,13 @@ _Noreturn void __pthread_exit(void *result) /* Access to target the exiting thread with syscalls that use * its kernel tid is controlled by killlock. For detached threads, * any use past this point would have undefined behavior, but for - * joinable threads it's a valid usage that must be handled. */ + * joinable threads it's a valid usage that must be handled. + * Signals must be blocked since pthread_kill must be AS-safe. */ + __block_app_sigs(&set); LOCK(self->killlock); - /* The thread list lock must be AS-safe, and thus requires - * application signals to be blocked before it can be taken. */ - __block_app_sigs(&set); + /* The thread list lock must be AS-safe, and thus depends on + * application signals being blocked above. */ __tl_lock(); /* If this is the only thread in the list, don't proceed with @@ -85,8 +86,8 @@ _Noreturn void __pthread_exit(void *result) * signal state to prepare for exit to call atexit handlers. */ if (self->next == self) { __tl_unlock(); - __restore_sigs(&set); UNLOCK(self->killlock); + __restore_sigs(&set); exit(0); } diff --git a/src/thread/pthread_getschedparam.c b/src/thread/pthread_getschedparam.c index 1cba073d..c098befb 100644 --- a/src/thread/pthread_getschedparam.c +++ b/src/thread/pthread_getschedparam.c @@ -4,6 +4,8 @@ int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param *restrict param) { int r; + sigset_t set; + __block_app_sigs(&set); LOCK(t->killlock); if (!t->tid) { r = ESRCH; @@ -14,5 +16,6 @@ int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param } } UNLOCK(t->killlock); + __restore_sigs(&set); return r; } diff --git a/src/thread/pthread_kill.c b/src/thread/pthread_kill.c index 3d9395cb..446254b6 100644 --- a/src/thread/pthread_kill.c +++ b/src/thread/pthread_kill.c @@ -4,9 +4,12 @@ int pthread_kill(pthread_t t, int sig) { int r; + sigset_t set; + __block_app_sigs(&set); LOCK(t->killlock); r = t->tid ? -__syscall(SYS_tkill, t->tid, sig) : (sig+0U >= _NSIG ? EINVAL : 0); UNLOCK(t->killlock); + __restore_sigs(&set); return r; } diff --git a/src/thread/pthread_setschedparam.c b/src/thread/pthread_setschedparam.c index 038d13d8..76d4d45a 100644 --- a/src/thread/pthread_setschedparam.c +++ b/src/thread/pthread_setschedparam.c @@ -4,8 +4,11 @@ int pthread_setschedparam(pthread_t t, int policy, const struct sched_param *param) { int r; + sigset_t set; + __block_app_sigs(&set); LOCK(t->killlock); r = !t->tid ? ESRCH : -__syscall(SYS_sched_setscheduler, t->tid, policy, param); UNLOCK(t->killlock); + __restore_sigs(&set); return r; } diff --git a/src/thread/pthread_setschedprio.c b/src/thread/pthread_setschedprio.c index 5bf4a019..fc2e13dd 100644 --- a/src/thread/pthread_setschedprio.c +++ b/src/thread/pthread_setschedprio.c @@ -4,8 +4,11 @@ int pthread_setschedprio(pthread_t t, int prio) { int r; + sigset_t set; + __block_app_sigs(&set); LOCK(t->killlock); r = !t->tid ? ESRCH : -__syscall(SYS_sched_setparam, t->tid, &prio); UNLOCK(t->killlock); + __restore_sigs(&set); return r; } -- 2.21.0

```
Yes, it is a standard C library implementation for multiple architectures.
Daniel
On 6/29/20 5:39 AM, mayuresh@kathe.in wrote:
> Hello,
>
> New here, so please excuse the naive question; is the "musl" project only about developing an alternative libc implementation? If no, is there any other sub-project website I could visit to read more?
>
> Thank you,
>
> ~Mayuresh
>
```

Sorry for my late response. I didn't realize musl was "reply-to-list only" and my filters didn't flag your reply without the CC. (Filters updated now!) On 6/24/20 7:45 PM, Rich Felker wrote: > On Wed, Jun 24, 2020 at 06:35:16PM -0500, Daniel Santos wrote: >> This is the function called for interruptable / repeatable syscalls like >> nanosleep. Without this patch, attaching a debugger to a program making >> such a syscall results in the debugger being completely unable to >> perform a backtrace. > On other archs this is handled by the tools/add-cfi* scripts. It might > be worth revisiting whether that's the right choice; for most asm > we're actually trying to get rid of the asm source files and use > inline asm so it's all up to the compiler, but that can't be done for > syscalls because we need to be able to adjust the stack for the > outgoing syscall. (Note: this is probably also an issue for inline, > non-cancellable syscalls and maybe not so easy to fix there.) Yes, there are many things that just cannot be done with inline asm. I thought we had __attribute__((naked)) for all archs in GCC now, but I guess that doesn't include MIPS (currently looks like ARM, AVR, C-SKY, MCORE, MSP430, NDS32, RISC-V, RL78, RX, SPU, and x86). I'll take a look at the scripts, but I'm not strong with awk. > Anyway some comments: > >> Co-Authored-By: Daniele Tamino <dtamino@irobot.com> >> Signed-off-by: Daniel Santos <daniel.santos@pobox.com> >> --- >> src/thread/mips/syscall_cp.s | 41 +++++++++++++++++++++++++++++++++++- >> 1 file changed, 40 insertions(+), 1 deletion(-) >> >> diff --git a/src/thread/mips/syscall_cp.s b/src/thread/mips/syscall_cp.s >> index d2846264..d39bff59 100644 >> --- a/src/thread/mips/syscall_cp.s >> +++ b/src/thread/mips/syscall_cp.s >> @@ -1,4 +1,14 @@ >> +.section .mdebug.abi32 >> +.previous > What does this do? It's a little bit trashy. I guess that the ".previous" is useless since it's followed by a ".text" later. But the empty .mdebug.<abi> section is there to help gdb determine the ABI. iiuc, newer versions of binutils generate this automatically. I'm not an expert in this area, or even MIPS yet; I mostly modeled what GCC generated on this arch and then looked up why it did. According to GCC, it's there for compatibility with older versions of binutils (and thus, gdb). https://github.com/gcc-mirror/gcc/blob/98fcd2513add205dcdd134eb29a2505ea9f81495/gcc/config/mips/mips.c#L9889 . Seeing how this was added to gcc in 2007, it would probably be safe to omit it, but if you want to keep it then we should remove either the ".previous" or ".text". >> .set noreorder >> +.cfi_sections .debug_frame >> +.abicalls >> +#ifdef __PIC__ >> + .option pic2 >> +#else >> + .option pic0 >> +#endif > I don't think pic0 corresponds to not having -fPIC; it's a weird thing > that's not used or supported. There's no reason to override any of > these (or .abicalls, which is also default and equivalent to pic2 > AIUI), and the change does not seem to be at all related to adding > CFI. Good point -- keep the patch only about debug info. > I'm not sure if .cfi_sections .debug_frame is needed -- is it? > >> +.text >> >> .global __cp_begin >> .hidden __cp_begin >> @@ -9,12 +19,32 @@ >> .global __cp_cancel >> .hidden __cp_cancel >> .type __cp_cancel,@function >> -.hidden __cancel >> +.hidden __cancel /* long __cancel() in src/thread/pthread_cancel.c */ >> .global __syscall_cp_asm >> .hidden __syscall_cp_asm >> .type __syscall_cp_asm,@function >> + >> +/* >> +long __syscall_cp_asm( >> + volatile int *cancel, >> + syscall_arg_t nr, >> + syscall_arg_t u, >> + syscall_arg_t v, >> + syscall_arg_t w, >> + syscall_arg_t x, >> + syscall_arg_t y, >> + syscall_arg_t z) >> +*/ > These changes are also unrelated to CFI. Ah yes. Just comments when I was analyzing the function. >> + >> + .ent __syscall_cp_asm >> + .frame $sp, 32, $ra >> + .mask 0x00000000, 0 >> + .fmask 0x00000000, 0 >> + .cfi_startproc >> + .cfi_return_column $ra > And I'm not sure about all of these. To my knowledge, .ent, .frame, .mask and .fmask are ignored by gas, but used by MIPSpro for generating debug information -- the exception being that gas expects .ent if you use .end. If you're only interested in gas, then we should probably omit these. GCC emits them by default and finding the documentation for them was a fking pain. http://csweb.cs.wfu.edu/~torgerse/Kokua/Irix_6.5.21_doc_cd/usr/share/Insight/library/SGI_bookshelves/SGI_Developer/books/MProAsLg_PG/sgi_html/ch08.html ".cfi_startproc" is necessary, but ".cfi_return_column $ra" may be superfluous since it's probably the default after a jump/branch-and-link. I'll test without it and see. >> __syscall_cp_asm: >> subu $sp, $sp, 32 >> + .cfi_adjust_cfa_offset 32 >> __cp_begin: >> lw $4, 0($4) >> bne $4, $0, __cp_cancel >> @@ -35,14 +65,17 @@ __cp_begin: >> __cp_end: >> beq $7, $0, 1f >> addu $sp, $sp, 32 >> + .cfi_adjust_cfa_offset -32 >> subu $2, $0, $2 >> 1: jr $ra >> nop >> >> __cp_cancel: >> move $2, $ra >> + .cfi_register $ra, $2 >> bal 1f >> addu $sp, $sp, 32 >> + .cfi_adjust_cfa_offset -32 >> .gpword . >> .gpword __cancel >> 1: lw $3, ($ra) >> @@ -51,3 +84,9 @@ __cp_cancel: >> addu $25, $25, $3 >> jr $25 >> move $ra, $2 > I think this part looks ok, either to be generated like on the other > archs or included. I'm not certain the ".cfi_register $ra, $2" can be safely added via a script and it could probably even be moved to after the bal delay slot, along with the ".cfi_adjust_cfa_offset -32". In fact, I seem to recall that "stepi" in gdb used to step "over" branch instructions and then branch after using "stepi" on the delay slot -- I've noticed recently that it now executes both at once. >> + .cfi_restore $ra >> +#ifdef __ELF__ >> + .size __syscall_cp_asm,.-__syscall_cp_asm >> +#endif > This is not needed. musl is always ELF. Fantastic! So if we also don't need __PIC__ then we can omit the rename to *.S. > Rich I would also like to do the same thing for clone and dlsym. I'll fiddle with the awk scripts and see what I can get them to do when I get some more time in the next week. Thanks, Daniel

```
On 6/26/20 1:26 AM, Jeffrey Walton wrote:
> On Fri, Jun 26, 2020 at 2:20 AM Daniel Santos <daniel@gsat.us> wrote:
>> ...
>>>> if (at) {
>>>> - if (at->tv_nsec >= 1000000000UL) return EINVAL;
>>>> + if ((unsigned long)at->tv_nsec >= 1000000000UL) return EINVAL;
>>>> if (__clock_gettime(clk, &to)) return EINVAL;
>>>> to.tv_sec = at->tv_sec - to.tv_sec;
>>>> if ((to.tv_nsec = at->tv_nsec - to.tv_nsec) < 0) {
>>>>
>>> may be use < 0 || >= 1000000000L and avoid the cast.
>>> there is a similar issue in src/thread/pthread_cond_timedwait.c as well
>> Thank you for that. I'll resubmit changing both instances.
>>
>> In this case, the POSIX spec requires nt_nsec to be a long (
>> https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/time.h.html
>> ). Either way, a good optimizer should convert this into an unsigned
> I believe the C language says the signed value gets promoted to an
> unsigned value. I don't believe the optimizer has anything to do with
> it.
>
> That's why -1 is greater than 1 in C:
>
> int x = -1;
> unsigned int y = 1;
> if (x > y)
> printf("WTF???\n");
>
> Jeff
Yes, I was referring to Khem's suggestion:
may be use < 0 || >= 1000000000L and avoid the cast.
The optimizer should convert this into a single unsigned compare on just
about any modern processor (e.g., two's compliment).
I suppose the real solution is to not add -Wextra to CFLAGS unless you
add -Wno-sign-compare, as musl intentionally uses this promotion rule.
Thanks!
Daniel
```

```
On Wed, Jul 01, 2020 at 11:23:09PM +0200, Valentin Ochs wrote:
> On Wed, Jul 01, 2020 at 04:44:56PM -0400, Rich Felker wrote:
> > On Wed, Jul 01, 2020 at 08:50:26PM +0200, Markus Wichmann wrote:
> > > Hi all,
> > >
> > > I noticed something while reading code today: Near the end of qsort(),
> > > we have this gem:
> > >
> > > shl(p, 2);
> > > pshift -= 2;
> > > p[0] ^= 7;
> > > shr(p, 1);
> > >
> > > Now, I don't know if I am missing something, but don't the shl and the
> > > shr partially cancel out? Isn't this the same as
> > >
> > > shl(p, 1);
> > > p[0] ^= 3;
> > >
> > > As it is, it isn't wrong, just weird.
> >
> > Assuming non-overflow, I think they're equivalent (also assuming you
> > keep the pshift-=2).
>
> Yes, it looks that way. I'm afraid I don't have any further insight -
> it's been quite a while since I thought about the qsort code, and I've
> not been doing much work on algorithms over the last couple of years.
> The only thing I can think of is that one could figure out which
> behaviour with regard to overflow in shl() should be the valid one. I
> suspect that replacing it would be valid and this is some transformation
> I did while implementing smoothsort without realizing that this can be
> simplified.
>
> Cheers,
> Valentin
>
Overflow on shl() is completely impossible. To overflow a shl(p, 2), we
would need the penultimate bit in p to be set. Every bit in p stands in for
a tree of that order, so if bit n is set, the heap contains a tree with
a number of elements equal to the n'th Leonardo number.
I don't know how big the Leonardo number corresponding to the
penultimate bit is, but I do know that halfway through the upper word
(wasn't the factor something like 1.44 or so?), the Leonardo numbers get
too big to be contained in a machine word. Meaning that tree would be
way too large to be addressed.
I concur that this looks like a missed optimization opportunity, and not
a deliberate design decision.
Ciao,
Markus
```

```
On Wed, Jul 01, 2020 at 04:44:56PM -0400, Rich Felker wrote:
> On Wed, Jul 01, 2020 at 08:50:26PM +0200, Markus Wichmann wrote:
> > Hi all,
> >
> > I noticed something while reading code today: Near the end of qsort(),
> > we have this gem:
> >
> > shl(p, 2);
> > pshift -= 2;
> > p[0] ^= 7;
> > shr(p, 1);
> >
> > Now, I don't know if I am missing something, but don't the shl and the
> > shr partially cancel out? Isn't this the same as
> >
> > shl(p, 1);
> > p[0] ^= 3;
> >
> > As it is, it isn't wrong, just weird.
>
> Assuming non-overflow, I think they're equivalent (also assuming you
> keep the pshift-=2).
Yes, it looks that way. I'm afraid I don't have any further insight -
it's been quite a while since I thought about the qsort code, and I've
not been doing much work on algorithms over the last couple of years.
The only thing I can think of is that one could figure out which
behaviour with regard to overflow in shl() should be the valid one. I
suspect that replacing it would be valid and this is some transformation
I did while implementing smoothsort without realizing that this can be
simplified.
Cheers,
Valentin
```

```
On Wed, Jul 01, 2020 at 08:50:26PM +0200, Markus Wichmann wrote:
> Hi all,
>
> I noticed something while reading code today: Near the end of qsort(),
> we have this gem:
>
> shl(p, 2);
> pshift -= 2;
> p[0] ^= 7;
> shr(p, 1);
>
> Now, I don't know if I am missing something, but don't the shl and the
> shr partially cancel out? Isn't this the same as
>
> shl(p, 1);
> p[0] ^= 3;
>
> As it is, it isn't wrong, just weird.
Assuming non-overflow, I think they're equivalent (also assuming you
keep the pshift-=2). I've CC'd the original author but we've not been
in touch for a long time so I don't know whether to expect a response.
I don't have any insight on why it was done this way.
Rich
```

Hi all, I noticed something while reading code today: Near the end of qsort(), we have this gem: shl(p, 2); pshift -= 2; p[0] ^= 7; shr(p, 1); Now, I don't know if I am missing something, but don't the shl and the shr partially cancel out? Isn't this the same as shl(p, 1); p[0] ^= 3; As it is, it isn't wrong, just weird. Ciao, Markus

vfscanf() may use the variable 'alloc' uninitialized when taking the branch introduced by recent commit b287cd745c2243f8e5114331763a5a9813b5f6ee. Spotted by clang: ../lib/libc/src/stdio/vfscanf.c:80:6: warning: variable 'alloc' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!f->rpos) goto input_fail; ^~~~~~~~ ../lib/libc/src/stdio/vfscanf.c:330:7: note: uninitialized use occurs here if (alloc) { ^~~~~ --- src/stdio/vfscanf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stdio/vfscanf.c b/src/stdio/vfscanf.c index b5ebc16e..b78a374d 100644 --- a/src/stdio/vfscanf.c +++ b/src/stdio/vfscanf.c @@ -57,7 +57,7 @@ int vfscanf(FILE *restrict f, const char *restrict fmt, va_list ap) { int width; int size; - int alloc; + int alloc = 0; int base; const unsigned char *p; int c, t; -- 2.23.0

```
On 2020-06-30 12:54, Rich Felker wrote:
> Note that for fixing this issue, it won't suffice just to make
> pthread_kill block signals. The other places that use the killlock
> also need to block signals, to make the lock fully AS-safe.
>
> Rich
Rich did you see the issue I was talking about in the example in my
previous email? Does it make sense?
I do not mind submitting a patch for this issue if you don't currently
have the spare cycles, just let me know if we are in agreement in terms
of understanding the issue and I can submit a patch that blocks "app
signals" in every place that uses the killlock.
Probably could also remove the pthread_self() special case that you
referenced earlier in the thread but that would be a second patch.
Thank you,
Hydro
```

```
On Tue, Jun 30, 2020 at 10:28:50PM +0300, Dmitry Samersoff wrote:
> Hello all,
>
> Does it make sense to trylock and immediately return ESRCH if
> pthread_kill is already in progress?
No, ESRCH is not a valid result for this condition. Moreover killlock
being taken does not tell you that pthread_kill is already in progress
targeting the same thread, just that something that needs the kernel
tid to be valid (kill, cancel, or scheduling changes) or that needs to
change its validity (exiting) is in progress.
Note that for fixing this issue, it won't suffice just to make
pthread_kill block signals. The other places that use the killlock
also need to block signals, to make the lock fully AS-safe.
Rich
```

```
Hello all,
Does it make sense to trylock and immediately return ESRCH if
pthread_kill is already in progress?
-Dmitry
On 30.06.2020 19:28, Hydro Flask wrote:
> On 2020-06-30 07:58, Markus Wichmann wrote:
>> On Tue, Jun 30, 2020 at 05:26:46AM -0400, Rich Felker wrote:
>>> On Mon, Jun 29, 2020 at 11:19:39PM -0700, Hydro Flask wrote:
>>> >
>>> > Just to be clear, this doesn't only occur when calling
>>> > pthread_kill() and using pthread_self() as the target, it can be any
>>> > target thread, as long as it's the same target thread is used in the
>>> > signal handler and in the synchronous context.
>>>
>>> How so? If the target is different, the rest of the pthread_kill,
>>> including the unlock, will proceed concurrently with the signal
>>> handler. However you may be able to construct mutual-signaling
>>> deadlock cases.
>>>
>>
>> Thread A calls pthread_kill(thread_c, ...). Thread B calls
>> (concurrently) pthread_kill(thread_a, ...). Thread B's signal arrives
>> while thread A holds the killlock. Signal handler calls
>> pthread_kill(thread_c, ...).
>
> You can trigger it much more simply than that. Doesn't require multiple
> threads:
>
> 1. pthread_kill(th)
> 2. LOCK(killlock)
> 3. <signal arrives after lock>
> 4. signal handler: pthread_kill(th)
> 5. signal handler: LOCK(killlock)
>
> In the preceding example "th" can be any pthread_t value, the only
> requirement is that it's the same pthread_t in both the synchronous and
> signal handler context.
```