Still more optimisations/simplifications in the math subtree. JFTR: I'm NOT subscribed to your mailing list, so CC: me in replies! --- -/src/math/fmax.c +++ +/src/math/fmax.c @@ -3,11 +3,9 @@ double fmax(double x, double y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? y : x; return x < y ? y : x; } --- -/src/math/fmaxf.c +++ +/src/math/fmaxf.c @@ -3,11 +3,9 @@ float fmaxf(float x, float y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? y : x; return x < y ? y : x; } --- -/src/math/fmaxl.c +++ +/src/math/fmaxl.c @@ -10,11 +10,9 @@ long double fmaxl(long double x, long double y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? y : x; return x < y ? y : x; } --- -/src/math/fmin.c +++ +/src/math/fmin.c @@ -3,11 +3,9 @@ double fmin(double x, double y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? x : y; - return x < y ? x : y; + return x > y ? y : x; } --- -/src/math/fminf.c +++ +/src/math/fminf.c @@ -3,11 +3,9 @@ float fminf(float x, float y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? x : y; - return x < y ? x : y; + return x > y ? y : x; } --- -/src/math/fminl.c +++ +/src/math/fminl.c @@ -10,11 +10,9 @@ long double fminl(long double x, long double y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) - return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? x : y; - return x < y ? x : y; + return x > y ? y : x; }

* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: > Still more optimisations/simplifications in the math subtree. > > JFTR: I'm NOT subscribed to your mailing list, so CC: me in replies! > > --- -/src/math/fmax.c > +++ +/src/math/fmax.c > @@ -3,11 +3,9 @@ > double fmax(double x, double y) > { > - if (isnan(x)) > + if (x != x) these two are not equivalent for snan input, but we dont care about snan, nor the compiler by default, so the compiler can optimize one to the other (although musl uses explicit int arithmetics instead of __builtin_isnan so it's a bit harder). in any case the two are equivalent for practical purposes and using isnan better documents the intention, you should change the isnan definition if you think it's not efficient. > return y; > - if (isnan(y)) > - return x; > /* handle signed zeros, see C99 Annex F.9.9.2 */ > - if (signbit(x) != signbit(y)) > + if (x == y) > return signbit(x) ? y : x; > return x < y ? y : x; nice trick, but the fenv behaviour is not right. you should run any such change through libc-test git://repo.or.cz/libc-test and look for regressions.

"Szabolcs Nagy" <nsz@port70.net> wrote: >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: >> Still more optimisations/simplifications in the math subtree. >> >> JFTR: I'm NOT subscribed to your mailing list, so CC: me in replies! >> >> --- -/src/math/fmax.c >> +++ +/src/math/fmax.c >> @@ -3,11 +3,9 @@ >> double fmax(double x, double y) >> { >> - if (isnan(x)) >> + if (x != x) > > these two are not equivalent for snan input, but we dont care > about snan, nor the compiler by default, so the compiler can > optimize one to the other (although musl uses explicit int > arithmetics instead of __builtin_isnan so it's a bit harder). The latter behaviour was my reason to use (x != x) here: I attempt to replace as many function calls as possible with "normal" code, and also try to avoid transfers to/from FPU/SSE registers to/from integer registers if that does not result in faster/shorter code. > in any case the two are equivalent for practical purposes and > using isnan better documents the intention, you should change > the isnan definition if you think it's not efficient. > >> return y; >> - if (isnan(y)) >> - return x; >> /* handle signed zeros, see C99 Annex F.9.9.2 */ >> - if (signbit(x) != signbit(y)) >> + if (x == y) >> return signbit(x) ? y : x; >> return x < y ? y : x; > > nice trick, but the fenv behaviour is not right. --- -/src/math/fmax.c +++ +/src/math/fmax.c @@ -3,11 +3,9 @@ double fmax(double x, double y) { - if (isnan(x)) + if (x != x) return y; - if (isnan(y)) + if (y != y) return x; /* handle signed zeros, see C99 Annex F.9.9.2 */ - if (signbit(x) != signbit(y)) + if (x == y) return signbit(x) ? y : x; return x < y ? y : x; } > you should run any such change through libc-test > git://repo.or.cz/libc-test and look for regressions. I already told Rich that I neither use an OS nor a compiler/assembler where musl or libc-test can be built. Stefan

* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 13:33:44 +0100]: > "Szabolcs Nagy" <nsz@port70.net> wrote: > >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: > > these two are not equivalent for snan input, but we dont care > > about snan, nor the compiler by default, so the compiler can > > optimize one to the other (although musl uses explicit int > > arithmetics instead of __builtin_isnan so it's a bit harder). > > The latter behaviour was my reason to use (x != x) here: I attempt > to replace as many function calls as possible with "normal" code, > and also try to avoid transfers to/from FPU/SSE registers to/from > integer registers if that does not result in faster/shorter code. why not just change the definition of isnan then? #if __GNUC__ > xxx #define isnan(x) sizeof(x)==sizeof(float) ? __builtin_isnanf(x) : ... > > you should run any such change through libc-test > > git://repo.or.cz/libc-test and look for regressions. > > I already told Rich that I neither use an OS nor a compiler/assembler > where musl or libc-test can be built. it does not matter where you use musl, if you want to submit patches you have to test on supported targets (since it's not realistic to test on all configurations, at least one relevant configuration is enough)

On Wed, Dec 11, 2019 at 02:16:59PM +0100, Szabolcs Nagy wrote: > * Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 13:33:44 +0100]: > > "Szabolcs Nagy" <nsz@port70.net> wrote: > > >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: > > > these two are not equivalent for snan input, but we dont care > > > about snan, nor the compiler by default, so the compiler can > > > optimize one to the other (although musl uses explicit int > > > arithmetics instead of __builtin_isnan so it's a bit harder). > > > > The latter behaviour was my reason to use (x != x) here: I attempt > > to replace as many function calls as possible with "normal" code, > > and also try to avoid transfers to/from FPU/SSE registers to/from > > integer registers if that does not result in faster/shorter code. > > why not just change the definition of isnan then? > > #if __GNUC__ > xxx > #define isnan(x) sizeof(x)==sizeof(float) ? __builtin_isnanf(x) : ... Yes, I think having this conditioned on GNUC would be acceptable, provided the builtin is supported in all versions of GNUC going back conceivably far. > > > you should run any such change through libc-test > > > git://repo.or.cz/libc-test and look for regressions. > > > > I already told Rich that I neither use an OS nor a compiler/assembler > > where musl or libc-test can be built. > > it does not matter where you use musl, if you want > to submit patches you have to test on supported > targets (since it's not realistic to test on all > configurations, at least one relevant configuration > is enough) I think it's "ok" to submit patches without having run tests, with the caveat that it's going to impact how willing folks are to review them, and if so how quickly it gets done. If submitting patches you haven't tested, text explaining the reasoning for why you think they're correct (e.g. "nan behavior is still correct because line N produces a nan and the function is allowed to raise invalid under this condition") would help a lot since thought processes like that are hard to reverse-engineer. Rich

"Szabolcs Nagy" <nsz@port70.net> wrote: >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 13:33:44 +0100]: >> "Szabolcs Nagy" <nsz@port70.net> wrote: >> >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: >> > these two are not equivalent for snan input, but we dont care >> > about snan, nor the compiler by default, so the compiler can >> > optimize one to the other (although musl uses explicit int >> > arithmetics instead of __builtin_isnan so it's a bit harder). >> >> The latter behaviour was my reason to use (x != x) here: I attempt >> to replace as many function calls as possible with "normal" code, >> and also try to avoid transfers to/from FPU/SSE registers to/from >> integer registers if that does not result in faster/shorter code. > > why not just change the definition of isnan then? Because I did not want to introduce such a global change; until now my patches are just local (peephole) optimisations. > #if __GNUC__ > xxx > #define isnan(x) sizeof(x)==sizeof(float) ? __builtin_isnanf(x) : ... This is better than my proposed change, as it also avoids the side- effect of (x != x) which can raise exceptions, and gets rid of the explicit transfer to integer registers, which can hurt performance. The macros isinf(), isnormal(), isfinite(), signbit() should of course be implemented in a similar way too, and the (internal only?) functions __FLOAT_BITS() and __DOUBLE_BITS() removed completely! regards Stefan PS: the following is just a "Gedankenspiel", extending the idea to avoid transfers from/to SSE registers. On x86-64, functions like isunordered(), copysign() etc. may be implemented using SSE intrinsics _mm_*() as follows: #include <immintrin.h> int signbit(double argument) { return /* 1 & */ _mm_movemask_pd(_mm_set_sd(argument)); } int isunordered(double a, double b) { #if 0 return _mm_comieq_sd(_mm_cmp_sd(_mm_set_sd(a), _mm_set_sd(b), _CMP_ORD_Q), _mm_set_sd(0.0)); #elif 0 return _mm_comineq_sd(_mm_set_sd(a), _mm_set_sd(a)) || _mm_comineq_sd(_mm_set_sd(b), _mm_set_sd(b)); #else return /* 1 & */ _mm_movemask_pd(_mm_cmp_sd(_mm_set_sd(a), _mm_set_sd(b), _CMP_UNORD_Q)); #endif } uint32_t lrint(double argument) { return _mm_cvtsd_si32(_mm_set_sd(argument)); } uint64_t llrint(double argument) { return _mm_cvtsd_si64(_mm_set_sd(argument)); } double copysign(double magnitude, double sign) { return _mm_cvtsd_f64(_mm_or_pd(_mm_and_pd(_mm_set_sd(-0.0), _mm_set_sd(sign)), _mm_andnot_pd(_mm_set_sd(-0.0), _mm_set_sd(magnitude)))); } double fdim(double x, double y) { return _mm_cvtsd_f64(_mm_and_pd(_mm_cmp_sd(_mm_set_sd(x), _mm_set_sd(y), _CMP_NLE_US), _mm_sub_sd(_mm_set_sd(x), _mm_set_sd(y)))); } double fmax(double x, double y) { __m128d mask = _mm_cmp_sd(_mm_set_sd(x), _mm_set_sd(x), _CMP_ORD_Q); return _mm_cvtsd_f64(_mm_or_pd(_mm_and_pd(mask, _mm_max_sd(_mm_set_sd(y), _mm_set_sd(x))), _mm_andnot_pd(mask, _mm_set_sd(y)))); } double fmin(double x, double y) { __m128d mask = _mm_cmp_sd(_mm_set_sd(x), _mm_set_sd(x), _CMP_ORD_Q); return _mm_cvtsd_f64(_mm_or_pd(_mm_and_pd(mask, _mm_min_sd(_mm_set_sd(y), _mm_set_sd(x))), _mm_andnot_pd(mask, _mm_set_sd(y)))); } Although the arguments and results are all held in SSE registers, there's no way to use them directly; it's but necessary to transfer them using _mm_set_sd() and _mm_cvtsd_f64(), which may result in superfluous instructions emitted by the compiler. If you but cheat and "hide" these functions from the compiler by placing them in a library, you can implement them as follows: __m128d fmin(__m128d x, __m128d y) { __m128d mask = _mm_cmp_sd(x, x, _CMP_ORD_Q); return _mm_or_pd(_mm_and_pd(mask, _mm_min_sd(y, x)), _mm_andnot_pd(mask, y)); } .code ; Intel syntax fmin proc public movsd xmm2, xmm0 ; xmm2 = x cmpsd xmm2, xmm0, 7 ; xmm2 = (x != NAN) ? -1 : 0 movsd xmm3, xmm2 andnpd xmm3, xmm1 ; xmm3 = (x != NAN) ? 0.0 : y minsd xmm1, xmm0 ; xmm1 = (x < y) ? x : y ; = min(x, y) andpd xmm2, xmm1 ; xmm2 = (x != NAN) ? min(x, y) : 0.0 orpd xmm2, xmm3 ; xmm2 = (x != NAN) ? min(x, y) : y movsd xmm0, xmm2 ; xmm0 = fmin(x, y) ret fmin endp

On Wed, Dec 11, 2019 at 10:17:09PM +0100, Stefan Kanthak wrote: > "Szabolcs Nagy" <nsz@port70.net> wrote: > > > >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 13:33:44 +0100]: > >> "Szabolcs Nagy" <nsz@port70.net> wrote: > >> >* Stefan Kanthak <stefan.kanthak@nexgo.de> [2019-12-11 10:55:29 +0100]: > >> > these two are not equivalent for snan input, but we dont care > >> > about snan, nor the compiler by default, so the compiler can > >> > optimize one to the other (although musl uses explicit int > >> > arithmetics instead of __builtin_isnan so it's a bit harder). > >> > >> The latter behaviour was my reason to use (x != x) here: I attempt > >> to replace as many function calls as possible with "normal" code, > >> and also try to avoid transfers to/from FPU/SSE registers to/from > >> integer registers if that does not result in faster/shorter code. > > > > why not just change the definition of isnan then? > > Because I did not want to introduce such a global change; until now my > patches are just local (peephole) optimisations. > > > #if __GNUC__ > xxx > > #define isnan(x) sizeof(x)==sizeof(float) ? __builtin_isnanf(x) : ... > > This is better than my proposed change, as it also avoids the side- > effect of (x != x) which can raise exceptions, and gets rid of the > explicit transfer to integer registers, which can hurt performance. > > The macros isinf(), isnormal(), isfinite(), signbit() should of > course be implemented in a similar way too, and the (internal only?) > functions __FLOAT_BITS() and __DOUBLE_BITS() removed completely! Not removed because the public headers support non-GNUC (or older GCC? I forget when these were introduced) compilers that may not provide these. Having the portable definitions present as the fallback case is still desirable. > PS: the following is just a "Gedankenspiel", extending the idea to > avoid transfers from/to SSE registers. > On x86-64, functions like isunordered(), copysign() etc. may be > implemented using SSE intrinsics _mm_*() as follows: > > #include <immintrin.h> > > int signbit(double argument) > { > return /* 1 & */ _mm_movemask_pd(_mm_set_sd(argument)); > } This is just a missed optimization the compiler should be able to do without intrinsics, on any arch where floating point types are kept in vector registers that can also do integer/bitmask operations. > uint32_t lrint(double argument) > { > return _mm_cvtsd_si32(_mm_set_sd(argument)); > } This is already done (on x86_64 where it's valid). It's in an asm source file but should be converted to a C source file with __asm__ and proper constraint, not intrinsics, because __asm__ is a compiler feature we require support for and intrinsics aren't (and also they have some really weird semantics with respect to how they interface with C aliasing rules). > double copysign(double magnitude, double sign) > { > return _mm_cvtsd_f64(_mm_or_pd(_mm_and_pd(_mm_set_sd(-0.0), _mm_set_sd(sign)), > _mm_andnot_pd(_mm_set_sd(-0.0), _mm_set_sd(magnitude)))); > } I don't think we have one like this for x86_64, but ideally the C would compile to something like it. (See above about missed optimization.) > Although the arguments and results are all held in SSE registers, > there's no way to use them directly; it's but necessary to > transfer them using _mm_set_sd() and _mm_cvtsd_f64(), which may > result in superfluous instructions emitted by the compiler. I don't see why you say that. They should be used in-place if possible just by virtue of how the compiler's IR works. Certainly for the __asm__ form they will be used in-place. > If you but cheat and "hide" these functions from the compiler > by placing them in a library, you can implement them as follows: > > __m128d fmin(__m128d x, __m128d y) > { > __m128d mask = _mm_cmp_sd(x, x, _CMP_ORD_Q); > > return _mm_or_pd(_mm_and_pd(mask, _mm_min_sd(y, x)), > _mm_andnot_pd(mask, y)); > } Yes, this kind of thing (hacks with declaring functions with wrong type to achieve an ABI result) is not something we really do in musl. But it shouldn't be needed here. Rich

```
On Wed, 11 Dec 2019, Stefan Kanthak wrote:
> This is better than my proposed change, as it also avoids the side-
> effect of (x != x) which can raise exceptions, and gets rid of the
> explicit transfer to integer registers, which can hurt performance.
I thought that
x != x
should NOT raise an exception, at least as I read Kahan's lecture notes.
Regards - Damian
Pacific Engineering Systems International, 277-279 Broadway, Glebe NSW 2037
Ph:+61-2-8571-0847 .. Fx:+61-2-9692-9623 | unsolicited email not wanted here
Views & opinions here are mine and not those of any past or present employer
```

"Rich Felker" <dalias@libc.org> wrote: > On Wed, Dec 11, 2019 at 10:17:09PM +0100, Stefan Kanthak wrote: [...] >> PS: the following is just a "Gedankenspiel", extending the idea to >> avoid transfers from/to SSE registers. >> On x86-64, functions like isunordered(), copysign() etc. may be >> implemented using SSE intrinsics _mm_*() as follows: >> >> #include <immintrin.h> >> >> int signbit(double argument) >> { >> return /* 1 & */ _mm_movemask_pd(_mm_set_sd(argument)); >> } > > This is just a missed optimization the compiler should be able to do > without intrinsics, on any arch where floating point types are kept in > vector registers that can also do integer/bitmask operations. The catch here is but that the MOVMSKPD instruction generated from _mm_movemask_pd() intrinsic yields its result in an integer register, so there's no need to do integer/bitmask operations on vector registers (and transfer them to an integer register afterwards). >> uint32_t lrint(double argument) >> { >> return _mm_cvtsd_si32(_mm_set_sd(argument)); >> } > > This is already done (on x86_64 where it's valid). It's in an asm > source file This is exactly the cheating I address below: the prototype of the assembler function matches the ABI, but not the C declaration. > but should be converted to a C source file with __asm__ > and proper constraint, not intrinsics, because __asm__ is a compiler > feature we require support for and intrinsics aren't (and also they > have some really weird semantics with respect to how they interface > with C aliasing rules). That's why I introduced this only as a "Gedankenspiel"! >> double copysign(double magnitude, double sign) >> { >> return _mm_cvtsd_f64(_mm_or_pd(_mm_and_pd(_mm_set_sd(-0.0), _mm_set_sd(sign)), >> _mm_andnot_pd(_mm_set_sd(-0.0), _mm_set_sd(magnitude)))); >> } > > I don't think we have one like this for x86_64, but ideally the C > would compile to something like it. (See above about missed > optimization.) Compilers typically emit superfluous PXOR/XORPD instructions here to clear the upper lane(s) of the vector registers, although _mm_*_sd() and _mm_*_ss() don't touch the upper lanes (so invalid values can't raise exceptions), and the bitmask operations _mm_*_pd() don't raise exceptions on SNANs, subnormals etc. >> Although the arguments and results are all held in SSE registers, >> there's no way to use them directly; it's but necessary to >> transfer them using _mm_set_sd() and _mm_cvtsd_f64(), which may >> result in superfluous instructions emitted by the compiler. > > I don't see why you say that. Just insert "in plain C" after "there's no way to use them directly" > They should be used in-place if possible just by virtue of how the > compiler's IR works. See above: most often XORPD or another instruction to clear/set the upper lane(s) is emitted. > Certainly for the __asm__ form they will be used in-place. Right. But that's the inline form of cheating.-) >> If you but cheat and "hide" these functions from the compiler >> by placing them in a library, you can implement them as follows: >> >> __m128d fmin(__m128d x, __m128d y) >> { >> __m128d mask = _mm_cmp_sd(x, x, _CMP_ORD_Q); >> >> return _mm_or_pd(_mm_and_pd(mask, _mm_min_sd(y, x)), >> _mm_andnot_pd(mask, y)); >> } > > Yes, this kind of thing (hacks with declaring functions with wrong > type to achieve an ABI result) is not something we really do in musl. > But it shouldn't be needed here. Remember that this is just a "Gedankenspiel". Stefan