mailing list of musl libc
 help / color / mirror / code / Atom feed
From: "Stefan Kanthak" <stefan.kanthak@nexgo.de>
To: "Szabolcs Nagy" <nsz@port70.net>
Cc: <musl@lists.openwall.com>
Subject: Re: [musl] [PATCH #2] Properly simplified nextafter()
Date: Sun, 15 Aug 2021 09:04:55 +0200	[thread overview]
Message-ID: <367A4018B58A4E308E2A95404362CBFB@H270> (raw)
In-Reply-To: <20210814234612.GH37904@port70.net>

Szabolcs Nagy <nsz@port70.net> wrote:

>* Stefan Kanthak <stefan.kanthak@nexgo.de> [2021-08-13 14:04:51 +0200]:
>> Szabolcs Nagy <nsz@port70.net> wrote on 2021-08-10 at 23:34:

>> > (the i386 machine where i originally tested this preferred int
>> > cmp and float cmp was very slow in the subnormal range
>>
>> This also and still holds for i386 FPU fadd/fmul as well as SSE
>> addsd/addss/mulss/mulsd additions/multiplies!
>
> they are avoided in the common case, and only used to create
> fenv side-effects.

Unfortunately but for hard & SOFT-float, where no fenv exists, as
Rich wrote.

>> --- -/src/math/nextafter.c
>> +++ +/src/math/nextafter.c
>> @@ -10,13 +10,13 @@
>>                 return x + y;
>>         if (ux.i == uy.i)
>>                 return y;
>> -       ax = ux.i & -1ULL/2;
>> -       ay = uy.i & -1ULL/2;
>> +       ax = ux.i << 2;
>> +       ay = uy.i << 2;
>
> the << 2 looks wrong, the top bit of the exponent is lost.

It IS wrong, but only in the post, not in the code I tested.

>>         if (ax == 0) {
>>                 if (ay == 0)
>>                         return y;
>>                 ux.i = (uy.i & 1ULL<<63) | 1;
>> -       } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63))
>> +       } else if ((ax < ay) == ((int64_t) ux.i < 0))
>>                 ux.i--;
>>         else
>>                 ux.i++;
> ...
>> How do you compare these 60 instructions/252 bytes to the code I posted
>> (23 instructions/72 bytes)?
>
> you should benchmark, but the second best is to look
> at the longest dependency chain in the hot path and
> add up the instruction latencies.

1 billion calls to nextafter(), with random from, and to either 0 or +INF:
run 1 against glibc,                         8.58 ns/call
run 2 against musl original,                 3.59
run 3 against musl patched,                  0.52
run 4 the pure floating-point variant from   0.72
      my initial post in this thread,
run 5 the assembly variant I posted.         0.28 ns/call

Now hurry up and patch your slowmotion code!

Stefan

PS: I cheated a very tiny little bit: the isnan() macro of musl patched is

#ifdef PATCH
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \
__fpclassifyl(x) == FP_NAN)
#else
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
__fpclassifyl(x) == FP_NAN)
#endif // PATCH

PPS: and of course the log from the benchmarks...

[stefan@rome ~]$ lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                16
On-line CPU(s) list:   0-15
Thread(s) per core:    2
Core(s) per socket:    8
Socket(s):             1
NUMA node(s):          1
Vendor ID:             AuthenticAMD
CPU family:            23
Model:                 49
Model name:            AMD EPYC 7262 8-Core Processor
Stepping:              0
CPU MHz:               3194.323
BogoMIPS:              6388.64
Virtualization:        AMD-V
L1d cache:             32K
L1i cache:             32K
L2 cache:              512K
L3 cache:              16384K
...
[stefan@rome ~]$ gcc --version
gcc (GCC) 8.3.1 20190311 (Red Hat 8.3.1-3)
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[stefan@rome ~]$ cat patch.c
#include <math.h>

static __inline unsigned __FLOAT_BITS(float __f)
{
 union {float __f; unsigned __i;} __u;
 __u.__f = __f;
 return __u.__i;
}

static __inline unsigned long long __DOUBLE_BITS(double __f)
{
 union {double __f; unsigned long long __i;} __u;
 __u.__f = __f;
 return __u.__i;
}

#ifdef PATCH
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \
__fpclassifyl(x) == FP_NAN)
#else
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
__fpclassifyl(x) == FP_NAN)
#endif // PATCH

__attribute__((noinline))
double nextafter(double x, double y)
{
 union {double f; unsigned long long i;} ux={x}, uy={y};
 unsigned long long ax, ay;
 int e;

 if (isnan(x) || isnan(y))
  return x + y;
 if (ux.i == uy.i)
  return y;
#ifdef PATCH
 ax = ux.i << 1;
 ay = uy.i << 1;
#else
 ax = ux.i & -1ULL/2;
 ay = uy.i & -1ULL/2;
#endif
 if (ax == 0) {
  if (ay == 0)
   return y;
  ux.i = (uy.i & 1ULL<<63) | 1;
#ifdef PATCH
 } else if (ax < ay == (long long) ux.i < 0)
#else
 } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63))
#endif
  ux.i--;
 else
  ux.i++;
 e = ux.i >> 52 & 0x7ff;
 /* raise overflow if ux.f is infinite and x is finite */
 if (e == 0x7ff)
  FORCE_EVAL(x + x);
 /* raise underflow if ux.f is subnormal or zero */
 if (e == 0)
  FORCE_EVAL(x*x + ux.f*ux.f);
 return ux.f;
}
[stefan@rome ~]$ cat bench.c
// Copyright © 2005-2021, Stefan Kanthak <stefan.kanthak@nexgo.de>

#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <time.h>

__attribute__((noinline))
double nop(double foo, double bar)
{
    return foo + bar;
}

inline static
double lfsr64(void)
{
    // 64-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
    //   initialised with 2**64 / golden ratio

    static uint64_t lfsr = 0x9E3779B97F4A7C15;
    const  uint64_t sign = (int64_t) lfsr >> 63;

    lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);

    return *(double *) &lfsr;
}

inline static
double random64(void)
{
    static uint64_t seed = 0x0123456789ABCDEF;

    seed = seed * 6364136223846793005 + 1442695040888963407;

    return *(double *) &seed;
}

int main(void)
{
    clock_t t0, t1, t2, tt;
    uint32_t n;
    volatile double result;

    t0 = clock();

    for (n = 500000000u; n > 0u; n--) {
        result = nop(lfsr64(), 0.0);
        result = nop(random64(), 1.0 / 0.0);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; n--) {
        result = nextafter(lfsr64(), 0.0);
        result = nextafter(random64(), 1.0 / 0.0);
    }

    t2 = clock();
    tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1;

    printf("\n"
           "nop()         %4lu.%06lu       0\n"
           "nextafter()   %4lu.%06lu    %4lu.%06lu\n"
           "              %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
[stefan@rome ~]$ gcc -O3 -lm bench.c
[stefan@rome ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()     10.060000       8.580000
                11.540000 nano-seconds

 Performance counter stats for './a.out':

         11,548.78 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               145      page-faults:u             #    0.013 K/sec
    38,917,213,536      cycles:u                  #    3.370 GHz                      (83.33%)
    15,647,656,615      stalled-cycles-frontend:u #   40.21% frontend cycles idle     (83.33%)
    10,746,044,422      stalled-cycles-backend:u  #   27.61% backend cycles idle      (83.33%)
    69,739,403,870      instructions:u            #    1.79  insn per cycle
                                                  #    0.22  stalled cycles per insn  (83.33%)
    16,495,748,110      branches:u                # 1428.354 M/sec                    (83.33%)
       500,701,246      branch-misses:u           #    3.04% of all branches          (83.33%)

      11.550414029 seconds time elapsed

      11.548265000 seconds user
       0.000999000 seconds sys


[stefan@rome ~]$ gcc -O3 bench.c patch.c
patch.c:23: warning: "isnan" redefined
 #define isnan(x) ( \

In file included from patch.c:1:
/usr/include/math.h:254: note: this is the location of the previous definition
 #  define isnan(x) \

[stefan@rome ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()      5.070000       3.590000
                 6.550000 nano-seconds

 Performance counter stats for './a.out':

          6,558.44 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               122      page-faults:u             #    0.019 K/sec
    22,038,054,931      cycles:u                  #    3.360 GHz                      (83.33%)
           204,849      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     5,497,340,276      stalled-cycles-backend:u  #   24.94% backend cycles idle      (83.34%)
    39,751,746,482      instructions:u            #    1.80  insn per cycle
                                                  #    0.14  stalled cycles per insn  (83.34%)
     9,747,428,086      branches:u                # 1486.242 M/sec                    (83.34%)
       250,407,409      branch-misses:u           #    2.57% of all branches          (83.33%)

       6.559550924 seconds time elapsed

       6.558918000 seconds user
       0.000000000 seconds sys


[stefan@rome ~]$ gcc -O3 bench.c -DPATCH patch.c
patch.c:18: warning: "isnan" redefined
 #define isnan(x) ( \

In file included from patch.c:1:
/usr/include/math.h:254: note: this is the location of the previous definition
 #  define isnan(x) \

[stefan@rome ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()      2.000000       0.520000
                 3.480000 nano-seconds

 Performance counter stats for './a.out':

          3,489.59 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               123      page-faults:u             #    0.035 K/sec
    11,721,075,764      cycles:u                  #    3.359 GHz                      (83.32%)
           132,680      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.32%)
     4,500,051,993      stalled-cycles-backend:u  #   38.39% backend cycles idle      (83.32%)
    40,501,721,908      instructions:u            #    3.46  insn per cycle
                                                  #    0.11  stalled cycles per insn  (83.33%)
     8,494,571,027      branches:u                # 2434.258 M/sec                    (83.35%)
           497,230      branch-misses:u           #    0.01% of all branches          (83.34%)

       3.490437579 seconds time elapsed

       3.490053000 seconds user
       0.000000000 seconds sys


[stefan@rome ~]$ gcc -O3 patch.c nextafter-fp.c
[stefan@rome ~]$ perf stat ./a.out

nop()            1.490000       0
nextafter()      2.210000       0.720000
                 3.700000 nano-seconds

 Performance counter stats for './a.out':

          3,702.89 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               122      page-faults:u             #    0.033 K/sec
    12,407,345,183      cycles:u                  #    3.351 GHz                      (83.32%)
           135,817      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     5,498,895,906      stalled-cycles-backend:u  #   44.32% backend cycles idle      (83.34%)
    38,002,430,460      instructions:u            #    3.06  insn per cycle
                                                  #    0.14  stalled cycles per insn  (83.34%)
     7,497,381,393      branches:u                # 2024.735 M/sec                    (83.34%)
           497,462      branch-misses:u           #    0.01% of all branches          (83.32%)

       3.703648875 seconds time elapsed

       3.703294000 seconds user
       0.000000000 seconds sys


[stefan@rome ~]$ gcc -O3 bench.c nextafter.s
[stefan@rome ~]$ perf stat ./a.out

nop()            1.630000       0
nextafter()      1.910000       0.280000
                 3.540000 nano-seconds

 Performance counter stats for './a.out':

          3,547.12 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               123      page-faults:u             #    0.035 K/sec
    11,949,840,797      cycles:u                  #    3.369 GHz                      (83.32%)
           129,627      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     3,998,960,716      stalled-cycles-backend:u  #   33.46% backend cycles idle      (83.34%)
    37,493,523,285      instructions:u            #    3.14  insn per cycle
                                                  #    0.11  stalled cycles per insn  (83.34%)
     7,998,559,192      branches:u                # 2254.945 M/sec                    (83.34%)
           497,565      branch-misses:u           #    0.01% of all branches          (83.32%)

       3.547999008 seconds time elapsed

       3.546671000 seconds user
       0.000999000 seconds sys


[stefan@rome ~]$


  reply	other threads:[~2021-08-15  7:13 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-10  6:23 [musl] [PATCH] " Stefan Kanthak
2021-08-10 21:34 ` Szabolcs Nagy
2021-08-10 22:53   ` Stefan Kanthak
2021-08-11  2:40     ` Rich Felker
2021-08-11 15:44       ` Stefan Kanthak
2021-08-11 16:09         ` Rich Felker
2021-08-11 16:50           ` Stefan Kanthak
2021-08-11 17:57             ` Rich Felker
2021-08-11 22:16               ` Szabolcs Nagy
2021-08-11 22:43                 ` Stefan Kanthak
2021-08-12  0:59                   ` Rich Felker
2021-08-11  8:23     ` Szabolcs Nagy
2021-08-13 12:04   ` [musl] [PATCH #2] " Stefan Kanthak
2021-08-13 15:59     ` Rich Felker
2021-08-13 18:30       ` Stefan Kanthak
2021-08-14  4:07         ` Damian McGuckin
2021-08-14 22:45           ` Szabolcs Nagy
2021-08-14 23:46     ` Szabolcs Nagy
2021-08-15  7:04       ` Stefan Kanthak [this message]
2021-08-15  7:46         ` Ariadne Conill
2021-08-15 13:59           ` Rich Felker
2021-08-15 14:57             ` Ariadne Conill
2021-08-15  8:24         ` Damian McGuckin
2021-08-15 14:03           ` Rich Felker
2021-08-15 15:10             ` Damian McGuckin
2021-08-15 14:56         ` Szabolcs Nagy
2021-08-15 15:19           ` Stefan Kanthak
2021-08-15 15:48             ` Rich Felker
2021-08-15 16:29               ` Stefan Kanthak
2021-08-15 16:49                 ` Rich Felker
2021-08-15 20:52                   ` Stefan Kanthak
2021-08-15 21:48                     ` Rich Felker
2021-08-15 15:52             ` Ariadne Conill
2021-08-15 16:09               ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=367A4018B58A4E308E2A95404362CBFB@H270 \
    --to=stefan.kanthak@nexgo.de \
    --cc=musl@lists.openwall.com \
    --cc=nsz@port70.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).