* Szabolcs Nagy [2020-07-08 00:38:14 +0200]: > + > +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ > +static inline uint64_t mul64(uint64_t a, uint64_t b) > +{ > + uint64_t ahi = a>>32; > + uint64_t alo = a&0xffffffff; > + uint64_t bhi = b>>32; > + uint64_t blo = b&0xffffffff; > + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); > +} ... > +/* returns a*b exactly. */ > +static inline u128 mul64_128(uint64_t a, uint64_t b) > +{ > + u128 r; > + uint64_t ahi = a>>32; > + uint64_t alo = a&0xffffffff; > + uint64_t bhi = b>>32; > + uint64_t blo = b&0xffffffff; > + uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); > + uint64_t lo2 = (alo*blo)&0xffffffff; > + r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); > + r.lo = (lo1<<32) + lo2; > + return r; > +} > + > +/* returns a*b*2^-128 - e, with error 0 <= e < 3. */ > +static inline u128 mul128(u128 a, u128 b) > +{ > + u128 hi = mul64_128(a.hi, b.hi); > + u128 m1 = mul64_128(a.hi, b.lo); > + u128 m2 = mul64_128(a.lo, b.hi); > + return add64(add64(hi, m1.hi), m2.hi); > +} this is does not have to be this precise, i should have spotted it earlier. attached a fixed version, should be a bit faster (and smaller unless -Os is used which prevents inlining) reran the tests, they still pass.