Deduplicating atomics written in terms of CAS

mailing list of musl libc
 help / color / mirror / code / Atom feed

* Deduplicating atomics written in terms of CAS
@ 2015-05-17  4:55 Rich Felker
  2015-05-17  6:00 ` Alexander Monakov
  2015-05-17  6:49 ` Jens Gustedt
  0 siblings, 2 replies; 15+ messages in thread
From: Rich Felker @ 2015-05-17  4:55 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1293 bytes --]

Lots of archs define most or all of their atomics except a_cas in
terms of a_cas. The attached atomic.h is a proposed replacement for
arch-specific atomic.h that would live in src/internal. The arch
atomic.h files would be replaced with atomic_arch.h, which could opt
to define nothing but a_cas, or could define more primitives itself if
it can do so more efficiently.

The second attachment, atomic_generic.h, is an implementation of the
atomics (and other non-atomic ops we've traditionally had in atomic.h)
using GNU C builtins. This file can be used as-is for any new archs
that satisfy the following conditions:

- They're not supported by compilers too old to have the __sync_*
  builtins.

- They don't need runtime switching/detection of atomic
  implementations.

- GCC doesn't generate pathologically bad code for the builtins.

Even if some of these conditions aren't met, atomic_generic.h may be
useful when porting to help determine the right way to implement
atomics by watching what the compiler does, and it may be suitable for
early porting stages before a proper atomic.h is written.

I'm not sure if I'll try to integrate this stuff right away or as part
of the bits deduplication and build system overhaul that's been
coming-soon for a long time.

Comments welcome.

Rich

[-- Attachment #2: atomic.h --]
[-- Type: text/plain, Size: 3009 bytes --]

#ifndef _ATOMIC_H
#define _ATOMIC_H

#include "atomic_arch.h"

#ifndef a_swap
static inline int a_swap(volatile int *p, int v)
{
	int old;
	do old = *p;
	while (a_cas(p, old, v) != old);
	return old;
}
#endif

#ifndef a_fetch_add
static inline int a_fetch_add(volatile int *p, int v)
{
	int old;
	do old = *p;
	while (a_cas(p, old, old+v) != old);
	return old;
}
#endif

#ifndef a_and
static inline void a_and(volatile int *p, int v)
{
	int old;
	do old = *p;
	while (a_cas(p, old, old&v) != old);
}
#endif

#ifndef a_or
static inline void a_or(volatile int *p, int v)
{
	int old;
	do old = *p;
	while (a_cas(p, old, old|v) != old);
}
#endif

#ifndef a_inc
static inline void a_inc(volatile int *p)
{
	a_fetch_add(p, 1);
}
#endif

#ifndef a_dec
static inline void a_dec(volatile int *p)
{
	a_fetch_add(p, -1);
}
#endif

#ifndef a_store
static inline void a_store(volatile int *p, int v)
{
#ifdef a_barrier
	a_barrier();
	*p = v;
	a_barrier();
#else
	a_swap(p, v);
#endif
}
#endif

#ifndef a_barrier
static void a_barrier()
{
	volatile int tmp = 0;
	a_cas(&tmp, 0, 0);
}
#endif

#ifndef a_spin
#define a_spin a_barrier
#endif

#ifndef a_and_64
static inline void a_and_64(volatile uint64_t *p, uint64_t v)
{
	union { uint64_t v; uint32_t r[2]; } u = { v };
	if (u.r[0]+1) a_and((int *)p, u.r[0]);
	if (u.r[1]+1) a_and((int *)p+1, u.r[1]);
}
#endif

#ifndef a_or_64
static inline void a_or_64(volatile uint64_t *p, uint64_t v)
{
	union { uint64_t v; uint32_t r[2]; } u = { v };
	if (u.r[0]) a_or((int *)p, u.r[0]);
	if (u.r[1]) a_or((int *)p+1, u.r[1]);
}
#endif

#ifndef a_cas_p
static inline void *a_cas_p(volatile void *p, void *t, void *s)
{
	return (void *)a_cas(p, (int)t, (int)s);
}
#endif

#ifndef a_or_l
static inline void a_or_l(volatile void *p, long v)
{
	if (sizeof(long) == sizeof(int)) a_or(p, v);
	else a_or_64(p, v);
}
#endif

#ifndef a_crash
static inline void a_crash()
{
	*(volatile char *)0=0;
}
#endif

#ifndef a_ctz_64
static inline int a_ctz_64(uint64_t x)
{
	static const char debruijn64[64] = {
		0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
		62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
		63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
		51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
	};
	static const char debruijn32[32] = {
		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
	};
	if (sizeof(long) < 8) {
		uint32_t y = x;
		if (!y) {
			y = x>>32;
			return 32 + debruijn32[(y&-y)*0x076be629 >> 27];
		}
		return debruijn32[(y&-y)*0x076be629 >> 27];
	}
	return debruijn64[(x&-x)*0x022fdd63cc95386dull >> 58];
}
#endif

#ifndef a_ctz_l
static inline int a_ctz_l(unsigned long x)
{
	static const char debruijn32[32] = {
		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
	};
	if (sizeof(long) == 8) return a_ctz_64(x);
	return debruijn32[(x&-x)*0x076be629 >> 27];
}
#endif

#endif

[-- Attachment #3: atomic_generic.h --]
[-- Type: text/plain, Size: 1063 bytes --]

static inline int a_cas(volatile int *p, int t, int s)
{
	return __sync_val_compare_and_swap(p, t, s);
}

static inline void *a_cas_p(volatile void **p, void *t, void *s)
{
	return __sync_val_compare_and_swap(p, t, s);
}

static inline int a_swap(volatile int *p, int v)
{
	int old;
	do old = *p;
	while (!__sync_bool_compare_and_swap(p, old, v));
	return old;
}

static inline int a_fetch_add(volatile int *p, int v)
{
	return __sync_fetch_and_add(p, v);
}

static inline void a_and(volatile int *p, int v)
{
	__sync_fetch_and_and(p, v);
}

static inline void a_or(volatile int *p, int v)
{
	__sync_fetch_and_or(p, v);
}

static void a_barrier()
{
	__sync_synchronize();
}

static inline void a_crash()
{
	__builtin_trap();
}

static inline int a_ctz_l(unsigned long x)
{
	return __builtin_ctzl(x);
}

static inline int a_ctz_64(uint64_t x)
{
	return __builtin_ctzll(x);
}

#define a_cas a_cas
#define a_cas_p a_cas_p
#define a_swap a_swap
#define a_fetch_add
#define a_and
#define a_or
#define a_barrier
#define a_crash
#define a_ctz_l
#define a_ctz_64

#endif

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  4:55 Deduplicating atomics written in terms of CAS Rich Felker
@ 2015-05-17  6:00 ` Alexander Monakov
  2015-05-17  6:14   ` Rich Felker
  2015-05-17  6:49 ` Jens Gustedt
  1 sibling, 1 reply; 15+ messages in thread
From: Alexander Monakov @ 2015-05-17  6:00 UTC (permalink / raw)
  To: musl

It seems the prototype of a_cas_p has changed:

    void *a_cas_p(volatile void **p, void *t, void *s)

Shouldn't the type of the first argument be 'void * volatile *'?

Alexander


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  6:00 ` Alexander Monakov
@ 2015-05-17  6:14   ` Rich Felker
  2015-05-17  7:37     ` Jens Gustedt
  0 siblings, 1 reply; 15+ messages in thread
From: Rich Felker @ 2015-05-17  6:14 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 09:00:15AM +0300, Alexander Monakov wrote:
> It seems the prototype of a_cas_p has changed:
> 
>     void *a_cas_p(volatile void **p, void *t, void *s)
> 
> Shouldn't the type of the first argument be 'void * volatile *'?

Yes, thanks for catching it. BTW changing from the current void* to
this will probably expose minor aliasing bugs I want to fix. I might
just change all pointers used with a_cas_p to uintptr_t, and have
a_cas_p work on uintptr_t, and force the code using them to cast back
and forth to real pointers. Alternatively we could try to get rid of
a_cas_p entirely. I'd like to trim down both the set of atomic
primitives we're using and the number of direct uses of atomics
(versus higher-level primitives like locks). Some candidates to
remove:

- a_cas_p
- a_or_l (only used in sigaction; could be replaced by a_or)
- a_and (not used at all)
- a_and_64/a_or_64 (malloc only; these are misnamed too)

Actually a_cas_p is the hardest to remove; while none of the users of
it are performance-critical themselves, they are using it as a means
of avoiding locking where the consumer of the data being written can't
or doesn't want to require a lock.

Rich

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  4:55 Deduplicating atomics written in terms of CAS Rich Felker
  2015-05-17  6:00 ` Alexander Monakov
@ 2015-05-17  6:49 ` Jens Gustedt
  2015-05-17 16:22   ` Rich Felker
  1 sibling, 1 reply; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17  6:49 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2858 bytes --]

Hello,

Am Sonntag, den 17.05.2015, 00:55 -0400 schrieb Rich Felker:
> Lots of archs define most or all of their atomics except a_cas in
> terms of a_cas. The attached atomic.h is a proposed replacement for
> arch-specific atomic.h that would live in src/internal. The arch
> atomic.h files would be replaced with atomic_arch.h, which could opt
> to define nothing but a_cas, or could define more primitives itself if
> it can do so more efficiently.

I like the approach

> The second attachment, atomic_generic.h, is an implementation of the
> atomics (and other non-atomic ops we've traditionally had in atomic.h)
> using GNU C builtins. This file can be used as-is for any new archs
> that satisfy the following conditions:
>
> - They're not supported by compilers too old to have the __sync_*
>   builtins.
> 
> - They don't need runtime switching/detection of atomic
>   implementations.
> 
> - GCC doesn't generate pathologically bad code for the builtins.

shouldn't this file then define or macros such as a_swap, too ?

On quick inspection I found issues with the two 64 bit functions:

#ifndef a_and_64
static inline void a_and_64(volatile uint64_t *p, uint64_t v)
{
        union { uint64_t v; uint32_t r[2]; } u = { v };
        if (u.r[0]+1) a_and((int *)p, u.r[0]);
        if (u.r[1]+1) a_and((int *)p+1, u.r[1]);
}
#endif

#ifndef a_or_64
static inline void a_or_64(volatile uint64_t *p, uint64_t v)
{
        union { uint64_t v; uint32_t r[2]; } u = { v };
        if (u.r[0]) a_or((int *)p, u.r[0]);
        if (u.r[1]) a_or((int *)p+1, u.r[1]);
}
#endif

First I don't get it how we can expect these to be be atomic. It looks
to me that the two 32 bit words can be updated with quite a laps of
time between them if the thread is delayed. I didn't check this, do we
really need 64bit atomics?

Then, the mix of uint32_t and int is unfortunate. This code is in
header files and thus visible to all compilation units, especially
user code that might use any optimization option that the compiler
offers. The cast to int* breaks aliasing rules, so compilers that are
used with aggressive optimization may produce wrong executables, in
pretending that *p didn't change.

I only recently learned that even cast to volatile doesn't help in
cases where the original object to which p points is not declared
volatile. The C standard states that only volatile *declared* objects
are subject to the rules of volatile. Accessing through a volatile
pointer doesn't help.

Jens

-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  6:14   ` Rich Felker
@ 2015-05-17  7:37     ` Jens Gustedt
  2015-05-17 16:28       ` Rich Felker
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17  7:37 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1386 bytes --]

Am Sonntag, den 17.05.2015, 02:14 -0400 schrieb Rich Felker:
> - a_and_64/a_or_64 (malloc only; these are misnamed too)

I should have checked the use before my last mail. They are
definitively misnamed.

Both uses of them look ok concerning atomicity, only one of the a_and
or a_or calls triggers.

The only object (mal.binmap) to which this is applied is in fact
volatile, so it must actually be reloaded all the time it is used.

But in line 352 the code uses another assumption, then, that 64 bit
loads always are atomic. I don't see why this should hold in general.

We already have a similar assumption for 32 bit int all over the
place, and I am not too happy with such "silent" assumption. For 64
bit, this assumption looks wrong to me.

I would be much happier by using explicit atomic types and atomic load
functions or macros everywhere. For normal builds these could be dummy
types made to resolve to the actual code that we have, now. But this
would allow to have hardening builds, that check for consistency of
all atomic accesses.

Jens

-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  6:49 ` Jens Gustedt
@ 2015-05-17 16:22   ` Rich Felker
  2015-05-17 17:19     ` Jens Gustedt
  0 siblings, 1 reply; 15+ messages in thread
From: Rich Felker @ 2015-05-17 16:22 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 08:49:04AM +0200, Jens Gustedt wrote:
> Hello,
> 
> Am Sonntag, den 17.05.2015, 00:55 -0400 schrieb Rich Felker:
> > Lots of archs define most or all of their atomics except a_cas in
> > terms of a_cas. The attached atomic.h is a proposed replacement for
> > arch-specific atomic.h that would live in src/internal. The arch
> > atomic.h files would be replaced with atomic_arch.h, which could opt
> > to define nothing but a_cas, or could define more primitives itself if
> > it can do so more efficiently.
> 
> I like the approach
> 
> > The second attachment, atomic_generic.h, is an implementation of the
> > atomics (and other non-atomic ops we've traditionally had in atomic.h)
> > using GNU C builtins. This file can be used as-is for any new archs
> > that satisfy the following conditions:
> >
> > - They're not supported by compilers too old to have the __sync_*
> >   builtins.
> > 
> > - They don't need runtime switching/detection of atomic
> >   implementations.
> > 
> > - GCC doesn't generate pathologically bad code for the builtins.
> 
> shouldn't this file then define or macros such as a_swap, too ?

Hm? I don't understand what you're asking.

> On quick inspection I found issues with the two 64 bit functions:
> 
> #ifndef a_and_64
> static inline void a_and_64(volatile uint64_t *p, uint64_t v)
> {
>         union { uint64_t v; uint32_t r[2]; } u = { v };
>         if (u.r[0]+1) a_and((int *)p, u.r[0]);
>         if (u.r[1]+1) a_and((int *)p+1, u.r[1]);
> }
> #endif
> 
> #ifndef a_or_64
> static inline void a_or_64(volatile uint64_t *p, uint64_t v)
> {
>         union { uint64_t v; uint32_t r[2]; } u = { v };
>         if (u.r[0]) a_or((int *)p, u.r[0]);
>         if (u.r[1]) a_or((int *)p+1, u.r[1]);
> }
> #endif
> 
> First I don't get it how we can expect these to be be atomic. It looks
> to me that the two 32 bit words can be updated with quite a laps of
> time between them if the thread is delayed. I didn't check this, do we
> really need 64bit atomics?

These are misnomers. They're only used/needed as atomic bit-set and
bit-clear. It would be nice to eliminate them completely, but malloc
is using them right now. It would be easy to put the above logic
directly in malloc and have the bitmasks be kept as a union of
uint64_t and int[], but that's mildly ugly too I think.

> Then, the mix of uint32_t and int is unfortunate. This code is in
> header files and thus visible to all compilation units, especially
> user code that might use any optimization option that the compiler
> offers. The cast to int* breaks aliasing rules, so compilers that are
> used with aggressive optimization may produce wrong executables, in
> pretending that *p didn't change.

The cast itself doesn't break aliasing rules. Only accessing the
memory as int does that. The intent was that a_or would only access
the object via asm, so the C type rules would not apply -- that's how
things originally worked when we only had i386 and x86_64. But now
that a_or is a C wrapper for a_cas on many/most archs, we do have an
aliasing problem, I think. That makes me more eagar to remove these.

> I only recently learned that even cast to volatile doesn't help in
> cases where the original object to which p points is not declared
> volatile. The C standard states that only volatile *declared* objects
> are subject to the rules of volatile. Accessing through a volatile
> pointer doesn't help.

I'm not so sure about that. See this question on SO, which has two
conflicting and both reasonable-sounding answers:

http://stackoverflow.com/questions/28654418/requirements-for-behavior-of-pointer-to-volatile-pointing-to-non-volatile-object

In any case, all objects used with atomics in musl are declared
volatile now, or that is the intent anyway. If I missed any please let
me know.

Rich


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17  7:37     ` Jens Gustedt
@ 2015-05-17 16:28       ` Rich Felker
  2015-05-17 16:59         ` Jens Gustedt
  0 siblings, 1 reply; 15+ messages in thread
From: Rich Felker @ 2015-05-17 16:28 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 09:37:19AM +0200, Jens Gustedt wrote:
> Am Sonntag, den 17.05.2015, 02:14 -0400 schrieb Rich Felker:
> > - a_and_64/a_or_64 (malloc only; these are misnamed too)
> 
> I should have checked the use before my last mail. They are
> definitively misnamed.
> 
> Both uses of them look ok concerning atomicity, only one of the a_and
> or a_or calls triggers.
> 
> The only object (mal.binmap) to which this is applied is in fact
> volatile, so it must actually be reloaded all the time it is used.
> 
> But in line 352 the code uses another assumption, then, that 64 bit
> loads always are atomic. I don't see why this should hold in general.

I don't think there's such an assumption. The only assumption is that
each bit is read exactly the number of times it would be on the
abstract machine, so that we can't observe inconsistent values for the
same object. Lack of any heavy synchronization around reading the mask
may result in failure to see some changes or seeing them out of order,
but it doesn't matter: If a bin is wrongly seen as non-empty, locking
and attempting to unbin from it will fail. If it is wrongly seen as
empty, the worst that can happen is a less-optimal (but would have
been optimal an instant earlier) larger chunk gets split instead of
using a smaller one to satisfy the allocation.

Of course it's an open question whether the complex atomics and
fine-grained locking in malloc help or hurt performance more on
average. I'd really like to measure this at some point. Overhauling
malloc to try to get significantly better multi-threaded performance
without the fragmentation-optimality sacrifices other mallocs make is
a long-term goal I have open.

> We already have a similar assumption for 32 bit int all over the
> place, and I am not too happy with such "silent" assumption. For 64
> bit, this assumption looks wrong to me.

I agree I wouldn't be happy with such an assumption, but I don't think
it's being made here.

> I would be much happier by using explicit atomic types and atomic load
> functions or macros everywhere. For normal builds these could be dummy
> types made to resolve to the actual code that we have, now. But this
> would allow to have hardening builds, that check for consistency of
> all atomic accesses.

There is no way to do an atomic 64-bit load on most of the archs we
support. So trying to make it explicit wouldn't help.

Rich

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 16:28       ` Rich Felker
@ 2015-05-17 16:59         ` Jens Gustedt
  2015-05-17 17:59           ` Rich Felker
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17 16:59 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 3546 bytes --]

Am Sonntag, den 17.05.2015, 12:28 -0400 schrieb Rich Felker:
> On Sun, May 17, 2015 at 09:37:19AM +0200, Jens Gustedt wrote:
> > Am Sonntag, den 17.05.2015, 02:14 -0400 schrieb Rich Felker:
> > > - a_and_64/a_or_64 (malloc only; these are misnamed too)
> > 
> > I should have checked the use before my last mail. They are
> > definitively misnamed.
> > 
> > Both uses of them look ok concerning atomicity, only one of the a_and
> > or a_or calls triggers.
> > 
> > The only object (mal.binmap) to which this is applied is in fact
> > volatile, so it must actually be reloaded all the time it is used.
> > 
> > But in line 352 the code uses another assumption, then, that 64 bit
> > loads always are atomic. I don't see why this should hold in general.
> 
> I don't think there's such an assumption. The only assumption is that
> each bit is read exactly the number of times it would be on the
> abstract machine, so that we can't observe inconsistent values for the
> same object. Lack of any heavy synchronization around reading the mask
> may result in failure to see some changes or seeing them out of order,
> but it doesn't matter: If a bin is wrongly seen as non-empty, locking
> and attempting to unbin from it will fail. If it is wrongly seen as
> empty, the worst that can happen is a less-optimal (but would have
> been optimal an instant earlier) larger chunk gets split instead of
> using a smaller one to satisfy the allocation.

So to summarize what you are saying that in this special context, an
out-of-sync load of one of the sub-words of a 64 bit word, would only
impact on performance and not correctness. Nice.

A maybe stupid question, then: why do atomics at all, here? You could
perhaps remove all that 64 bit pseudo atomic stuff then.

> Of course it's an open question whether the complex atomics and
> fine-grained locking in malloc help or hurt performance more on
> average. I'd really like to measure this at some point. Overhauling
> malloc to try to get significantly better multi-threaded performance
> without the fragmentation-optimality sacrifices other mallocs make is
> a long-term goal I have open.
> 
> > We already have a similar assumption for 32 bit int all over the
> > place, and I am not too happy with such "silent" assumption. For 64
> > bit, this assumption looks wrong to me.
> 
> I agree I wouldn't be happy with such an assumption, but I don't think
> it's being made here.
> 
> > I would be much happier by using explicit atomic types and atomic load
> > functions or macros everywhere. For normal builds these could be dummy
> > types made to resolve to the actual code that we have, now. But this
> > would allow to have hardening builds, that check for consistency of
> > all atomic accesses.
> 
> There is no way to do an atomic 64-bit load on most of the archs we
> support. So trying to make it explicit wouldn't help.

Ah sorry, I probably went too fast. My last paragraph would be for all
atomic operations, so in particular 32 bit. A macro "a_load" would
make intentions clearer and would perhaps allow to implement an
optional compile time check to see if we use any object consistently
as atomic or not.

Jens

-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::




[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 16:22   ` Rich Felker
@ 2015-05-17 17:19     ` Jens Gustedt
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17 17:19 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 5056 bytes --]

Am Sonntag, den 17.05.2015, 12:22 -0400 schrieb Rich Felker:
> On Sun, May 17, 2015 at 08:49:04AM +0200, Jens Gustedt wrote:
> > Hello,
> > 
> > Am Sonntag, den 17.05.2015, 00:55 -0400 schrieb Rich Felker:
> > > Lots of archs define most or all of their atomics except a_cas in
> > > terms of a_cas. The attached atomic.h is a proposed replacement for
> > > arch-specific atomic.h that would live in src/internal. The arch
> > > atomic.h files would be replaced with atomic_arch.h, which could opt
> > > to define nothing but a_cas, or could define more primitives itself if
> > > it can do so more efficiently.
> > 
> > I like the approach
> > 
> > > The second attachment, atomic_generic.h, is an implementation of the
> > > atomics (and other non-atomic ops we've traditionally had in atomic.h)
> > > using GNU C builtins. This file can be used as-is for any new archs
> > > that satisfy the following conditions:
> > >
> > > - They're not supported by compilers too old to have the __sync_*
> > >   builtins.
> > > 
> > > - They don't need runtime switching/detection of atomic
> > >   implementations.
> > > 
> > > - GCC doesn't generate pathologically bad code for the builtins.
> > 
> > shouldn't this file then define or macros such as a_swap, too ?
> 
> Hm? I don't understand what you're asking.

Ah, I missed the list of defines that are actually all hidden at the
end of the file. I would have preference to put them in the beginning
of the file.

> > On quick inspection I found issues with the two 64 bit functions:
> > 
> > #ifndef a_and_64
> > static inline void a_and_64(volatile uint64_t *p, uint64_t v)
> > {
> >         union { uint64_t v; uint32_t r[2]; } u = { v };
> >         if (u.r[0]+1) a_and((int *)p, u.r[0]);
> >         if (u.r[1]+1) a_and((int *)p+1, u.r[1]);
> > }
> > #endif
> > 
> > #ifndef a_or_64
> > static inline void a_or_64(volatile uint64_t *p, uint64_t v)
> > {
> >         union { uint64_t v; uint32_t r[2]; } u = { v };
> >         if (u.r[0]) a_or((int *)p, u.r[0]);
> >         if (u.r[1]) a_or((int *)p+1, u.r[1]);
> > }
> > #endif
> > 
> > First I don't get it how we can expect these to be be atomic. It looks
> > to me that the two 32 bit words can be updated with quite a laps of
> > time between them if the thread is delayed. I didn't check this, do we
> > really need 64bit atomics?
> 
> These are misnomers. They're only used/needed as atomic bit-set and
> bit-clear. It would be nice to eliminate them completely, but malloc
> is using them right now. It would be easy to put the above logic
> directly in malloc and have the bitmasks be kept as a union of
> uint64_t and int[], but that's mildly ugly too I think.
> 
> > Then, the mix of uint32_t and int is unfortunate. This code is in
> > header files and thus visible to all compilation units, especially
> > user code that might use any optimization option that the compiler
> > offers. The cast to int* breaks aliasing rules, so compilers that are
> > used with aggressive optimization may produce wrong executables, in
> > pretending that *p didn't change.
> 
> The cast itself doesn't break aliasing rules. Only accessing the
> memory as int does that.

yes, sure, that's what I meant :)

> The intent was that a_or would only access
> the object via asm, so the C type rules would not apply -- that's how
> things originally worked when we only had i386 and x86_64. But now
> that a_or is a C wrapper for a_cas on many/most archs, we do have an
> aliasing problem, I think. That makes me more eagar to remove these.

yes, see my other answer about malloc

> > I only recently learned that even cast to volatile doesn't help in
> > cases where the original object to which p points is not declared
> > volatile. The C standard states that only volatile *declared* objects
> > are subject to the rules of volatile. Accessing through a volatile
> > pointer doesn't help.
> 
> I'm not so sure about that.

I am quite sure. We recently had a discussion on that in the
committee, and the outcome was basically what I was stating above.

> See this question on SO, which has two
> conflicting and both reasonable-sounding answers:
> 
> http://stackoverflow.com/questions/28654418/requirements-for-behavior-of-pointer-to-volatile-pointing-to-non-volatile-object

thanks for the pointer, I didn't knew about the text in the rationale.

This could be an indication that the text as it is in the standard is
a defect.

> In any case, all objects used with atomics in musl are declared
> volatile now, or that is the intent anyway. If I missed any please let
> me know.

I don't know of any, but I will look around a bit.

Jens


-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::




[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 16:59         ` Jens Gustedt
@ 2015-05-17 17:59           ` Rich Felker
  2015-05-17 22:23             ` Jens Gustedt
  0 siblings, 1 reply; 15+ messages in thread
From: Rich Felker @ 2015-05-17 17:59 UTC (permalink / raw)
  To: musl

On Sun, May 17, 2015 at 06:59:53PM +0200, Jens Gustedt wrote:
> Am Sonntag, den 17.05.2015, 12:28 -0400 schrieb Rich Felker:
> > On Sun, May 17, 2015 at 09:37:19AM +0200, Jens Gustedt wrote:
> > > Am Sonntag, den 17.05.2015, 02:14 -0400 schrieb Rich Felker:
> > > > - a_and_64/a_or_64 (malloc only; these are misnamed too)
> > > 
> > > I should have checked the use before my last mail. They are
> > > definitively misnamed.
> > > 
> > > Both uses of them look ok concerning atomicity, only one of the a_and
> > > or a_or calls triggers.
> > > 
> > > The only object (mal.binmap) to which this is applied is in fact
> > > volatile, so it must actually be reloaded all the time it is used.
> > > 
> > > But in line 352 the code uses another assumption, then, that 64 bit
> > > loads always are atomic. I don't see why this should hold in general.
> > 
> > I don't think there's such an assumption. The only assumption is that
> > each bit is read exactly the number of times it would be on the
> > abstract machine, so that we can't observe inconsistent values for the
> > same object. Lack of any heavy synchronization around reading the mask
> > may result in failure to see some changes or seeing them out of order,
> > but it doesn't matter: If a bin is wrongly seen as non-empty, locking
> > and attempting to unbin from it will fail. If it is wrongly seen as
> > empty, the worst that can happen is a less-optimal (but would have
> > been optimal an instant earlier) larger chunk gets split instead of
> > using a smaller one to satisfy the allocation.
> 
> So to summarize what you are saying that in this special context, an
> out-of-sync load of one of the sub-words of a 64 bit word, would only
> impact on performance and not correctness. Nice.

Right. Technically it may also impact fragmentation-optimality, but
only in a way that would also be affected by timing/scheduling
differences, so I don't think it makes sense to insist on an
optimality condition there.

> A maybe stupid question, then: why do atomics at all, here? You could
> perhaps remove all that 64 bit pseudo atomic stuff then.

We absolutely would not want two concurrent attempts to set a bit to
clobber each others result, so that the other result would _never_ be
seen. This would result in free memory being lost -- the bin would
appear to be empty until something was added to it again. That's the
motivation for the atomics.

> > Of course it's an open question whether the complex atomics and
> > fine-grained locking in malloc help or hurt performance more on
> > average. I'd really like to measure this at some point. Overhauling
> > malloc to try to get significantly better multi-threaded performance
> > without the fragmentation-optimality sacrifices other mallocs make is
> > a long-term goal I have open.
> > 
> > > We already have a similar assumption for 32 bit int all over the
> > > place, and I am not too happy with such "silent" assumption. For 64
> > > bit, this assumption looks wrong to me.
> > 
> > I agree I wouldn't be happy with such an assumption, but I don't think
> > it's being made here.
> > 
> > > I would be much happier by using explicit atomic types and atomic load
> > > functions or macros everywhere. For normal builds these could be dummy
> > > types made to resolve to the actual code that we have, now. But this
> > > would allow to have hardening builds, that check for consistency of
> > > all atomic accesses.
> > 
> > There is no way to do an atomic 64-bit load on most of the archs we
> > support. So trying to make it explicit wouldn't help.
> 
> Ah sorry, I probably went too fast. My last paragraph would be for all
> atomic operations, so in particular 32 bit. A macro "a_load" would
> make intentions clearer and would perhaps allow to implement an
> optional compile time check to see if we use any object consistently
> as atomic or not.

The reason I'm mildly against this is that all current reads of
atomics, except via the return value of a_cas or a_fetch_add, are
relaxed-order. We don't care if we see a stale value; if staleness
could be a problem, the caller takes care of that in an efficient way.
Having a_load that's relaxed-order whereas all the existing atomics
are seq_cst order would be an inconsistent API design.

Adding a_load_relaxed or something would be an option, but I'm still
not really a fan. Compilers always aim to do volatile ops as a single
load/store when possible (this matters for some MMIO-type uses that
volatile is intended for; some hardware treats 4 byte writes
differently from one word write, and of course the read direction
matters when reading MMIO back from hardware) so there's no reason to
expect the loads to break. I think it was more of an issue before we
moved to using volatile to model atomics.

Rich


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 17:59           ` Rich Felker
@ 2015-05-17 22:23             ` Jens Gustedt
  2015-05-17 22:33               ` Rich Felker
  2015-05-18 10:19               ` Szabolcs Nagy
  0 siblings, 2 replies; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17 22:23 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1733 bytes --]

Am Sonntag, den 17.05.2015, 13:59 -0400 schrieb Rich Felker:
> > Ah sorry, I probably went too fast. My last paragraph would be for all
> > atomic operations, so in particular 32 bit. A macro "a_load" would
> > make intentions clearer and would perhaps allow to implement an
> > optional compile time check to see if we use any object consistently
> > as atomic or not.
> 
> The reason I'm mildly against this is that all current reads of
> atomics, except via the return value of a_cas or a_fetch_add, are
> relaxed-order. We don't care if we see a stale value; if staleness
> could be a problem, the caller takes care of that in an efficient way.
> Having a_load that's relaxed-order whereas all the existing atomics
> are seq_cst order would be an inconsistent API design.

I still wasn't clear enough, sorry. My idea was not that such a
function or macro should change anything on the binary code that is
produced, at least for production builds. I just thought to
encapsulate all atomic accesses into a type and functions that allow
to have a compile check. With that we could ensure that actually all
accesses to such an object are through these functions.

The advantage of C11's model for atomic is that this a qualifier, and
then the compiler automatically checks (or ensures) that all accesses
are atomic. We don't have that luxury, here, but we could get a bit
closer to it.

Jens



-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::





[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 22:23             ` Jens Gustedt
@ 2015-05-17 22:33               ` Rich Felker
  2015-05-17 23:22                 ` Jens Gustedt
  2015-05-18 10:19               ` Szabolcs Nagy
  1 sibling, 1 reply; 15+ messages in thread
From: Rich Felker @ 2015-05-17 22:33 UTC (permalink / raw)
  To: musl

On Mon, May 18, 2015 at 12:23:07AM +0200, Jens Gustedt wrote:
> Am Sonntag, den 17.05.2015, 13:59 -0400 schrieb Rich Felker:
> > > Ah sorry, I probably went too fast. My last paragraph would be for all
> > > atomic operations, so in particular 32 bit. A macro "a_load" would
> > > make intentions clearer and would perhaps allow to implement an
> > > optional compile time check to see if we use any object consistently
> > > as atomic or not.
> > 
> > The reason I'm mildly against this is that all current reads of
> > atomics, except via the return value of a_cas or a_fetch_add, are
> > relaxed-order. We don't care if we see a stale value; if staleness
> > could be a problem, the caller takes care of that in an efficient way.
> > Having a_load that's relaxed-order whereas all the existing atomics
> > are seq_cst order would be an inconsistent API design.
> 
> I still wasn't clear enough, sorry. My idea was not that such a
> function or macro should change anything on the binary code that is
> produced, at least for production builds. I just thought to
> encapsulate all atomic accesses into a type and functions that allow
> to have a compile check.

I understand that. But if it were called a_load, its semantics (no
synchronization/relaxed order) would be inconsistent with all other
a_* atomics which are seq_cst. That's what I don't like.

Rich


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 22:33               ` Rich Felker
@ 2015-05-17 23:22                 ` Jens Gustedt
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Gustedt @ 2015-05-17 23:22 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1861 bytes --]

Am Sonntag, den 17.05.2015, 18:33 -0400 schrieb Rich Felker:
> On Mon, May 18, 2015 at 12:23:07AM +0200, Jens Gustedt wrote:
> > Am Sonntag, den 17.05.2015, 13:59 -0400 schrieb Rich Felker:
> > > > Ah sorry, I probably went too fast. My last paragraph would be for all
> > > > atomic operations, so in particular 32 bit. A macro "a_load" would
> > > > make intentions clearer and would perhaps allow to implement an
> > > > optional compile time check to see if we use any object consistently
> > > > as atomic or not.
> > > 
> > > The reason I'm mildly against this is that all current reads of
> > > atomics, except via the return value of a_cas or a_fetch_add, are
> > > relaxed-order. We don't care if we see a stale value; if staleness
> > > could be a problem, the caller takes care of that in an efficient way.
> > > Having a_load that's relaxed-order whereas all the existing atomics
> > > are seq_cst order would be an inconsistent API design.
> > 
> > I still wasn't clear enough, sorry. My idea was not that such a
> > function or macro should change anything on the binary code that is
> > produced, at least for production builds. I just thought to
> > encapsulate all atomic accesses into a type and functions that allow
> > to have a compile check.
> 
> I understand that. But if it were called a_load, its semantics (no
> synchronization/relaxed order) would be inconsistent with all other
> a_* atomics which are seq_cst. That's what I don't like.

Right. So call it a_load_rel, or something similar?

Jens

-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::




[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-17 22:23             ` Jens Gustedt
  2015-05-17 22:33               ` Rich Felker
@ 2015-05-18 10:19               ` Szabolcs Nagy
  2015-05-18 11:03                 ` Jens Gustedt
  1 sibling, 1 reply; 15+ messages in thread
From: Szabolcs Nagy @ 2015-05-18 10:19 UTC (permalink / raw)
  To: musl

* Jens Gustedt <jens.gustedt@inria.fr> [2015-05-18 00:23:07 +0200]:
> 
> The advantage of C11's model for atomic is that this a qualifier, and
> then the compiler automatically checks (or ensures) that all accesses
> are atomic. We don't have that luxury, here, but we could get a bit
> closer to it.
> 

the qualifierness of atomic is a bit confusing

differently qualified types had the same alignment and representation
so far and in some cases pointers to differently qualified types are
implicitly convertible (eg assigning char* to const char* is ok).

atomic is special: when the standard says 'qualified or unqualified type'
it does not include _Atomic even though it is called a qualifier.
(atomic is always mentioned explicitly, it can have different representation
and alignment to allow implementation with locks and thus the pointers
cannot be convertible)

annotating everything with _Atomic in musl is problematic because the
atomic bits are publicly visible in pthread types (we could use _Atomic
only when building musl, but i think its usefulness should be demonstrated
with examples etc before doing something that ugly.. having a_load_relaxed
without typesystem help does not sound very useful)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Deduplicating atomics written in terms of CAS
  2015-05-18 10:19               ` Szabolcs Nagy
@ 2015-05-18 11:03                 ` Jens Gustedt
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Gustedt @ 2015-05-18 11:03 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1951 bytes --]

Am Montag, den 18.05.2015, 12:19 +0200 schrieb Szabolcs Nagy:
> * Jens Gustedt <jens.gustedt@inria.fr> [2015-05-18 00:23:07 +0200]:
> > 
> > The advantage of C11's model for atomic is that this a qualifier, and
> > then the compiler automatically checks (or ensures) that all accesses
> > are atomic. We don't have that luxury, here, but we could get a bit
> > closer to it.
> > 
> 
> the qualifierness of atomic is a bit confusing
> 
> differently qualified types had the same alignment and representation
> so far and in some cases pointers to differently qualified types are
> implicitly convertible (eg assigning char* to const char* is ok).
> 
> atomic is special: when the standard says 'qualified or unqualified type'
> it does not include _Atomic even though it is called a qualifier.
> (atomic is always mentioned explicitly, it can have different representation
> and alignment to allow implementation with locks and thus the pointers
> cannot be convertible)
> 
> annotating everything with _Atomic in musl is problematic because the
> atomic bits are publicly visible in pthread types (we could use _Atomic
> only when building musl, but i think its usefulness should be demonstrated
> with examples etc before doing something that ugly.. having a_load_relaxed
> without typesystem help does not sound very useful)

I didn't say that I wanted to use _Atomic, we can't do that for
obvious reasons. I just want to hide the atomic objects behind a
typedef, such that production compiles generate exactly the same
binary, but that we also can use the compiler to have a check for
consistency.

Jens


-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::




[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-05-18 11:03 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-17  4:55 Deduplicating atomics written in terms of CAS Rich Felker
2015-05-17  6:00 ` Alexander Monakov
2015-05-17  6:14   ` Rich Felker
2015-05-17  7:37     ` Jens Gustedt
2015-05-17 16:28       ` Rich Felker
2015-05-17 16:59         ` Jens Gustedt
2015-05-17 17:59           ` Rich Felker
2015-05-17 22:23             ` Jens Gustedt
2015-05-17 22:33               ` Rich Felker
2015-05-17 23:22                 ` Jens Gustedt
2015-05-18 10:19               ` Szabolcs Nagy
2015-05-18 11:03                 ` Jens Gustedt
2015-05-17  6:49 ` Jens Gustedt
2015-05-17 16:22   ` Rich Felker
2015-05-17 17:19     ` Jens Gustedt

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).