[PATCH 5/7] use the new lock algorithm for malloc

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH 5/7] use the new lock algorithm for malloc
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
                   ` (2 preceding siblings ...)
  2018-01-03 13:17 ` [PATCH 2/7] consistently use the LOCK an UNLOCK macros Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-09 17:42   ` Rich Felker
  2018-01-03 13:17 ` [PATCH 3/7] revise the definition of multiple basic locks in the code Jens Gustedt
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

Malloc used a specialized lock implementation in many places. Now that we
have a generic lock that has the desired properties, we should just use
this, instead of this multitude of very similar lock mechanisms.
---
 src/malloc/malloc.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index 9e05e1d6..6c667a5a 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -13,6 +13,8 @@
 #define inline inline __attribute__((always_inline))
 #endif
 
+#include "__lock.h"
+
 void *__mmap(void *, size_t, int, int, int, off_t);
 int __munmap(void *, size_t);
 void *__mremap(void *, size_t, size_t, int, ...);
@@ -24,7 +26,7 @@ struct chunk {
 };
 
 struct bin {
-	volatile int lock[2];
+	volatile int lock[1];
 	struct chunk *head;
 	struct chunk *tail;
 };
@@ -32,7 +34,7 @@ struct bin {
 static struct {
 	volatile uint64_t binmap;
 	struct bin bins[64];
-	volatile int free_lock[2];
+	volatile int free_lock[1];
 } mal;
 
 
@@ -58,30 +60,16 @@ static struct {
 
 /* Synchronization tools */
 
-static inline void lock(volatile int *lk)
-{
-	if (libc.threads_minus_1)
-		while(a_swap(lk, 1)) __wait(lk, lk+1, 1, 1);
-}
-
-static inline void unlock(volatile int *lk)
-{
-	if (lk[0]) {
-		a_store(lk, 0);
-		if (lk[1]) __wake(lk, 1, 1);
-	}
-}
-
 static inline void lock_bin(int i)
 {
-	lock(mal.bins[i].lock);
+	__lock_fast(mal.bins[i].lock);
 	if (!mal.bins[i].head)
 		mal.bins[i].head = mal.bins[i].tail = BIN_TO_CHUNK(i);
 }
 
 static inline void unlock_bin(int i)
 {
-	unlock(mal.bins[i].lock);
+	__unlock_fast(mal.bins[i].lock);
 }
 
 static int first_set(uint64_t x)
@@ -161,7 +149,7 @@ void *__expand_heap(size_t *);
 
 static struct chunk *expand_heap(size_t n)
 {
-	static int heap_lock[2];
+	static volatile int heap_lock[1];
 	static void *end;
 	void *p;
 	struct chunk *w;
@@ -171,11 +159,11 @@ static struct chunk *expand_heap(size_t n)
 	 * we need room for an extra zero-sized sentinel chunk. */
 	n += SIZE_ALIGN;
 
-	lock(heap_lock);
+	__lock_fast(heap_lock);
 
 	p = __expand_heap(&n);
 	if (!p) {
-		unlock(heap_lock);
+		__unlock_fast(heap_lock);
 		return 0;
 	}
 
@@ -200,7 +188,7 @@ static struct chunk *expand_heap(size_t n)
 	w = MEM_TO_CHUNK(p);
 	w->csize = n | C_INUSE;
 
-	unlock(heap_lock);
+	__unlock_fast(heap_lock);
 
 	return w;
 }
@@ -481,10 +469,10 @@ void free(void *p)
 			next->psize = final_size | C_INUSE;
 			i = bin_index(final_size);
 			lock_bin(i);
-			lock(mal.free_lock);
+			__lock_fast(mal.free_lock);
 			if (self->psize & next->csize & C_INUSE)
 				break;
-			unlock(mal.free_lock);
+			__unlock_fast(mal.free_lock);
 			unlock_bin(i);
 		}
 
@@ -510,7 +498,7 @@ void free(void *p)
 
 	self->csize = final_size;
 	next->psize = final_size;
-	unlock(mal.free_lock);
+	__unlock_fast(mal.free_lock);
 
 	self->next = BIN_TO_CHUNK(i);
 	self->prev = mal.bins[i].tail;
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 7/7] implement the local lock for condition variables with the new lock feature
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
  2018-01-03 13:17 ` [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-03 13:17 ` [PATCH 2/7] consistently use the LOCK an UNLOCK macros Jens Gustedt
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

Condition variables (pthread and C11) had a specialized lock
implementation that already had been designed such that no access to the
variable is made after an unlock. Here this is mandatory because the
implementation uses variables on the thread's stack to chain the access.

The previous implementation only held two different internal states, one
when there is no congestion and one when there is. The latter is then
used to wake up other threads if necessary.

The new lock structure deals with such situation more adequately, because
it keeps track of the exact congestion and allows us to react
appropriately.
---
 src/thread/pthread_cond_timedwait.c | 53 ++++++++++++-------------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

diff --git a/src/thread/pthread_cond_timedwait.c b/src/thread/pthread_cond_timedwait.c
index ed8569c2..659e9b5a 100644
--- a/src/thread/pthread_cond_timedwait.c
+++ b/src/thread/pthread_cond_timedwait.c
@@ -1,5 +1,11 @@
 #include "pthread_impl.h"
 
+#if defined(__GNUC__) && defined(__PIC__)
+#define inline inline __attribute__((always_inline))
+#endif
+
+#include "__lock.h"
+
 void __pthread_testcancel(void);
 int __pthread_mutex_lock(pthread_mutex_t *);
 int __pthread_mutex_unlock(pthread_mutex_t *);
@@ -33,31 +39,6 @@ struct waiter {
 	volatile int *notify;
 };
 
-/* Self-synchronized-destruction-safe lock functions */
-
-static inline void lock(volatile int *l)
-{
-	if (a_cas(l, 0, 1)) {
-		a_cas(l, 1, 2);
-		do __wait(l, 0, 2, 1);
-		while (a_cas(l, 0, 2));
-	}
-}
-
-static inline void unlock(volatile int *l)
-{
-	if (a_swap(l, 0)==2)
-		__wake(l, 1, 1);
-}
-
-static inline void unlock_requeue(volatile int *l, volatile int *r, int w)
-{
-	a_store(l, 0);
-	if (w) __wake(l, 1, 1);
-	else __syscall(SYS_futex, l, FUTEX_REQUEUE|FUTEX_PRIVATE, 0, 1, r) != -ENOSYS
-		|| __syscall(SYS_futex, l, FUTEX_REQUEUE, 0, 1, r);
-}
-
 enum {
 	WAITING,
 	SIGNALED,
@@ -84,7 +65,7 @@ int __pthread_cond_timedwait(pthread_cond_t *restrict c, pthread_mutex_t *restri
 		seq = c->_c_seq;
 		a_inc(&c->_c_waiters);
 	} else {
-		lock(&c->_c_lock);
+		__lock_fast(&c->_c_lock);
 
 		seq = node.barrier = 2;
 		fut = &node.barrier;
@@ -94,7 +75,7 @@ int __pthread_cond_timedwait(pthread_cond_t *restrict c, pthread_mutex_t *restri
 		if (!c->_c_tail) c->_c_tail = &node;
 		else node.next->prev = &node;
 
-		unlock(&c->_c_lock);
+		__unlock_fast(&c->_c_lock);
 	}
 
 	__pthread_mutex_unlock(m);
@@ -125,14 +106,14 @@ int __pthread_cond_timedwait(pthread_cond_t *restrict c, pthread_mutex_t *restri
 		 * after seeing a LEAVING waiter without getting notified
 		 * via the futex notify below. */
 
-		lock(&c->_c_lock);
-		
+		__lock_fast(&c->_c_lock);
+
 		if (c->_c_head == &node) c->_c_head = node.next;
 		else if (node.prev) node.prev->next = node.next;
 		if (c->_c_tail == &node) c->_c_tail = node.prev;
 		else if (node.next) node.next->prev = node.prev;
-		
-		unlock(&c->_c_lock);
+
+		__unlock_fast(&c->_c_lock);
 
 		if (node.notify) {
 			if (a_fetch_add(node.notify, -1)==1)
@@ -140,7 +121,7 @@ int __pthread_cond_timedwait(pthread_cond_t *restrict c, pthread_mutex_t *restri
 		}
 	} else {
 		/* Lock barrier first to control wake order. */
-		lock(&node.barrier);
+		__lock_fast(&node.barrier);
 	}
 
 relock:
@@ -156,7 +137,7 @@ relock:
 	/* Unlock the barrier that's holding back the next waiter, and
 	 * either wake it or requeue it to the mutex. */
 	if (node.prev)
-		unlock_requeue(&node.prev->barrier, &m->_m_lock, m->_m_type & 128);
+		__unlock_requeue_fast(&node.prev->barrier, &m->_m_lock, m->_m_type & 128);
 	else
 		a_dec(&m->_m_waiters);
 
@@ -180,7 +161,7 @@ int __private_cond_signal(pthread_cond_t *c, int n)
 	volatile int ref = 0;
 	int cur;
 
-	lock(&c->_c_lock);
+	__lock_fast(&c->_c_lock);
 	for (p=c->_c_tail; n && p; p=p->prev) {
 		if (a_cas(&p->state, WAITING, SIGNALED) != WAITING) {
 			ref++;
@@ -198,7 +179,7 @@ int __private_cond_signal(pthread_cond_t *c, int n)
 		c->_c_head = 0;
 	}
 	c->_c_tail = p;
-	unlock(&c->_c_lock);
+	__unlock_fast(&c->_c_lock);
 
 	/* Wait for any waiters in the LEAVING state to remove
 	 * themselves from the list before returning or allowing
@@ -206,7 +187,7 @@ int __private_cond_signal(pthread_cond_t *c, int n)
 	while ((cur = ref)) __wait(&ref, 0, cur, 1);
 
 	/* Allow first signaled waiter, if any, to proceed. */
-	if (first) unlock(&first->barrier);
+	if (first) __unlock_fast(&first->barrier);
 
 	return 0;
 }
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 6/7] implement __unlock_requeue
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
                   ` (5 preceding siblings ...)
  2018-01-03 13:17 ` [PATCH 1/7] a new lock algorithm with lock value and congestion count in the same atomic int Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  6 siblings, 0 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

We need a special case of unlock when we might take advantage of the
futex requeue feature to move a thread from a condition queue to the one
of the correponding mutex
---
 src/internal/__lock.h |  8 ++++++++
 src/internal/libc.h   |  2 ++
 src/thread/__lock.c   | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/src/internal/__lock.h b/src/internal/__lock.h
index f16d6176..0c2e9bfb 100644
--- a/src/internal/__lock.h
+++ b/src/internal/__lock.h
@@ -19,3 +19,11 @@ static inline void __unlock_fast(volatile int *l)
 		}
 	}
 }
+
+static inline void __unlock_requeue_fast(volatile int *l, volatile int *r, int shared)
+{
+	extern void __unlock_requeue_slow(volatile int*, volatile int*, int);
+	/* We have to check if l[0] had been touched at all. */
+	if (l[0] < 0 && (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)))
+		__unlock_requeue_slow(l, r, shared);
+}
diff --git a/src/internal/libc.h b/src/internal/libc.h
index a594d0c5..999b4cba 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -50,6 +50,8 @@ extern char *__progname, *__progname_full;
 void __lock_slow(volatile int *, int) ATTR_LIBC_VISIBILITY;
 void __lock(volatile int *) ATTR_LIBC_VISIBILITY;
 void __unlock(volatile int *) ATTR_LIBC_VISIBILITY;
+void __unlock_requeue(volatile int *, volatile int *, int) ATTR_LIBC_VISIBILITY;
+void __unlock_requeue_slow(volatile int *, volatile int *, int) ATTR_LIBC_VISIBILITY;
 int __lockfile(FILE *) ATTR_LIBC_VISIBILITY;
 void __unlockfile(FILE *) ATTR_LIBC_VISIBILITY;
 #define LOCK(x) __lock(x)
diff --git a/src/thread/__lock.c b/src/thread/__lock.c
index a3d8d4d0..b0b0c67c 100644
--- a/src/thread/__lock.c
+++ b/src/thread/__lock.c
@@ -19,6 +19,23 @@
 
 weak_alias(__lock_fast, __lock);
 weak_alias(__unlock_fast, __unlock);
+weak_alias(__unlock_requeue_fast, __unlock_requeue);
+
+/* __unlock_requeue handles the case where we have a second futex
+ * queue to which the a waiting thread should be scheduled if shared
+ * is false. This is e.g the case when l is the futex of a condition
+ * variable and r is the one of a private mutex: if the mutex is
+ * private, it is more efficient to just insert the thread in the
+ * queue of that mutex, instead of waking him up and put it back to
+ * sleep immediately when trying to lock the mutex. */
+
+void __unlock_requeue_slow(volatile int *l, volatile int *r, int shared)
+{
+	if (shared) __wake(l, 1, 1);
+	else __syscall(SYS_futex, l, FUTEX_REQUEUE|FUTEX_PRIVATE, 0, 1, r) != -ENOSYS
+		|| __syscall(SYS_futex, l, FUTEX_REQUEUE, 0, 1, r);
+}
+
 
 void __lock_slow(volatile int *l, int current)
 {
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/7] consistently use the LOCK an UNLOCK macros
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
  2018-01-03 13:17 ` [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU Jens Gustedt
  2018-01-03 13:17 ` [PATCH 7/7] implement the local lock for condition variables with the new lock feature Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-03 13:17 ` [PATCH 5/7] use the new lock algorithm for malloc Jens Gustedt
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

In some places there has been a direct usage of the functions. Use the
macros consistently everywhere, such that it might be easier later on to
capture the fast path directly inside the macro and only have the call
overhead on the slow path.
---
 src/thread/pthread_create.c        | 6 +++---
 src/thread/pthread_detach.c        | 2 +-
 src/thread/pthread_getschedparam.c | 4 ++--
 src/thread/pthread_kill.c          | 4 ++--
 src/thread/pthread_setschedparam.c | 4 ++--
 src/thread/pthread_setschedprio.c  | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index 6cbf85b3..34cd9936 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -37,10 +37,10 @@ _Noreturn void __pthread_exit(void *result)
 
 	__pthread_tsd_run_dtors();
 
-	__lock(self->exitlock);
+	LOCK(self->exitlock);
 
 	/* Mark this thread dead before decrementing count */
-	__lock(self->killlock);
+	LOCK(self->killlock);
 	self->dead = 1;
 
 	/* Block all signals before decrementing the live thread count.
@@ -54,7 +54,7 @@ _Noreturn void __pthread_exit(void *result)
 	 * been blocked. This precludes observation of the thread id
 	 * as a live thread (with application code running in it) after
 	 * the thread was reported dead by ESRCH being returned. */
-	__unlock(self->killlock);
+	UNLOCK(self->killlock);
 
 	/* It's impossible to determine whether this is "the last thread"
 	 * until performing the atomic decrement, since multiple threads
diff --git a/src/thread/pthread_detach.c b/src/thread/pthread_detach.c
index d9c90d1c..692bbaf9 100644
--- a/src/thread/pthread_detach.c
+++ b/src/thread/pthread_detach.c
@@ -9,7 +9,7 @@ static int __pthread_detach(pthread_t t)
 	if (a_cas(t->exitlock, 0, INT_MIN + 1))
 		return __pthread_join(t, 0);
 	t->detached = 2;
-	__unlock(t->exitlock);
+	UNLOCK(t->exitlock);
 	return 0;
 }
 
diff --git a/src/thread/pthread_getschedparam.c b/src/thread/pthread_getschedparam.c
index 3053c186..a994b637 100644
--- a/src/thread/pthread_getschedparam.c
+++ b/src/thread/pthread_getschedparam.c
@@ -3,7 +3,7 @@
 int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param *restrict param)
 {
 	int r;
-	__lock(t->killlock);
+	LOCK(t->killlock);
 	if (t->dead) {
 		r = ESRCH;
 	} else {
@@ -12,6 +12,6 @@ int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param
 			*policy = __syscall(SYS_sched_getscheduler, t->tid);
 		}
 	}
-	__unlock(t->killlock);
+	UNLOCK(t->killlock);
 	return r;
 }
diff --git a/src/thread/pthread_kill.c b/src/thread/pthread_kill.c
index acdb1ea6..f0903420 100644
--- a/src/thread/pthread_kill.c
+++ b/src/thread/pthread_kill.c
@@ -3,8 +3,8 @@
 int pthread_kill(pthread_t t, int sig)
 {
 	int r;
-	__lock(t->killlock);
+	LOCK(t->killlock);
 	r = t->dead ? ESRCH : -__syscall(SYS_tkill, t->tid, sig);
-	__unlock(t->killlock);
+	UNLOCK(t->killlock);
 	return r;
 }
diff --git a/src/thread/pthread_setschedparam.c b/src/thread/pthread_setschedparam.c
index c4738d64..9e2fa456 100644
--- a/src/thread/pthread_setschedparam.c
+++ b/src/thread/pthread_setschedparam.c
@@ -3,8 +3,8 @@
 int pthread_setschedparam(pthread_t t, int policy, const struct sched_param *param)
 {
 	int r;
-	__lock(t->killlock);
+	LOCK(t->killlock);
 	r = t->dead ? ESRCH : -__syscall(SYS_sched_setscheduler, t->tid, policy, param);
-	__unlock(t->killlock);
+	UNLOCK(t->killlock);
 	return r;
 }
diff --git a/src/thread/pthread_setschedprio.c b/src/thread/pthread_setschedprio.c
index e0bdc03b..dc745b42 100644
--- a/src/thread/pthread_setschedprio.c
+++ b/src/thread/pthread_setschedprio.c
@@ -3,8 +3,8 @@
 int pthread_setschedprio(pthread_t t, int prio)
 {
 	int r;
-	__lock(t->killlock);
+	LOCK(t->killlock);
 	r = t->dead ? ESRCH : -__syscall(SYS_sched_setparam, t->tid, &prio);
-	__unlock(t->killlock);
+	UNLOCK(t->killlock);
 	return r;
 }
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/7] a new lock algorithm with lock value and congestion count in the same atomic int
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
                   ` (4 preceding siblings ...)
  2018-01-03 13:17 ` [PATCH 3/7] revise the definition of multiple basic locks in the code Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-03 13:17 ` [PATCH 6/7] implement __unlock_requeue Jens Gustedt
  6 siblings, 0 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

A variant of this new lock algorithm has been presented at SAC'16, see
https://hal.inria.fr/hal-01304108. A full version of that paper is
available at https://hal.inria.fr/hal-01236734.

The main motivation of this is to improve on the safety of the basic lock
implementation in musl. This is achieved by squeezing a lock flag and a
congestion count (= threads inside the critical section) into a single
int. Thereby an unlock operation does exactly one memory
transfer (a_fetch_add) and never touches the value again, but still
detects if a waiter has to be woken up.

This is a fix of a use-after-free bug in pthread_detach that had
temporarily been patched. Therefore this patch also reverts

         c1e27367a9b26b9baac0f37a12349fc36567c8b6

This is also the only place where internal knowledge of the lock
algorithm is used.

The main price for the improved safety is a little bit larger code.

Under high congestion, the scheduling behavior will be different
compared to the previous algorithm. In that case, a successful
put-to-sleep may appear out of order compared to the arrival in the
critical section.

This is version 3 of the patch that takes remarks of Alexander and Rich
into account:

 - all lock values are now formulated with respect to INT_MIN
 - unlock now uses __wake to wake up waiters
 - a new inline function __futexwait with the same complexity as __wake
   is use to set threads to sleep
 - __unlock covers the case that no lock had been recorded
 - the fast path of __lock is moved outside the loop in a prominent position
 - revert c1e27367
---
 src/internal/pthread_impl.h |  6 +++++
 src/thread/__lock.c         | 55 ++++++++++++++++++++++++++++++++++++++++-----
 src/thread/pthread_detach.c |  5 ++---
 3 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 56e19348..602d6f56 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -136,6 +136,12 @@ static inline void __wake(volatile void *addr, int cnt, int priv)
 	__syscall(SYS_futex, addr, FUTEX_WAKE|priv, cnt) != -ENOSYS ||
 	__syscall(SYS_futex, addr, FUTEX_WAKE, cnt);
 }
+static inline void __futexwait(volatile void *addr, int val, int priv)
+{
+	if (priv) priv = FUTEX_PRIVATE;
+	__syscall(SYS_futex, addr, FUTEX_WAIT|priv, val) != -ENOSYS ||
+	__syscall(SYS_futex, addr, FUTEX_WAIT, val);
+}
 
 void __acquire_ptc(void);
 void __release_ptc(void);
diff --git a/src/thread/__lock.c b/src/thread/__lock.c
index 0874c04a..45557c88 100644
--- a/src/thread/__lock.c
+++ b/src/thread/__lock.c
@@ -1,15 +1,60 @@
 #include "pthread_impl.h"
 
+/* This lock primitive combines a flag (in the sign bit) and a
+ * congestion count (= threads inside the critical section, CS) in a
+ * single int that is accessed through atomic operations. The states
+ * of the int for value x are:
+ *
+ * x == 0: unlocked and no thread inside the critical section
+ *
+ * x < 0: locked with a congestion of x-INT_MIN, including the thread
+ * that holds the lock
+ *
+ * x > 0: unlocked with a congestion of x
+ *
+ * or in an equivalent formulation x is the congestion count or'ed
+ * with INT_MIN as a lock flag.
+ */
+
 void __lock(volatile int *l)
 {
-	if (libc.threads_minus_1)
-		while (a_swap(l, 1)) __wait(l, l+1, 1, 1);
+	if (!libc.threads_minus_1) return;
+	/* fast path: INT_MIN for the lock, +1 for the congestion */
+	int current = a_cas(l, 0, INT_MIN + 1);
+	if (!current) return;
+	/* A first spin loop, for medium congestion. */
+	for (unsigned i = 0; i < 10; ++i) {
+		if (current < 0) current -= INT_MIN + 1;
+		// assertion: current >= 0
+		int val = a_cas(l, current, INT_MIN + (current + 1));
+		if (val == current) return;
+		current = val;
+	}
+	// Spinning failed, so mark ourselves as being inside the CS.
+	current = a_fetch_add(l, 1) + 1;
+	/* The main lock acquisition loop for heavy congestion. The only
+	 * change to the value performed inside that loop is a successful
+	 * lock via the CAS that acquires the lock. */
+	for (;;) {
+		/* We can only go into wait, if we know that somebody holds the
+		 * lock and will eventually wake us up, again. */
+		if (current < 0) {
+			__futexwait(l, current, 1);
+			current -= INT_MIN + 1;
+		}
+		/* assertion: current > 0, the count includes us already. */
+		int val = a_cas(l, current, INT_MIN + current);
+		if (val == current) return;
+		current = val;
+	}
 }
 
 void __unlock(volatile int *l)
 {
-	if (l[0]) {
-		a_store(l, 0);
-		if (l[1]) __wake(l, 1, 1);
+	/* Check l[0] to see if we are multi-threaded. */
+	if (l[0] < 0) {
+		if (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)) {
+			__wake(l, 1, 1);
+		}
 	}
 }
diff --git a/src/thread/pthread_detach.c b/src/thread/pthread_detach.c
index 13482607..d9c90d1c 100644
--- a/src/thread/pthread_detach.c
+++ b/src/thread/pthread_detach.c
@@ -6,11 +6,10 @@ int __pthread_join(pthread_t, void **);
 static int __pthread_detach(pthread_t t)
 {
 	/* Cannot detach a thread that's already exiting */
-	if (a_swap(t->exitlock, 1))
+	if (a_cas(t->exitlock, 0, INT_MIN + 1))
 		return __pthread_join(t, 0);
 	t->detached = 2;
-	a_store(t->exitlock, 0);
-	__wake(t->exitlock, 1, 1);
+	__unlock(t->exitlock);
 	return 0;
 }
 
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-09 17:41   ` Rich Felker
  2018-01-03 13:17 ` [PATCH 7/7] implement the local lock for condition variables with the new lock feature Jens Gustedt
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

This provides two interfaces __lock_fast and __unlock_fast that are both
"static inline" and that should result in a better integration of the
lock in place. The slow path of the lock algorithm remains centralized,
here adding the overhead of a function call is not a big deal.

This must only be used directly by a TU that encapsulates all LOCK and
UNLOCK calls of a particular lock object.
---
 src/internal/__lock.h | 21 +++++++++++++++++++++
 src/internal/libc.h   |  1 +
 src/thread/__lock.c   | 20 +++++---------------
 3 files changed, 27 insertions(+), 15 deletions(-)
 create mode 100644 src/internal/__lock.h

diff --git a/src/internal/__lock.h b/src/internal/__lock.h
new file mode 100644
index 00000000..f16d6176
--- /dev/null
+++ b/src/internal/__lock.h
@@ -0,0 +1,21 @@
+#include "pthread_impl.h"
+
+static inline void __lock_fast(volatile int *l)
+{
+	extern void __lock_slow(volatile int*, int);
+	if (!libc.threads_minus_1) return;
+	/* fast path: INT_MIN for the lock, +1 for the congestion */
+	int current = a_cas(l, 0, INT_MIN + 1);
+	if (!current) return;
+	__lock_slow(l, current);
+}
+
+static inline void __unlock_fast(volatile int *l)
+{
+	/* Check l[0] to see if we are multi-threaded. */
+	if (l[0] < 0) {
+		if (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)) {
+			__wake(l, 1, 1);
+		}
+	}
+}
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 5e145183..a594d0c5 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -47,6 +47,7 @@ extern size_t __sysinfo ATTR_LIBC_VISIBILITY;
 extern char *__progname, *__progname_full;
 
 /* Designed to avoid any overhead in non-threaded processes */
+void __lock_slow(volatile int *, int) ATTR_LIBC_VISIBILITY;
 void __lock(volatile int *) ATTR_LIBC_VISIBILITY;
 void __unlock(volatile int *) ATTR_LIBC_VISIBILITY;
 int __lockfile(FILE *) ATTR_LIBC_VISIBILITY;
diff --git a/src/thread/__lock.c b/src/thread/__lock.c
index 45557c88..a3d8d4d0 100644
--- a/src/thread/__lock.c
+++ b/src/thread/__lock.c
@@ -1,4 +1,5 @@
 #include "pthread_impl.h"
+#include "__lock.h"
 
 /* This lock primitive combines a flag (in the sign bit) and a
  * congestion count (= threads inside the critical section, CS) in a
@@ -16,12 +17,11 @@
  * with INT_MIN as a lock flag.
  */
 
-void __lock(volatile int *l)
+weak_alias(__lock_fast, __lock);
+weak_alias(__unlock_fast, __unlock);
+
+void __lock_slow(volatile int *l, int current)
 {
-	if (!libc.threads_minus_1) return;
-	/* fast path: INT_MIN for the lock, +1 for the congestion */
-	int current = a_cas(l, 0, INT_MIN + 1);
-	if (!current) return;
 	/* A first spin loop, for medium congestion. */
 	for (unsigned i = 0; i < 10; ++i) {
 		if (current < 0) current -= INT_MIN + 1;
@@ -48,13 +48,3 @@ void __lock(volatile int *l)
 		current = val;
 	}
 }
-
-void __unlock(volatile int *l)
-{
-	/* Check l[0] to see if we are multi-threaded. */
-	if (l[0] < 0) {
-		if (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)) {
-			__wake(l, 1, 1);
-		}
-	}
-}
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/7] revise the definition of multiple basic locks in the code
  2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
                   ` (3 preceding siblings ...)
  2018-01-03 13:17 ` [PATCH 5/7] use the new lock algorithm for malloc Jens Gustedt
@ 2018-01-03 13:17 ` Jens Gustedt
  2018-01-03 13:17 ` [PATCH 1/7] a new lock algorithm with lock value and congestion count in the same atomic int Jens Gustedt
  2018-01-03 13:17 ` [PATCH 6/7] implement __unlock_requeue Jens Gustedt
  6 siblings, 0 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:17 UTC (permalink / raw)
  To: musl

In all cases this is just a change from two volatile int to one.
---
 src/dirent/__dirent.h       | 2 +-
 src/exit/at_quick_exit.c    | 2 +-
 src/exit/atexit.c           | 2 +-
 src/internal/pthread_impl.h | 4 ++--
 src/locale/dcngettext.c     | 2 +-
 src/locale/locale_map.c     | 2 +-
 src/locale/setlocale.c      | 2 +-
 src/malloc/lite_malloc.c    | 2 +-
 src/misc/syslog.c           | 2 +-
 src/prng/random.c           | 2 +-
 src/stdio/ofl.c             | 2 +-
 src/thread/pthread_atfork.c | 2 +-
 src/thread/sem_open.c       | 2 +-
 src/thread/synccall.c       | 2 +-
 src/time/__tz.c             | 2 +-
 15 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/dirent/__dirent.h b/src/dirent/__dirent.h
index 32871baf..101b0368 100644
--- a/src/dirent/__dirent.h
+++ b/src/dirent/__dirent.h
@@ -4,6 +4,6 @@ struct __dirstream
 	off_t tell;
 	int buf_pos;
 	int buf_end;
-	volatile int lock[2];
+	volatile int lock[1];
 	char buf[2048];
 };
diff --git a/src/exit/at_quick_exit.c b/src/exit/at_quick_exit.c
index ac28dfd9..4079b242 100644
--- a/src/exit/at_quick_exit.c
+++ b/src/exit/at_quick_exit.c
@@ -5,7 +5,7 @@
 
 static void (*funcs[COUNT])(void);
 static int count;
-static volatile int lock[2];
+static volatile int lock[1];
 
 void __funcs_on_quick_exit()
 {
diff --git a/src/exit/atexit.c b/src/exit/atexit.c
index 2b58b8bb..cd3b0a64 100644
--- a/src/exit/atexit.c
+++ b/src/exit/atexit.c
@@ -13,7 +13,7 @@ static struct fl
 } builtin, *head;
 
 static int slot;
-static volatile int lock[2];
+static volatile int lock[1];
 
 void __funcs_on_exit()
 {
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 602d6f56..f0b2c20c 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -39,8 +39,8 @@ struct pthread {
 	int unblock_cancel;
 	volatile int timer_id;
 	locale_t locale;
-	volatile int killlock[2];
-	volatile int exitlock[2];
+	volatile int killlock[1];
+	volatile int exitlock[1];
 	volatile int startlock[2];
 	unsigned long sigmask[_NSIG/8/sizeof(long)];
 	char *dlerror_buf;
diff --git a/src/locale/dcngettext.c b/src/locale/dcngettext.c
index b79b7010..d8af9603 100644
--- a/src/locale/dcngettext.c
+++ b/src/locale/dcngettext.c
@@ -34,7 +34,7 @@ static char *gettextdir(const char *domainname, size_t *dirlen)
 
 char *bindtextdomain(const char *domainname, const char *dirname)
 {
-	static volatile int lock[2];
+	static volatile int lock[1];
 	struct binding *p, *q;
 
 	if (!domainname) return 0;
diff --git a/src/locale/locale_map.c b/src/locale/locale_map.c
index 188fcf39..79542310 100644
--- a/src/locale/locale_map.c
+++ b/src/locale/locale_map.c
@@ -26,7 +26,7 @@ static const char envvars[][12] = {
 
 const struct __locale_map *__get_locale(int cat, const char *val)
 {
-	static volatile int lock[2];
+	static volatile int lock[1];
 	static void *volatile loc_head;
 	const struct __locale_map *p;
 	struct __locale_map *new = 0;
diff --git a/src/locale/setlocale.c b/src/locale/setlocale.c
index 623660cc..40bc7ece 100644
--- a/src/locale/setlocale.c
+++ b/src/locale/setlocale.c
@@ -21,7 +21,7 @@ char *__strchrnul(const char *, int);
 
 char *setlocale(int cat, const char *name)
 {
-	static volatile int lock[2];
+	static volatile int lock[1];
 
 	if ((unsigned)cat > LC_ALL) return 0;
 
diff --git a/src/malloc/lite_malloc.c b/src/malloc/lite_malloc.c
index a7e4a9f7..701f60b4 100644
--- a/src/malloc/lite_malloc.c
+++ b/src/malloc/lite_malloc.c
@@ -11,7 +11,7 @@ void *__expand_heap(size_t *);
 static void *__simple_malloc(size_t n)
 {
 	static char *cur, *end;
-	static volatile int lock[2];
+	static volatile int lock[1];
 	size_t align=1, pad;
 	void *p;
 
diff --git a/src/misc/syslog.c b/src/misc/syslog.c
index 9dd1ddb5..60e9020f 100644
--- a/src/misc/syslog.c
+++ b/src/misc/syslog.c
@@ -11,7 +11,7 @@
 #include <fcntl.h>
 #include "libc.h"
 
-static volatile int lock[2];
+static volatile int lock[1];
 static char log_ident[32];
 static int log_opt;
 static int log_facility = LOG_USER;
diff --git a/src/prng/random.c b/src/prng/random.c
index 7d557d70..13a5e6df 100644
--- a/src/prng/random.c
+++ b/src/prng/random.c
@@ -22,7 +22,7 @@ static int n = 31;
 static int i = 3;
 static int j = 0;
 static uint32_t *x = init+1;
-static volatile int lock[2];
+static volatile int lock[1];
 
 static uint32_t lcg31(uint32_t x) {
 	return (1103515245*x + 12345) & 0x7fffffff;
diff --git a/src/stdio/ofl.c b/src/stdio/ofl.c
index b143999c..0e3602aa 100644
--- a/src/stdio/ofl.c
+++ b/src/stdio/ofl.c
@@ -2,7 +2,7 @@
 #include "libc.h"
 
 static FILE *ofl_head;
-static volatile int ofl_lock[2];
+static volatile int ofl_lock[1];
 
 FILE **__ofl_lock()
 {
diff --git a/src/thread/pthread_atfork.c b/src/thread/pthread_atfork.c
index a40d7f63..c6f77b3f 100644
--- a/src/thread/pthread_atfork.c
+++ b/src/thread/pthread_atfork.c
@@ -8,7 +8,7 @@ static struct atfork_funcs {
 	struct atfork_funcs *prev, *next;
 } *funcs;
 
-static volatile int lock[2];
+static volatile int lock[1];
 
 void __fork_handler(int who)
 {
diff --git a/src/thread/sem_open.c b/src/thread/sem_open.c
index fda0acd3..dc0279e8 100644
--- a/src/thread/sem_open.c
+++ b/src/thread/sem_open.c
@@ -20,7 +20,7 @@ static struct {
 	sem_t *sem;
 	int refcnt;
 } *semtab;
-static volatile int lock[2];
+static volatile int lock[1];
 
 #define FLAGS (O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NONBLOCK)
 
diff --git a/src/thread/synccall.c b/src/thread/synccall.c
index f6813576..ba2f258e 100644
--- a/src/thread/synccall.c
+++ b/src/thread/synccall.c
@@ -14,7 +14,7 @@ static struct chain {
 	sem_t target_sem, caller_sem;
 } *volatile head;
 
-static volatile int synccall_lock[2];
+static volatile int synccall_lock[1];
 static volatile int target_tid;
 static void (*callback)(void *), *context;
 static volatile int dummy = 0;
diff --git a/src/time/__tz.c b/src/time/__tz.c
index 8cc96032..1dbb0b8f 100644
--- a/src/time/__tz.c
+++ b/src/time/__tz.c
@@ -27,7 +27,7 @@ static char old_tz_buf[32];
 static char *old_tz = old_tz_buf;
 static size_t old_tz_size = sizeof old_tz_buf;
 
-static volatile int lock[2];
+static volatile int lock[1];
 
 static int getint(const char **p)
 {
-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 0/7] V3 of the new lock algorithm
@ 2018-01-03 13:20 Jens Gustedt
  2018-01-03 13:17 ` [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU Jens Gustedt
                   ` (6 more replies)
  0 siblings, 7 replies; 13+ messages in thread
From: Jens Gustedt @ 2018-01-03 13:20 UTC (permalink / raw)
  To: musl

This patch series implements and applies the new internal lock
algorithm.

Patch 1 is the lock implementation itself. I hope this is fit for
1.19, now.

The patches 2-7 do some clean up and apply this lock algorithm to two
fields, where we had specialized lock implementations before:
condition variables and malloc.

In all of this I am still a bit uncomfortable with the double syscalls
to futex functions. We do that just because some older Linux versions
might not implement private futexes. I'd prefer if that could be
replaced by a read to a central variable instead of having a second
(conditional) syscall. Such a variable could be initialized during
thread-library initialization, I think.

Jens Gustedt (7):
  a new lock algorithm with lock value and congestion count in the same
    atomic int
  consistently use the LOCK an UNLOCK macros
  revise the definition of multiple basic locks in the code
  separate the fast parts of __lock and __unlock into a .h file that may
    be used by other TU
  use the new lock algorithm for malloc
  implement __unlock_requeue
  implement the local lock for condition variables with the new lock
    feature

 src/dirent/__dirent.h               |  2 +-
 src/exit/at_quick_exit.c            |  2 +-
 src/exit/atexit.c                   |  2 +-
 src/internal/__lock.h               | 29 ++++++++++++++++
 src/internal/libc.h                 |  3 ++
 src/internal/pthread_impl.h         | 10 ++++--
 src/locale/dcngettext.c             |  2 +-
 src/locale/locale_map.c             |  2 +-
 src/locale/setlocale.c              |  2 +-
 src/malloc/lite_malloc.c            |  2 +-
 src/malloc/malloc.c                 | 38 ++++++++-------------
 src/misc/syslog.c                   |  2 +-
 src/prng/random.c                   |  2 +-
 src/stdio/ofl.c                     |  2 +-
 src/thread/__lock.c                 | 66 +++++++++++++++++++++++++++++++++----
 src/thread/pthread_atfork.c         |  2 +-
 src/thread/pthread_cond_timedwait.c | 53 ++++++++++-------------------
 src/thread/pthread_create.c         |  6 ++--
 src/thread/pthread_detach.c         |  5 ++-
 src/thread/pthread_getschedparam.c  |  4 +--
 src/thread/pthread_kill.c           |  4 +--
 src/thread/pthread_setschedparam.c  |  4 +--
 src/thread/pthread_setschedprio.c   |  4 +--
 src/thread/sem_open.c               |  2 +-
 src/thread/synccall.c               |  2 +-
 src/time/__tz.c                     |  2 +-
 26 files changed, 156 insertions(+), 98 deletions(-)
 create mode 100644 src/internal/__lock.h

-- 
2.15.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU
  2018-01-03 13:17 ` [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU Jens Gustedt
@ 2018-01-09 17:41   ` Rich Felker
  0 siblings, 0 replies; 13+ messages in thread
From: Rich Felker @ 2018-01-09 17:41 UTC (permalink / raw)
  To: musl

On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> This provides two interfaces __lock_fast and __unlock_fast that are both
> "static inline" and that should result in a better integration of the
> lock in place. The slow path of the lock algorithm remains centralized,
> here adding the overhead of a function call is not a big deal.
> 
> This must only be used directly by a TU that encapsulates all LOCK and
> UNLOCK calls of a particular lock object.
> ---
>  src/internal/__lock.h | 21 +++++++++++++++++++++
>  src/internal/libc.h   |  1 +
>  src/thread/__lock.c   | 20 +++++---------------
>  3 files changed, 27 insertions(+), 15 deletions(-)
>  create mode 100644 src/internal/__lock.h
> 
> diff --git a/src/internal/__lock.h b/src/internal/__lock.h
> new file mode 100644
> index 00000000..f16d6176
> --- /dev/null
> +++ b/src/internal/__lock.h
> @@ -0,0 +1,21 @@
> +#include "pthread_impl.h"
> +
> +static inline void __lock_fast(volatile int *l)
> +{
> +	extern void __lock_slow(volatile int*, int);
> +	if (!libc.threads_minus_1) return;
> +	/* fast path: INT_MIN for the lock, +1 for the congestion */
> +	int current = a_cas(l, 0, INT_MIN + 1);
> +	if (!current) return;
> +	__lock_slow(l, current);
> +}

I think I'd like to hold off on this one for now until we study the
impact. Burying the libc.threads_minus_1 check inside an external
function was actually something of a trick to prevent the caller from
needing a GOT pointer; on archs where loading the GOT pointer is a
pain, putting the load in the caller may badly un-shrinkwrap the
function.

In practice, that probably doesn't work correctly right now because
too many internal functions are missing hidden attribute, and too many
of the affected functions call public functions (which the compiler
was able to assume didn't go thru PLT when we had the vis.h hack, but
that's off by default now because it was a bad idea). So figuring out
whether this sort of shrinkwrap-assistance is still salvagable or just
needs to be thrown out is an open question..

> +static inline void __unlock_fast(volatile int *l)
> +{
> +	/* Check l[0] to see if we are multi-threaded. */
> +	if (l[0] < 0) {
> +		if (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)) {
> +			__wake(l, 1, 1);
> +		}
> +	}
> +}

This one probably already makes sense to inline.

> diff --git a/src/internal/libc.h b/src/internal/libc.h
> index 5e145183..a594d0c5 100644
> --- a/src/internal/libc.h
> +++ b/src/internal/libc.h
> @@ -47,6 +47,7 @@ extern size_t __sysinfo ATTR_LIBC_VISIBILITY;
>  extern char *__progname, *__progname_full;
>  
>  /* Designed to avoid any overhead in non-threaded processes */
> +void __lock_slow(volatile int *, int) ATTR_LIBC_VISIBILITY;
>  void __lock(volatile int *) ATTR_LIBC_VISIBILITY;
>  void __unlock(volatile int *) ATTR_LIBC_VISIBILITY;
>  int __lockfile(FILE *) ATTR_LIBC_VISIBILITY;
> diff --git a/src/thread/__lock.c b/src/thread/__lock.c
> index 45557c88..a3d8d4d0 100644
> --- a/src/thread/__lock.c
> +++ b/src/thread/__lock.c
> @@ -1,4 +1,5 @@
>  #include "pthread_impl.h"
> +#include "__lock.h"
>  
>  /* This lock primitive combines a flag (in the sign bit) and a
>   * congestion count (= threads inside the critical section, CS) in a
> @@ -16,12 +17,11 @@
>   * with INT_MIN as a lock flag.
>   */
>  
> -void __lock(volatile int *l)
> +weak_alias(__lock_fast, __lock);
> +weak_alias(__unlock_fast, __unlock);
> +
> +void __lock_slow(volatile int *l, int current)
>  {
> -	if (!libc.threads_minus_1) return;
> -	/* fast path: INT_MIN for the lock, +1 for the congestion */
> -	int current = a_cas(l, 0, INT_MIN + 1);
> -	if (!current) return;
>  	/* A first spin loop, for medium congestion. */
>  	for (unsigned i = 0; i < 10; ++i) {
>  		if (current < 0) current -= INT_MIN + 1;
> @@ -48,13 +48,3 @@ void __lock(volatile int *l)
>  		current = val;
>  	}
>  }
> -
> -void __unlock(volatile int *l)
> -{
> -	/* Check l[0] to see if we are multi-threaded. */
> -	if (l[0] < 0) {
> -		if (a_fetch_add(l, -(INT_MIN + 1)) != (INT_MIN + 1)) {
> -			__wake(l, 1, 1);
> -		}
> -	}
> -}
> -- 
> 2.15.1

I'm missing the mechanism by which the new __lock_fast and
__unlock_fast ever get inlined. __lock.h does not seem to be included
anywhere. In order to make this work right, some refactoring
discussion of what should be exposed by libc.h vs pthread_impl.h
probably needs to be discussed. I think the factoring right now is
fairly wrong; lock and futex primitives aren't really part of the
"pthread implementation" but general stuff libc needs internally.

Rich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/7] use the new lock algorithm for malloc
  2018-01-03 13:17 ` [PATCH 5/7] use the new lock algorithm for malloc Jens Gustedt
@ 2018-01-09 17:42   ` Rich Felker
  2018-01-09 18:58     ` Jens Gustedt
  0 siblings, 1 reply; 13+ messages in thread
From: Rich Felker @ 2018-01-09 17:42 UTC (permalink / raw)
  To: musl

On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> Malloc used a specialized lock implementation in many places. Now that we
> have a generic lock that has the desired properties, we should just use
> this, instead of this multitude of very similar lock mechanisms.
> ---
>  src/malloc/malloc.c | 38 +++++++++++++-------------------------
>  1 file changed, 13 insertions(+), 25 deletions(-)
> 
> diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> index 9e05e1d6..6c667a5a 100644
> --- a/src/malloc/malloc.c
> +++ b/src/malloc/malloc.c
> @@ -13,6 +13,8 @@
>  #define inline inline __attribute__((always_inline))
>  #endif
>  
> +#include "__lock.h"
> +

Ah, I see -- maybe you deemed malloc to be the only place where
inlining for the sake of speed made sense? That's probably true.

Rich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/7] use the new lock algorithm for malloc
  2018-01-09 17:42   ` Rich Felker
@ 2018-01-09 18:58     ` Jens Gustedt
  2018-01-09 19:26       ` Rich Felker
  0 siblings, 1 reply; 13+ messages in thread
From: Jens Gustedt @ 2018-01-09 18:58 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1432 bytes --]

Hello Rich,

On Tue, 9 Jan 2018 12:42:34 -0500 Rich Felker <dalias@libc.org> wrote:

> On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> > Malloc used a specialized lock implementation in many places. Now
> > that we have a generic lock that has the desired properties, we
> > should just use this, instead of this multitude of very similar
> > lock mechanisms. ---
> >  src/malloc/malloc.c | 38 +++++++++++++-------------------------
> >  1 file changed, 13 insertions(+), 25 deletions(-)
> > 
> > diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> > index 9e05e1d6..6c667a5a 100644
> > --- a/src/malloc/malloc.c
> > +++ b/src/malloc/malloc.c
> > @@ -13,6 +13,8 @@
> >  #define inline inline __attribute__((always_inline))
> >  #endif
> >  
> > +#include "__lock.h"
> > +  
> 
> Ah, I see -- maybe you deemed malloc to be the only place where
> inlining for the sake of speed made sense? That's probably true.

Yes, and also I was trying to be conservative. Previously, the lock
functions for malloc resided in the same TU, so they were probably
inlined most of the time.

Jens

-- 
:: INRIA Nancy Grand Est ::: Camus ::::::: ICube/ICPS :::
:: ::::::::::::::: office Strasbourg : +33 368854536   ::
:: :::::::::::::::::::::: gsm France : +33 651400183   ::
:: ::::::::::::::: gsm international : +49 15737185122 ::
:: http://icube-icps.unistra.fr/index.php/Jens_Gustedt ::

[-- Attachment #2: Digitale Signatur von OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/7] use the new lock algorithm for malloc
  2018-01-09 18:58     ` Jens Gustedt
@ 2018-01-09 19:26       ` Rich Felker
  2018-09-18 19:23         ` Rich Felker
  0 siblings, 1 reply; 13+ messages in thread
From: Rich Felker @ 2018-01-09 19:26 UTC (permalink / raw)
  To: musl

On Tue, Jan 09, 2018 at 07:58:51PM +0100, Jens Gustedt wrote:
> Hello Rich,
> 
> On Tue, 9 Jan 2018 12:42:34 -0500 Rich Felker <dalias@libc.org> wrote:
> 
> > On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> > > Malloc used a specialized lock implementation in many places. Now
> > > that we have a generic lock that has the desired properties, we
> > > should just use this, instead of this multitude of very similar
> > > lock mechanisms. ---
> > >  src/malloc/malloc.c | 38 +++++++++++++-------------------------
> > >  1 file changed, 13 insertions(+), 25 deletions(-)
> > > 
> > > diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> > > index 9e05e1d6..6c667a5a 100644
> > > --- a/src/malloc/malloc.c
> > > +++ b/src/malloc/malloc.c
> > > @@ -13,6 +13,8 @@
> > >  #define inline inline __attribute__((always_inline))
> > >  #endif
> > >  
> > > +#include "__lock.h"
> > > +  
> > 
> > Ah, I see -- maybe you deemed malloc to be the only place where
> > inlining for the sake of speed made sense? That's probably true.
> 
> Yes, and also I was trying to be conservative. Previously, the lock
> functions for malloc resided in the same TU, so they were probably
> inlined most of the time.

Yes, and that was done because (at least at the time) it made a
significant empirical difference. So I suspect it makes sense to do
the same still. I've queued your patches 1-3 for inclusion in my next
push unless I see any major problem. I might try to get the rest
included too but being that I'm behind on this release cycle we'll
see..

Thanks for all your work on this and patience. :)

Rich



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/7] use the new lock algorithm for malloc
  2018-01-09 19:26       ` Rich Felker
@ 2018-09-18 19:23         ` Rich Felker
  0 siblings, 0 replies; 13+ messages in thread
From: Rich Felker @ 2018-09-18 19:23 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2591 bytes --]

On Tue, Jan 09, 2018 at 02:26:44PM -0500, Rich Felker wrote:
> On Tue, Jan 09, 2018 at 07:58:51PM +0100, Jens Gustedt wrote:
> > Hello Rich,
> > 
> > On Tue, 9 Jan 2018 12:42:34 -0500 Rich Felker <dalias@libc.org> wrote:
> > 
> > > On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> > > > Malloc used a specialized lock implementation in many places. Now
> > > > that we have a generic lock that has the desired properties, we
> > > > should just use this, instead of this multitude of very similar
> > > > lock mechanisms. ---
> > > >  src/malloc/malloc.c | 38 +++++++++++++-------------------------
> > > >  1 file changed, 13 insertions(+), 25 deletions(-)
> > > > 
> > > > diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> > > > index 9e05e1d6..6c667a5a 100644
> > > > --- a/src/malloc/malloc.c
> > > > +++ b/src/malloc/malloc.c
> > > > @@ -13,6 +13,8 @@
> > > >  #define inline inline __attribute__((always_inline))
> > > >  #endif
> > > >  
> > > > +#include "__lock.h"
> > > > +  
> > > 
> > > Ah, I see -- maybe you deemed malloc to be the only place where
> > > inlining for the sake of speed made sense? That's probably true.
> > 
> > Yes, and also I was trying to be conservative. Previously, the lock
> > functions for malloc resided in the same TU, so they were probably
> > inlined most of the time.
> 
> Yes, and that was done because (at least at the time) it made a
> significant empirical difference. So I suspect it makes sense to do
> the same still. I've queued your patches 1-3 for inclusion in my next
> push unless I see any major problem. I might try to get the rest
> included too but being that I'm behind on this release cycle we'll
> see..
> 
> Thanks for all your work on this and patience. :)

I'm just coming back to look at this, and I can't get the new lock to
perform comparably well to the current one, much less better, in
malloc. I suspect the benefit of just being able to do a store and
relaxed read on x86 for the unlock is too great to beat. Note that I
just fixed a bug related to this on powerpc64 in commit
12817793301398241b6cb00c740f0d3ca41076e9 and I expect the performance
properties might be reversed on non-x86 archs.

I did have to hack it in since the patch from this series no longer
directly applies, and I just did it inline as a test, but I don't
think I did anything wrong there; it's attached for reference.

I'm also attaching the (very old) malloc_stress.c I used to measure. I
noticed the greatest differences running it with test #3 and 4 threads
(./malloc_stress 3 4), where 4 is the number of cores.

Rich

[-- Attachment #2: malloc-newlock.diff --]
[-- Type: text/plain, Size: 783 bytes --]

diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index 9698259..0ad540f 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -6,6 +6,7 @@
 #include <errno.h>
 #include <sys/mman.h>
 #include "libc.h"
+#include "lock.h"
 #include "atomic.h"
 #include "pthread_impl.h"
 #include "malloc_impl.h"
@@ -26,12 +27,18 @@ int __malloc_replaced;
 
 static inline void lock(volatile int *lk)
 {
+	if (!libc.threads_minus_1 || !a_cas(lk, 0, INT_MIN + 1)) return;
+	LOCK(lk);
+	return;
 	if (libc.threads_minus_1)
 		while(a_swap(lk, 1)) __wait(lk, lk+1, 1, 1);
 }
 
 static inline void unlock(volatile int *lk)
 {
+	if (lk[0] < 0 && a_fetch_add(lk, -(INT_MIN + 1)) != (INT_MIN + 1))
+		__wake(lk, 1, 1);
+	return;
 	if (lk[0]) {
 		a_store(lk, 0);
 		if (lk[1]) __wake(lk, 1, 1);

[-- Attachment #3: malloc_stress.c --]
[-- Type: text/plain, Size: 3769 bytes --]

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <pthread.h>

#define MAX_NTH 160
#define LOOPS 100000
#define SH_COUNT 300
#define PV_COUNT 300
#define MAX_SZ 500//00
#define DEF_SZ 40//00
#define TEST test4

struct {
	pthread_mutex_t lock;
	void *mem;
} foo[SH_COUNT];

static unsigned rng(unsigned *r)
{
	return *r = *r * 1103515245 + 12345;
}

static void *test1(void *unused)
{
	void *p[PV_COUNT] = { 0 };
	unsigned sz[PV_COUNT] = { 0 };
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t k;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % PV_COUNT;
		k = rng(&r) % MAX_SZ;
		if (p[j]) free(p[j]), p[j]=0, sz[j]=0;
		else p[j] = malloc(sz[j]=k);
	}
	for (j=0, k=0; j<PV_COUNT; j++) k+=sz[j];
	return (void *)k;
}

static void *test2(void *unused)
{
	void *p[PV_COUNT] = { 0 };
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t k;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % PV_COUNT;
		if (p[j]) free(p[j]), p[j]=0;
		else p[j] = malloc(DEF_SZ);
	}
	for (j=0, k=0; j<PV_COUNT; j++) if (p[j]) k+=DEF_SZ;
	return (void *)k;
}

static void *test3(void *unused)
{
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t sz;
	void *p;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % SH_COUNT;
		sz = rng(&r) % MAX_SZ;
		pthread_mutex_lock(&foo[j].lock);
		p = foo[j].mem;
		foo[j].mem = 0;
		pthread_mutex_unlock(&foo[j].lock);
		free(p);
		if (!p) {
			p = malloc(sz);
			pthread_mutex_lock(&foo[j].lock);
			if (!foo[j].mem) foo[j].mem = p, p = 0;
			pthread_mutex_unlock(&foo[j].lock);
			free(p);
		}
	}
	return 0;
}

static void *test4(void *unused)
{
	void *p[PV_COUNT];
	size_t i;
	for (i=0; i<PV_COUNT; i++) p[i] = malloc(DEF_SZ);
	for (i=0; i<PV_COUNT-1; i++) free(p[i]);
	return (void *)(DEF_SZ);
}

static void *test5(void *unused)
{
	void *p[PV_COUNT];
	size_t i;
	for (i=0; i<PV_COUNT; i++) p[i] = malloc(DEF_SZ);
	return (void *)(DEF_SZ*PV_COUNT);
}

static void *(*const tests[])(void *) = { 0, test1, test2, test3, test4, test5 };

int main(int argc, char **argv)
{
	pthread_t t[MAX_NTH];
	void *res;
	int i;
	int maj, min, in_heap=0;
	FILE *f;
	char buf[128];
	struct timespec tv_init, tv;
	unsigned long l;
	size_t vm_size=0, vm_rss=0, vm_priv_clean=0, vm_priv_dirty=0;
	size_t l_size=0;
	int testid = 1;
	int nth = 1;
	void *p[1024];

	if (argc>1) testid = atoi(argv[1]);
	if (argc>2) nth = atoi(argv[2]);

#if 0
	for (i=0; i<1024; i++)
		p[i] = malloc(32768);
	for (i=0; i<1024; i++)
		free(p[i]);
#endif

	clock_gettime(CLOCK_REALTIME, &tv_init);

	for (i=0; i<nth-1; i++)
		if (pthread_create(t+i, 0, tests[testid], 0)) {
			perror("pthread_create");
			exit(1);
		}
	res = tests[testid](0);
	l_size += (size_t)res;
	for (i=0; i<nth-1; i++) {
		pthread_join(t[i], &res);
		l_size += (size_t)res;
	}
	l_size += 1023;
	l_size >>= 10;

	clock_gettime(CLOCK_REALTIME, &tv);
	tv.tv_sec -= tv_init.tv_sec;
	if ((tv.tv_nsec -= tv_init.tv_nsec) < 0) {
		tv.tv_nsec += 1000000000;
		tv.tv_sec--;
	}
	f = fopen("/proc/self/smaps", "rb");
	if (f) while (fgets(buf, sizeof buf, f)) {
		if (sscanf(buf, "%*lx-%*lx %*s %*lx %x:%x %*lu %*s", &maj, &min)==2)
			in_heap = (!maj && !min && !strstr(buf, "---p") && (strstr(buf, "[heap]") || !strchr(buf, '[')));
		if (in_heap) {
			if (sscanf(buf, "Size: %lu", &l)==1) vm_size += l;
			else if (sscanf(buf, "Rss: %lu", &l)==1) vm_rss += l;
			else if (sscanf(buf, "Private_Clean: %lu", &l)==1) vm_priv_clean += l;
			else if (sscanf(buf, "Private_Dirty: %lu", &l)==1) vm_priv_dirty += l;
		}
	}
	if (f) fclose(f);
	printf("%ld.%.9ld seconds elapsed\n", tv.tv_sec, tv.tv_nsec);
	printf("logical: %zu, vss: %zu, rss: %zu, clean: %zu, dirty: %zu\n",
		l_size, vm_size, vm_rss, vm_priv_clean, vm_priv_dirty);
	//dump_heap();
	return 0;
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-09-18 19:23 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-03 13:20 [PATCH 0/7] V3 of the new lock algorithm Jens Gustedt
2018-01-03 13:17 ` [PATCH 4/7] separate the fast parts of __lock and __unlock into a .h file that may be used by other TU Jens Gustedt
2018-01-09 17:41   ` Rich Felker
2018-01-03 13:17 ` [PATCH 7/7] implement the local lock for condition variables with the new lock feature Jens Gustedt
2018-01-03 13:17 ` [PATCH 2/7] consistently use the LOCK an UNLOCK macros Jens Gustedt
2018-01-03 13:17 ` [PATCH 5/7] use the new lock algorithm for malloc Jens Gustedt
2018-01-09 17:42   ` Rich Felker
2018-01-09 18:58     ` Jens Gustedt
2018-01-09 19:26       ` Rich Felker
2018-09-18 19:23         ` Rich Felker
2018-01-03 13:17 ` [PATCH 3/7] revise the definition of multiple basic locks in the code Jens Gustedt
2018-01-03 13:17 ` [PATCH 1/7] a new lock algorithm with lock value and congestion count in the same atomic int Jens Gustedt
2018-01-03 13:17 ` [PATCH 6/7] implement __unlock_requeue Jens Gustedt

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).