* Re: Re: 答复: [musl] musl: about malloc 'expand heap' issue
2018-10-31 3:44 ` Rich Felker
@ 2019-04-12 22:51 ` Rich Felker
0 siblings, 0 replies; 5+ messages in thread
From: Rich Felker @ 2019-04-12 22:51 UTC (permalink / raw)
To: musl
Cc: zhangwentao (M), Jianing (OS-LAB), Huangqiang (H),
leijitang, wanghaozhan
[-- Attachment #1: Type: text/plain, Size: 2557 bytes --]
On Tue, Oct 30, 2018 at 11:44:36PM -0400, Rich Felker wrote:
> On Wed, Oct 31, 2018 at 01:19:11AM +0000, zhangwentao (M) wrote:
> > Hi
> >
> > Now we are using a global lock to solve the issue. And as you said the performance maybe cost too much.
> > And if you have solution about this and the fix is not that expensive, that's great.
> > If you finish it, would you send the patch to me ?
>
> If I get a solution that's acceptable for upstream, it will be
> committed in git master and I'll follow up on this thread to mention
> it.
>
> Unfortunately, looking again at where the spurious empty-bin situation
> happens (as a result of alloc_rev/_fwd during free), I don't see an
> easy fix that's not costly. The best we can do might be something like
> this:
>
> In malloc, only use the per-bin lock to obtain a chunk if the
> exact-sized bin is non-empty. If it's empty, take a global lock
> (free_lock, but its semantics might need to be adjusted somewhat) to
> ensure a consistent view of what larger free chunks are available for
> splitting (which might end up revealing that a chunk of the exact
> desired bin was actually available). Ideally this will not have much
> impact on malloc performance under conditions where lots of free
> chunks are available (heavy load).
Here's a (WIP, with some debug mess left around) patch along the lines
I described above, to mitigate the badness of the current malloc. My
old malloc stress test, which is probably not representative of
real-world usage patterns, shows it being somewhat costly, but that's
expected, and is mainly an indication that the overall design is not
good. I'm attaching it here for initial review/comments, on the
existing thread since I know the ppl cc'd are interested. I'll follow
up in another thread later too.
In the process of doing this patch, I realized it probably is possible
to do the original dlmalloc-variant idea I had in mind before musl was
even released: putting locks in each chunk header/footer, and rather
than having a global split/merge lock, locking the header/footer
whenever we want to split or merge (change the sizes of) a chunk. I'm
still not 100% sure -- there's some locking order issue between the
ideal order for freeing/merging vs allocating/splitting. But it may be
a way to get some more life out of the old malloc, if it turns out the
attached patch is deemed too costly to apply as-is. It would also be a
good bit of work redesigning header structures, etc., at which point
it might make more sense to work on the new design instead.
Rich
[-- Attachment #2: malloc_mitigations.diff --]
[-- Type: text/plain, Size: 9740 bytes --]
diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index 9698259..6b0dac3 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -17,7 +17,7 @@
static struct {
volatile uint64_t binmap;
struct bin bins[64];
- volatile int free_lock[2];
+ volatile int split_merge_lock[2];
} mal;
int __malloc_replaced;
@@ -102,16 +102,21 @@ static int bin_index_up(size_t x)
return bin_tab[x/128-4] + 17;
}
-#if 0
-void __dump_heap(int x)
+static void *heap_base;
+#if 1
+void __dump_heap()
{
struct chunk *c;
int i;
- for (c = (void *)mal.heap; CHUNK_SIZE(c); c = NEXT_CHUNK(c))
+ for (c = (void *)heap_base; CHUNK_SIZE(c); c = NEXT_CHUNK(c)) {
fprintf(stderr, "base %p size %zu (%d) flags %d/%d\n",
c, CHUNK_SIZE(c), bin_index(CHUNK_SIZE(c)),
c->csize & 15,
NEXT_CHUNK(c)->psize & 15);
+ if (CHUNK_PSIZE(NEXT_CHUNK(c)) != CHUNK_SIZE(c))
+ fprintf(stderr, "!! next chunk psize is wrong: %zu\n",
+ CHUNK_PSIZE(NEXT_CHUNK(c)));
+ }
for (i=0; i<64; i++) {
if (mal.bins[i].head != BIN_TO_CHUNK(i) && mal.bins[i].head) {
fprintf(stderr, "bin %d: %p\n", i, mal.bins[i].head);
@@ -125,7 +130,6 @@ void __dump_heap(int x)
static struct chunk *expand_heap(size_t n)
{
- static int heap_lock[2];
static void *end;
void *p;
struct chunk *w;
@@ -135,13 +139,8 @@ static struct chunk *expand_heap(size_t n)
* we need room for an extra zero-sized sentinel chunk. */
n += SIZE_ALIGN;
- lock(heap_lock);
-
p = __expand_heap(&n);
- if (!p) {
- unlock(heap_lock);
- return 0;
- }
+ if (!p) return 0;
/* If not just expanding existing space, we need to make a
* new sentinel chunk below the allocated space. */
@@ -164,7 +163,7 @@ static struct chunk *expand_heap(size_t n)
w = MEM_TO_CHUNK(p);
w->csize = n | C_INUSE;
- unlock(heap_lock);
+ if (!heap_base) heap_base = w;
return w;
}
@@ -195,96 +194,44 @@ static void unbin(struct chunk *c, int i)
NEXT_CHUNK(c)->psize |= C_INUSE;
}
-static int alloc_fwd(struct chunk *c)
-{
- int i;
- size_t k;
- while (!((k=c->csize) & C_INUSE)) {
- i = bin_index(k);
- lock_bin(i);
- if (c->csize == k) {
- unbin(c, i);
- unlock_bin(i);
- return 1;
- }
- unlock_bin(i);
- }
- return 0;
-}
-
-static int alloc_rev(struct chunk *c)
+static void bin_chunk(struct chunk *self, int i)
{
- int i;
- size_t k;
- while (!((k=c->psize) & C_INUSE)) {
- i = bin_index(k);
- lock_bin(i);
- if (c->psize == k) {
- unbin(PREV_CHUNK(c), i);
- unlock_bin(i);
- return 1;
- }
- unlock_bin(i);
- }
- return 0;
+ self->next = BIN_TO_CHUNK(i);
+ self->prev = mal.bins[i].tail;
+ self->next->prev = self;
+ self->prev->next = self;
+ if (self->prev == BIN_TO_CHUNK(i))
+ a_or_64(&mal.binmap, 1ULL<<i);
}
-
-/* pretrim - trims a chunk _prior_ to removing it from its bin.
- * Must be called with i as the ideal bin for size n, j the bin
- * for the _free_ chunk self, and bin j locked. */
-static int pretrim(struct chunk *self, size_t n, int i, int j)
+static void trim(struct chunk *self, size_t n)
{
- size_t n1;
+ size_t n1 = CHUNK_SIZE(self);
struct chunk *next, *split;
- /* We cannot pretrim if it would require re-binning. */
- if (j < 40) return 0;
- if (j < i+3) {
- if (j != 63) return 0;
- n1 = CHUNK_SIZE(self);
- if (n1-n <= MMAP_THRESHOLD) return 0;
- } else {
- n1 = CHUNK_SIZE(self);
- }
- if (bin_index(n1-n) != j) return 0;
+ if (n >= n1 - DONTCARE) return;
next = NEXT_CHUNK(self);
split = (void *)((char *)self + n);
- split->prev = self->prev;
- split->next = self->next;
- split->prev->next = split;
- split->next->prev = split;
split->psize = n | C_INUSE;
split->csize = n1-n;
next->psize = n1-n;
self->csize = n | C_INUSE;
- return 1;
-}
-static void trim(struct chunk *self, size_t n)
-{
- size_t n1 = CHUNK_SIZE(self);
- struct chunk *next, *split;
+ int i = bin_index(n1-n);
+ lock_bin(i);
- if (n >= n1 - DONTCARE) return;
+ bin_chunk(split, i);
- next = NEXT_CHUNK(self);
- split = (void *)((char *)self + n);
-
- split->psize = n | C_INUSE;
- split->csize = n1-n | C_INUSE;
- next->psize = n1-n | C_INUSE;
- self->csize = n | C_INUSE;
-
- __bin_chunk(split);
+ unlock_bin(i);
}
void *malloc(size_t n)
{
struct chunk *c;
int i, j;
+ uint64_t mask;
if (adjust_size(&n) < 0) return 0;
@@ -300,33 +247,38 @@ void *malloc(size_t n)
}
i = bin_index_up(n);
- for (;;) {
- uint64_t mask = mal.binmap & -(1ULL<<i);
- if (!mask) {
- c = expand_heap(n);
- if (!c) return 0;
- if (alloc_rev(c)) {
- struct chunk *x = c;
- c = PREV_CHUNK(c);
- NEXT_CHUNK(x)->psize = c->csize =
- x->csize + CHUNK_SIZE(c);
- }
- break;
+ if (i<63 && (mal.binmap & (1ULL<<i))) {
+ lock_bin(i);
+ c = mal.bins[i].head;
+ if (c != BIN_TO_CHUNK(i)) {
+ unbin(c, i);
+ unlock_bin(i);
+ return CHUNK_TO_MEM(c);
}
+ unlock_bin(i);
+ }
+ lock(mal.split_merge_lock);
+ for (mask = mal.binmap & -(1ULL<<i); mask; mask -= (mask&-mask)) {
j = first_set(mask);
lock_bin(j);
c = mal.bins[j].head;
if (c != BIN_TO_CHUNK(j)) {
- if (!pretrim(c, n, i, j)) unbin(c, j);
+ unbin(c, j);
unlock_bin(j);
break;
}
unlock_bin(j);
}
-
- /* Now patch up in case we over-allocated */
+ if (!mask) {
+ c = expand_heap(n);
+ if (!c) {
+ unlock(mal.split_merge_lock);
+ return 0;
+ }
+ j = 63;
+ }
trim(c, n);
-
+ unlock(mal.split_merge_lock);
return CHUNK_TO_MEM(c);
}
@@ -379,6 +331,8 @@ void *realloc(void *p, size_t n)
self = MEM_TO_CHUNK(p);
n1 = n0 = CHUNK_SIZE(self);
+ if (n<=n0 && n0-n<=DONTCARE) return p;
+
if (IS_MMAPPED(self)) {
size_t extra = self->psize;
char *base = (char *)self - extra;
@@ -405,27 +359,24 @@ void *realloc(void *p, size_t n)
/* Crash on corrupted footer (likely from buffer overflow) */
if (next->psize != self->csize) a_crash();
- /* Merge adjacent chunks if we need more space. This is not
- * a waste of time even if we fail to get enough space, because our
- * subsequent call to free would otherwise have to do the merge. */
- if (n > n1 && alloc_fwd(next)) {
- n1 += CHUNK_SIZE(next);
- next = NEXT_CHUNK(next);
- }
- /* FIXME: find what's wrong here and reenable it..? */
- if (0 && n > n1 && alloc_rev(self)) {
- self = PREV_CHUNK(self);
- n1 += CHUNK_SIZE(self);
- }
- self->csize = n1 | C_INUSE;
- next->psize = n1 | C_INUSE;
+ lock(mal.split_merge_lock);
- /* If we got enough space, split off the excess and return */
- if (n <= n1) {
- //memmove(CHUNK_TO_MEM(self), p, n0-OVERHEAD);
- trim(self, n);
- return CHUNK_TO_MEM(self);
+ size_t nsize = next->csize & C_INUSE ? 0 : CHUNK_SIZE(next);
+ if (n0+nsize >= n) {
+ int i = bin_index(nsize);
+ lock_bin(i);
+ if (!(next->csize & C_INUSE)) {
+ unbin(next, i);
+ unlock_bin(i);
+ next = NEXT_CHUNK(next);
+ self->csize = next->psize = n0+nsize | C_INUSE;
+ trim(self, n);
+ unlock(mal.split_merge_lock);
+ return CHUNK_TO_MEM(self);
+ }
+ unlock_bin(i);
}
+ unlock(mal.split_merge_lock);
copy_realloc:
/* As a last resort, allocate a new chunk and copy to it. */
@@ -440,70 +391,58 @@ copy_free_ret:
void __bin_chunk(struct chunk *self)
{
struct chunk *next = NEXT_CHUNK(self);
- size_t final_size, new_size, size;
- int reclaim=0;
- int i;
-
- final_size = new_size = CHUNK_SIZE(self);
/* Crash on corrupted footer (likely from buffer overflow) */
if (next->psize != self->csize) a_crash();
- for (;;) {
- if (self->psize & next->csize & C_INUSE) {
- self->csize = final_size | C_INUSE;
- next->psize = final_size | C_INUSE;
- i = bin_index(final_size);
- lock_bin(i);
- lock(mal.free_lock);
- if (self->psize & next->csize & C_INUSE)
- break;
- unlock(mal.free_lock);
- unlock_bin(i);
- }
+ lock(mal.split_merge_lock);
- if (alloc_rev(self)) {
- self = PREV_CHUNK(self);
- size = CHUNK_SIZE(self);
- final_size += size;
- if (new_size+size > RECLAIM && (new_size+size^size) > size)
- reclaim = 1;
- }
+ size_t osize = CHUNK_SIZE(self), size = osize;
+
+ /* Since we hold split_merge_lock, only transition from free to
+ * in-use can race; in-use to free is impossible */
+ size_t psize = self->psize & C_INUSE ? 0 : CHUNK_PSIZE(self);
+ size_t nsize = next->csize & C_INUSE ? 0 : CHUNK_SIZE(next);
- if (alloc_fwd(next)) {
- size = CHUNK_SIZE(next);
- final_size += size;
- if (new_size+size > RECLAIM && (new_size+size^size) > size)
- reclaim = 1;
+ if (psize) {
+ int i = bin_index(psize);
+ lock_bin(i);
+ if (!(self->psize & C_INUSE)) {
+ struct chunk *prev = PREV_CHUNK(self);
+ unbin(prev, i);
+ self = prev;
+ size += psize;
+ }
+ unlock_bin(i);
+ }
+ if (nsize) {
+ int i = bin_index(nsize);
+ lock_bin(i);
+ if (!(next->csize & C_INUSE)) {
+ unbin(next, i);
next = NEXT_CHUNK(next);
+ size += nsize;
}
+ unlock_bin(i);
}
- if (!(mal.binmap & 1ULL<<i))
- a_or_64(&mal.binmap, 1ULL<<i);
+ int j = bin_index(size);
+ lock_bin(j);
- self->csize = final_size;
- next->psize = final_size;
- unlock(mal.free_lock);
+ self->csize = size;
+ next->psize = size;
+ bin_chunk(self, j);
+ unlock(mal.split_merge_lock);
- self->next = BIN_TO_CHUNK(i);
- self->prev = mal.bins[i].tail;
- self->next->prev = self;
- self->prev->next = self;
-
- /* Replace middle of large chunks with fresh zero pages */
- if (reclaim) {
+#if 0
+ if (size > RECLAIM && osize+size^osize > size) {
uintptr_t a = (uintptr_t)self + SIZE_ALIGN+PAGE_SIZE-1 & -PAGE_SIZE;
uintptr_t b = (uintptr_t)next - SIZE_ALIGN & -PAGE_SIZE;
-#if 1
__madvise((void *)a, b-a, MADV_DONTNEED);
-#else
- __mmap((void *)a, b-a, PROT_READ|PROT_WRITE,
- MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
-#endif
}
+#endif
- unlock_bin(i);
+ unlock_bin(j);
}
static void unmap_chunk(struct chunk *self)
^ permalink raw reply [flat|nested] 5+ messages in thread