From 67b1c68f1d7bd7f343e9778f3a19c1495e15116b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gonzalo=20Tornar=C3=ADa?= Date: Tue, 26 Dec 2023 19:28:39 -0300 Subject: [PATCH] glibc: fix memalign performance regression --- ...1105852568c3ebc712225ae78b8c8ba31a78.patch | 296 ++++++++++++++++++ ...fc1cf094406a138e4d1bcf9553e59edcf89d.patch | 252 +++++++++++++++ srcpkgs/glibc/template | 2 +- 3 files changed, 549 insertions(+), 1 deletion(-) create mode 100644 srcpkgs/glibc/patches/30723-1-542b1105852568c3ebc712225ae78b8c8ba31a78.patch create mode 100644 srcpkgs/glibc/patches/30723-2-0dc7fc1cf094406a138e4d1bcf9553e59edcf89d.patch diff --git a/srcpkgs/glibc/patches/30723-1-542b1105852568c3ebc712225ae78b8c8ba31a78.patch b/srcpkgs/glibc/patches/30723-1-542b1105852568c3ebc712225ae78b8c8ba31a78.patch new file mode 100644 index 0000000000000..56d5d47c031a0 --- /dev/null +++ b/srcpkgs/glibc/patches/30723-1-542b1105852568c3ebc712225ae78b8c8ba31a78.patch @@ -0,0 +1,296 @@ +From 542b1105852568c3ebc712225ae78b8c8ba31a78 Mon Sep 17 00:00:00 2001 +From: Florian Weimer +Date: Fri, 11 Aug 2023 11:18:17 +0200 +Subject: [PATCH] malloc: Enable merging of remainders in memalign (bug 30723) + +Previously, calling _int_free from _int_memalign could put remainders +into the tcache or into fastbins, where they are invisible to the +low-level allocator. This results in missed merge opportunities +because once these freed chunks become available to the low-level +allocator, further memalign allocations (even of the same size are) +likely obstructing merges. + +Furthermore, during forwards merging in _int_memalign, do not +completely give up when the remainder is too small to serve as a +chunk on its own. We can still give it back if it can be merged +with the following unused chunk. This makes it more likely that +memalign calls in a loop achieve a compact memory layout, +independently of initial heap layout. + +Drop some useless (unsigned long) casts along the way, and tweak +the style to more closely match GNU on changed lines. + +Reviewed-by: DJ Delorie +--- + malloc/malloc.c | 197 +++++++++++++++++++++++++++++------------------- + 1 file changed, 121 insertions(+), 76 deletions(-) + +diff --git a/malloc/malloc.c b/malloc/malloc.c +index e2f1a615a4..948f9759af 100644 +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -1086,6 +1086,11 @@ typedef struct malloc_chunk* mchunkptr; + + static void* _int_malloc(mstate, size_t); + static void _int_free(mstate, mchunkptr, int); ++static void _int_free_merge_chunk (mstate, mchunkptr, INTERNAL_SIZE_T); ++static INTERNAL_SIZE_T _int_free_create_chunk (mstate, ++ mchunkptr, INTERNAL_SIZE_T, ++ mchunkptr, INTERNAL_SIZE_T); ++static void _int_free_maybe_consolidate (mstate, INTERNAL_SIZE_T); + static void* _int_realloc(mstate, mchunkptr, INTERNAL_SIZE_T, + INTERNAL_SIZE_T); + static void* _int_memalign(mstate, size_t, size_t); +@@ -4637,31 +4642,52 @@ _int_free (mstate av, mchunkptr p, int have_lock) + if (!have_lock) + __libc_lock_lock (av->mutex); + +- nextchunk = chunk_at_offset(p, size); +- +- /* Lightweight tests: check whether the block is already the +- top block. */ +- if (__glibc_unlikely (p == av->top)) +- malloc_printerr ("double free or corruption (top)"); +- /* Or whether the next chunk is beyond the boundaries of the arena. */ +- if (__builtin_expect (contiguous (av) +- && (char *) nextchunk +- >= ((char *) av->top + chunksize(av->top)), 0)) +- malloc_printerr ("double free or corruption (out)"); +- /* Or whether the block is actually not marked used. */ +- if (__glibc_unlikely (!prev_inuse(nextchunk))) +- malloc_printerr ("double free or corruption (!prev)"); +- +- nextsize = chunksize(nextchunk); +- if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0) +- || __builtin_expect (nextsize >= av->system_mem, 0)) +- malloc_printerr ("free(): invalid next size (normal)"); ++ _int_free_merge_chunk (av, p, size); + +- free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ); ++ if (!have_lock) ++ __libc_lock_unlock (av->mutex); ++ } ++ /* ++ If the chunk was allocated via mmap, release via munmap(). ++ */ ++ ++ else { ++ munmap_chunk (p); ++ } ++} ++ ++/* Try to merge chunk P of SIZE bytes with its neighbors. Put the ++ resulting chunk on the appropriate bin list. P must not be on a ++ bin list yet, and it can be in use. */ ++static void ++_int_free_merge_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size) ++{ ++ mchunkptr nextchunk = chunk_at_offset(p, size); ++ ++ /* Lightweight tests: check whether the block is already the ++ top block. */ ++ if (__glibc_unlikely (p == av->top)) ++ malloc_printerr ("double free or corruption (top)"); ++ /* Or whether the next chunk is beyond the boundaries of the arena. */ ++ if (__builtin_expect (contiguous (av) ++ && (char *) nextchunk ++ >= ((char *) av->top + chunksize(av->top)), 0)) ++ malloc_printerr ("double free or corruption (out)"); ++ /* Or whether the block is actually not marked used. */ ++ if (__glibc_unlikely (!prev_inuse(nextchunk))) ++ malloc_printerr ("double free or corruption (!prev)"); ++ ++ INTERNAL_SIZE_T nextsize = chunksize(nextchunk); ++ if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0) ++ || __builtin_expect (nextsize >= av->system_mem, 0)) ++ malloc_printerr ("free(): invalid next size (normal)"); ++ ++ free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ); + +- /* consolidate backward */ +- if (!prev_inuse(p)) { +- prevsize = prev_size (p); ++ /* Consolidate backward. */ ++ if (!prev_inuse(p)) ++ { ++ INTERNAL_SIZE_T prevsize = prev_size (p); + size += prevsize; + p = chunk_at_offset(p, -((long) prevsize)); + if (__glibc_unlikely (chunksize(p) != prevsize)) +@@ -4669,9 +4695,25 @@ _int_free (mstate av, mchunkptr p, int have_lock) + unlink_chunk (av, p); + } + +- if (nextchunk != av->top) { ++ /* Write the chunk header, maybe after merging with the following chunk. */ ++ size = _int_free_create_chunk (av, p, size, nextchunk, nextsize); ++ _int_free_maybe_consolidate (av, size); ++} ++ ++/* Create a chunk at P of SIZE bytes, with SIZE potentially increased ++ to cover the immediately following chunk NEXTCHUNK of NEXTSIZE ++ bytes (if NEXTCHUNK is unused). The chunk at P is not actually ++ read and does not have to be initialized. After creation, it is ++ placed on the appropriate bin list. The function returns the size ++ of the new chunk. */ ++static INTERNAL_SIZE_T ++_int_free_create_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, ++ mchunkptr nextchunk, INTERNAL_SIZE_T nextsize) ++{ ++ if (nextchunk != av->top) ++ { + /* get and clear inuse bit */ +- nextinuse = inuse_bit_at_offset(nextchunk, nextsize); ++ bool nextinuse = inuse_bit_at_offset (nextchunk, nextsize); + + /* consolidate forward */ + if (!nextinuse) { +@@ -4686,8 +4728,8 @@ _int_free (mstate av, mchunkptr p, int have_lock) + been given one chance to be used in malloc. + */ + +- bck = unsorted_chunks(av); +- fwd = bck->fd; ++ mchunkptr bck = unsorted_chunks (av); ++ mchunkptr fwd = bck->fd; + if (__glibc_unlikely (fwd->bk != bck)) + malloc_printerr ("free(): corrupted unsorted chunks"); + p->fd = fwd; +@@ -4706,61 +4748,52 @@ _int_free (mstate av, mchunkptr p, int have_lock) + check_free_chunk(av, p); + } + +- /* +- If the chunk borders the current high end of memory, +- consolidate into top +- */ +- +- else { ++ else ++ { ++ /* If the chunk borders the current high end of memory, ++ consolidate into top. */ + size += nextsize; + set_head(p, size | PREV_INUSE); + av->top = p; + check_chunk(av, p); + } + +- /* +- If freeing a large space, consolidate possibly-surrounding +- chunks. Then, if the total unused topmost memory exceeds trim +- threshold, ask malloc_trim to reduce top. +- +- Unless max_fast is 0, we don't know if there are fastbins +- bordering top, so we cannot tell for sure whether threshold +- has been reached unless fastbins are consolidated. But we +- don't want to consolidate on each free. As a compromise, +- consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD +- is reached. +- */ ++ return size; ++} + +- if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) { ++/* If freeing a large space, consolidate possibly-surrounding ++ chunks. Then, if the total unused topmost memory exceeds trim ++ threshold, ask malloc_trim to reduce top. */ ++static void ++_int_free_maybe_consolidate (mstate av, INTERNAL_SIZE_T size) ++{ ++ /* Unless max_fast is 0, we don't know if there are fastbins ++ bordering top, so we cannot tell for sure whether threshold has ++ been reached unless fastbins are consolidated. But we don't want ++ to consolidate on each free. As a compromise, consolidation is ++ performed if FASTBIN_CONSOLIDATION_THRESHOLD is reached. */ ++ if (size >= FASTBIN_CONSOLIDATION_THRESHOLD) ++ { + if (atomic_load_relaxed (&av->have_fastchunks)) + malloc_consolidate(av); + +- if (av == &main_arena) { ++ if (av == &main_arena) ++ { + #ifndef MORECORE_CANNOT_TRIM +- if ((unsigned long)(chunksize(av->top)) >= +- (unsigned long)(mp_.trim_threshold)) +- systrim(mp_.top_pad, av); ++ if (chunksize (av->top) >= mp_.trim_threshold) ++ systrim (mp_.top_pad, av); + #endif +- } else { +- /* Always try heap_trim(), even if the top chunk is not +- large, because the corresponding heap might go away. */ +- heap_info *heap = heap_for_ptr(top(av)); ++ } ++ else ++ { ++ /* Always try heap_trim, even if the top chunk is not large, ++ because the corresponding heap might go away. */ ++ heap_info *heap = heap_for_ptr (top (av)); + +- assert(heap->ar_ptr == av); +- heap_trim(heap, mp_.top_pad); +- } ++ assert (heap->ar_ptr == av); ++ heap_trim (heap, mp_.top_pad); ++ } + } +- +- if (!have_lock) +- __libc_lock_unlock (av->mutex); +- } +- /* +- If the chunk was allocated via mmap, release via munmap(). +- */ +- +- else { +- munmap_chunk (p); +- } + } + + /* +@@ -5221,7 +5254,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) + (av != &main_arena ? NON_MAIN_ARENA : 0)); + set_inuse_bit_at_offset (newp, newsize); + set_head_size (p, leadsize | (av != &main_arena ? NON_MAIN_ARENA : 0)); +- _int_free (av, p, 1); ++ _int_free_merge_chunk (av, p, leadsize); + p = newp; + + assert (newsize >= nb && +@@ -5232,15 +5265,27 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) + if (!chunk_is_mmapped (p)) + { + size = chunksize (p); +- if ((unsigned long) (size) > (unsigned long) (nb + MINSIZE)) ++ mchunkptr nextchunk = chunk_at_offset(p, size); ++ INTERNAL_SIZE_T nextsize = chunksize(nextchunk); ++ if (size > nb) + { + remainder_size = size - nb; +- remainder = chunk_at_offset (p, nb); +- set_head (remainder, remainder_size | PREV_INUSE | +- (av != &main_arena ? NON_MAIN_ARENA : 0)); +- set_head_size (p, nb); +- _int_free (av, remainder, 1); +- } ++ if (remainder_size >= MINSIZE ++ || nextchunk == av->top ++ || !inuse_bit_at_offset (nextchunk, nextsize)) ++ { ++ /* We can only give back the tail if it is larger than ++ MINSIZE, or if the following chunk is unused (top ++ chunk or unused in-heap chunk). Otherwise we would ++ create a chunk that is smaller than MINSIZE. */ ++ remainder = chunk_at_offset (p, nb); ++ set_head_size (p, nb); ++ remainder_size = _int_free_create_chunk (av, remainder, ++ remainder_size, ++ nextchunk, nextsize); ++ _int_free_maybe_consolidate (av, remainder_size); ++ } ++ } + } + + check_inuse_chunk (av, p); diff --git a/srcpkgs/glibc/patches/30723-2-0dc7fc1cf094406a138e4d1bcf9553e59edcf89d.patch b/srcpkgs/glibc/patches/30723-2-0dc7fc1cf094406a138e4d1bcf9553e59edcf89d.patch new file mode 100644 index 0000000000000..4615c7b035cc7 --- /dev/null +++ b/srcpkgs/glibc/patches/30723-2-0dc7fc1cf094406a138e4d1bcf9553e59edcf89d.patch @@ -0,0 +1,252 @@ +From 0dc7fc1cf094406a138e4d1bcf9553e59edcf89d Mon Sep 17 00:00:00 2001 +From: Florian Weimer +Date: Thu, 10 Aug 2023 19:36:56 +0200 +Subject: [PATCH] malloc: Remove bin scanning from memalign (bug 30723) + +On the test workload (mpv --cache=yes with VP9 video decoding), the +bin scanning has a very poor success rate (less than 2%). The tcache +scanning has about 50% success rate, so keep that. + +Update comments in malloc/tst-memalign-2 to indicate the purpose +of the tests. Even with the scanning removed, the additional +merging opportunities since commit 542b1105852568c3ebc712225ae78b +("malloc: Enable merging of remainders in memalign (bug 30723)") +are sufficient to pass the existing large bins test. + +Remove leftover variables from _int_free from refactoring in the +same commit. + +Reviewed-by: DJ Delorie +--- + malloc/malloc.c | 169 ++-------------------------------------- + malloc/tst-memalign-2.c | 7 +- + 2 files changed, 10 insertions(+), 166 deletions(-) + +diff --git a/malloc/malloc.c b/malloc/malloc.c +index 948f9759af..d0bbbf3710 100644 +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -4488,12 +4488,6 @@ _int_free (mstate av, mchunkptr p, int have_lock) + { + INTERNAL_SIZE_T size; /* its size */ + mfastbinptr *fb; /* associated fastbin */ +- mchunkptr nextchunk; /* next contiguous chunk */ +- INTERNAL_SIZE_T nextsize; /* its size */ +- int nextinuse; /* true if nextchunk is used */ +- INTERNAL_SIZE_T prevsize; /* size of previous contiguous chunk */ +- mchunkptr bck; /* misc temp for linking */ +- mchunkptr fwd; /* misc temp for linking */ + + size = chunksize (p); + +@@ -5032,42 +5026,6 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize, + ------------------------------ memalign ------------------------------ + */ + +-/* Returns 0 if the chunk is not and does not contain the requested +- aligned sub-chunk, else returns the amount of "waste" from +- trimming. NB is the *chunk* byte size, not the user byte +- size. */ +-static size_t +-chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb) +-{ +- void *m = chunk2mem (p); +- INTERNAL_SIZE_T size = chunksize (p); +- void *aligned_m = m; +- +- if (__glibc_unlikely (misaligned_chunk (p))) +- malloc_printerr ("_int_memalign(): unaligned chunk detected"); +- +- aligned_m = PTR_ALIGN_UP (m, alignment); +- +- INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m; +- +- /* We can't trim off the front as it's too small. */ +- if (front_extra > 0 && front_extra < MINSIZE) +- return 0; +- +- /* If it's a perfect fit, it's an exception to the return value rule +- (we would return zero waste, which looks like "not usable"), so +- handle it here by returning a small non-zero value instead. */ +- if (size == nb && front_extra == 0) +- return 1; +- +- /* If the block we need fits in the chunk, calculate total waste. */ +- if (size > nb + front_extra) +- return size - nb; +- +- /* Can't use this chunk. */ +- return 0; +-} +- + /* BYTES is user requested bytes, not requested chunksize bytes. */ + static void * + _int_memalign (mstate av, size_t alignment, size_t bytes) +@@ -5082,7 +5040,6 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) + mchunkptr remainder; /* spare room at end to split off */ + unsigned long remainder_size; /* its size */ + INTERNAL_SIZE_T size; +- mchunkptr victim; + + nb = checked_request2size (bytes); + if (nb == 0) +@@ -5101,129 +5058,13 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) + we don't find anything in those bins, the common malloc code will + scan starting at 2x. */ + +- /* This will be set if we found a candidate chunk. */ +- victim = NULL; +- +- /* Fast bins are singly-linked, hard to remove a chunk from the middle +- and unlikely to meet our alignment requirements. We have not done +- any experimentation with searching for aligned fastbins. */ +- +- if (av != NULL) +- { +- int first_bin_index; +- int first_largebin_index; +- int last_bin_index; +- +- if (in_smallbin_range (nb)) +- first_bin_index = smallbin_index (nb); +- else +- first_bin_index = largebin_index (nb); +- +- if (in_smallbin_range (nb * 2)) +- last_bin_index = smallbin_index (nb * 2); +- else +- last_bin_index = largebin_index (nb * 2); +- +- first_largebin_index = largebin_index (MIN_LARGE_SIZE); +- +- int victim_index; /* its bin index */ +- +- for (victim_index = first_bin_index; +- victim_index < last_bin_index; +- victim_index ++) +- { +- victim = NULL; +- +- if (victim_index < first_largebin_index) +- { +- /* Check small bins. Small bin chunks are doubly-linked despite +- being the same size. */ +- +- mchunkptr fwd; /* misc temp for linking */ +- mchunkptr bck; /* misc temp for linking */ +- +- bck = bin_at (av, victim_index); +- fwd = bck->fd; +- while (fwd != bck) +- { +- if (chunk_ok_for_memalign (fwd, alignment, nb) > 0) +- { +- victim = fwd; +- +- /* Unlink it */ +- victim->fd->bk = victim->bk; +- victim->bk->fd = victim->fd; +- break; +- } +- +- fwd = fwd->fd; +- } +- } +- else +- { +- /* Check large bins. */ +- mchunkptr fwd; /* misc temp for linking */ +- mchunkptr bck; /* misc temp for linking */ +- mchunkptr best = NULL; +- size_t best_size = 0; +- +- bck = bin_at (av, victim_index); +- fwd = bck->fd; ++ /* Call malloc with worst case padding to hit alignment. */ ++ m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); + +- while (fwd != bck) +- { +- int extra; +- +- if (chunksize (fwd) < nb) +- break; +- extra = chunk_ok_for_memalign (fwd, alignment, nb); +- if (extra > 0 +- && (extra <= best_size || best == NULL)) +- { +- best = fwd; +- best_size = extra; +- } ++ if (m == 0) ++ return 0; /* propagate failure */ + +- fwd = fwd->fd; +- } +- victim = best; +- +- if (victim != NULL) +- { +- unlink_chunk (av, victim); +- break; +- } +- } +- +- if (victim != NULL) +- break; +- } +- } +- +- /* Strategy: find a spot within that chunk that meets the alignment +- request, and then possibly free the leading and trailing space. +- This strategy is incredibly costly and can lead to external +- fragmentation if header and footer chunks are unused. */ +- +- if (victim != NULL) +- { +- p = victim; +- m = chunk2mem (p); +- set_inuse (p); +- if (av != &main_arena) +- set_non_main_arena (p); +- } +- else +- { +- /* Call malloc with worst case padding to hit alignment. */ +- +- m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); +- +- if (m == 0) +- return 0; /* propagate failure */ +- +- p = mem2chunk (m); +- } ++ p = mem2chunk (m); + + if ((((unsigned long) (m)) % alignment) != 0) /* misaligned */ + { +diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c +index f229283dbf..ecd6fa249e 100644 +--- a/malloc/tst-memalign-2.c ++++ b/malloc/tst-memalign-2.c +@@ -86,7 +86,8 @@ do_test (void) + TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2); + } + +- /* Test for non-head tcache hits. */ ++ /* Test for non-head tcache hits. This exercises the memalign ++ scanning code to find matching allocations. */ + for (i = 0; i < array_length (ptr); ++ i) + { + if (i == 4) +@@ -113,7 +114,9 @@ do_test (void) + free (p); + TEST_VERIFY (count > 0); + +- /* Large bins test. */ ++ /* Large bins test. This verifies that the over-allocated parts ++ that memalign releases for future allocations can be reused by ++ memalign itself at least in some cases. */ + + for (i = 0; i < LN; ++ i) + { diff --git a/srcpkgs/glibc/template b/srcpkgs/glibc/template index 20805fb52e816..cf7cd073a9500 100644 --- a/srcpkgs/glibc/template +++ b/srcpkgs/glibc/template @@ -1,7 +1,7 @@ # Template file for 'glibc' pkgname=glibc version=2.38 -revision=1 +revision=2 bootstrap=yes short_desc="GNU C library" maintainer="Enno Boland "