[PATCH] optimize malloc0

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] optimize malloc0
@ 2017-06-26 21:43 Alexander Monakov
  2017-07-04 21:45 ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2017-06-26 21:43 UTC (permalink / raw)
  To: musl

Implementation of __malloc0 in malloc.c takes care to preserve zero
pages by overwriting only non-zero data. However, malloc must have
already modified auxiliary heap data just before and beyond the
allocated region, so we know that edge pages need not be preserved.

For allocations smaller than one page, pass them immediately to memset.
Otherwise, use memset to handle partial pages at the head and tail of
the allocation, and scan complete pages in the interior. Optimize the
scanning loop by processing 16 bytes per iteration and handling rest of
page via memset as soon as a non-zero byte is found.
---
A followup to a recent IRC discussion. Code size cost on x86 is about just 80
bytes (note e.g. how mal0_clear uses memset for two purposes simultaneously,
handling the partial page at the end, and clearing interior non-zero pages).

On a Sandy Bridge CPU, speed improvement for the potentially-zero-page scanning
loop is almost 2x on 64-bit, almost 3x on 32-bit.

Note that existing implementation can over-clear by as much as sizeof(size_t)-1
beyond the allocation, the new implementation never does that. This may expose
application bugs that were hidden before.

Alexander

 src/malloc/malloc.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index d5ee4280..720fa696 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -366,15 +366,28 @@ void *malloc(size_t n)
 	return CHUNK_TO_MEM(c);
 }
 
+static size_t mal0_clear(char *p, size_t pagesz, size_t n)
+{
+	typedef unsigned long long T;
+	char *pp = p + n;
+	size_t i = (uintptr_t)pp & (pagesz - 1);
+	for (;;) {
+		pp = memset(pp - i, 0, i);
+		if (pp - p < pagesz) return pp - p;
+		for (i = pagesz; i; i -= 2*sizeof(T), pp -= 2*sizeof(T))
+		        if (((T *)pp)[-1] | ((T *)pp)[-2])
+				break;
+	}
+}
+
 void *__malloc0(size_t n)
 {
 	void *p = malloc(n);
-	if (p && !IS_MMAPPED(MEM_TO_CHUNK(p))) {
-		size_t *z;
-		n = (n + sizeof *z - 1)/sizeof *z;
-		for (z=p; n; n--, z++) if (*z) *z=0;
-	}
-	return p;
+	if (!p || IS_MMAPPED(MEM_TO_CHUNK(p)))
+		return p;
+	if (n >= PAGE_SIZE)
+		n = mal0_clear(p, PAGE_SIZE, n);
+	return memset(p, 0, n);
 }
 
 void *realloc(void *p, size_t n)
-- 
2.11.0



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-06-26 21:43 [PATCH] optimize malloc0 Alexander Monakov
@ 2017-07-04 21:45 ` Rich Felker
  2017-07-04 23:09   ` Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2017-07-04 21:45 UTC (permalink / raw)
  To: musl

On Tue, Jun 27, 2017 at 12:43:39AM +0300, Alexander Monakov wrote:
> Implementation of __malloc0 in malloc.c takes care to preserve zero
> pages by overwriting only non-zero data. However, malloc must have
> already modified auxiliary heap data just before and beyond the
> allocated region, so we know that edge pages need not be preserved.
> 
> For allocations smaller than one page, pass them immediately to memset.
> Otherwise, use memset to handle partial pages at the head and tail of
> the allocation, and scan complete pages in the interior. Optimize the
> scanning loop by processing 16 bytes per iteration and handling rest of
> page via memset as soon as a non-zero byte is found.
> ---
> A followup to a recent IRC discussion. Code size cost on x86 is about just 80
> bytes (note e.g. how mal0_clear uses memset for two purposes simultaneously,
> handling the partial page at the end, and clearing interior non-zero pages).
> 
> On a Sandy Bridge CPU, speed improvement for the potentially-zero-page scanning
> loop is almost 2x on 64-bit, almost 3x on 32-bit.
> 
> Note that existing implementation can over-clear by as much as sizeof(size_t)-1
> beyond the allocation, the new implementation never does that. This may expose
> application bugs that were hidden before.
> 
> Alexander
> 
>  src/malloc/malloc.c | 25 +++++++++++++++++++------
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> index d5ee4280..720fa696 100644
> --- a/src/malloc/malloc.c
> +++ b/src/malloc/malloc.c
> @@ -366,15 +366,28 @@ void *malloc(size_t n)
>  	return CHUNK_TO_MEM(c);
>  }
>  
> +static size_t mal0_clear(char *p, size_t pagesz, size_t n)
> +{
> +	typedef unsigned long long T;
> +	char *pp = p + n;
> +	size_t i = (uintptr_t)pp & (pagesz - 1);
> +	for (;;) {
> +		pp = memset(pp - i, 0, i);
> +		if (pp - p < pagesz) return pp - p;
> +		for (i = pagesz; i; i -= 2*sizeof(T), pp -= 2*sizeof(T))
> +		        if (((T *)pp)[-1] | ((T *)pp)[-2])
> +				break;
> +	}
> +}
> +
>  void *__malloc0(size_t n)
>  {
>  	void *p = malloc(n);
> -	if (p && !IS_MMAPPED(MEM_TO_CHUNK(p))) {
> -		size_t *z;
> -		n = (n + sizeof *z - 1)/sizeof *z;
> -		for (z=p; n; n--, z++) if (*z) *z=0;
> -	}
> -	return p;
> +	if (!p || IS_MMAPPED(MEM_TO_CHUNK(p)))
> +		return p;
> +	if (n >= PAGE_SIZE)
> +		n = mal0_clear(p, PAGE_SIZE, n);
> +	return memset(p, 0, n);
>  }
>  
>  void *realloc(void *p, size_t n)
> -- 
> 2.11.0

Overall I like this. Reviewing what was discussed on IRC, I called the
loop logic clever and nsz said maybe a bit too clever. On further
reading I think he's right. One additional concern was that the
reverse-scanning may be bad for performance. I suggested it might work
just as well to restructure the loop as "for each word, if nonzero,
memset to end of page and advance to that point". A cheap way to avoid
the scanning logic for the first and last partial page, while not
complicating the loop logic, would be just writing a nonzero value to
the first byte of each before the loop.

Rich


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-04 21:45 ` Rich Felker
@ 2017-07-04 23:09   ` Alexander Monakov
  2017-07-04 23:39     ` Rich Felker
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2017-07-04 23:09 UTC (permalink / raw)
  To: musl

On Tue, 4 Jul 2017, Rich Felker wrote:
> Overall I like this. Reviewing what was discussed on IRC, I called the
> loop logic clever and nsz said maybe a bit too clever. On further
> reading I think he's right.

Somehow raising this point in the context of the rest of src/malloc seems
even worse than common bikeshed.

> One additional concern was that the reverse-scanning may be bad for
> performance.

Or it might be good for performance, because:

a) the caller is likely to use the lower addresses, in which case the
   reverse scan is more likely to leave relevant lines in L1$

b) switching directions corresponds to switching access patterns:
   reverse for reading, forward (in memset) for writing, and that
   may help hardware more than it hurts

c) at least on intel cpus hardware prefetcher doesn't cross 4K boundaries
   anyway, so discontiguous access on memset->scan transitions shouldn't
   matter there

d) in practice the most frequent calls are probably less-than-pagesize,
   and the patch handles those in the most efficient way

> A cheap way to avoid the scanning logic for the first and last partial
> page, while not complicating the loop logic, would be just writing a
> nonzero value to the first byte of each before the loop.

Nonsense.

This patch handles the common case (less than 4K) in the most efficient
way, strikes a good size/speed tradeoff for the rest, and makes the
mal0_clear interface such that it can be moved to a separate translation
unit (to assist non-'--gc-sections' static linking, if desired) with
minimal penalty.

I can rewrite it fully forward scanning without much trouble, but I
think it wouldn't be for the better.

Alexander


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-04 23:09   ` Alexander Monakov
@ 2017-07-04 23:39     ` Rich Felker
  2017-07-05  8:49       ` Szabolcs Nagy
  2017-07-05 13:28       ` [PATCH] " Alexander Monakov
  0 siblings, 2 replies; 9+ messages in thread
From: Rich Felker @ 2017-07-04 23:39 UTC (permalink / raw)
  To: musl

On Wed, Jul 05, 2017 at 02:09:21AM +0300, Alexander Monakov wrote:
> On Tue, 4 Jul 2017, Rich Felker wrote:
> > Overall I like this. Reviewing what was discussed on IRC, I called the
> > loop logic clever and nsz said maybe a bit too clever. On further
> > reading I think he's right.
> 
> Somehow raising this point in the context of the rest of src/malloc seems
> even worse than common bikeshed.

I don't think so -- I actually had to re-read the code a few times
before I understood what it was doing. Yes, maybe there's some
confusing code in malloc.c, and I'd like to avoid repeating that when
it's rewritten, but I think concern about making it worse is valid.

> > One additional concern was that the reverse-scanning may be bad for
> > performance.
> 
> Or it might be good for performance, because:
> 
> a) the caller is likely to use the lower addresses, in which case the
>    reverse scan is more likely to leave relevant lines in L1$
> 
> b) switching directions corresponds to switching access patterns:
>    reverse for reading, forward (in memset) for writing, and that
>    may help hardware more than it hurts
> 
> c) at least on intel cpus hardware prefetcher doesn't cross 4K boundaries
>    anyway, so discontiguous access on memset->scan transitions shouldn't
>    matter there
> 
> d) in practice the most frequent calls are probably less-than-pagesize,
>    and the patch handles those in the most efficient way

OK, these make sense.

> > A cheap way to avoid the scanning logic for the first and last partial
> > page, while not complicating the loop logic, would be just writing a
> > nonzero value to the first byte of each before the loop.
> 
> Nonsense.

I'm not sure what part is "nonsense". I was saying that, if we want to
do a simple, idiomatic forward loop like I described, the need for
special-casing the first and last partial pages could be avoided by
preloading nonzero data in 2 specific places, so that the same logic
that switches to memset for the interior pages would also work for the
boundary ones. However I think you have good reasons to claim your
current approach is better.

> This patch handles the common case (less than 4K) in the most efficient
> way, strikes a good size/speed tradeoff for the rest, and makes the
> mal0_clear interface such that it can be moved to a separate translation
> unit (to assist non-'--gc-sections' static linking, if desired) with
> minimal penalty.

That sounds nice, but do you have a proposal for how it would work?
Dummy weak mal0_clear in malloc.c with the working definition in
calloc.c? Just putting it in a separate TU wouldn't do anything to
help since malloc.c would still have a reference to it.

> I can rewrite it fully forward scanning without much trouble, but I
> think it wouldn't be for the better.

*nod* I'll re-check the version you submitted just to be sure I
understand and agree that it's doing what I think it is, with the plan
of going with it or something very similar. I might add some comments
as a follow-up patch.

Rich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-04 23:39     ` Rich Felker
@ 2017-07-05  8:49       ` Szabolcs Nagy
  2017-07-05 12:45         ` Rich Felker
  2017-07-05 13:28       ` [PATCH] " Alexander Monakov
  1 sibling, 1 reply; 9+ messages in thread
From: Szabolcs Nagy @ 2017-07-05  8:49 UTC (permalink / raw)
  To: musl

* Rich Felker <dalias@libc.org> [2017-07-04 19:39:10 -0400]:
> On Wed, Jul 05, 2017 at 02:09:21AM +0300, Alexander Monakov wrote:
> > I can rewrite it fully forward scanning without much trouble, but I
> > think it wouldn't be for the better.
> 
> *nod* I'll re-check the version you submitted just to be sure I
> understand and agree that it's doing what I think it is, with the plan
> of going with it or something very similar. I might add some comments
> as a follow-up patch.

i think may_alias should be added and uint64_t used
instead of unsigned long long (which may have a size
that does not divide pagesize)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-05  8:49       ` Szabolcs Nagy
@ 2017-07-05 12:45         ` Rich Felker
  2017-12-16 11:27           ` [PATCH v2] " Alexander Monakov
  0 siblings, 1 reply; 9+ messages in thread
From: Rich Felker @ 2017-07-05 12:45 UTC (permalink / raw)
  To: musl

On Wed, Jul 05, 2017 at 10:49:10AM +0200, Szabolcs Nagy wrote:
> * Rich Felker <dalias@libc.org> [2017-07-04 19:39:10 -0400]:
> > On Wed, Jul 05, 2017 at 02:09:21AM +0300, Alexander Monakov wrote:
> > > I can rewrite it fully forward scanning without much trouble, but I
> > > think it wouldn't be for the better.
> > 
> > *nod* I'll re-check the version you submitted just to be sure I
> > understand and agree that it's doing what I think it is, with the plan
> > of going with it or something very similar. I might add some comments
> > as a follow-up patch.
> 
> i think may_alias should be added and uint64_t used
> instead of unsigned long long (which may have a size
> that does not divide pagesize)

I agree about may_alias (forgot to mention that; believe we discussed
it on IRC before). I don't think ull is a practical issue (there are
probably other places we assume all integer types have power-of-two
sizes and this seems like a totally reasonable assumption) but I don't
mind changing it to uint64_t for obviousness/generality. Hidden
assumptions like this tend to bite future readers.

Rich


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-04 23:39     ` Rich Felker
  2017-07-05  8:49       ` Szabolcs Nagy
@ 2017-07-05 13:28       ` Alexander Monakov
  2017-07-05 16:13         ` Rich Felker
  1 sibling, 1 reply; 9+ messages in thread
From: Alexander Monakov @ 2017-07-05 13:28 UTC (permalink / raw)
  To: musl

On Tue, 4 Jul 2017, Rich Felker wrote:
> > > Overall I like this. Reviewing what was discussed on IRC, I called the
> > > loop logic clever and nsz said maybe a bit too clever. On further
> > > reading I think he's right.
> > 
> > Somehow raising this point in the context of the rest of src/malloc seems
> > even worse than common bikeshed.
> 
> I don't think so -- I actually had to re-read the code a few times
> before I understood what it was doing. Yes, maybe there's some
> confusing code in malloc.c, and I'd like to avoid repeating that when
> it's rewritten, but I think concern about making it worse is valid.

My main gripe is with the way this feedback was offered. It shouldn't be
ok to say "nah, too clever" and just leave it at that - at least make an
effort to elaborate or suggest improvements? How should contributors
find a balance between "acceptably non-clever" code and code that lives
up to general expectations of efficiency and conciseness in musl?

That we seem to disagree on this code being simple enough is secondary.

> I was saying that, if we want to do a simple, idiomatic forward loop
> like I described, the need for special-casing the first and last
> partial pages could be avoided by preloading nonzero data in 2
> specific places, so that the same logic that switches to memset for
> the interior pages would also work for the boundary ones.

This doesn't address the need to treat loop/memset boundaries separately
for boundary pages. In fact, what you wrote in the previous email, if
interpreted literally, would clear the whole page at the end of region.

> That sounds nice, but do you have a proposal for how it would work?
> Dummy weak mal0_clear in malloc.c with the working definition in
> calloc.c? Just putting it in a separate TU wouldn't do anything to
> help since malloc.c would still have a reference to it.

Hm, yes, probably something like that, perhaps with also adding a weak
definition of calloc in lite_malloc.c. I haven't really looked into it.

Alexander

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] optimize malloc0
  2017-07-05 13:28       ` [PATCH] " Alexander Monakov
@ 2017-07-05 16:13         ` Rich Felker
  0 siblings, 0 replies; 9+ messages in thread
From: Rich Felker @ 2017-07-05 16:13 UTC (permalink / raw)
  To: musl

On Wed, Jul 05, 2017 at 04:28:28PM +0300, Alexander Monakov wrote:
> On Tue, 4 Jul 2017, Rich Felker wrote:
> > > > Overall I like this. Reviewing what was discussed on IRC, I called the
> > > > loop logic clever and nsz said maybe a bit too clever. On further
> > > > reading I think he's right.
> > > 
> > > Somehow raising this point in the context of the rest of src/malloc seems
> > > even worse than common bikeshed.
> > 
> > I don't think so -- I actually had to re-read the code a few times
> > before I understood what it was doing. Yes, maybe there's some
> > confusing code in malloc.c, and I'd like to avoid repeating that when
> > it's rewritten, but I think concern about making it worse is valid.
> 
> My main gripe is with the way this feedback was offered. It shouldn't be
> ok to say "nah, too clever" and just leave it at that - at least make an
> effort to elaborate or suggest improvements? How should contributors
> find a balance between "acceptably non-clever" code and code that lives
> up to general expectations of efficiency and conciseness in musl?
> 
> That we seem to disagree on this code being simple enough is secondary.

OK. My aim was to discover whether the cleverness was justified, and
in the end I think it mostly was. Until understanding that, I don't
think I had a lot of constructive feedback on how it could be less
"overly clever". Maybe a brief high-level description of what the code
is doing and why, either in the email or a comment, would have helped.

> > I was saying that, if we want to do a simple, idiomatic forward loop
> > like I described, the need for special-casing the first and last
> > partial pages could be avoided by preloading nonzero data in 2
> > specific places, so that the same logic that switches to memset for
> > the interior pages would also work for the boundary ones.
> 
> This doesn't address the need to treat loop/memset boundaries separately
> for boundary pages. In fact, what you wrote in the previous email, if
> interpreted literally, would clear the whole page at the end of region.

Sorry, I really should have just written what I meant as code. My
intent was to have the MIN(diff_to_end_of_page, diff_to_end_of_chunk)
logic in the conditional for "found a nonzero word".

> > That sounds nice, but do you have a proposal for how it would work?
> > Dummy weak mal0_clear in malloc.c with the working definition in
> > calloc.c? Just putting it in a separate TU wouldn't do anything to
> > help since malloc.c would still have a reference to it.
> 
> Hm, yes, probably something like that, perhaps with also adding a weak
> definition of calloc in lite_malloc.c. I haven't really looked into it.

That's okay, we can follow up on this later if anyone cares about the
size. I don't think it's a big deal.

Rich


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] optimize malloc0
  2017-07-05 12:45         ` Rich Felker
@ 2017-12-16 11:27           ` Alexander Monakov
  0 siblings, 0 replies; 9+ messages in thread
From: Alexander Monakov @ 2017-12-16 11:27 UTC (permalink / raw)
  To: musl

Implementation of __malloc0 in malloc.c takes care to preserve zero
pages by overwriting only non-zero data. However, malloc must have
already modified auxiliary heap data just before and beyond the
allocated region, so we know that edge pages need not be preserved.

For allocations smaller than one page, pass them immediately to memset.
Otherwise, use memset to handle partial pages at the head and tail of
the allocation, and scan complete pages in the interior. Optimize the
scanning loop by processing 16 bytes per iteration and handling rest of
page via memset as soon as a non-zero byte is found.
---

v2: use uint64_t with may_alias attribute

 src/malloc/malloc.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index 9e05e1d6..0a7d5d85 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -366,15 +366,32 @@ void *malloc(size_t n)
 	return CHUNK_TO_MEM(c);
 }
 
+static size_t mal0_clear(char *p, size_t pagesz, size_t n)
+{
+#ifdef __GNUC__
+	typedef uint64_t __attribute__((__may_alias__)) T;
+#else
+	typedef unsigned char T;
+#endif
+	char *pp = p + n;
+	size_t i = (uintptr_t)pp & (pagesz - 1);
+	for (;;) {
+		pp = memset(pp - i, 0, i);
+		if (pp - p < pagesz) return pp - p;
+		for (i = pagesz; i; i -= 2*sizeof(T), pp -= 2*sizeof(T))
+		        if (((T *)pp)[-1] | ((T *)pp)[-2])
+				break;
+	}
+}
+
 void *__malloc0(size_t n)
 {
 	void *p = malloc(n);
-	if (p && !IS_MMAPPED(MEM_TO_CHUNK(p))) {
-		size_t *z;
-		n = (n + sizeof *z - 1)/sizeof *z;
-		for (z=p; n; n--, z++) if (*z) *z=0;
-	}
-	return p;
+	if (!p || IS_MMAPPED(MEM_TO_CHUNK(p)))
+		return p;
+	if (n >= PAGE_SIZE)
+		n = mal0_clear(p, PAGE_SIZE, n);
+	return memset(p, 0, n);
 }
 
 void *realloc(void *p, size_t n)
-- 
2.11.0



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-12-16 11:27 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-26 21:43 [PATCH] optimize malloc0 Alexander Monakov
2017-07-04 21:45 ` Rich Felker
2017-07-04 23:09   ` Alexander Monakov
2017-07-04 23:39     ` Rich Felker
2017-07-05  8:49       ` Szabolcs Nagy
2017-07-05 12:45         ` Rich Felker
2017-12-16 11:27           ` [PATCH v2] " Alexander Monakov
2017-07-05 13:28       ` [PATCH] " Alexander Monakov
2017-07-05 16:13         ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).