Re: [PATCH] x86_64/memset: simple optimizations

mailing list of musl libc
 help / color / mirror / code / Atom feed

* Re: [PATCH] x86_64/memset: simple optimizations
       [not found]     ` <20150207130655.GW23507@brightrain.aerifal.cx>
@ 2015-02-10 20:27       ` Denys Vlasenko
  2015-02-10 20:43         ` Rich Felker
  0 siblings, 1 reply; 4+ messages in thread
From: Denys Vlasenko @ 2015-02-10 20:27 UTC (permalink / raw)
  To: Rich Felker, musl

[-- Attachment #1: Type: text/plain, Size: 1724 bytes --]

On Sat, Feb 7, 2015 at 2:06 PM, Rich Felker <dalias@aerifal.cx> wrote:
> On Sat, Feb 07, 2015 at 01:49:43PM +0100, Denys Vlasenko wrote:
>> On Sat, Feb 7, 2015 at 1:35 AM, Rich Felker <dalias@aerifal.cx> wrote:
>> What speedups?
>> In particular:
>> - perform pre-alignment if dst is unaligned
>
> For the rep stosq path? Does it help? I don't recall the details but I
> seem to remember both docs and measurements showing no reliable
> benefit from alignment for this instruction, and we had people trying
> things on several different cpu models. I'm open to hearing evidence
> to the contrary though.

size:20k buf:0x7f38656e2100
stos:25978 ns (times 32), 25.227500 bytes/ns
stos+1:31395 ns (times 32), 20.874662 bytes/ns
stos+4:31396 ns (times 32), 20.873997 bytes/ns
stos+8:24446 ns (times 32), 26.808476 bytes/ns

size:50k buf:0x7fbca1dc9100
stos:68149 ns (times 32), 24.041439 bytes/ns
stos+1:85762 ns (times 32), 19.104032 bytes/ns
stos+4:85762 ns (times 32), 19.104032 bytes/ns
stos+8:68204 ns (times 32), 24.022051 bytes/ns

size:1024k buf:0x7fa3036a5100
stos:1632285 ns (times 32), 20.556724 bytes/ns
stos+1:1891092 ns (times 32), 17.743416 bytes/ns
stos+4:1891089 ns (times 32), 17.743444 bytes/ns
stos+8:1632181 ns (times 32), 20.558034 bytes/ns

size:5000k buf:0x7fdf5cd6b100
stos:15592138 ns (times 32), 10.558298 bytes/ns
stos+1:15501841 ns (times 32), 10.619799 bytes/ns
stos+4:15507773 ns (times 32), 10.615737 bytes/ns
stos+8:15589617 ns (times 32), 10.560005 bytes/ns

The source is attached.

This data shows that (on my CPU, Sandy Bridge with 4MB L2)
8-byte alignment helps when stores fit into L1 or L2.
If memset is larger than L2, memory throughput is too low
and there is no measurable difference.

[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 3216 bytes --]

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
/* Old glibc (< 2.3.4) does not provide this constant. We use syscall
 * directly so this definition is safe. */
#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC 1
#endif

/* libc has incredibly messy way of doing this,
 * typically requiring -lrt. We just skip all this mess */
static void get_mono(struct timespec *ts)
{
        syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
}

void memset_rep_stosq(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"rep stosq"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt), "2" (0)
	: "memory"
	);
}

void memset_movnti(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1: movnti %%rax,(%%rdi)\n"
		"add $8,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt), "2" (0)
	: "memory"
	);
}

void memset_movnti_unroll(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1:\n"
		"movnti %%rax,(%%rdi)\n"
		"movnti %%rax,8(%%rdi)\n"
		"movnti %%rax,16(%%rdi)\n"
		"movnti %%rax,24(%%rdi)\n"
		"add $32,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt/4), "2" (0)
	: "memory"
	);
}

unsigned gett()
{
#if 0
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_usec;
#else
	struct timespec ts;
	get_mono(&ts);
	return ts.tv_nsec;
#endif
}

unsigned difft(unsigned t2, unsigned t1)
{
	t2 -= t1;
	if ((int)t2 < 0)
		t2 += 1000000000;
	return t2;
}

#define BUF (50*1024)
#define BUF8 (BUF/8)

void measure(void *buf, void (*m)(void *ptr, unsigned long cnt), const char *name)
{
	unsigned t1, t2, cnt;

	sleep(1);
	m(buf, BUF8);

	t2 = -1U;
	cnt = 1000;
	while (--cnt) {
		t1 = gett();
#define REPEAT 32
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		t1 = difft(gett(), t1);
		if (t2 > t1)
			t2 = t1;
//		printf("%s:%u ns %u\n", name, t1, t2);
	}
	printf("%s:%u ns (times %d), %.6f bytes/ns\n", name, t2, REPEAT, (double)(BUF) * REPEAT / t2);
}

int main()
{
	char *buf = malloc(8*BUF + 4096);

	buf += 0x100;
	buf = (char*)((long)buf & ~0xffL);

	printf("size:%uk buf:%p\n", BUF/1024, buf);
//	measure(buf, memset_movnti, "movnti");
//	measure(buf, memset_movnti_unroll, "movnti_unroll");
	measure(buf, memset_rep_stosq, "stos");
//	measure(buf+1, memset_movnti, "movnti+1");
//	measure(buf+1, memset_movnti_unroll, "movnti_unroll+1");
	measure(buf+1, memset_rep_stosq, "stos+1");
//	measure(buf+3, memset_movnti, "movnti+3");
//	measure(buf+3, memset_movnti_unroll, "movnti_unroll+3");
	measure(buf+4, memset_rep_stosq, "stos+4");
	measure(buf+8, memset_rep_stosq, "stos+8");

	return 0;
}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [PATCH] x86_64/memset: simple optimizations
  2015-02-10 20:27       ` [PATCH] x86_64/memset: simple optimizations Denys Vlasenko
@ 2015-02-10 20:43         ` Rich Felker
  2015-02-10 20:52           ` Denys Vlasenko
  0 siblings, 1 reply; 4+ messages in thread
From: Rich Felker @ 2015-02-10 20:43 UTC (permalink / raw)
  To: musl

On Tue, Feb 10, 2015 at 09:27:17PM +0100, Denys Vlasenko wrote:
> On Sat, Feb 7, 2015 at 2:06 PM, Rich Felker <dalias@aerifal.cx> wrote:
> > On Sat, Feb 07, 2015 at 01:49:43PM +0100, Denys Vlasenko wrote:
> >> On Sat, Feb 7, 2015 at 1:35 AM, Rich Felker <dalias@aerifal.cx> wrote:
> >> What speedups?
> >> In particular:
> >> - perform pre-alignment if dst is unaligned
> >
> > For the rep stosq path? Does it help? I don't recall the details but I
> > seem to remember both docs and measurements showing no reliable
> > benefit from alignment for this instruction, and we had people trying
> > things on several different cpu models. I'm open to hearing evidence
> > to the contrary though.
> 
> size:20k buf:0x7f38656e2100
> stos:25978 ns (times 32), 25.227500 bytes/ns
> stos+1:31395 ns (times 32), 20.874662 bytes/ns
> stos+4:31396 ns (times 32), 20.873997 bytes/ns
> stos+8:24446 ns (times 32), 26.808476 bytes/ns
> 
> size:50k buf:0x7fbca1dc9100
> stos:68149 ns (times 32), 24.041439 bytes/ns
> stos+1:85762 ns (times 32), 19.104032 bytes/ns
> stos+4:85762 ns (times 32), 19.104032 bytes/ns
> stos+8:68204 ns (times 32), 24.022051 bytes/ns
> 
> size:1024k buf:0x7fa3036a5100
> stos:1632285 ns (times 32), 20.556724 bytes/ns
> stos+1:1891092 ns (times 32), 17.743416 bytes/ns
> stos+4:1891089 ns (times 32), 17.743444 bytes/ns
> stos+8:1632181 ns (times 32), 20.558034 bytes/ns
> 
> size:5000k buf:0x7fdf5cd6b100
> stos:15592138 ns (times 32), 10.558298 bytes/ns
> stos+1:15501841 ns (times 32), 10.619799 bytes/ns
> stos+4:15507773 ns (times 32), 10.615737 bytes/ns
> stos+8:15589617 ns (times 32), 10.560005 bytes/ns
> 
> The source is attached.

OK. This looks sufficiently significant (despite unaligned memsets
being rare) that it would be nice to optimize it. Could we just write
an initial possibly-misaligned word then increment the start address
and round it up before using rep stos?

> #define _GNU_SOURCE
> #include <sys/types.h>
> #include <sys/time.h>
> #include <sys/syscall.h>
> #include <time.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <string.h>
> /* Old glibc (< 2.3.4) does not provide this constant. We use syscall
>  * directly so this definition is safe. */
> #ifndef CLOCK_MONOTONIC
> #define CLOCK_MONOTONIC 1
> #endif
> 
> /* libc has incredibly messy way of doing this,
>  * typically requiring -lrt. We just skip all this mess */
> static void get_mono(struct timespec *ts)
> {
>         syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
> }

FWIW, this is a bad idea; you get syscall overhead in your
measurements. If you just use clock_gettime (the function) you'll get
vdso results (no syscall).

Using the syscall directly is also sketchy in that x32 has an
incorrect kernel-side definition for struct timespec, but I think it
will only matter if aarch64-ILP32 copies this problem from x32 and
you're using a big-endian system.

Rich


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [PATCH] x86_64/memset: simple optimizations
  2015-02-10 20:43         ` Rich Felker
@ 2015-02-10 20:52           ` Denys Vlasenko
  2015-02-10 20:54             ` Rich Felker
  0 siblings, 1 reply; 4+ messages in thread
From: Denys Vlasenko @ 2015-02-10 20:52 UTC (permalink / raw)
  To: musl

On Tue, Feb 10, 2015 at 9:43 PM, Rich Felker <dalias@aerifal.cx> wrote:
> On Tue, Feb 10, 2015 at 09:27:17PM +0100, Denys Vlasenko wrote:
>> On Sat, Feb 7, 2015 at 2:06 PM, Rich Felker <dalias@aerifal.cx> wrote:
>> /* libc has incredibly messy way of doing this,
>>  * typically requiring -lrt. We just skip all this mess */
>> static void get_mono(struct timespec *ts)
>> {
>>         syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
>> }
>
> FWIW, this is a bad idea; you get syscall overhead in your
> measurements. If you just use clock_gettime (the function) you'll get
> vdso results (no syscall).

I repeat memset 32 times between reading timespamp.
Thus, even with "small" 20kb memset test
there are 640kb of writes to L1. This is bit enough
to make overhead insignificant.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [PATCH] x86_64/memset: simple optimizations
  2015-02-10 20:52           ` Denys Vlasenko
@ 2015-02-10 20:54             ` Rich Felker
  0 siblings, 0 replies; 4+ messages in thread
From: Rich Felker @ 2015-02-10 20:54 UTC (permalink / raw)
  To: musl

On Tue, Feb 10, 2015 at 09:52:54PM +0100, Denys Vlasenko wrote:
> On Tue, Feb 10, 2015 at 9:43 PM, Rich Felker <dalias@aerifal.cx> wrote:
> > On Tue, Feb 10, 2015 at 09:27:17PM +0100, Denys Vlasenko wrote:
> >> On Sat, Feb 7, 2015 at 2:06 PM, Rich Felker <dalias@aerifal.cx> wrote:
> >> /* libc has incredibly messy way of doing this,
> >>  * typically requiring -lrt. We just skip all this mess */
> >> static void get_mono(struct timespec *ts)
> >> {
> >>         syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
> >> }
> >
> > FWIW, this is a bad idea; you get syscall overhead in your
> > measurements. If you just use clock_gettime (the function) you'll get
> > vdso results (no syscall).
> 
> I repeat memset 32 times between reading timespamp.
> Thus, even with "small" 20kb memset test
> there are 640kb of writes to L1. This is bit enough
> to make overhead insignificant.

Yes, I agree it's probably okay the way you've structured the test
here; that's why I mentioned it as a "FWIW" rather than an objection
to the results. It was more an aside remark about how this technique
could be problematic in the future. Sorry for not being clear.

Rich


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-02-10 20:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1423258814-9045-1-git-send-email-vda.linux@googlemail.com>
     [not found] ` <20150207003535.GS23507@brightrain.aerifal.cx>
     [not found]   ` <CAK1hOcP8zCo843y0VhqMr6Wc0JtaD4V4V=TicLnmJ6SynBhmGw@mail.gmail.com>
     [not found]     ` <20150207130655.GW23507@brightrain.aerifal.cx>
2015-02-10 20:27       ` [PATCH] x86_64/memset: simple optimizations Denys Vlasenko
2015-02-10 20:43         ` Rich Felker
2015-02-10 20:52           ` Denys Vlasenko
2015-02-10 20:54             ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).