mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Denys Vlasenko <vda.linux@googlemail.com>
To: Rich Felker <dalias@libc.org>
Cc: musl <musl@lists.openwall.com>
Subject: Re: [PATCH] x86_64/memset: use "small block" code for blocks up to 30 bytes long
Date: Tue, 17 Feb 2015 17:51:11 +0100	[thread overview]
Message-ID: <CAK1hOcOaN4SnpO2jMGib3tFEf+c8=Tu8Nwi2YnOhzefpSSqTng@mail.gmail.com> (raw)
In-Reply-To: <20150217161222.GF23507@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 2117 bytes --]

On Tue, Feb 17, 2015 at 5:12 PM, Rich Felker <dalias@libc.org> wrote:
> On Tue, Feb 17, 2015 at 02:08:52PM +0100, Denys Vlasenko wrote:
>> >> Please see attached file.
>> >
>> > I tried it and it's ~1 cycle slower for at least sizes 16-30;
>> > presumably we're seeing the cost of the extra compare/branch at these
>> > sizes but not at others. What does your timing test show?
>>
>> See below.
>> First column - result of my2.s
>> Second column - result of vda1.s
>>
>> Basically, the "rep stosq" code path got a bit faster, while
>> small memsets stayed the same.
>
> Can you post your test program for me to try out? Here's what I've
> been using, attached.

With your program I see similar results:

...
size 50: min=10, avg=10           min=10, avg=10
size 52: min=10, avg=10           min=10, avg=10
size 54: min=10, avg=11           min=10, avg=11
size 56: min=10, avg=11           min=10, avg=11
size 58: min=10, avg=11           min=10, avg=10
size 60: min=10, avg=10           min=10, avg=12
size 62: min=10, avg=10           min=10, avg=11
size 64: min=18, avg=18           min=18, avg=22
size 96: min=17, avg=17           min=18, avg=18
size 128: min=31, avg=32          min=32, avg=32
size 160: min=35, avg=37          min=33, avg=37
size 192: min=40, avg=40          min=36, avg=37
size 224: min=43, avg=43          min=40, avg=40
size 256: min=44, avg=47          min=43, avg=43
size 288: min=47, avg=48          min=46, avg=47
size 320: min=50, avg=52          min=52, avg=52
size 352: min=53, avg=54          min=52, avg=60
size 384: min=56, avg=57          min=55, avg=57
size 416: min=59, avg=60          min=62, avg=63
size 448: min=63, avg=65          min=66, avg=66
size 480: min=66, avg=71          min=69, avg=69
size 512: min=73, avg=74          min=73, avg=76
size 1024: min=127, avg=129       min=127, avg=129
size 2048: min=221, avg=236       min=221, avg=236
size 4096: min=425, avg=444       min=424, avg=450
size 8192: min=831, avg=881       min=830, avg=883
size 16384: min=1644, avg=1717    min=1643, avg=1748

My test program is attached, I use:

gcc -O2 -Wall memset-cycles.c FOO.s

[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 3388 bytes --]

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
/* Old glibc (< 2.3.4) does not provide this constant. We use syscall
 * directly so this definition is safe. */
#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC 1
#endif

#define BUF (2*1024)
#define FILL 0

/* libc has incredibly messy way of doing this,
 * typically requiring -lrt. We just skip all this mess */
static void get_mono(struct timespec *ts)
{
        syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
}

//void *musl_memset(void *s, int c, size_t n);

void *memset_rep_stosq(void *ptr, int c, size_t cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"rep stosq"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt/8), "2" (0)
	: "memory"
	);
	return ptr;
}

void *memset_movnti(void *ptr, int c, size_t cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1: movnti %%rax,(%%rdi)\n"
		"add $8,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt/8), "2" (0)
	: "memory"
	);
	return ptr;
}

void *memset_movnti_unroll(void *ptr, int c, size_t cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1:\n"
		"movnti %%rax,(%%rdi)\n"
		"movnti %%rax,8(%%rdi)\n"
		"movnti %%rax,16(%%rdi)\n"
		"movnti %%rax,24(%%rdi)\n"
		"add $32,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt/(8*4)), "2" (0)
	: "memory"
	);
	return ptr;
}

unsigned gett()
{
#if 0
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_usec;
#else
	struct timespec ts;
	get_mono(&ts);
	return ts.tv_nsec;
#endif
}

unsigned difft(unsigned t2, unsigned t1)
{
	t2 -= t1;
	if ((int)t2 < 0)
		t2 += 1000000000;
	return t2;
}

void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt), const char *name)
{
	unsigned t1, t2, cnt;
	unsigned repeat = 1;

	/* For small sizes, call m() repeatedly before measuring time diff */
	repeat = ((256*1024) / (sz|1)) ? : 1;

//	sleep(1);
	m(buf, FILL, sz); /* warm up caches */
	m(buf, FILL, sz); /* warm up caches */

	t2 = -1U;
	cnt = 1000;
	while (--cnt) {
		unsigned rep = repeat;

		t1 = gett();
		do {
			m(buf, FILL, sz);
		} while (--rep);
		t1 = difft(gett(), t1);
		if (t2 > t1)
			t2 = t1;
//		printf("%s:%u ns %u\n", name, t1, t2);
	}
//	printf("%s:%u ns (times %d), %u bytes, %.2f bytes/ns\n", name, t2, repeat, sz, (double)(sz) * repeat / t2);
	printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2);
}

int main()
{
	int sz;
	char *buf = malloc(BUF + 4096);

	buf += 0x100;
	buf = (char*)((long)buf & ~0xffL);

	setlinebuf(stdout);
	printf("size:%u (%uk) buf:%p\n", BUF, BUF/1024, buf);

	sz = BUF;
	do {
		measure(sz, buf, memset, "musl");
//		measure(sz, buf+1, memset, "musL");
	} while (--sz >= 0);
//	measure(buf, memset_movnti, "movnti");
//	measure(buf, memset_movnti_unroll, "movnti_unroll");
//	measure(buf, memset_rep_stosq, "stos");
//	measure(buf+1, memset_movnti, "movnti+1");
//	measure(buf+1, memset_movnti_unroll, "movnti_unroll+1");
//	measure(buf+1, memset_rep_stosq, "stos+1");
//	measure(buf+3, memset_movnti, "movnti+3");
//	measure(buf+3, memset_movnti_unroll, "movnti_unroll+3");
//	measure(buf+4, memset_rep_stosq, "stos+4");
//	measure(buf+8, memset_rep_stosq, "stos+8");

	return 0;
}

  reply	other threads:[~2015-02-17 16:51 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-13 16:39 Denys Vlasenko
2015-02-14 19:35 ` Rich Felker
2015-02-15  4:06   ` Rich Felker
2015-02-15 14:07     ` Denys Vlasenko
2015-02-15 15:03       ` Rich Felker
2015-02-15 21:44         ` Denys Vlasenko
2015-02-15 22:55           ` Rich Felker
2015-02-16 10:09             ` Denys Vlasenko
2015-02-16 15:12               ` Rich Felker
2015-02-16 17:36           ` Rich Felker
2015-02-17 13:08             ` Denys Vlasenko
2015-02-17 16:12               ` Rich Felker
2015-02-17 16:51                 ` Denys Vlasenko [this message]
2015-02-17 17:30                   ` Denys Vlasenko
2015-02-17 17:40                   ` Rich Felker
2015-02-17 18:53                     ` Denys Vlasenko
2015-02-17 21:12                       ` Rich Felker
2015-02-18  9:05                         ` Denys Vlasenko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAK1hOcOaN4SnpO2jMGib3tFEf+c8=Tu8Nwi2YnOhzefpSSqTng@mail.gmail.com' \
    --to=vda.linux@googlemail.com \
    --cc=dalias@libc.org \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).