From: Denys Vlasenko <vda.linux@googlemail.com>
To: Rich Felker <dalias@libc.org>
Cc: musl <musl@lists.openwall.com>
Subject: Re: [PATCH] x86_64/memset: use "small block" code for blocks up to 30 bytes long
Date: Tue, 17 Feb 2015 17:51:11 +0100 [thread overview]
Message-ID: <CAK1hOcOaN4SnpO2jMGib3tFEf+c8=Tu8Nwi2YnOhzefpSSqTng@mail.gmail.com> (raw)
In-Reply-To: <20150217161222.GF23507@brightrain.aerifal.cx>
[-- Attachment #1: Type: text/plain, Size: 2117 bytes --]
On Tue, Feb 17, 2015 at 5:12 PM, Rich Felker <dalias@libc.org> wrote:
> On Tue, Feb 17, 2015 at 02:08:52PM +0100, Denys Vlasenko wrote:
>> >> Please see attached file.
>> >
>> > I tried it and it's ~1 cycle slower for at least sizes 16-30;
>> > presumably we're seeing the cost of the extra compare/branch at these
>> > sizes but not at others. What does your timing test show?
>>
>> See below.
>> First column - result of my2.s
>> Second column - result of vda1.s
>>
>> Basically, the "rep stosq" code path got a bit faster, while
>> small memsets stayed the same.
>
> Can you post your test program for me to try out? Here's what I've
> been using, attached.
With your program I see similar results:
...
size 50: min=10, avg=10 min=10, avg=10
size 52: min=10, avg=10 min=10, avg=10
size 54: min=10, avg=11 min=10, avg=11
size 56: min=10, avg=11 min=10, avg=11
size 58: min=10, avg=11 min=10, avg=10
size 60: min=10, avg=10 min=10, avg=12
size 62: min=10, avg=10 min=10, avg=11
size 64: min=18, avg=18 min=18, avg=22
size 96: min=17, avg=17 min=18, avg=18
size 128: min=31, avg=32 min=32, avg=32
size 160: min=35, avg=37 min=33, avg=37
size 192: min=40, avg=40 min=36, avg=37
size 224: min=43, avg=43 min=40, avg=40
size 256: min=44, avg=47 min=43, avg=43
size 288: min=47, avg=48 min=46, avg=47
size 320: min=50, avg=52 min=52, avg=52
size 352: min=53, avg=54 min=52, avg=60
size 384: min=56, avg=57 min=55, avg=57
size 416: min=59, avg=60 min=62, avg=63
size 448: min=63, avg=65 min=66, avg=66
size 480: min=66, avg=71 min=69, avg=69
size 512: min=73, avg=74 min=73, avg=76
size 1024: min=127, avg=129 min=127, avg=129
size 2048: min=221, avg=236 min=221, avg=236
size 4096: min=425, avg=444 min=424, avg=450
size 8192: min=831, avg=881 min=830, avg=883
size 16384: min=1644, avg=1717 min=1643, avg=1748
My test program is attached, I use:
gcc -O2 -Wall memset-cycles.c FOO.s
[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 3388 bytes --]
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
/* Old glibc (< 2.3.4) does not provide this constant. We use syscall
* directly so this definition is safe. */
#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC 1
#endif
#define BUF (2*1024)
#define FILL 0
/* libc has incredibly messy way of doing this,
* typically requiring -lrt. We just skip all this mess */
static void get_mono(struct timespec *ts)
{
syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
}
//void *musl_memset(void *s, int c, size_t n);
void *memset_rep_stosq(void *ptr, int c, size_t cnt)
{
unsigned long ax,cx,di;
asm volatile(
"rep stosq"
: "=D" (di), "=c" (cx), "=a" (ax)
: "0" (ptr), "1" (cnt/8), "2" (0)
: "memory"
);
return ptr;
}
void *memset_movnti(void *ptr, int c, size_t cnt)
{
unsigned long ax,cx,di;
asm volatile(
"1: movnti %%rax,(%%rdi)\n"
"add $8,%%rdi\n"
"dec %%rcx\n"
"jnz 1b\n"
"sfence\n"
: "=D" (di), "=c" (cx), "=a" (ax)
: "0" (ptr), "1" (cnt/8), "2" (0)
: "memory"
);
return ptr;
}
void *memset_movnti_unroll(void *ptr, int c, size_t cnt)
{
unsigned long ax,cx,di;
asm volatile(
"1:\n"
"movnti %%rax,(%%rdi)\n"
"movnti %%rax,8(%%rdi)\n"
"movnti %%rax,16(%%rdi)\n"
"movnti %%rax,24(%%rdi)\n"
"add $32,%%rdi\n"
"dec %%rcx\n"
"jnz 1b\n"
"sfence\n"
: "=D" (di), "=c" (cx), "=a" (ax)
: "0" (ptr), "1" (cnt/(8*4)), "2" (0)
: "memory"
);
return ptr;
}
unsigned gett()
{
#if 0
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_usec;
#else
struct timespec ts;
get_mono(&ts);
return ts.tv_nsec;
#endif
}
unsigned difft(unsigned t2, unsigned t1)
{
t2 -= t1;
if ((int)t2 < 0)
t2 += 1000000000;
return t2;
}
void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt), const char *name)
{
unsigned t1, t2, cnt;
unsigned repeat = 1;
/* For small sizes, call m() repeatedly before measuring time diff */
repeat = ((256*1024) / (sz|1)) ? : 1;
// sleep(1);
m(buf, FILL, sz); /* warm up caches */
m(buf, FILL, sz); /* warm up caches */
t2 = -1U;
cnt = 1000;
while (--cnt) {
unsigned rep = repeat;
t1 = gett();
do {
m(buf, FILL, sz);
} while (--rep);
t1 = difft(gett(), t1);
if (t2 > t1)
t2 = t1;
// printf("%s:%u ns %u\n", name, t1, t2);
}
// printf("%s:%u ns (times %d), %u bytes, %.2f bytes/ns\n", name, t2, repeat, sz, (double)(sz) * repeat / t2);
printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2);
}
int main()
{
int sz;
char *buf = malloc(BUF + 4096);
buf += 0x100;
buf = (char*)((long)buf & ~0xffL);
setlinebuf(stdout);
printf("size:%u (%uk) buf:%p\n", BUF, BUF/1024, buf);
sz = BUF;
do {
measure(sz, buf, memset, "musl");
// measure(sz, buf+1, memset, "musL");
} while (--sz >= 0);
// measure(buf, memset_movnti, "movnti");
// measure(buf, memset_movnti_unroll, "movnti_unroll");
// measure(buf, memset_rep_stosq, "stos");
// measure(buf+1, memset_movnti, "movnti+1");
// measure(buf+1, memset_movnti_unroll, "movnti_unroll+1");
// measure(buf+1, memset_rep_stosq, "stos+1");
// measure(buf+3, memset_movnti, "movnti+3");
// measure(buf+3, memset_movnti_unroll, "movnti_unroll+3");
// measure(buf+4, memset_rep_stosq, "stos+4");
// measure(buf+8, memset_rep_stosq, "stos+8");
return 0;
}
next prev parent reply other threads:[~2015-02-17 16:51 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-02-13 16:39 Denys Vlasenko
2015-02-14 19:35 ` Rich Felker
2015-02-15 4:06 ` Rich Felker
2015-02-15 14:07 ` Denys Vlasenko
2015-02-15 15:03 ` Rich Felker
2015-02-15 21:44 ` Denys Vlasenko
2015-02-15 22:55 ` Rich Felker
2015-02-16 10:09 ` Denys Vlasenko
2015-02-16 15:12 ` Rich Felker
2015-02-16 17:36 ` Rich Felker
2015-02-17 13:08 ` Denys Vlasenko
2015-02-17 16:12 ` Rich Felker
2015-02-17 16:51 ` Denys Vlasenko [this message]
2015-02-17 17:30 ` Denys Vlasenko
2015-02-17 17:40 ` Rich Felker
2015-02-17 18:53 ` Denys Vlasenko
2015-02-17 21:12 ` Rich Felker
2015-02-18 9:05 ` Denys Vlasenko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAK1hOcOaN4SnpO2jMGib3tFEf+c8=Tu8Nwi2YnOhzefpSSqTng@mail.gmail.com' \
--to=vda.linux@googlemail.com \
--cc=dalias@libc.org \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).