mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Denys Vlasenko <vda.linux@googlemail.com>
To: Rich Felker <dalias@aerifal.cx>, musl@lists.openwall.com
Subject: Re: [PATCH] x86_64/memset: simple optimizations
Date: Tue, 10 Feb 2015 21:27:17 +0100	[thread overview]
Message-ID: <CAK1hOcN0icJUTsyic+g4Qq_Jgzkn8+FUQFbNjkyGRyTL4EbEGg@mail.gmail.com> (raw)
In-Reply-To: <20150207130655.GW23507@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 1724 bytes --]

On Sat, Feb 7, 2015 at 2:06 PM, Rich Felker <dalias@aerifal.cx> wrote:
> On Sat, Feb 07, 2015 at 01:49:43PM +0100, Denys Vlasenko wrote:
>> On Sat, Feb 7, 2015 at 1:35 AM, Rich Felker <dalias@aerifal.cx> wrote:
>> What speedups?
>> In particular:
>> - perform pre-alignment if dst is unaligned
>
> For the rep stosq path? Does it help? I don't recall the details but I
> seem to remember both docs and measurements showing no reliable
> benefit from alignment for this instruction, and we had people trying
> things on several different cpu models. I'm open to hearing evidence
> to the contrary though.

size:20k buf:0x7f38656e2100
stos:25978 ns (times 32), 25.227500 bytes/ns
stos+1:31395 ns (times 32), 20.874662 bytes/ns
stos+4:31396 ns (times 32), 20.873997 bytes/ns
stos+8:24446 ns (times 32), 26.808476 bytes/ns

size:50k buf:0x7fbca1dc9100
stos:68149 ns (times 32), 24.041439 bytes/ns
stos+1:85762 ns (times 32), 19.104032 bytes/ns
stos+4:85762 ns (times 32), 19.104032 bytes/ns
stos+8:68204 ns (times 32), 24.022051 bytes/ns

size:1024k buf:0x7fa3036a5100
stos:1632285 ns (times 32), 20.556724 bytes/ns
stos+1:1891092 ns (times 32), 17.743416 bytes/ns
stos+4:1891089 ns (times 32), 17.743444 bytes/ns
stos+8:1632181 ns (times 32), 20.558034 bytes/ns

size:5000k buf:0x7fdf5cd6b100
stos:15592138 ns (times 32), 10.558298 bytes/ns
stos+1:15501841 ns (times 32), 10.619799 bytes/ns
stos+4:15507773 ns (times 32), 10.615737 bytes/ns
stos+8:15589617 ns (times 32), 10.560005 bytes/ns

The source is attached.

This data shows that (on my CPU, Sandy Bridge with 4MB L2)
8-byte alignment helps when stores fit into L1 or L2.
If memset is larger than L2, memory throughput is too low
and there is no measurable difference.

[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 3216 bytes --]

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
/* Old glibc (< 2.3.4) does not provide this constant. We use syscall
 * directly so this definition is safe. */
#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC 1
#endif

/* libc has incredibly messy way of doing this,
 * typically requiring -lrt. We just skip all this mess */
static void get_mono(struct timespec *ts)
{
        syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
}

void memset_rep_stosq(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"rep stosq"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt), "2" (0)
	: "memory"
	);
}

void memset_movnti(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1: movnti %%rax,(%%rdi)\n"
		"add $8,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt), "2" (0)
	: "memory"
	);
}

void memset_movnti_unroll(void *ptr, unsigned long cnt)
{
	unsigned long ax,cx,di;

	asm volatile(
		"1:\n"
		"movnti %%rax,(%%rdi)\n"
		"movnti %%rax,8(%%rdi)\n"
		"movnti %%rax,16(%%rdi)\n"
		"movnti %%rax,24(%%rdi)\n"
		"add $32,%%rdi\n"
		"dec %%rcx\n"
		"jnz 1b\n"
		"sfence\n"
	: "=D" (di), "=c" (cx), "=a" (ax)
	: "0" (ptr), "1" (cnt/4), "2" (0)
	: "memory"
	);
}

unsigned gett()
{
#if 0
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_usec;
#else
	struct timespec ts;
	get_mono(&ts);
	return ts.tv_nsec;
#endif
}

unsigned difft(unsigned t2, unsigned t1)
{
	t2 -= t1;
	if ((int)t2 < 0)
		t2 += 1000000000;
	return t2;
}

#define BUF (50*1024)
#define BUF8 (BUF/8)

void measure(void *buf, void (*m)(void *ptr, unsigned long cnt), const char *name)
{
	unsigned t1, t2, cnt;

	sleep(1);
	m(buf, BUF8);

	t2 = -1U;
	cnt = 1000;
	while (--cnt) {
		t1 = gett();
#define REPEAT 32
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);
		t1 = difft(gett(), t1);
		if (t2 > t1)
			t2 = t1;
//		printf("%s:%u ns %u\n", name, t1, t2);
	}
	printf("%s:%u ns (times %d), %.6f bytes/ns\n", name, t2, REPEAT, (double)(BUF) * REPEAT / t2);
}

int main()
{
	char *buf = malloc(8*BUF + 4096);

	buf += 0x100;
	buf = (char*)((long)buf & ~0xffL);

	printf("size:%uk buf:%p\n", BUF/1024, buf);
//	measure(buf, memset_movnti, "movnti");
//	measure(buf, memset_movnti_unroll, "movnti_unroll");
	measure(buf, memset_rep_stosq, "stos");
//	measure(buf+1, memset_movnti, "movnti+1");
//	measure(buf+1, memset_movnti_unroll, "movnti_unroll+1");
	measure(buf+1, memset_rep_stosq, "stos+1");
//	measure(buf+3, memset_movnti, "movnti+3");
//	measure(buf+3, memset_movnti_unroll, "movnti_unroll+3");
	measure(buf+4, memset_rep_stosq, "stos+4");
	measure(buf+8, memset_rep_stosq, "stos+8");

	return 0;
}

       reply	other threads:[~2015-02-10 20:27 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1423258814-9045-1-git-send-email-vda.linux@googlemail.com>
     [not found] ` <20150207003535.GS23507@brightrain.aerifal.cx>
     [not found]   ` <CAK1hOcP8zCo843y0VhqMr6Wc0JtaD4V4V=TicLnmJ6SynBhmGw@mail.gmail.com>
     [not found]     ` <20150207130655.GW23507@brightrain.aerifal.cx>
2015-02-10 20:27       ` Denys Vlasenko [this message]
2015-02-10 20:43         ` Rich Felker
2015-02-10 20:52           ` Denys Vlasenko
2015-02-10 20:54             ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAK1hOcN0icJUTsyic+g4Qq_Jgzkn8+FUQFbNjkyGRyTL4EbEGg@mail.gmail.com \
    --to=vda.linux@googlemail.com \
    --cc=dalias@aerifal.cx \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).