From: Denys Vlasenko <vda.linux@googlemail.com>
To: Rich Felker <dalias@libc.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>, musl@lists.openwall.com
Subject: [PATCH 3/3] i386/memset: move byte-extending IMUL up, drop one insn
Date: Mon, 12 Oct 2015 20:30:34 +0200 [thread overview]
Message-ID: <1444674635-25421-3-git-send-email-vda.linux@googlemail.com> (raw)
In-Reply-To: <1444674635-25421-1-git-send-email-vda.linux@googlemail.com>
"mov %dl,%dh" may have a register stall (even though rumor has it
newest intel chips do double accounting for low byte regs,
on such CPUs subsequent "mov %dl,(%eax)" won't wait for nex value
of %edx).
Anyway, if we move IMUL up, we can drop it altogether.
text data bss dec hex filename
177 0 0 177 b1 memset2.o
175 0 0 175 af memset3.o
Add the bench program. No need for everybody to reinvent it every time.
Comparison of memset before/after last three patches on Sandy Bridge CPU:
big blocks unaffected, small blocks:
Before After
50 byte block: 6.31 bytes/ns 6.56
49 byte block: 6.19 bytes/ns 6.43
48 byte block: 6.06 bytes/ns 6.30
47 byte block: 5.94 bytes/ns 6.17
46 byte block: 5.81 bytes/ns 6.04
45 byte block: 5.69 bytes/ns 5.91
44 byte block: 5.56 bytes/ns 5.78
43 byte block: 5.44 bytes/ns 5.65
42 byte block: 5.31 bytes/ns 5.52
41 byte block: 5.19 bytes/ns 5.39
40 byte block: 5.06 bytes/ns 5.26
39 byte block: 4.94 bytes/ns 5.13
38 byte block: 4.81 bytes/ns 5.00
37 byte block: 4.69 bytes/ns 4.87
36 byte block: 4.56 bytes/ns 4.74
35 byte block: 4.44 bytes/ns 4.61
34 byte block: 4.31 bytes/ns 4.48
33 byte block: 4.19 bytes/ns 4.35
32 byte block: 4.06 bytes/ns 4.22
31 byte block: 3.93 bytes/ns 4.09
30 byte block: 5.71 bytes/ns 6.23
29 byte block: 5.52 bytes/ns 6.03
28 byte block: 5.33 bytes/ns 5.82
27 byte block: 5.15 bytes/ns 5.62
26 byte block: 4.95 bytes/ns 5.41
25 byte block: 4.77 bytes/ns 5.21
24 byte block: 4.58 bytes/ns 5.00
23 byte block: 4.39 bytes/ns 4.79
22 byte block: 4.20 bytes/ns 4.59
21 byte block: 4.01 bytes/ns 4.38
20 byte block: 3.82 bytes/ns 4.17
19 byte block: 3.63 bytes/ns 3.97
18 byte block: 3.44 bytes/ns 3.76
17 byte block: 3.25 bytes/ns 3.55
16 byte block: 3.06 bytes/ns 3.34
15 byte block: 2.87 bytes/ns 3.14
14 byte block: 3.61 bytes/ns 4.05
13 byte block: 3.35 bytes/ns 3.76
12 byte block: 3.09 bytes/ns 3.47
11 byte block: 2.84 bytes/ns 3.19
10 byte block: 2.58 bytes/ns 2.90
9 byte block: 2.33 bytes/ns 2.61
8 byte block: 2.07 bytes/ns 2.32
7 byte block: 1.81 bytes/ns 2.03
6 byte block: 1.84 bytes/ns 1.84
5 byte block: 1.54 bytes/ns 1.54
4 byte block: 1.23 bytes/ns 1.23
3 byte block: 0.92 bytes/ns 0.92
2 byte block: 0.68 bytes/ns 0.62
1 byte block: 0.34 bytes/ns 0.31
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
CC: Rich Felker <dalias@libc.org>
CC: musl@lists.openwall.com
---
src/string/i386/memset.s | 4 +-
src/string/i386/memset_bench.c | 92 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 3 deletions(-)
create mode 100644 src/string/i386/memset_bench.c
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
index cd13f41..622a1f0 100644
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -10,16 +10,14 @@ memset:
test %ecx,%ecx
jz 1f
- mov %dl,%dh
-
mov %dl,(%eax)
mov %dl,-1(%eax,%ecx)
+ imul $0x1010101,%edx
cmp $2,%ecx
jbe 1f
mov %dx,1(%eax)
mov %dx,(-1-2)(%eax,%ecx)
- imul $0x10001,%edx
cmp $6,%ecx
jbe 1f
diff --git a/src/string/i386/memset_bench.c b/src/string/i386/memset_bench.c
new file mode 100644
index 0000000..2072bbe
--- /dev/null
+++ b/src/string/i386/memset_bench.c
@@ -0,0 +1,92 @@
+// Build a-la
+// gcc -m32 -static -O2 -Wall memset_bench.c memset.s
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#define FILL 0
+
+/* libc has incredibly messy way of doing this,
+ * typically requiring -lrt. We just skip all this mess */
+#ifndef CLOCK_MONOTONIC
+#define CLOCK_MONOTONIC 1
+#endif
+static void get_mono(struct timespec *ts)
+{
+ syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
+}
+
+unsigned gett()
+{
+#if 0
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return tv.tv_usec;
+#else
+ struct timespec ts;
+ get_mono(&ts);
+ return ts.tv_nsec;
+#endif
+}
+
+unsigned difft(unsigned t2, unsigned t1)
+{
+ t2 -= t1;
+ if ((int)t2 < 0)
+ t2 += 1000000000;
+ return t2;
+}
+
+void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt))
+{
+ unsigned t1, t2, cnt;
+ unsigned repeat = 1;
+
+ /* For small sizes, call m() many times before measuring time diff */
+ repeat = ((256*1024) / (sz|1)) ? : 1;
+
+ m(buf, FILL, sz); /* warm up caches */
+ m(buf, FILL, sz); /* warm up caches */
+
+ t2 = -1U;
+ cnt = 1000;
+ while (--cnt) {
+ unsigned rep = repeat;
+
+ t1 = gett();
+ do {
+ m(buf, FILL, sz);
+ } while (--rep);
+ t1 = difft(gett(), t1);
+ if (t2 > t1)
+ t2 = t1;
+ }
+ printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2);
+}
+
+int main(int argc, char **argv)
+{
+ int sz;
+ char *buf;
+
+ sz = argv[1] ? atoi(argv[1]) : 1024;
+ buf = malloc(sz + 4096);
+ buf += 0x100;
+ buf = (char*)((long)buf & ~0xffL);
+
+ setlinebuf(stdout);
+ printf("size:%u (%uk) buf:%p\n", sz, sz/1024, buf);
+
+ do {
+ measure(sz, buf, memset);
+ } while (--sz > 0);
+
+ return 0;
+}
--
1.8.1.4
next prev parent reply other threads:[~2015-10-12 18:30 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-10-12 18:30 [PATCH 1/3] i386/memset: argument load code need not be separate Denys Vlasenko
2015-10-12 18:30 ` [PATCH 2/3] i386/memset: do not fetch fill char from memory again Denys Vlasenko
2015-11-05 2:54 ` Rich Felker
2015-10-12 18:30 ` Denys Vlasenko [this message]
2015-10-14 23:20 ` [PATCH 1/3] i386/memset: argument load code need not be separate Rich Felker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1444674635-25421-3-git-send-email-vda.linux@googlemail.com \
--to=vda.linux@googlemail.com \
--cc=dalias@libc.org \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).