* [PATCH 2/3] i386/memset: do not fetch fill char from memory again
2015-10-12 18:30 [PATCH 1/3] i386/memset: argument load code need not be separate Denys Vlasenko
@ 2015-10-12 18:30 ` Denys Vlasenko
2015-11-05 2:54 ` Rich Felker
2015-10-12 18:30 ` [PATCH 3/3] i386/memset: move byte-extending IMUL up, drop one insn Denys Vlasenko
2015-10-14 23:20 ` [PATCH 1/3] i386/memset: argument load code need not be separate Rich Felker
2 siblings, 1 reply; 5+ messages in thread
From: Denys Vlasenko @ 2015-10-12 18:30 UTC (permalink / raw)
To: Rich Felker; +Cc: Denys Vlasenko, musl
shl $16,%edx
mov 8(%esp),%dl
mov 8(%esp),%dh
The above code has two register merge stalls, and it goes to load unit
to fetch the data. I don't know what's worse. Both are not pleasant.
Replace them with IMUL. It has ~3 cycle latency, but no stalls.
Move it a bit up to hide its latency.
text data bss dec hex filename
182 0 0 182 b6 memset1.o
177 0 0 177 b1 memset2.o
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
CC: Rich Felker <dalias@libc.org>
CC: musl@lists.openwall.com
---
src/string/i386/memset.s | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
index d6118c7..cd13f41 100644
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -19,13 +19,10 @@ memset:
mov %dx,1(%eax)
mov %dx,(-1-2)(%eax,%ecx)
+ imul $0x10001,%edx
cmp $6,%ecx
jbe 1f
- shl $16,%edx
- mov 8(%esp),%dl
- mov 8(%esp),%dh
-
mov %edx,(1+2)(%eax)
mov %edx,(-1-2-4)(%eax,%ecx)
cmp $14,%ecx
--
1.8.1.4
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 3/3] i386/memset: move byte-extending IMUL up, drop one insn
2015-10-12 18:30 [PATCH 1/3] i386/memset: argument load code need not be separate Denys Vlasenko
2015-10-12 18:30 ` [PATCH 2/3] i386/memset: do not fetch fill char from memory again Denys Vlasenko
@ 2015-10-12 18:30 ` Denys Vlasenko
2015-10-14 23:20 ` [PATCH 1/3] i386/memset: argument load code need not be separate Rich Felker
2 siblings, 0 replies; 5+ messages in thread
From: Denys Vlasenko @ 2015-10-12 18:30 UTC (permalink / raw)
To: Rich Felker; +Cc: Denys Vlasenko, musl
"mov %dl,%dh" may have a register stall (even though rumor has it
newest intel chips do double accounting for low byte regs,
on such CPUs subsequent "mov %dl,(%eax)" won't wait for nex value
of %edx).
Anyway, if we move IMUL up, we can drop it altogether.
text data bss dec hex filename
177 0 0 177 b1 memset2.o
175 0 0 175 af memset3.o
Add the bench program. No need for everybody to reinvent it every time.
Comparison of memset before/after last three patches on Sandy Bridge CPU:
big blocks unaffected, small blocks:
Before After
50 byte block: 6.31 bytes/ns 6.56
49 byte block: 6.19 bytes/ns 6.43
48 byte block: 6.06 bytes/ns 6.30
47 byte block: 5.94 bytes/ns 6.17
46 byte block: 5.81 bytes/ns 6.04
45 byte block: 5.69 bytes/ns 5.91
44 byte block: 5.56 bytes/ns 5.78
43 byte block: 5.44 bytes/ns 5.65
42 byte block: 5.31 bytes/ns 5.52
41 byte block: 5.19 bytes/ns 5.39
40 byte block: 5.06 bytes/ns 5.26
39 byte block: 4.94 bytes/ns 5.13
38 byte block: 4.81 bytes/ns 5.00
37 byte block: 4.69 bytes/ns 4.87
36 byte block: 4.56 bytes/ns 4.74
35 byte block: 4.44 bytes/ns 4.61
34 byte block: 4.31 bytes/ns 4.48
33 byte block: 4.19 bytes/ns 4.35
32 byte block: 4.06 bytes/ns 4.22
31 byte block: 3.93 bytes/ns 4.09
30 byte block: 5.71 bytes/ns 6.23
29 byte block: 5.52 bytes/ns 6.03
28 byte block: 5.33 bytes/ns 5.82
27 byte block: 5.15 bytes/ns 5.62
26 byte block: 4.95 bytes/ns 5.41
25 byte block: 4.77 bytes/ns 5.21
24 byte block: 4.58 bytes/ns 5.00
23 byte block: 4.39 bytes/ns 4.79
22 byte block: 4.20 bytes/ns 4.59
21 byte block: 4.01 bytes/ns 4.38
20 byte block: 3.82 bytes/ns 4.17
19 byte block: 3.63 bytes/ns 3.97
18 byte block: 3.44 bytes/ns 3.76
17 byte block: 3.25 bytes/ns 3.55
16 byte block: 3.06 bytes/ns 3.34
15 byte block: 2.87 bytes/ns 3.14
14 byte block: 3.61 bytes/ns 4.05
13 byte block: 3.35 bytes/ns 3.76
12 byte block: 3.09 bytes/ns 3.47
11 byte block: 2.84 bytes/ns 3.19
10 byte block: 2.58 bytes/ns 2.90
9 byte block: 2.33 bytes/ns 2.61
8 byte block: 2.07 bytes/ns 2.32
7 byte block: 1.81 bytes/ns 2.03
6 byte block: 1.84 bytes/ns 1.84
5 byte block: 1.54 bytes/ns 1.54
4 byte block: 1.23 bytes/ns 1.23
3 byte block: 0.92 bytes/ns 0.92
2 byte block: 0.68 bytes/ns 0.62
1 byte block: 0.34 bytes/ns 0.31
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
CC: Rich Felker <dalias@libc.org>
CC: musl@lists.openwall.com
---
src/string/i386/memset.s | 4 +-
src/string/i386/memset_bench.c | 92 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 3 deletions(-)
create mode 100644 src/string/i386/memset_bench.c
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
index cd13f41..622a1f0 100644
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -10,16 +10,14 @@ memset:
test %ecx,%ecx
jz 1f
- mov %dl,%dh
-
mov %dl,(%eax)
mov %dl,-1(%eax,%ecx)
+ imul $0x1010101,%edx
cmp $2,%ecx
jbe 1f
mov %dx,1(%eax)
mov %dx,(-1-2)(%eax,%ecx)
- imul $0x10001,%edx
cmp $6,%ecx
jbe 1f
diff --git a/src/string/i386/memset_bench.c b/src/string/i386/memset_bench.c
new file mode 100644
index 0000000..2072bbe
--- /dev/null
+++ b/src/string/i386/memset_bench.c
@@ -0,0 +1,92 @@
+// Build a-la
+// gcc -m32 -static -O2 -Wall memset_bench.c memset.s
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#define FILL 0
+
+/* libc has incredibly messy way of doing this,
+ * typically requiring -lrt. We just skip all this mess */
+#ifndef CLOCK_MONOTONIC
+#define CLOCK_MONOTONIC 1
+#endif
+static void get_mono(struct timespec *ts)
+{
+ syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
+}
+
+unsigned gett()
+{
+#if 0
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return tv.tv_usec;
+#else
+ struct timespec ts;
+ get_mono(&ts);
+ return ts.tv_nsec;
+#endif
+}
+
+unsigned difft(unsigned t2, unsigned t1)
+{
+ t2 -= t1;
+ if ((int)t2 < 0)
+ t2 += 1000000000;
+ return t2;
+}
+
+void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt))
+{
+ unsigned t1, t2, cnt;
+ unsigned repeat = 1;
+
+ /* For small sizes, call m() many times before measuring time diff */
+ repeat = ((256*1024) / (sz|1)) ? : 1;
+
+ m(buf, FILL, sz); /* warm up caches */
+ m(buf, FILL, sz); /* warm up caches */
+
+ t2 = -1U;
+ cnt = 1000;
+ while (--cnt) {
+ unsigned rep = repeat;
+
+ t1 = gett();
+ do {
+ m(buf, FILL, sz);
+ } while (--rep);
+ t1 = difft(gett(), t1);
+ if (t2 > t1)
+ t2 = t1;
+ }
+ printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2);
+}
+
+int main(int argc, char **argv)
+{
+ int sz;
+ char *buf;
+
+ sz = argv[1] ? atoi(argv[1]) : 1024;
+ buf = malloc(sz + 4096);
+ buf += 0x100;
+ buf = (char*)((long)buf & ~0xffL);
+
+ setlinebuf(stdout);
+ printf("size:%u (%uk) buf:%p\n", sz, sz/1024, buf);
+
+ do {
+ measure(sz, buf, memset);
+ } while (--sz > 0);
+
+ return 0;
+}
--
1.8.1.4
^ permalink raw reply [flat|nested] 5+ messages in thread