From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/7028 Path: news.gmane.org!not-for-mail From: Denys Vlasenko Newsgroups: gmane.linux.lib.musl.general Subject: [PATCH] x86_64/memset: use "small block" code for blocks up to 30 bytes long Date: Fri, 13 Feb 2015 17:39:49 +0100 Message-ID: <1423845589-5920-1-git-send-email-vda.linux@googlemail.com> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: plane.gmane.org X-Trace: ger.gmane.org 1423845616 16033 80.91.229.3 (13 Feb 2015 16:40:16 GMT) X-Complaints-To: usenet@ger.gmane.org NNTP-Posting-Date: Fri, 13 Feb 2015 16:40:16 +0000 (UTC) Cc: Denys Vlasenko To: musl@lists.openwall.com, Rich Felker Original-X-From: musl-return-7041-gllmg-musl=m.gmane.org@lists.openwall.com Fri Feb 13 17:40:15 2015 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by plane.gmane.org with smtp (Exim 4.69) (envelope-from ) id 1YMJHp-0007vJ-Am for gllmg-musl@m.gmane.org; Fri, 13 Feb 2015 17:40:13 +0100 Original-Received: (qmail 28275 invoked by uid 550); 13 Feb 2015 16:40:12 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: Original-Received: (qmail 28251 invoked from network); 13 Feb 2015 16:40:11 -0000 X-Scanned-By: MIMEDefang 2.68 on 10.5.11.27 Xref: news.gmane.org gmane.linux.lib.musl.general:7028 Archived-At: Before this change, we were using it only for 15-byte blocks and smaller. Measurements on Sandy Bridge CPU show that "rep stosq" setup time is high enough to dominate speed of fills well above that size: 31 byte block: 3.279282 bytes/ns 30 byte block: 3.173499 bytes/ns .. 20 byte block: 2.116552 bytes/ns .. 16 byte block: 1.799337 bytes/ns 15 byte block: 5.074332 bytes/ns 14 byte block: 4.736135 bytes/ns 13 byte block: 4.398852 bytes/ns 12 byte block: 4.060479 bytes/ns 11 byte block: 3.723065 bytes/ns 10 byte block: 3.384556 bytes/ns 9 byte block: 2.867677 bytes/ns 8 byte block: 2.257382 bytes/ns 7 byte block: 1.975605 bytes/ns 6 byte block: 1.693388 bytes/ns 5 byte block: 1.411434 bytes/ns 4 byte block: 1.129147 bytes/ns 3 byte block: 0.847030 bytes/ns 2 byte block: 0.616008 bytes/ns 1 byte block: 0.308069 bytes/ns The patch does not increase the number of branches, but is able to handle blocks up to 30 bytes. After the patch, timings are: 32 byte block: 3.384681 bytes/ns 31 byte block: 3.279118 bytes/ns 30 byte block: 10.128968 bytes/ns 29 byte block: 9.793798 bytes/ns 28 byte block: 9.456081 bytes/ns 27 byte block: 9.120555 bytes/ns 26 byte block: 8.782757 bytes/ns 25 byte block: 8.446654 bytes/ns 24 byte block: 8.109310 bytes/ns 23 byte block: 7.773063 bytes/ns 22 byte block: 7.434663 bytes/ns 21 byte block: 7.098760 bytes/ns 20 byte block: 6.760724 bytes/ns 19 byte block: 6.424286 bytes/ns 18 byte block: 6.086166 bytes/ns 17 byte block: 5.749441 bytes/ns 16 byte block: 5.411120 bytes/ns 15 byte block: 5.074234 bytes/ns 14 byte block: 3.947913 bytes/ns 13 byte block: 3.666643 bytes/ns 12 byte block: 3.384641 bytes/ns 11 byte block: 3.103178 bytes/ns 10 byte block: 2.821105 bytes/ns 9 byte block: 2.539481 bytes/ns 8 byte block: 2.257338 bytes/ns 7 byte block: 1.975530 bytes/ns 6 byte block: 1.693337 bytes/ns 5 byte block: 1.411388 bytes/ns 4 byte block: 1.129111 bytes/ns 3 byte block: 0.846994 bytes/ns 2 byte block: 0.615982 bytes/ns 1 byte block: 0.308056 bytes/ns Signed-off-by: Denys Vlasenko --- src/string/x86_64/memset.s | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index ea61687..81adbb2 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -2,13 +2,13 @@ .type memset,@function memset: movzbq %sil,%rax - cmp $16,%rdx - jb .Less_than_16 - test %esi,%esi jnz .L_widen_rax # unlikely .L_widened: + cmp $31,%rdx + jb .Less_than_31 + mov %rdi,%r8 test $7,%dil @@ -43,7 +43,7 @@ memset: jmp .L_aligned -.Less_than_16: +.Less_than_31: test %edx,%edx jz .L_ret @@ -52,20 +52,18 @@ memset: cmp $2,%edx jbe .L_ret - mov %al,1(%rdi) - mov %al,-2(%rdi,%rdx) - # 32-bit imul has 3-4 cycles latency - imul $0x1010101,%eax - cmp $4,%edx + mov %ax,1(%rdi) + mov %ax,(-1-2)(%rdi,%rdx) + cmp $6,%edx jbe .L_ret - mov %eax,(%rdi) - mov %eax,-4(%rdi,%rdx) - cmp $8,%edx + mov %eax,(1+2)(%rdi) + mov %eax,(-1-2-4)(%rdi,%rdx) + cmp $14,%edx jbe .L_ret - mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rdx) + mov %rax,(1+2+4)(%rdi) + mov %rax,(-1-2-4-8)(%rdi,%rdx) .L_ret: mov %rdi,%rax ret -- 1.8.1.4