From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/7008 Path: news.gmane.org!not-for-mail From: Denys Vlasenko Newsgroups: gmane.linux.lib.musl.general Subject: [PATCH 1/2] x86_64/memset: avoid multiply insn if possible Date: Thu, 12 Feb 2015 18:17:02 +0100 Message-ID: <1423761423-30050-1-git-send-email-vda.linux@googlemail.com> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: plane.gmane.org X-Trace: ger.gmane.org 1423761452 17587 80.91.229.3 (12 Feb 2015 17:17:32 GMT) X-Complaints-To: usenet@ger.gmane.org NNTP-Posting-Date: Thu, 12 Feb 2015 17:17:32 +0000 (UTC) Cc: Denys Vlasenko To: musl@lists.openwall.com, Rich Felker Original-X-From: musl-return-7021-gllmg-musl=m.gmane.org@lists.openwall.com Thu Feb 12 18:17:32 2015 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by plane.gmane.org with smtp (Exim 4.69) (envelope-from ) id 1YLxOL-0006eP-6H for gllmg-musl@m.gmane.org; Thu, 12 Feb 2015 18:17:29 +0100 Original-Received: (qmail 14005 invoked by uid 550); 12 Feb 2015 17:17:27 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: Original-Received: (qmail 13984 invoked from network); 12 Feb 2015 17:17:27 -0000 X-Scanned-By: MIMEDefang 2.68 on 10.5.11.24 Xref: news.gmane.org gmane.linux.lib.musl.general:7008 Archived-At: memset is very, very often called with fill=0, and 64-bit imul is expensive on many CPUs. Avoid it if fill=0. Also avoid multiply on "short memset" codepath if possible, and when we do need it, use 32-bit one, which is cheaper on many CPUs. Signed-off-by: Denys Vlasenko --- src/string/x86_64/memset.s | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index 3cc8fcf..523caa0 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,13 +1,12 @@ .global memset .type memset,@function memset: - movzbl %sil,%esi - mov $0x101010101010101,%rax - # 64-bit imul has 3-7 cycles latency, launch early - imul %rsi,%rax - + movzbq %sil,%rax cmp $16,%rdx - jb 1f + jb .Less_than_16 + test %esi,%esi + jnz .L_widen_rax # unlikely +.L_widened: lea -1(%rdx),%rcx mov %rdi,%r8 @@ -18,26 +17,35 @@ memset: mov %r8,%rax ret -1: test %edx,%edx - jz 1f +.L_widen_rax: + # 64-bit imul has 3-7 cycles latency + mov $0x101010101010101,%rsi + imul %rsi,%rax + jmp .L_widened + +.Less_than_16: + test %edx,%edx + jz .L_ret mov %al,(%rdi) mov %al,-1(%rdi,%rdx) cmp $2,%edx - jbe 1f + jbe .L_ret mov %al,1(%rdi) mov %al,-2(%rdi,%rdx) + # 32-bit imul has 3-4 cycles latency + imul $0x1010101,%eax cmp $4,%edx - jbe 1f + jbe .L_ret mov %eax,(%rdi) mov %eax,-4(%rdi,%rdx) cmp $8,%edx - jbe 1f + jbe .L_ret mov %eax,4(%rdi) mov %eax,-8(%rdi,%rdx) - -1: mov %rdi,%rax +.L_ret: + mov %rdi,%rax ret -- 1.8.1.4