From mboxrd@z Thu Jan  1 00:00:00 1970
X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/7009
Path: news.gmane.org!not-for-mail
From: Denys Vlasenko <vda.linux@googlemail.com>
Newsgroups: gmane.linux.lib.musl.general
Subject: [PATCH 2/2] x86_64/memset: align destination to 8 byte boundary
Date: Thu, 12 Feb 2015 18:17:03 +0100
Message-ID: <1423761423-30050-2-git-send-email-vda.linux@googlemail.com>
References: <1423761423-30050-1-git-send-email-vda.linux@googlemail.com>
Reply-To: musl@lists.openwall.com
NNTP-Posting-Host: plane.gmane.org
X-Trace: ger.gmane.org 1423761453 17595 80.91.229.3 (12 Feb 2015 17:17:33 GMT)
X-Complaints-To: usenet@ger.gmane.org
NNTP-Posting-Date: Thu, 12 Feb 2015 17:17:33 +0000 (UTC)
Cc: Denys Vlasenko <vda.linux@googlemail.com>
To: musl@lists.openwall.com, Rich Felker <dalias@libc.org>
Original-X-From: musl-return-7022-gllmg-musl=m.gmane.org@lists.openwall.com Thu Feb 12 18:17:33 2015
Return-path: <musl-return-7022-gllmg-musl=m.gmane.org@lists.openwall.com>
Envelope-to: gllmg-musl@m.gmane.org
Original-Received: from mother.openwall.net ([195.42.179.200])
	by plane.gmane.org with smtp (Exim 4.69)
	(envelope-from <musl-return-7022-gllmg-musl=m.gmane.org@lists.openwall.com>)
	id 1YLxOO-0006f5-OF
	for gllmg-musl@m.gmane.org; Thu, 12 Feb 2015 18:17:32 +0100
Original-Received: (qmail 14172 invoked by uid 550); 12 Feb 2015 17:17:29 -0000
Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:musl@lists.openwall.com>
List-Help: <mailto:musl-help@lists.openwall.com>
List-Unsubscribe: <mailto:musl-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:musl-subscribe@lists.openwall.com>
Original-Received: (qmail 14060 invoked from network); 12 Feb 2015 17:17:28 -0000
In-Reply-To: <1423761423-30050-1-git-send-email-vda.linux@googlemail.com>
X-Scanned-By: MIMEDefang 2.68 on 10.5.11.24
Xref: news.gmane.org gmane.linux.lib.musl.general:7009
Archived-At: <http://permalink.gmane.org/gmane.linux.lib.musl.general/7009>

8-byte alignment gives ~25% speedup on "rep stosq" memsets
to L1 cache, compared to intentionally misaligned ones.
It is a smaller win of ~15% on larger memsets to L2 too.
Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz)

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 src/string/x86_64/memset.s | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
index 523caa0..5c9e333 100644
--- a/src/string/x86_64/memset.s
+++ b/src/string/x86_64/memset.s
@@ -4,16 +4,23 @@ memset:
 	movzbq %sil,%rax
 	cmp $16,%rdx
 	jb .Less_than_16
+
 	test %esi,%esi
 	jnz .L_widen_rax  # unlikely
 .L_widened:
 
-	lea -1(%rdx),%rcx
 	mov %rdi,%r8
+
+	test $7,%dil
+	jnz .L_align  # unlikely
+.L_aligned:
+
+	lea -1(%rdx),%rcx
 	shr $3,%rcx
 	mov %rax,-8(%rdi,%rdx)
 	rep
 	stosq
+
 	mov %r8,%rax
 	ret
 
@@ -23,6 +30,19 @@ memset:
 	imul %rsi,%rax
 	jmp .L_widened
 
+# 8-byte alignment gives ~25% speedup on "rep stosq" memsets
+# to L1 cache, compared to intentionally misaligned ones.
+# It is a smaller win of ~15% on larger memsets to L2 too.
+# Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz)
+.L_align:
+	mov %rax,(%rdi)
+1:	inc %rdi
+	dec %rdx
+	test $7,%dil
+	jnz 1b
+	jmp .L_aligned
+
+
 .Less_than_16:
 	test %edx,%edx
 	jz .L_ret
-- 
1.8.1.4