From: zhangfei <zhang_fei_0403@163.com>
To: dalias@libc.org, musl@lists.openwall.com
Cc: zhangfei <zhangfei@nj.iscas.ac.cn>
Subject: [musl] [PATCH 2/3] RISC-V: Optimize memcpy
Date: Wed, 7 Jun 2023 18:07:09 +0800 [thread overview]
Message-ID: <20230607100710.4286-3-zhang_fei_0403@163.com> (raw)
In-Reply-To: <20230607100710.4286-1-zhang_fei_0403@163.com>
From: zhangfei <zhangfei@nj.iscas.ac.cn>
This code is based on linux/arch/riscv/lib/memcpy.S. Removed macro definition to support
RISCV64.
The original implementation in the kernel uses byte-wise copy if src and dst are not
co-aligned.This approach is not efficient enough.Therefore, the patch linked below has
been used to modify this section.
https://lore.kernel.org/all/20210216225555.4976-1-gary@garyguo.net/
The link above has been optimized memcpy for misaligned cases.If they are not co-aligned,
then it will load two adjacent words from src and use shifts to assemble a full machine
word.
Signed-off-by: Zhang Fei<zhangfei@nj.iscas.ac.cn>
---
src/string/riscv64/memcpy.S | 159 ++++++++++++++++++++++++++++++++++++
1 file changed, 159 insertions(+)
create mode 100644 src/string/riscv64/memcpy.S
diff --git a/src/string/riscv64/memcpy.S b/src/string/riscv64/memcpy.S
new file mode 100644
index 0000000..ee59924
--- /dev/null
+++ b/src/string/riscv64/memcpy.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+#define REG_L ld
+
+.global memcpy
+.type memcpy,@function
+memcpy:
+ /* Save for return value */
+ mv t6, a0
+
+ /*
+ * Register allocation for code below:
+ * a0 - start of uncopied dst
+ * a1 - start of uncopied src
+ * t0 - end of uncopied dst
+ */
+ add t0, a0, a2
+
+ /*
+ * Use bytewise copy if too small.
+ *
+ * This threshold must be at least 2*SZREG to ensure at least one
+ * wordwise copy is performed. It is chosen to be 16 because it will
+ * save at least 7 iterations of bytewise copy, which pays off the
+ * fixed overhead.
+ */
+ li a3, 16
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Bytewise copy first to align a0 to word boundary.
+ */
+ addi a2, a0, SZREG-1
+ andi a2, a2, ~(SZREG-1)
+ beq a0, a2, 2f
+1:
+ lb a5, 0(a1)
+ addi a1, a1, 1
+ sb a5, 0(a0)
+ addi a0, a0, 1
+ bne a0, a2, 1b
+2:
+
+ /*
+ * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+ * aligned word-wise copy. Otherwise we need to perform misaligned
+ * word-wise copy.
+ */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lmisaligned_word_copy
+
+ /* Unrolled wordwise copy */
+ addi t0, t0, -(16*SZREG-1)
+ bgeu a0, t0, 2f
+1:
+ REG_L a2, 0(a1)
+ REG_L a3, SZREG(a1)
+ REG_L a4, 2*SZREG(a1)
+ REG_L a5, 3*SZREG(a1)
+ REG_L a6, 4*SZREG(a1)
+ REG_L a7, 5*SZREG(a1)
+ REG_L t1, 6*SZREG(a1)
+ REG_L t2, 7*SZREG(a1)
+ REG_L t3, 8*SZREG(a1)
+ REG_L t4, 9*SZREG(a1)
+ REG_L t5, 10*SZREG(a1)
+ REG_S a2, 0(a0)
+ REG_S a3, SZREG(a0)
+ REG_S a4, 2*SZREG(a0)
+ REG_S a5, 3*SZREG(a0)
+ REG_S a6, 4*SZREG(a0)
+ REG_S a7, 5*SZREG(a0)
+ REG_S t1, 6*SZREG(a0)
+ REG_S t2, 7*SZREG(a0)
+ REG_S t3, 8*SZREG(a0)
+ REG_S t4, 9*SZREG(a0)
+ REG_S t5, 10*SZREG(a0)
+ REG_L a2, 11*SZREG(a1)
+ REG_L a3, 12*SZREG(a1)
+ REG_L a4, 13*SZREG(a1)
+ REG_L a5, 14*SZREG(a1)
+ REG_L a6, 15*SZREG(a1)
+ addi a1, a1, 16*SZREG
+ REG_S a2, 11*SZREG(a0)
+ REG_S a3, 12*SZREG(a0)
+ REG_S a4, 13*SZREG(a0)
+ REG_S a5, 14*SZREG(a0)
+ REG_S a6, 15*SZREG(a0)
+ addi a0, a0, 16*SZREG
+ bltu a0, t0, 1b
+2:
+ /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+ addi t0, t0, 15*SZREG
+
+ /* Wordwise copy */
+ bgeu a0, t0, 2f
+1:
+ REG_L a5, 0(a1)
+ addi a1, a1, SZREG
+ REG_S a5, 0(a0)
+ addi a0, a0, SZREG
+ bltu a0, t0, 1b
+2:
+ addi t0, t0, SZREG-1
+
+.Lbyte_copy_tail:
+ /*
+ * Bytewise copy anything left.
+ */
+ beq a0, t0, 2f
+1:
+ lb a5, 0(a1)
+ addi a1, a1, 1
+ sb a5, 0(a0)
+ addi a0, a0, 1
+ bne a0, t0, 1b
+2:
+
+ mv a0, t6
+ ret
+
+.Lmisaligned_word_copy:
+ /*
+ * Misaligned word-wise copy.
+ * For misaligned copy we still perform word-wise copy, but we need to
+ * use the value fetched from the previous iteration and do some shifts.
+ * This is safe because we wouldn't access more words than necessary.
+ */
+
+ /* Calculate shifts */
+ slli t3, a3, 3
+ sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+ /* Load the initial value and align a1 */
+ andi a1, a1, ~(SZREG-1)
+ REG_L a5, 0(a1)
+
+ addi t0, t0, -(SZREG-1)
+ /* At least one iteration will be executed here, no check */
+1:
+ srl a4, a5, t3
+ REG_L a5, SZREG(a1)
+ addi a1, a1, SZREG
+ sll a2, a5, t4
+ or a2, a2, a4
+ REG_S a2, 0(a0)
+ addi a0, a0, SZREG
+ bltu a0, t0, 1b
+
+ /* Update pointers to correct value */
+ addi t0, t0, SZREG-1
+ add a1, a1, a3
+
+ j .Lbyte_copy_tail
--
2.34.1
next prev parent reply other threads:[~2023-06-07 10:08 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-07 10:07 [musl] [PATCH 0/3] RISC-V: Optimize memset, memcpy and memmove zhangfei
2023-06-07 10:07 ` [musl] [PATCH 1/3] RISC-V: Optimize memset zhangfei
2023-06-07 12:57 ` Rich Felker
2023-06-07 10:07 ` zhangfei [this message]
2023-06-07 10:07 ` [musl] [PATCH 3/3] RISC-V: Optimize memmove zhangfei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230607100710.4286-3-zhang_fei_0403@163.com \
--to=zhang_fei_0403@163.com \
--cc=dalias@libc.org \
--cc=musl@lists.openwall.com \
--cc=zhangfei@nj.iscas.ac.cn \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).