[musl] [PATCH 2/3] RISC-V: Optimize memcpy

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: zhangfei <zhang_fei_0403@163.com>
To: dalias@libc.org, musl@lists.openwall.com
Cc: zhangfei <zhangfei@nj.iscas.ac.cn>
Subject: [musl] [PATCH 2/3] RISC-V: Optimize memcpy
Date: Wed,  7 Jun 2023 18:07:09 +0800	[thread overview]
Message-ID: <20230607100710.4286-3-zhang_fei_0403@163.com> (raw)
In-Reply-To: <20230607100710.4286-1-zhang_fei_0403@163.com>

From: zhangfei <zhangfei@nj.iscas.ac.cn>

This code is based on linux/arch/riscv/lib/memcpy.S. Removed macro definition to support
RISCV64.
The original implementation in the kernel uses byte-wise copy if src and dst are not
co-aligned.This approach is not efficient enough.Therefore, the patch linked below has
been used to modify this section.

https://lore.kernel.org/all/20210216225555.4976-1-gary@garyguo.net/ 

The link above has been optimized memcpy for misaligned cases.If they are not co-aligned,
then it will load two adjacent words from src and use shifts to assemble a full machine 
word.

Signed-off-by: Zhang Fei<zhangfei@nj.iscas.ac.cn>
---
 src/string/riscv64/memcpy.S | 159 ++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 src/string/riscv64/memcpy.S

diff --git a/src/string/riscv64/memcpy.S b/src/string/riscv64/memcpy.S
new file mode 100644
index 0000000..ee59924
--- /dev/null
+++ b/src/string/riscv64/memcpy.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+#define REG_L ld
+
+.global memcpy
+.type memcpy,@function
+memcpy:
+        /* Save for return value */
+        mv      t6, a0
+
+        /*
+         * Register allocation for code below:
+         * a0 - start of uncopied dst
+         * a1 - start of uncopied src
+         * t0 - end of uncopied dst
+         */
+        add     t0, a0, a2
+
+        /*
+         * Use bytewise copy if too small.
+         *
+         * This threshold must be at least 2*SZREG to ensure at least one
+         * wordwise copy is performed. It is chosen to be 16 because it will
+         * save at least 7 iterations of bytewise copy, which pays off the
+         * fixed overhead.
+         */
+        li      a3, 16
+        bltu    a2, a3, .Lbyte_copy_tail
+
+        /*
+         * Bytewise copy first to align a0 to word boundary.
+         */
+        addi    a2, a0, SZREG-1
+        andi    a2, a2, ~(SZREG-1)
+        beq     a0, a2, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, a2, 1b
+2:
+
+        /*
+         * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+         * aligned word-wise copy. Otherwise we need to perform misaligned
+         * word-wise copy.
+         */
+        andi    a3, a1, SZREG-1
+        bnez    a3, .Lmisaligned_word_copy
+
+        /* Unrolled wordwise copy */
+        addi    t0, t0, -(16*SZREG-1)
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a2,        0(a1)
+        REG_L   a3,    SZREG(a1)
+        REG_L   a4,  2*SZREG(a1)
+        REG_L   a5,  3*SZREG(a1)
+        REG_L   a6,  4*SZREG(a1)
+        REG_L   a7,  5*SZREG(a1)
+        REG_L   t1,  6*SZREG(a1)
+        REG_L   t2,  7*SZREG(a1)
+        REG_L   t3,  8*SZREG(a1)
+        REG_L   t4,  9*SZREG(a1)
+        REG_L   t5, 10*SZREG(a1)
+        REG_S   a2,        0(a0)
+        REG_S   a3,    SZREG(a0)
+        REG_S   a4,  2*SZREG(a0)
+        REG_S   a5,  3*SZREG(a0)
+        REG_S   a6,  4*SZREG(a0)
+        REG_S   a7,  5*SZREG(a0)
+        REG_S   t1,  6*SZREG(a0)
+        REG_S   t2,  7*SZREG(a0)
+        REG_S   t3,  8*SZREG(a0)
+        REG_S   t4,  9*SZREG(a0)
+        REG_S   t5, 10*SZREG(a0)
+        REG_L   a2, 11*SZREG(a1)
+        REG_L   a3, 12*SZREG(a1)
+        REG_L   a4, 13*SZREG(a1)
+        REG_L   a5, 14*SZREG(a1)
+        REG_L   a6, 15*SZREG(a1)
+        addi    a1, a1, 16*SZREG
+        REG_S   a2, 11*SZREG(a0)
+        REG_S   a3, 12*SZREG(a0)
+        REG_S   a4, 13*SZREG(a0)
+        REG_S   a5, 14*SZREG(a0)
+        REG_S   a6, 15*SZREG(a0)
+        addi    a0, a0, 16*SZREG
+        bltu    a0, t0, 1b
+2:
+        /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+        addi    t0, t0, 15*SZREG
+
+        /* Wordwise copy */
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a5, 0(a1)
+        addi    a1, a1, SZREG
+        REG_S   a5, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+2:
+        addi    t0, t0, SZREG-1
+
+.Lbyte_copy_tail:
+        /*
+         * Bytewise copy anything left.
+         */
+        beq     a0, t0, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, t0, 1b
+2:
+
+        mv      a0, t6
+        ret
+
+.Lmisaligned_word_copy:
+        /*
+         * Misaligned word-wise copy.
+         * For misaligned copy we still perform word-wise copy, but we need to
+         * use the value fetched from the previous iteration and do some shifts.
+         * This is safe because we wouldn't access more words than necessary.
+         */
+
+        /* Calculate shifts */
+        slli    t3, a3, 3
+        sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+        /* Load the initial value and align a1 */
+        andi    a1, a1, ~(SZREG-1)
+        REG_L   a5, 0(a1)
+
+        addi    t0, t0, -(SZREG-1)
+        /* At least one iteration will be executed here, no check */
+1:
+        srl     a4, a5, t3
+        REG_L   a5, SZREG(a1)
+        addi    a1, a1, SZREG
+        sll     a2, a5, t4
+        or      a2, a2, a4
+        REG_S   a2, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+
+        /* Update pointers to correct value */
+        addi    t0, t0, SZREG-1
+        add     a1, a1, a3
+
+        j       .Lbyte_copy_tail
-- 
2.34.1

next prev parent reply	other threads:[~2023-06-07 10:08 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-07 10:07 [musl] [PATCH 0/3] RISC-V: Optimize memset, memcpy and memmove zhangfei
2023-06-07 10:07 ` [musl] [PATCH 1/3] RISC-V: Optimize memset zhangfei
2023-06-07 12:57   ` Rich Felker
2023-06-07 10:07 ` zhangfei [this message]
2023-06-07 10:07 ` [musl] [PATCH 3/3] RISC-V: Optimize memmove zhangfei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230607100710.4286-3-zhang_fei_0403@163.com \
    --to=zhang_fei_0403@163.com \
    --cc=dalias@libc.org \
    --cc=musl@lists.openwall.com \
    --cc=zhangfei@nj.iscas.ac.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).