From mboxrd@z Thu Jan  1 00:00:00 1970
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on inbox.vuxu.org
X-Spam-Level: 
X-Spam-Status: No, score=-1.5 required=5.0 tests=DKIM_INVALID,DKIM_SIGNED,
	FREEMAIL_FROM,MAILING_LIST_MULTI,RCVD_IN_DNSWL_LOW,
	T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
Received: (qmail 1963 invoked from network); 7 Jun 2023 10:08:45 -0000
Received: from second.openwall.net (193.110.157.125)
  by inbox.vuxu.org with ESMTPUTF8; 7 Jun 2023 10:08:45 -0000
Received: (qmail 27736 invoked by uid 550); 7 Jun 2023 10:08:17 -0000
Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:musl@lists.openwall.com>
List-Help: <mailto:musl-help@lists.openwall.com>
List-Unsubscribe: <mailto:musl-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:musl-subscribe@lists.openwall.com>
List-ID: <musl.lists.openwall.com>
Reply-To: musl@lists.openwall.com
Received: (qmail 26538 invoked from network); 7 Jun 2023 10:08:16 -0000
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
	s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=ZQTnO
	k/hHn3GJ5H9eMSaY6BCQ2smEdbQ4NEhp7Dmpcg=; b=Zs2s+qf9L2ifPY2WQromK
	IErjbLJ+W5B4A0d3RBdCg5MfPpmnhLBIsbaky+tB3JVUc6MU9oz5bioS8fcNuvOX
	8p4xtEH/j6wAmq0k+SgR/W5h2fi81srwFF9teHtBp4yTmSPd+HlZH9HWw9R3MKTA
	PkKZAPa+fib39e+HBbtjfs=
From: zhangfei <zhang_fei_0403@163.com>
To: dalias@libc.org,
	musl@lists.openwall.com
Cc: zhangfei <zhangfei@nj.iscas.ac.cn>
Date: Wed,  7 Jun 2023 18:07:09 +0800
Message-Id: <20230607100710.4286-3-zhang_fei_0403@163.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230607100710.4286-1-zhang_fei_0403@163.com>
References: <20230607100710.4286-1-zhang_fei_0403@163.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-CM-TRANSID:_____wBXhbbPVoBki6btBg--.27251S4
X-Coremail-Antispam: 1Uf129KBjvJXoWxGFyxXFW5ur1kAw1xtr1rCrg_yoWrCw17pr
	4rG3s2qF9I9F1Syayaq3W5JrZ8Grnavw1kXrsIkF17KryDCFn5Z3sFqFy5Jwn8Jr1fCF43
	ZF1Dtry7WF15trDanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2
	9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x07Ul9aPUUUUU=
X-Originating-IP: [180.111.101.91]
X-CM-SenderInfo: x2kd0w5bihxsiquqjqqrwthudrp/1tbiWxaHl2I0aUMABAAAsl
Subject: [musl] [PATCH 2/3] RISC-V: Optimize memcpy 

From: zhangfei <zhangfei@nj.iscas.ac.cn>

This code is based on linux/arch/riscv/lib/memcpy.S. Removed macro definition to support
RISCV64.
The original implementation in the kernel uses byte-wise copy if src and dst are not
co-aligned.This approach is not efficient enough.Therefore, the patch linked below has
been used to modify this section.

https://lore.kernel.org/all/20210216225555.4976-1-gary@garyguo.net/ 

The link above has been optimized memcpy for misaligned cases.If they are not co-aligned,
then it will load two adjacent words from src and use shifts to assemble a full machine 
word.

Signed-off-by: Zhang Fei<zhangfei@nj.iscas.ac.cn>
---
 src/string/riscv64/memcpy.S | 159 ++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 src/string/riscv64/memcpy.S

diff --git a/src/string/riscv64/memcpy.S b/src/string/riscv64/memcpy.S
new file mode 100644
index 0000000..ee59924
--- /dev/null
+++ b/src/string/riscv64/memcpy.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+#define REG_L ld
+
+.global memcpy
+.type memcpy,@function
+memcpy:
+        /* Save for return value */
+        mv      t6, a0
+
+        /*
+         * Register allocation for code below:
+         * a0 - start of uncopied dst
+         * a1 - start of uncopied src
+         * t0 - end of uncopied dst
+         */
+        add     t0, a0, a2
+
+        /*
+         * Use bytewise copy if too small.
+         *
+         * This threshold must be at least 2*SZREG to ensure at least one
+         * wordwise copy is performed. It is chosen to be 16 because it will
+         * save at least 7 iterations of bytewise copy, which pays off the
+         * fixed overhead.
+         */
+        li      a3, 16
+        bltu    a2, a3, .Lbyte_copy_tail
+
+        /*
+         * Bytewise copy first to align a0 to word boundary.
+         */
+        addi    a2, a0, SZREG-1
+        andi    a2, a2, ~(SZREG-1)
+        beq     a0, a2, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, a2, 1b
+2:
+
+        /*
+         * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+         * aligned word-wise copy. Otherwise we need to perform misaligned
+         * word-wise copy.
+         */
+        andi    a3, a1, SZREG-1
+        bnez    a3, .Lmisaligned_word_copy
+
+        /* Unrolled wordwise copy */
+        addi    t0, t0, -(16*SZREG-1)
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a2,        0(a1)
+        REG_L   a3,    SZREG(a1)
+        REG_L   a4,  2*SZREG(a1)
+        REG_L   a5,  3*SZREG(a1)
+        REG_L   a6,  4*SZREG(a1)
+        REG_L   a7,  5*SZREG(a1)
+        REG_L   t1,  6*SZREG(a1)
+        REG_L   t2,  7*SZREG(a1)
+        REG_L   t3,  8*SZREG(a1)
+        REG_L   t4,  9*SZREG(a1)
+        REG_L   t5, 10*SZREG(a1)
+        REG_S   a2,        0(a0)
+        REG_S   a3,    SZREG(a0)
+        REG_S   a4,  2*SZREG(a0)
+        REG_S   a5,  3*SZREG(a0)
+        REG_S   a6,  4*SZREG(a0)
+        REG_S   a7,  5*SZREG(a0)
+        REG_S   t1,  6*SZREG(a0)
+        REG_S   t2,  7*SZREG(a0)
+        REG_S   t3,  8*SZREG(a0)
+        REG_S   t4,  9*SZREG(a0)
+        REG_S   t5, 10*SZREG(a0)
+        REG_L   a2, 11*SZREG(a1)
+        REG_L   a3, 12*SZREG(a1)
+        REG_L   a4, 13*SZREG(a1)
+        REG_L   a5, 14*SZREG(a1)
+        REG_L   a6, 15*SZREG(a1)
+        addi    a1, a1, 16*SZREG
+        REG_S   a2, 11*SZREG(a0)
+        REG_S   a3, 12*SZREG(a0)
+        REG_S   a4, 13*SZREG(a0)
+        REG_S   a5, 14*SZREG(a0)
+        REG_S   a6, 15*SZREG(a0)
+        addi    a0, a0, 16*SZREG
+        bltu    a0, t0, 1b
+2:
+        /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+        addi    t0, t0, 15*SZREG
+
+        /* Wordwise copy */
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a5, 0(a1)
+        addi    a1, a1, SZREG
+        REG_S   a5, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+2:
+        addi    t0, t0, SZREG-1
+
+.Lbyte_copy_tail:
+        /*
+         * Bytewise copy anything left.
+         */
+        beq     a0, t0, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, t0, 1b
+2:
+
+        mv      a0, t6
+        ret
+
+.Lmisaligned_word_copy:
+        /*
+         * Misaligned word-wise copy.
+         * For misaligned copy we still perform word-wise copy, but we need to
+         * use the value fetched from the previous iteration and do some shifts.
+         * This is safe because we wouldn't access more words than necessary.
+         */
+
+        /* Calculate shifts */
+        slli    t3, a3, 3
+        sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+        /* Load the initial value and align a1 */
+        andi    a1, a1, ~(SZREG-1)
+        REG_L   a5, 0(a1)
+
+        addi    t0, t0, -(SZREG-1)
+        /* At least one iteration will be executed here, no check */
+1:
+        srl     a4, a5, t3
+        REG_L   a5, SZREG(a1)
+        addi    a1, a1, SZREG
+        sll     a2, a5, t4
+        or      a2, a2, a4
+        REG_S   a2, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+
+        /* Update pointers to correct value */
+        addi    t0, t0, SZREG-1
+        add     a1, a1, a3
+
+        j       .Lbyte_copy_tail
-- 
2.34.1