[musl] [PATCH 1/3] RISC-V: Optimize memset

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: zhangfei <zhang_fei_0403@163.com>
To: dalias@libc.org, musl@lists.openwall.com
Cc: zhangfei <zhangfei@nj.iscas.ac.cn>
Subject: [musl] [PATCH 1/3] RISC-V: Optimize memset
Date: Wed,  7 Jun 2023 18:07:08 +0800	[thread overview]
Message-ID: <20230607100710.4286-2-zhang_fei_0403@163.com> (raw)
In-Reply-To: <20230607100710.4286-1-zhang_fei_0403@163.com>

From: zhangfei <zhangfei@nj.iscas.ac.cn>

This code is based on linux/arch/riscv/lib/memset.S. Removed macro definition and modified
to support RISCV64.
When the amount of data in the source code is less than 16 bytes or after loop tail
processing, byte storage is used. Here we refer to musl/src/string/memset.c, and modify it
to fill head and tail with minimal branching.

Signed-off-by: Zhang Fei<zhangfei@nj.iscas.ac.cn>
---
 src/string/riscv64/memset.S | 136 ++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 src/string/riscv64/memset.S

diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S
new file mode 100644
index 0000000..f8663d7
--- /dev/null
+++ b/src/string/riscv64/memset.S
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+
+.global memset
+.type memset,@function
+memset:
+        move 	t0, a0  /* Preserve return value */
+
+	/* Defer to byte-oriented fill for small sizes */
+	sltiu 	a3, a2, 16
+	bnez 	a3, 4f
+
+	/*
+	 * Round to nearest XLEN-aligned address
+	 * greater than or equal to start address
+	 */
+	addi 	a3, t0, SZREG-1
+	andi 	a3, a3, ~(SZREG-1)
+	beq 	a3, t0, 2f  /* Skip if already aligned */
+	/* Handle initial misalignment */
+	sub 	a4, a3, t0
+1:
+	sb 	a1, 0(t0)
+	addi 	t0, t0, 1
+	bltu 	t0, a3, 1b
+	sub 	a2, a2, a4  /* Update count */
+
+2: 
+	andi 	a1, a1, 0xff
+	slli 	a3, a1, 8
+	or 	a1, a3, a1
+	slli 	a3, a1, 16
+	or 	a1, a3, a1
+	slli 	a3, a1, 32
+	or 	a1, a3, a1
+
+	/* Calculate end address */
+	andi 	a4, a2, ~(SZREG-1)
+	add 	a3, t0, a4
+
+	andi 	a4, a4, 31*SZREG  /* Calculate remainder */
+	beqz 	a4, 3f            /* Shortcut if no remainder */
+	neg 	a4, a4
+	addi 	a4, a4, 32*SZREG  /* Calculate initial offset */
+
+	/* Adjust start address with offset */
+	sub 	t0, t0, a4
+
+	/* Jump into loop body */
+	/* Assumes 64-bit instruction lengths */
+	la 	a5, 3f
+	srli 	a4, a4, 1
+	add 	a5, a5, a4
+	jr 	a5
+3:
+	REG_S 	a1,        0(t0)
+	REG_S 	a1,    SZREG(t0)
+	REG_S 	a1,  2*SZREG(t0)
+	REG_S 	a1,  3*SZREG(t0)
+	REG_S 	a1,  4*SZREG(t0)
+	REG_S 	a1,  5*SZREG(t0)
+	REG_S 	a1,  6*SZREG(t0)
+	REG_S 	a1,  7*SZREG(t0)
+	REG_S 	a1,  8*SZREG(t0)
+	REG_S 	a1,  9*SZREG(t0)
+	REG_S 	a1, 10*SZREG(t0)
+	REG_S 	a1, 11*SZREG(t0)
+	REG_S 	a1, 12*SZREG(t0)
+	REG_S 	a1, 13*SZREG(t0)
+	REG_S 	a1, 14*SZREG(t0)
+	REG_S 	a1, 15*SZREG(t0)
+	REG_S 	a1, 16*SZREG(t0)
+	REG_S 	a1, 17*SZREG(t0)
+	REG_S 	a1, 18*SZREG(t0)
+	REG_S 	a1, 19*SZREG(t0)
+	REG_S 	a1, 20*SZREG(t0)
+	REG_S 	a1, 21*SZREG(t0)
+	REG_S 	a1, 22*SZREG(t0)
+	REG_S 	a1, 23*SZREG(t0)
+	REG_S 	a1, 24*SZREG(t0)
+	REG_S 	a1, 25*SZREG(t0)
+	REG_S 	a1, 26*SZREG(t0)
+	REG_S 	a1, 27*SZREG(t0)
+	REG_S 	a1, 28*SZREG(t0)
+	REG_S 	a1, 29*SZREG(t0)
+	REG_S 	a1, 30*SZREG(t0)
+	REG_S 	a1, 31*SZREG(t0)
+	addi 	t0, t0, 32*SZREG
+	bltu 	t0, a3, 3b
+	andi 	a2, a2, SZREG-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz 	a2, 6f
+	add 	a3, t0, a2
+5:
+        /* Fill head and tail with minimal branching. Each
+         * conditional ensures that all the subsequently used
+         * offsets are well-defined and in the dest region. */
+	sb 	a1, 0(t0)
+	sb 	a1, -1(a3)
+	li 	a4, 2
+       bgeu 	a4, a2, 6f 
+        
+       sb 	a1, 1(t0) 
+       sb 	a1, 2(t0) 
+       sb 	a1, -2(a3) 
+       sb 	a1, -3(a3) 
+	li 	a4, 6
+       bgeu 	a4, a2, 6f 
+
+       sb 	a1, 3(t0) 
+       sb 	a1, -4(a3) 
+	li 	a4, 8
+       bgeu 	a4, a2, 6f 
+        
+       sb 	a1, 4(t0) 
+       sb 	a1, 5(t0) 
+       sb 	a1, -5(a3) 
+	li 	a4, 11
+       bgeu 	a4, a2, 6f 
+ 
+       sb 	a1, 6(t0) 
+       sb 	a1, -6(a3) 
+       sb 	a1, -7(a3) 
+	li 	a4, 14
+       bgeu 	a4, a2, 6f 
+
+       sb 	a1, 7(t0) 
+6:
+	ret
-- 
2.34.1

next prev parent reply	other threads:[~2023-06-07 10:08 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-07 10:07 [musl] [PATCH 0/3] RISC-V: Optimize memset, memcpy and memmove zhangfei
2023-06-07 10:07 ` zhangfei [this message]
2023-06-07 12:57   ` [musl] [PATCH 1/3] RISC-V: Optimize memset Rich Felker
2023-06-07 10:07 ` [musl] [PATCH 2/3] RISC-V: Optimize memcpy zhangfei
2023-06-07 10:07 ` [musl] [PATCH 3/3] RISC-V: Optimize memmove zhangfei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230607100710.4286-2-zhang_fei_0403@163.com \
    --to=zhang_fei_0403@163.com \
    --cc=dalias@libc.org \
    --cc=musl@lists.openwall.com \
    --cc=zhangfei@nj.iscas.ac.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).