* [musl] [PATCH v2 0/1] riscv64: add optimized string functions
@ 2025-11-19 5:40 Pincheng Wang
2025-11-19 5:40 ` [musl] [PATCH v2 1/1] riscv64: add optimized memset, memcpy and memmove Pincheng Wang
0 siblings, 1 reply; 2+ messages in thread
From: Pincheng Wang @ 2025-11-19 5:40 UTC (permalink / raw)
To: musl; +Cc: pincheng.plct
Hi all,
This patch series supersedes my previous RVV-optimized memset patch
series, now extending support to memmove and memcpy as well. It provides
optimized versions of these string functions for RISC-V with vector
extension, dispatching at runtime based on CPU capability (RVV support).
Changes from v1:
- Change function pointers' attribute from hidden to static.
- Change conditional compilation condition from __riscv to __riscv &&
__riscv_xlen==64.
Implementation details:
- mem{set,cpy,move}.c are renamed internally to
__mem{set,cpy,move}_scalar via macro, preserving the generic C
implementation scalar fallback.
- mem{set,cpy,move}_vector.S provide the optimized
__mem{set,cpy,move}_vector symbols.
- string_dispatch.c exports the public mem{set,cpy,move}() symbols,
which dispatches via function pointer.
- __init_riscv_string_optimizations is called in __libc_start_main to
initialize function pointers based on AT_HWCAP.
- The vector implementation uses m8 register grouping for bulk fills.
Performance:
Function Size Improvement
memset 16B 0.06%
memset 64B 49.22%
memset 256B 127.81%
memset 1KB 58.12%
memset 4KB 47.95%
memset 64KB 2.56%
memcpy 16B 0.02%
memcpy 64B 35.94%
memcpy 256B 205.10%
memcpy 1KB 126.01%
memcpy 4KB 107.71%
memcpy 64KB 36.15%
memmove_bwd 16B -0.67%
memmove_bwd 64B 47.03%
memmove_bwd 256B 207.32%
memmove_bwd 1KB 125.33%
memmove_bwd 4KB 106.72%
memmove_bwd 64KB 41.46%
Benchmarks are conducted on a Spacemit X60 CPU.
Functional behavior matches generic functions.
Thanks,
Pincheng Wang
Pincheng Wang (1):
riscv64: add optimized memset, memcpy and memmove
src/env/__libc_start_main.c | 3 ++
src/internal/libc.h | 3 ++
src/string/riscv64/memcpy.c | 4 +++
src/string/riscv64/memcpy_vector.S | 28 +++++++++++++++
src/string/riscv64/memmove.c | 4 +++
src/string/riscv64/memmove_vector.S | 52 ++++++++++++++++++++++++++++
src/string/riscv64/memset.c | 4 +++
src/string/riscv64/memset_vector.S | 29 ++++++++++++++++
src/string/riscv64/string_dispatch.c | 52 ++++++++++++++++++++++++++++
9 files changed, 179 insertions(+)
create mode 100644 src/string/riscv64/memcpy.c
create mode 100644 src/string/riscv64/memcpy_vector.S
create mode 100644 src/string/riscv64/memmove.c
create mode 100644 src/string/riscv64/memmove_vector.S
create mode 100644 src/string/riscv64/memset.c
create mode 100644 src/string/riscv64/memset_vector.S
create mode 100644 src/string/riscv64/string_dispatch.c
--
2.39.5
^ permalink raw reply [flat|nested] 2+ messages in thread
* [musl] [PATCH v2 1/1] riscv64: add optimized memset, memcpy and memmove
2025-11-19 5:40 [musl] [PATCH v2 0/1] riscv64: add optimized string functions Pincheng Wang
@ 2025-11-19 5:40 ` Pincheng Wang
0 siblings, 0 replies; 2+ messages in thread
From: Pincheng Wang @ 2025-11-19 5:40 UTC (permalink / raw)
To: musl; +Cc: pincheng.plct
Add RISC-V vector extension optimized memset, memcpy and memmove
implementations with runtime CPU capability detection via AT_HWCAP.
The implementations provide both vector and scalar variants in a single
binary. At process startup, __init_riscv_string_optimizations() queries
AT_HWCAP to detect RVV support and selects the appropriate
implementations via function pointer dispatch. This allows the same libc
to run correctly on both vector-capable and non-vector RISC-V CPUs.
The vector implementation uses m8 register grouping and processes data
in vector-length chunks, providing significant performance improvements
on RVV-capable hardware while maintaining compatibility with non-vector
RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
src/env/__libc_start_main.c | 3 ++
src/internal/libc.h | 3 ++
src/string/riscv64/memcpy.c | 4 +++
src/string/riscv64/memcpy_vector.S | 28 +++++++++++++++
src/string/riscv64/memmove.c | 4 +++
src/string/riscv64/memmove_vector.S | 52 ++++++++++++++++++++++++++++
src/string/riscv64/memset.c | 4 +++
src/string/riscv64/memset_vector.S | 29 ++++++++++++++++
src/string/riscv64/string_dispatch.c | 52 ++++++++++++++++++++++++++++
9 files changed, 179 insertions(+)
create mode 100644 src/string/riscv64/memcpy.c
create mode 100644 src/string/riscv64/memcpy_vector.S
create mode 100644 src/string/riscv64/memmove.c
create mode 100644 src/string/riscv64/memmove_vector.S
create mode 100644 src/string/riscv64/memset.c
create mode 100644 src/string/riscv64/memset_vector.S
create mode 100644 src/string/riscv64/string_dispatch.c
diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index c5b277bd..db6b3b7c 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn)
__init_tls(aux);
__init_ssp((void *)aux[AT_RANDOM]);
+#if defined (__riscv) && __riscv_xlen == 64
+ __init_riscv_string_optimizations();
+#endif
if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
&& !aux[AT_SECURE]) return;
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 619bba86..45c99f12 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -40,6 +40,9 @@ extern hidden struct __libc __libc;
hidden void __init_libc(char **, char *);
hidden void __init_tls(size_t *);
hidden void __init_ssp(void *);
+#if defined (__riscv) && __riscv_xlen == 64
+hidden void __init_riscv_string_optimizations(void);
+#endif
hidden void __libc_start_init(void);
hidden void __funcs_on_exit(void);
hidden void __funcs_on_quick_exit(void);
diff --git a/src/string/riscv64/memcpy.c b/src/string/riscv64/memcpy.c
new file mode 100644
index 00000000..01892e69
--- /dev/null
+++ b/src/string/riscv64/memcpy.c
@@ -0,0 +1,4 @@
+/* Rename the generic memcpy to __memcpy_scalar and include it */
+#define memcpy __memcpy_scalar
+#include "../memcpy.c"
+#undef memcpy
diff --git a/src/string/riscv64/memcpy_vector.S b/src/string/riscv64/memcpy_vector.S
new file mode 100644
index 00000000..1131c52b
--- /dev/null
+++ b/src/string/riscv64/memcpy_vector.S
@@ -0,0 +1,28 @@
+ .text
+ .global __memcpy_vect
+ .type __memcpy_vect,%function
+ .option push
+ .option arch, +v
+/* void *__memcpy_vect(void *dest, const void *src, size_t n)
+ * a0 = dest, a1 = src, a2 = n
+ * Returns a0.
+ */
+__memcpy_vect:
+ mv t0, a0 /* running dst */
+ mv t1, a1 /* running src */
+ beqz a2, .Ldone_copy /* n == 0 then return */
+
+.Lbulk_copy:
+ vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ add t0, t0, t2
+ add t1, t1, t2
+ sub a2, a2, t2
+ bnez a2, .Lbulk_copy
+ /* fallthrough */
+
+.Ldone_copy:
+ ret
+.size __memcpy_vect, .-__memcpy_vect
+.option pop
diff --git a/src/string/riscv64/memmove.c b/src/string/riscv64/memmove.c
new file mode 100644
index 00000000..915d6ba9
--- /dev/null
+++ b/src/string/riscv64/memmove.c
@@ -0,0 +1,4 @@
+/* Rename the generic memmove to __memmove_scalar and include it */
+#define memmove __memmove_scalar
+#include "../memmove.c"
+#undef memmove
diff --git a/src/string/riscv64/memmove_vector.S b/src/string/riscv64/memmove_vector.S
new file mode 100644
index 00000000..77877e69
--- /dev/null
+++ b/src/string/riscv64/memmove_vector.S
@@ -0,0 +1,52 @@
+ .text
+ .global __memmove_vect
+ .type __memmove_vect,%function
+ .option push
+ .option arch, +v
+/* void *__memmove_vect(void *dest, const void *src, size_t n)
+ * a0 = dest, a1 = src, a2 = n
+ * Returns a0.
+ */
+__memmove_vect:
+ beqz a2, .Ldone_move /* n == 0 */
+ beq a0, a1, .Ldone_move /* dst == src */
+
+ /* overlap check */
+ bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
+
+ sub t2, a0, a1 /* t2 = dst - src */
+ bgeu t2, a2, .Lforward_move /* no overlap then forward move */
+
+ /* backward move */
+ add t0, a0, a2 /* running dst_end */
+ add t1, a1, a2 /* running src_end */
+
+.Lbackward_loop:
+ vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
+ sub t0, t0, t3
+ sub t1, t1, t3
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ sub a2, a2, t3
+ bnez a2, .Lbackward_loop
+ j .Ldone_move
+
+ /* forward move, same as __memcpy_vect */
+.Lforward_move:
+ mv t0, a0 /* running dst */
+ mv t1, a1 /* running src */
+
+.Lforward_loop:
+ vsetvli t3, a2, e8, m8, ta, ma
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ add t0, t0, t3
+ add t1, t1, t3
+ sub a2, a2, t3
+ bnez a2, .Lforward_loop
+ /* fallthrough */
+
+.Ldone_move:
+ ret
+.size __memmove_vect, .-__memmove_vect
+.option pop
diff --git a/src/string/riscv64/memset.c b/src/string/riscv64/memset.c
new file mode 100644
index 00000000..11fa3032
--- /dev/null
+++ b/src/string/riscv64/memset.c
@@ -0,0 +1,4 @@
+/* Rename the generic memset to __memset_scalar and include it */
+#define memset __memset_scalar
+#include "../memset.c"
+#undef memset
diff --git a/src/string/riscv64/memset_vector.S b/src/string/riscv64/memset_vector.S
new file mode 100644
index 00000000..8f939ecd
--- /dev/null
+++ b/src/string/riscv64/memset_vector.S
@@ -0,0 +1,29 @@
+ .text
+ .global __memset_vect
+ .type __memset_vect,%function
+ .option push
+ .option arch, +v
+/* void *__memset_vect(void *s, int c, size_t n)
+ * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
+ * Returns a0.
+ */
+__memset_vect:
+ mv t0, a0 /* running dst; keep a0 as return */
+ beqz a2, .Ldone_vect /* n == 0 then return */
+
+ /* Broadcast fill byte once. */
+ vsetvli t1, zero, e8, m8, ta, ma
+ vmv.v.x v0, a1
+
+.Lbulk_vect:
+ vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */
+ vse8.v v0, (t0)
+ add t0, t0, t1
+ sub a2, a2, t1
+ bnez a2, .Lbulk_vect
+ /* fallthrough */
+
+.Ldone_vect:
+ ret
+.size __memset_vect, .-__memset_vect
+.option pop
diff --git a/src/string/riscv64/string_dispatch.c b/src/string/riscv64/string_dispatch.c
new file mode 100644
index 00000000..cd0d1fbc
--- /dev/null
+++ b/src/string/riscv64/string_dispatch.c
@@ -0,0 +1,52 @@
+#include <stddef.h>
+#include <sys/auxv.h>
+#include "libc.h"
+
+void *__memset_scalar(void *s, int c, size_t n);
+void *__memset_vect(void *s, int c, size_t n);
+void *__memcpy_scalar(void *restrict dest, const void *restrict src, size_t n);
+void *__memcpy_vect(void *restrict dest, const void *restrict src, size_t n);
+void *__memmove_scalar(void *restrict dest, const void *restrict src, size_t n);
+void *__memmove_vect(void *restrict dest, const void *restrict src, size_t n);
+
+/* string function pointer, runtime-dispatched based on RVV support */
+#ifndef __riscv_vector
+static void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar;
+static void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_scalar;
+static void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_scalar;
+#else
+static void *(*__memset_ptr)(void *, int, size_t) = __memset_vect;
+static void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_vect;
+static void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_vect;
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+ return __memset_ptr(s, c, n);
+}
+
+void *memcpy(void *restrict dest, const void *restrict src, size_t n)
+{
+ return __memcpy_ptr(dest, src, n);
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ return __memmove_ptr(dest, src, n);
+}
+
+static inline int __has_rvv_via_hwcap(void)
+{
+ const unsigned long V_bit = (1ul << ('V' - 'A'));
+ unsigned long hwcap = __getauxval(AT_HWCAP);
+ return (hwcap & V_bit) != 0;
+}
+
+hidden void __init_riscv_string_optimizations(void)
+{
+ if (__has_rvv_via_hwcap()) {
+ __memset_ptr = __memset_vect;
+ __memcpy_ptr = __memcpy_vect;
+ __memmove_ptr = __memmove_vect;
+ }
+}
--
2.39.5
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-11-19 5:41 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-19 5:40 [musl] [PATCH v2 0/1] riscv64: add optimized string functions Pincheng Wang
2025-11-19 5:40 ` [musl] [PATCH v2 1/1] riscv64: add optimized memset, memcpy and memmove Pincheng Wang
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).