* [musl] [PATCH v3 0/1] riscv64: Add RVV optimized memset implementation
@ 2025-10-30 12:09 Pincheng Wang
2025-10-30 12:09 ` [musl] [PATCH v3 1/1] riscv64: add runtime-detected vector optimized memset Pincheng Wang
0 siblings, 1 reply; 2+ messages in thread
From: Pincheng Wang @ 2025-10-30 12:09 UTC (permalink / raw)
To: musl; +Cc: pincheng.plct
Hi all,
This is v3 of the RISC-V Vector (RVV) optimized memset patch.
Changes from v2:
- Drop global vector disable in arch.mak; explicitly enable 'V'
extension via `.option arch` in memset_vector.S.
- Use __getauxval instead of getauxval and rename memset_{scalar,vect}
to __memset_{scalar,vect} to avoid symbol conflicts.
- Remove the small-size fast path; now use vector code uniformly for all
sizes to avoid branch mispredictions and simplify control flow.
Changes from v1:
- Replaced compile-time detection (__riscv_vetor macro) with runtime
detection using AT_HWCAP, addressing the main concern from v1
feedback.
- Introduced a dispatch mechanism (memset_dispatch.c) that selects
appropriate implementation at process startup.
Implementation details:
- memset.c is renamed internally to __memset_scalar via macro,
preserving the generic C implementation scalar fallback.
- memset_vector.S provides the optimized __memset_vector symbol.
- memset_dispatch.c exports the public memset() symbol, which dispatches
via function pointer.
- __init_riscv_string_optimizations is called in __libc_start_main to
initialize the function pointer based on AT_HWCAP.
- The vector implementation uses vsetvli for bulk fills.
Performance:
- On Spacemit X60: up to ~3.36x faster (256B), with consistent gains
across medium and large sizes.
For very small sizes (<8 bytes), there can be minor regressions compared
to the generic C version. This is trade off for the significant gains on
larger sizes.
Testing:
- QEMU with QEMU_CPU="rv64,v=true" and "rv64,v=false".
- Spacemit X60 with V extension support.
- CFLAGS="-march=rv64gc" and "-march=rv64gcv".
Functional behavior matches generic memset.
Thanks,
Pincheng Wang
Pincheng Wang (1):
riscv64: add runtime-detected vector optimized memset
src/env/__libc_start_main.c | 3 +++
src/internal/libc.h | 3 +++
src/string/riscv64/memset.c | 4 ++++
src/string/riscv64/memset_dispatch.c | 36 ++++++++++++++++++++++++++++
src/string/riscv64/memset_vector.S | 28 ++++++++++++++++++++++
5 files changed, 74 insertions(+)
create mode 100644 src/string/riscv64/memset.c
create mode 100644 src/string/riscv64/memset_dispatch.c
create mode 100644 src/string/riscv64/memset_vector.S
--
2.39.5
^ permalink raw reply [flat|nested] 2+ messages in thread
* [musl] [PATCH v3 1/1] riscv64: add runtime-detected vector optimized memset
2025-10-30 12:09 [musl] [PATCH v3 0/1] riscv64: Add RVV optimized memset implementation Pincheng Wang
@ 2025-10-30 12:09 ` Pincheng Wang
0 siblings, 0 replies; 2+ messages in thread
From: Pincheng Wang @ 2025-10-30 12:09 UTC (permalink / raw)
To: musl; +Cc: pincheng.plct
Add a RISC-V vector extension optimized memset implementation with
runtime CPU capability detection via HW_CAP.
The implementation provides both vector and scalar variants in a single
binary. At process startup, __init_riscv_string_optimizations() queries
AT_HWCAP to detect RVV support and selects the appropriate
implementation via function pointer dispatch. This allows the same libc
to run correctly on both vector-capable and non-vector RISC-V CPUs.
The vector implementation uses m8 register grouping and processes data
in vector-length chunks, providing significant performance improvements
on RVV-capable hardware while maintaining compatibility with non-vector
RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
src/env/__libc_start_main.c | 3 +++
src/internal/libc.h | 3 +++
src/string/riscv64/memset.c | 4 ++++
src/string/riscv64/memset_dispatch.c | 36 ++++++++++++++++++++++++++++
src/string/riscv64/memset_vector.S | 28 ++++++++++++++++++++++
5 files changed, 74 insertions(+)
create mode 100644 src/string/riscv64/memset.c
create mode 100644 src/string/riscv64/memset_dispatch.c
create mode 100644 src/string/riscv64/memset_vector.S
diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index c5b277bd..c23e63f4 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn)
__init_tls(aux);
__init_ssp((void *)aux[AT_RANDOM]);
+#ifdef __riscv
+ __init_riscv_string_optimizations();
+#endif
if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
&& !aux[AT_SECURE]) return;
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 619bba86..28860b06 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -40,6 +40,9 @@ extern hidden struct __libc __libc;
hidden void __init_libc(char **, char *);
hidden void __init_tls(size_t *);
hidden void __init_ssp(void *);
+#ifdef __riscv
+hidden void __init_riscv_string_optimizations(void);
+#endif
hidden void __libc_start_init(void);
hidden void __funcs_on_exit(void);
hidden void __funcs_on_quick_exit(void);
diff --git a/src/string/riscv64/memset.c b/src/string/riscv64/memset.c
new file mode 100644
index 00000000..11fa3032
--- /dev/null
+++ b/src/string/riscv64/memset.c
@@ -0,0 +1,4 @@
+/* Rename the generic memset to __memset_scalar and include it */
+#define memset __memset_scalar
+#include "../memset.c"
+#undef memset
diff --git a/src/string/riscv64/memset_dispatch.c b/src/string/riscv64/memset_dispatch.c
new file mode 100644
index 00000000..8e36d0f9
--- /dev/null
+++ b/src/string/riscv64/memset_dispatch.c
@@ -0,0 +1,36 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+#include "libc.h"
+
+void *__memset_scalar(void *s, int c, size_t n);
+void *__memset_vect(void *s, int c, size_t n);
+
+/* memset function pointer, runtime-dispatched based on RVV support */
+__attribute__((visibility("hidden")))
+#ifndef __riscv_vector
+void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar;
+#else
+void *(*__memset_ptr)(void *, int, size_t) = __memset_vect;
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+ return __memset_ptr(s, c, n);
+}
+
+static __inline int __has_rvv_via_hwcap(void)
+{
+ const unsigned long V_bit = (1ul << ('V' - 'A'));
+ unsigned long hwcap = __getauxval(AT_HWCAP);
+ return (hwcap & V_bit) != 0;
+}
+
+__attribute__((visibility("hidden")))
+void __init_riscv_string_optimizations(void)
+{
+ if (__has_rvv_via_hwcap())
+ __memset_ptr = __memset_vect;
+ else
+ __memset_ptr = __memset_scalar;
+}
diff --git a/src/string/riscv64/memset_vector.S b/src/string/riscv64/memset_vector.S
new file mode 100644
index 00000000..513645d3
--- /dev/null
+++ b/src/string/riscv64/memset_vector.S
@@ -0,0 +1,28 @@
+ .text
+ .global __memset_vect
+ .option push
+ .option arch, +v
+/* void *__memset_vect(void *s, int c, size_t n)
+ * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
+ * Returns a0.
+ */
+__memset_vect:
+ mv t0, a0 /* running dst; keep a0 as return */
+ beqz a2, .Ldone_vect /* n == 0 then return */
+
+ /* Broadcast fill byte once. */
+ vsetvli t1, zero, e8, m8, ta, ma
+ vmv.v.x v0, a1
+
+.Lbulk_vect:
+ vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */
+ vse8.v v0, (t0)
+ add t0, t0, t1
+ sub a2, a2, t1
+ bnez a2, .Lbulk_vect
+ /* fallthrough */
+
+.Ldone_vect:
+ ret
+.size __memset_vect, .-__memset_vect
+.option pop
--
2.39.5
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-10-30 12:10 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-30 12:09 [musl] [PATCH v3 0/1] riscv64: Add RVV optimized memset implementation Pincheng Wang
2025-10-30 12:09 ` [musl] [PATCH v3 1/1] riscv64: add runtime-detected vector optimized memset Pincheng Wang
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).