From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=-3.3 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,RCVD_IN_MSPIKE_H5, RCVD_IN_MSPIKE_WL,RCVD_IN_ZEN_BLOCKED_OPENDNS autolearn=ham autolearn_force=no version=3.4.4 Received: from second.openwall.net (second.openwall.net [193.110.157.125]) by inbox.vuxu.org (Postfix) with SMTP id 550B134C4F for ; Thu, 13 Nov 2025 17:15:52 +0100 (CET) Received: (qmail 15874 invoked by uid 550); 13 Nov 2025 16:15:35 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Reply-To: musl@lists.openwall.com x-ms-reactions: disallow Received: (qmail 15599 invoked from network); 13 Nov 2025 16:15:34 -0000 From: Pincheng Wang To: musl@lists.openwall.com Cc: pincheng.plct@isrc.iscas.ac.cn Date: Fri, 14 Nov 2025 00:15:18 +0800 Message-Id: <20251113161518.57357-2-pincheng.plct@isrc.iscas.ac.cn> X-Mailer: git-send-email 2.39.5 In-Reply-To: <20251113161518.57357-1-pincheng.plct@isrc.iscas.ac.cn> References: <20251113161518.57357-1-pincheng.plct@isrc.iscas.ac.cn> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-CM-TRANSID:rQCowAAXqdsXBBZpSE6iAA--.1891S3 X-Coremail-Antispam: 1UD129KBjvJXoW3uF15uFy3KF1UJFWkGFWUCFg_yoWDWw47pa 1UAw13Kr4fAw1xWF4fW3Wjvan8G395uF15W3ZruF15Z3y8GFyYyF9xAa4UZFW7JF1jkw1a 9r4UGFy3ua1rAaUanT9S1TB71UUUUU7qnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUU9214x267AKxVWUJVW8JwAFc2x0x2IEx4CE42xK8VAvwI8IcIk0 rVWrJVCq3wAFIxvE14AKwVWUJVWUGwA2048vs2IY020E87I2jVAFwI0_Jr4l82xGYIkIc2 x26xkF7I0E14v26r18M28lY4IEw2IIxxk0rwA2F7IY1VAKz4vEj48ve4kI8wA2z4x0Y4vE 2Ix0cI8IcVAFwI0_Jr0_JF4l84ACjcxK6xIIjxv20xvEc7CjxVAFwI0_Jr0_Gr1l84ACjc xK6I8E87Iv67AKxVWUJVW8JwA2z4x0Y4vEx4A2jsIEc7CjxVAFwI0_Jr0_Gr1le2I262IY c4CY6c8Ij28IcVAaY2xG8wAqx4xG64xvF2IEw4CE5I8CrVC2j2WlYx0E2Ix0cI8IcVAFwI 0_Jr0_Jr4lYx0Ex4A2jsIE14v26r1j6r4UMcvjeVCFs4IE7xkEbVWUJVW8JwACjcxG0xvY 0x0EwIxGrwACjI8F5VA0II8E6IAqYI8I648v4I1l42xK82IYc2Ij64vIr41l4I8I3I0E4I kC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8GjcxK67AKxVWUGVWU WwC2zVAF1VAY17CE14v26r1Y6r17MIIYrxkI7VAKI48JMIIF0xvE2Ix0cI8IcVAFwI0_Jr 0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r1j6r4UMIIF0xvE42xK8VAvwI8IcIk0rVWU JVWUCwCI42IY6I8E87Iv67AKxVWUJVW8JwCI42IY6I8E87Iv6xkF7I0E14v26r1j6r4UYx BIdaVFxhVjvjDU0xZFpf9x0JUqfO7UUUUU= X-Originating-IP: [120.227.56.204] X-CM-SenderInfo: pslquxhhqjh1xofwqxxvufhxpvfd2hldfou0/ Subject: [musl] [PATCH 1/1] riscv64: add optimized memset, memcpy and memmove Add RISC-V vector extension optimized memset, memcpy and memmove implementation with runtime CPU capability detection via HW_CAP. The implementations provide both vector and scalar variants in a single binary. At process startup, __init_riscv_string_optimizations() queries AT_HWCAP to detect RVV support and selects the appropriate implementations via function pointer dispatch. This allows the same libc to run correctly on both vector-capable and non-vector RISC-V CPUs. The vector implementation uses m8 register grouping and processes data in vector-length chunks, providing significant performance improvements on RVV-capable hardware while maintaining compatibility with non-vector RISC-V systems. Signed-off-by: Pincheng Wang --- src/env/__libc_start_main.c | 3 ++ src/internal/libc.h | 3 ++ src/string/riscv64/memcpy.c | 4 ++ src/string/riscv64/memcpy_vector.S | 27 ++++++++++++++ src/string/riscv64/memmove.c | 4 ++ src/string/riscv64/memmove_vector.S | 51 ++++++++++++++++++++++++++ src/string/riscv64/memset.c | 4 ++ src/string/riscv64/memset_dispatch.c | 36 ++++++++++++++++++ src/string/riscv64/memset_vector.S | 28 ++++++++++++++ src/string/riscv64/string_dispatch.c | 55 ++++++++++++++++++++++++++++ 10 files changed, 215 insertions(+) create mode 100644 src/string/riscv64/memcpy.c create mode 100644 src/string/riscv64/memcpy_vector.S create mode 100644 src/string/riscv64/memmove.c create mode 100644 src/string/riscv64/memmove_vector.S create mode 100644 src/string/riscv64/memset.c create mode 100644 src/string/riscv64/memset_dispatch.c create mode 100644 src/string/riscv64/memset_vector.S create mode 100644 src/string/riscv64/string_dispatch.c diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c index c5b277bd..c23e63f4 100644 --- a/src/env/__libc_start_main.c +++ b/src/env/__libc_start_main.c @@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn) __init_tls(aux); __init_ssp((void *)aux[AT_RANDOM]); +#ifdef __riscv + __init_riscv_string_optimizations(); +#endif if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID] && !aux[AT_SECURE]) return; diff --git a/src/internal/libc.h b/src/internal/libc.h index 619bba86..28860b06 100644 --- a/src/internal/libc.h +++ b/src/internal/libc.h @@ -40,6 +40,9 @@ extern hidden struct __libc __libc; hidden void __init_libc(char **, char *); hidden void __init_tls(size_t *); hidden void __init_ssp(void *); +#ifdef __riscv +hidden void __init_riscv_string_optimizations(void); +#endif hidden void __libc_start_init(void); hidden void __funcs_on_exit(void); hidden void __funcs_on_quick_exit(void); diff --git a/src/string/riscv64/memcpy.c b/src/string/riscv64/memcpy.c new file mode 100644 index 00000000..01892e69 --- /dev/null +++ b/src/string/riscv64/memcpy.c @@ -0,0 +1,4 @@ +/* Rename the generic memcpy to __memcpy_scalar and include it */ +#define memcpy __memcpy_scalar +#include "../memcpy.c" +#undef memcpy diff --git a/src/string/riscv64/memcpy_vector.S b/src/string/riscv64/memcpy_vector.S new file mode 100644 index 00000000..6a045832 --- /dev/null +++ b/src/string/riscv64/memcpy_vector.S @@ -0,0 +1,27 @@ + .text + .global __memcpy_vect + .option push + .option arch, +v +/* void *__memcpy_vect(void *dest, const void *src, size_t n) + * a0 = dest, a1 = src, a2 = n + * Returns a0. + */ +__memcpy_vect: + mv t0, a0 /* running dst */ + mv t1, a1 /* running src */ + beqz a2, .Ldone_copy /* n == 0 then return */ + +.Lbulk_copy: + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */ + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t2 + add t1, t1, t2 + sub a2, a2, t2 + bnez a2, .Lbulk_copy + /* fallthrough */ + +.Ldone_copy: + ret +.size __memcpy_vect, .-__memcpy_vect +.option pop diff --git a/src/string/riscv64/memmove.c b/src/string/riscv64/memmove.c new file mode 100644 index 00000000..915d6ba9 --- /dev/null +++ b/src/string/riscv64/memmove.c @@ -0,0 +1,4 @@ +/* Rename the generic memmove to __memmove_scalar and include it */ +#define memmove __memmove_scalar +#include "../memmove.c" +#undef memmove diff --git a/src/string/riscv64/memmove_vector.S b/src/string/riscv64/memmove_vector.S new file mode 100644 index 00000000..aec87a52 --- /dev/null +++ b/src/string/riscv64/memmove_vector.S @@ -0,0 +1,51 @@ + .text + .global __memmove_vect + .option push + .option arch, +v +/* void *__memmove_vect(void *dest, const void *src, size_t n) + * a0 = dest, a1 = src, a2 = n + * Returns a0. + */ +__memmove_vect: + beqz a2, .Ldone_move /* n == 0 */ + beq a0, a1, .Ldone_move /* dst == src */ + + /* overlap check */ + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/ + + sub t2, a0, a1 /* t2 = dst - src */ + bgeu t2, a2, .Lforward_move /* no overlap then forward move */ + + /* backward move */ + add t0, a0, a2 /* running dst_end */ + add t1, a1, a2 /* running src_end */ + +.Lbackward_loop: + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */ + sub t0, t0, t3 + sub t1, t1, t3 + vle8.v v0, (t1) + vse8.v v0, (t0) + sub a2, a2, t3 + bnez a2, .Lbackward_loop + j .Ldone_move + + /* forward move, same as __memcpy_vect */ +.Lforward_move: + mv t0, a0 /* running dst */ + mv t1, a1 /* running src */ + +.Lforward_loop: + vsetvli t3, a2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t3 + add t1, t1, t3 + sub a2, a2, t3 + bnez a2, .Lforward_loop + /* fallthrough */ + +.Ldone_move: + ret +.size __memmove_vect, .-__memmove_vect +.option pop diff --git a/src/string/riscv64/memset.c b/src/string/riscv64/memset.c new file mode 100644 index 00000000..11fa3032 --- /dev/null +++ b/src/string/riscv64/memset.c @@ -0,0 +1,4 @@ +/* Rename the generic memset to __memset_scalar and include it */ +#define memset __memset_scalar +#include "../memset.c" +#undef memset diff --git a/src/string/riscv64/memset_dispatch.c b/src/string/riscv64/memset_dispatch.c new file mode 100644 index 00000000..8e36d0f9 --- /dev/null +++ b/src/string/riscv64/memset_dispatch.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include "libc.h" + +void *__memset_scalar(void *s, int c, size_t n); +void *__memset_vect(void *s, int c, size_t n); + +/* memset function pointer, runtime-dispatched based on RVV support */ +__attribute__((visibility("hidden"))) +#ifndef __riscv_vector +void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar; +#else +void *(*__memset_ptr)(void *, int, size_t) = __memset_vect; +#endif + +void *memset(void *s, int c, size_t n) +{ + return __memset_ptr(s, c, n); +} + +static __inline int __has_rvv_via_hwcap(void) +{ + const unsigned long V_bit = (1ul << ('V' - 'A')); + unsigned long hwcap = __getauxval(AT_HWCAP); + return (hwcap & V_bit) != 0; +} + +__attribute__((visibility("hidden"))) +void __init_riscv_string_optimizations(void) +{ + if (__has_rvv_via_hwcap()) + __memset_ptr = __memset_vect; + else + __memset_ptr = __memset_scalar; +} diff --git a/src/string/riscv64/memset_vector.S b/src/string/riscv64/memset_vector.S new file mode 100644 index 00000000..513645d3 --- /dev/null +++ b/src/string/riscv64/memset_vector.S @@ -0,0 +1,28 @@ + .text + .global __memset_vect + .option push + .option arch, +v +/* void *__memset_vect(void *s, int c, size_t n) + * a0 = s (dest), a1 = c (fill byte), a2 = n (size) + * Returns a0. + */ +__memset_vect: + mv t0, a0 /* running dst; keep a0 as return */ + beqz a2, .Ldone_vect /* n == 0 then return */ + + /* Broadcast fill byte once. */ + vsetvli t1, zero, e8, m8, ta, ma + vmv.v.x v0, a1 + +.Lbulk_vect: + vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */ + vse8.v v0, (t0) + add t0, t0, t1 + sub a2, a2, t1 + bnez a2, .Lbulk_vect + /* fallthrough */ + +.Ldone_vect: + ret +.size __memset_vect, .-__memset_vect +.option pop diff --git a/src/string/riscv64/string_dispatch.c b/src/string/riscv64/string_dispatch.c new file mode 100644 index 00000000..2844d906 --- /dev/null +++ b/src/string/riscv64/string_dispatch.c @@ -0,0 +1,55 @@ +#include +#include +#include +#include "libc.h" + +void *__memset_scalar(void *s, int c, size_t n); +void *__memset_vect(void *s, int c, size_t n); +void *__memcpy_scalar(void *restrict dest, const void *restrict src, size_t n); +void *__memcpy_vect(void *restrict dest, const void *restrict src, size_t n); +void *__memmove_scalar(void *restrict dest, const void *restrict src, size_t n); +void *__memmove_vect(void *restrict dest, const void *restrict src, size_t n); + +/* string function pointer, runtime-dispatched based on RVV support */ +__attribute__((visibility("hidden"))) +#ifndef __riscv_vector +void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar; +void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_scalar; +void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_scalar; +#else +void *(*__memset_ptr)(void *, int, size_t) = __memset_vect; +void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_vect; +void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_vect; +#endif + +void *memset(void *s, int c, size_t n) +{ + return __memset_ptr(s, c, n); +} + +void *memcpy(void *restrict dest, const void *restrict src, size_t n) +{ + return __memcpy_ptr(dest, src, n); +} + +void *memmove(void *dest, const void *src, size_t n) +{ + return __memmove_ptr(dest, src, n); +} + +static __inline int __has_rvv_via_hwcap(void) +{ + const unsigned long V_bit = (1ul << ('V' - 'A')); + unsigned long hwcap = __getauxval(AT_HWCAP); + return (hwcap & V_bit) != 0; +} + +__attribute__((visibility("hidden"))) +void __init_riscv_string_optimizations(void) +{ + if (__has_rvv_via_hwcap()) { + __memset_ptr = __memset_vect; + __memcpy_ptr = __memcpy_vect; + __memmove_ptr = __memmove_vect; + } +} -- 2.39.5