From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=-3.3 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,RCVD_IN_MSPIKE_H5, RCVD_IN_MSPIKE_WL autolearn=ham autolearn_force=no version=3.4.4 Received: from second.openwall.net (second.openwall.net [193.110.157.125]) by inbox.vuxu.org (Postfix) with SMTP id DE0F52153C for ; Wed, 19 Nov 2025 06:41:30 +0100 (CET) Received: (qmail 28380 invoked by uid 550); 19 Nov 2025 05:41:15 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Reply-To: musl@lists.openwall.com x-ms-reactions: disallow Received: (qmail 28219 invoked from network); 19 Nov 2025 05:41:14 -0000 From: Pincheng Wang To: musl@lists.openwall.com Cc: pincheng.plct@isrc.iscas.ac.cn Date: Wed, 19 Nov 2025 13:40:59 +0800 Message-Id: <20251119054059.514848-2-pincheng.plct@isrc.iscas.ac.cn> X-Mailer: git-send-email 2.39.5 In-Reply-To: <20251119054059.514848-1-pincheng.plct@isrc.iscas.ac.cn> References: <20251119054059.514848-1-pincheng.plct@isrc.iscas.ac.cn> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-CM-TRANSID:zQCowAA3_29sWB1p+Ug9AQ--.12794S3 X-Coremail-Antispam: 1UD129KBjvJXoW3XFWDXr1UZFW8WrWUtF13urg_yoWfWw1rpF 4UAw13Kr4fZrn7WF4fWF1jvan8G3ykuF15Wwnru3Z8ZrW8GFyUtasxA3WjvrW3JF1jkw1a 9r4UGFyY9a1rCaUanT9S1TB71UUUUU7qnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUU9I14x267AKxVWUJVW8JwAFc2x0x2IEx4CE42xK8VAvwI8IcIk0 rVWrJVCq3wAFIxvE14AKwVWUJVWUGwA2048vs2IY020E87I2jVAFwI0_Jr4l82xGYIkIc2 x26xkF7I0E14v26r18M28lY4IEw2IIxxk0rwA2F7IY1VAKz4vEj48ve4kI8wA2z4x0Y4vE 2Ix0cI8IcVAFwI0_Jr0_JF4l84ACjcxK6xIIjxv20xvEc7CjxVAFwI0_Jr0_Gr1l84ACjc xK6I8E87Iv67AKxVWUJVW8JwA2z4x0Y4vEx4A2jsIEc7CjxVAFwI0_Gr0_Gr1UM2AIxVAI cxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7xfMcIj6xIIjxv20xvE14 v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Yz7v_Jr0_Gr1lF7xvr2IY c2Ij64vIr41lF7I21c0EjII2zVCS5cI20VAGYxC7MxAIw28IcxkI7VAKI48JMxC20s026x CaFVCjc4AY6r1j6r4UMI8I3I0E5I8CrVAFwI0_Jr0_Jr4lx2IqxVCjr7xvwVAFwI0_JrI_ JrWlx4CE17CEb7AF67AKxVWUXVWUAwCIc40Y0x0EwIxGrwCI42IY6xIIjxv20xvE14v26r 1j6r1xMIIF0xvE2Ix0cI8IcVCY1x0267AKxVWUJVW8JwCI42IY6xAIw20EY4v20xvaj40_ Jr0_JF4lIxAIcVC2z280aVAFwI0_Jr0_Gr1lIxAIcVC2z280aVCY1x0267AKxVWUJVW8Jb IYCTnIWIevJa73UjIFyTuYvjfU5VbkUUUUU X-Originating-IP: [120.227.56.239] X-CM-SenderInfo: pslquxhhqjh1xofwqxxvufhxpvfd2hldfou0/ Subject: [musl] [PATCH v2 1/1] riscv64: add optimized memset, memcpy and memmove Add RISC-V vector extension optimized memset, memcpy and memmove implementations with runtime CPU capability detection via AT_HWCAP. The implementations provide both vector and scalar variants in a single binary. At process startup, __init_riscv_string_optimizations() queries AT_HWCAP to detect RVV support and selects the appropriate implementations via function pointer dispatch. This allows the same libc to run correctly on both vector-capable and non-vector RISC-V CPUs. The vector implementation uses m8 register grouping and processes data in vector-length chunks, providing significant performance improvements on RVV-capable hardware while maintaining compatibility with non-vector RISC-V systems. Signed-off-by: Pincheng Wang --- src/env/__libc_start_main.c | 3 ++ src/internal/libc.h | 3 ++ src/string/riscv64/memcpy.c | 4 +++ src/string/riscv64/memcpy_vector.S | 28 +++++++++++++++ src/string/riscv64/memmove.c | 4 +++ src/string/riscv64/memmove_vector.S | 52 ++++++++++++++++++++++++++++ src/string/riscv64/memset.c | 4 +++ src/string/riscv64/memset_vector.S | 29 ++++++++++++++++ src/string/riscv64/string_dispatch.c | 52 ++++++++++++++++++++++++++++ 9 files changed, 179 insertions(+) create mode 100644 src/string/riscv64/memcpy.c create mode 100644 src/string/riscv64/memcpy_vector.S create mode 100644 src/string/riscv64/memmove.c create mode 100644 src/string/riscv64/memmove_vector.S create mode 100644 src/string/riscv64/memset.c create mode 100644 src/string/riscv64/memset_vector.S create mode 100644 src/string/riscv64/string_dispatch.c diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c index c5b277bd..db6b3b7c 100644 --- a/src/env/__libc_start_main.c +++ b/src/env/__libc_start_main.c @@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn) __init_tls(aux); __init_ssp((void *)aux[AT_RANDOM]); +#if defined (__riscv) && __riscv_xlen == 64 + __init_riscv_string_optimizations(); +#endif if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID] && !aux[AT_SECURE]) return; diff --git a/src/internal/libc.h b/src/internal/libc.h index 619bba86..45c99f12 100644 --- a/src/internal/libc.h +++ b/src/internal/libc.h @@ -40,6 +40,9 @@ extern hidden struct __libc __libc; hidden void __init_libc(char **, char *); hidden void __init_tls(size_t *); hidden void __init_ssp(void *); +#if defined (__riscv) && __riscv_xlen == 64 +hidden void __init_riscv_string_optimizations(void); +#endif hidden void __libc_start_init(void); hidden void __funcs_on_exit(void); hidden void __funcs_on_quick_exit(void); diff --git a/src/string/riscv64/memcpy.c b/src/string/riscv64/memcpy.c new file mode 100644 index 00000000..01892e69 --- /dev/null +++ b/src/string/riscv64/memcpy.c @@ -0,0 +1,4 @@ +/* Rename the generic memcpy to __memcpy_scalar and include it */ +#define memcpy __memcpy_scalar +#include "../memcpy.c" +#undef memcpy diff --git a/src/string/riscv64/memcpy_vector.S b/src/string/riscv64/memcpy_vector.S new file mode 100644 index 00000000..1131c52b --- /dev/null +++ b/src/string/riscv64/memcpy_vector.S @@ -0,0 +1,28 @@ + .text + .global __memcpy_vect + .type __memcpy_vect,%function + .option push + .option arch, +v +/* void *__memcpy_vect(void *dest, const void *src, size_t n) + * a0 = dest, a1 = src, a2 = n + * Returns a0. + */ +__memcpy_vect: + mv t0, a0 /* running dst */ + mv t1, a1 /* running src */ + beqz a2, .Ldone_copy /* n == 0 then return */ + +.Lbulk_copy: + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */ + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t2 + add t1, t1, t2 + sub a2, a2, t2 + bnez a2, .Lbulk_copy + /* fallthrough */ + +.Ldone_copy: + ret +.size __memcpy_vect, .-__memcpy_vect +.option pop diff --git a/src/string/riscv64/memmove.c b/src/string/riscv64/memmove.c new file mode 100644 index 00000000..915d6ba9 --- /dev/null +++ b/src/string/riscv64/memmove.c @@ -0,0 +1,4 @@ +/* Rename the generic memmove to __memmove_scalar and include it */ +#define memmove __memmove_scalar +#include "../memmove.c" +#undef memmove diff --git a/src/string/riscv64/memmove_vector.S b/src/string/riscv64/memmove_vector.S new file mode 100644 index 00000000..77877e69 --- /dev/null +++ b/src/string/riscv64/memmove_vector.S @@ -0,0 +1,52 @@ + .text + .global __memmove_vect + .type __memmove_vect,%function + .option push + .option arch, +v +/* void *__memmove_vect(void *dest, const void *src, size_t n) + * a0 = dest, a1 = src, a2 = n + * Returns a0. + */ +__memmove_vect: + beqz a2, .Ldone_move /* n == 0 */ + beq a0, a1, .Ldone_move /* dst == src */ + + /* overlap check */ + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/ + + sub t2, a0, a1 /* t2 = dst - src */ + bgeu t2, a2, .Lforward_move /* no overlap then forward move */ + + /* backward move */ + add t0, a0, a2 /* running dst_end */ + add t1, a1, a2 /* running src_end */ + +.Lbackward_loop: + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */ + sub t0, t0, t3 + sub t1, t1, t3 + vle8.v v0, (t1) + vse8.v v0, (t0) + sub a2, a2, t3 + bnez a2, .Lbackward_loop + j .Ldone_move + + /* forward move, same as __memcpy_vect */ +.Lforward_move: + mv t0, a0 /* running dst */ + mv t1, a1 /* running src */ + +.Lforward_loop: + vsetvli t3, a2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t3 + add t1, t1, t3 + sub a2, a2, t3 + bnez a2, .Lforward_loop + /* fallthrough */ + +.Ldone_move: + ret +.size __memmove_vect, .-__memmove_vect +.option pop diff --git a/src/string/riscv64/memset.c b/src/string/riscv64/memset.c new file mode 100644 index 00000000..11fa3032 --- /dev/null +++ b/src/string/riscv64/memset.c @@ -0,0 +1,4 @@ +/* Rename the generic memset to __memset_scalar and include it */ +#define memset __memset_scalar +#include "../memset.c" +#undef memset diff --git a/src/string/riscv64/memset_vector.S b/src/string/riscv64/memset_vector.S new file mode 100644 index 00000000..8f939ecd --- /dev/null +++ b/src/string/riscv64/memset_vector.S @@ -0,0 +1,29 @@ + .text + .global __memset_vect + .type __memset_vect,%function + .option push + .option arch, +v +/* void *__memset_vect(void *s, int c, size_t n) + * a0 = s (dest), a1 = c (fill byte), a2 = n (size) + * Returns a0. + */ +__memset_vect: + mv t0, a0 /* running dst; keep a0 as return */ + beqz a2, .Ldone_vect /* n == 0 then return */ + + /* Broadcast fill byte once. */ + vsetvli t1, zero, e8, m8, ta, ma + vmv.v.x v0, a1 + +.Lbulk_vect: + vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */ + vse8.v v0, (t0) + add t0, t0, t1 + sub a2, a2, t1 + bnez a2, .Lbulk_vect + /* fallthrough */ + +.Ldone_vect: + ret +.size __memset_vect, .-__memset_vect +.option pop diff --git a/src/string/riscv64/string_dispatch.c b/src/string/riscv64/string_dispatch.c new file mode 100644 index 00000000..cd0d1fbc --- /dev/null +++ b/src/string/riscv64/string_dispatch.c @@ -0,0 +1,52 @@ +#include +#include +#include "libc.h" + +void *__memset_scalar(void *s, int c, size_t n); +void *__memset_vect(void *s, int c, size_t n); +void *__memcpy_scalar(void *restrict dest, const void *restrict src, size_t n); +void *__memcpy_vect(void *restrict dest, const void *restrict src, size_t n); +void *__memmove_scalar(void *restrict dest, const void *restrict src, size_t n); +void *__memmove_vect(void *restrict dest, const void *restrict src, size_t n); + +/* string function pointer, runtime-dispatched based on RVV support */ +#ifndef __riscv_vector +static void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar; +static void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_scalar; +static void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_scalar; +#else +static void *(*__memset_ptr)(void *, int, size_t) = __memset_vect; +static void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_vect; +static void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_vect; +#endif + +void *memset(void *s, int c, size_t n) +{ + return __memset_ptr(s, c, n); +} + +void *memcpy(void *restrict dest, const void *restrict src, size_t n) +{ + return __memcpy_ptr(dest, src, n); +} + +void *memmove(void *dest, const void *src, size_t n) +{ + return __memmove_ptr(dest, src, n); +} + +static inline int __has_rvv_via_hwcap(void) +{ + const unsigned long V_bit = (1ul << ('V' - 'A')); + unsigned long hwcap = __getauxval(AT_HWCAP); + return (hwcap & V_bit) != 0; +} + +hidden void __init_riscv_string_optimizations(void) +{ + if (__has_rvv_via_hwcap()) { + __memset_ptr = __memset_vect; + __memcpy_ptr = __memcpy_vect; + __memmove_ptr = __memmove_vect; + } +} -- 2.39.5