From mboxrd@z Thu Jan 1 00:00:00 1970 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=-2.0 required=5.0 tests=MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,RDNS_NONE, SPF_PASS autolearn=ham autolearn_force=no version=3.4.2 Received: (qmail 4316 invoked from network); 25 Mar 2020 21:46:01 -0000 Received-SPF: pass (mother.openwall.net: domain of lists.openwall.com designates 195.42.179.200 as permitted sender) receiver=inbox.vuxu.org; client-ip=195.42.179.200 envelope-from= Received: from unknown (HELO mother.openwall.net) (195.42.179.200) by inbox.vuxu.org with ESMTP; 25 Mar 2020 21:46:01 -0000 Received: (qmail 17644 invoked by uid 550); 25 Mar 2020 21:45:57 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Reply-To: musl@lists.openwall.com Received: (qmail 17611 invoked from network); 25 Mar 2020 21:45:57 -0000 Date: Wed, 25 Mar 2020 22:45:45 +0100 From: Szabolcs Nagy To: musl@lists.openwall.com Message-ID: <20200325214544.GT14278@port70.net> Mail-Followup-To: musl@lists.openwall.com MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="JYK4vJDZwFMowpUq" Content-Disposition: inline User-Agent: Mutt/1.10.1 (2018-07-13) Subject: [musl] [PATCH] aarch64: add optimized memcpy, memmove and memset --JYK4vJDZwFMowpUq Content-Type: text/plain; charset=us-ascii Content-Disposition: inline minimal edits to upstream version for easier updates and because this code was benchmarked across many cores. gcc generates slow code for the current c implementations. the integer memcpy was chosen instead of the simd one, this performs better on little cores, i think this is the more conservative choice for now. note: there are upcoming security architectures which may mean updates to these functions (BTI - landing pads, PAUTH - return address signing, MTE - 16byte tag granule may affect optimized strcmp etc, not relevant yet), but runtime support for these will need other libc changes. --JYK4vJDZwFMowpUq Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="0001-aarch64-add-optimized-memcpy-memmove-and-memset.patch" >From 633a481dbed9923cab1b3d5985eb4ebd8780f12d Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Wed, 25 Mar 2020 21:30:00 +0000 Subject: [PATCH] aarch64: add optimized memcpy, memmove and memset The implementations are from https://github.com/ARM-software/optimized-routines at commit a0ad28dbcb9d52be85dfe622f4c43d9262641300 only the include path and function names are modified in the files string/aarch64/memcpy.S string/aarch64/memset.S and memmove is an alias of memcpy. --- src/string/aarch64/asmdefs.h | 31 +++++ src/string/aarch64/memcpy.S | 239 +++++++++++++++++++++++++++++++++++ src/string/aarch64/memmove.s | 1 + src/string/aarch64/memset.S | 114 +++++++++++++++++ 4 files changed, 385 insertions(+) create mode 100644 src/string/aarch64/asmdefs.h create mode 100644 src/string/aarch64/memcpy.S create mode 100644 src/string/aarch64/memmove.s create mode 100644 src/string/aarch64/memset.S diff --git a/src/string/aarch64/asmdefs.h b/src/string/aarch64/asmdefs.h new file mode 100644 index 00000000..7d143a96 --- /dev/null +++ b/src/string/aarch64/asmdefs.h @@ -0,0 +1,31 @@ +/* + * Macros for asm code. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S new file mode 100644 index 00000000..5e76e13a --- /dev/null +++ b/src/string/aarch64/memcpy.S @@ -0,0 +1,239 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY (memcpy) +ENTRY_ALIAS (memmove) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +END (memcpy) diff --git a/src/string/aarch64/memmove.s b/src/string/aarch64/memmove.s new file mode 100644 index 00000000..0dc2941f --- /dev/null +++ b/src/string/aarch64/memmove.s @@ -0,0 +1 @@ +// see memcpy.S diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S new file mode 100644 index 00000000..7aca4a1e --- /dev/null +++ b/src/string/aarch64/memset.S @@ -0,0 +1,114 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 + +ENTRY (memset) + + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (memset) -- 2.24.1 --JYK4vJDZwFMowpUq--