From: Rich Felker <dalias@libc.org>
To: musl@lists.openwall.com
Subject: Re: [musl] Release prep for 1.2.1, and afterwards
Date: Thu, 25 Jun 2020 21:20:06 -0400 [thread overview]
Message-ID: <20200626012003.GX6430@brightrain.aerifal.cx> (raw)
In-Reply-To: <20200625211536.GS6430@brightrain.aerifal.cx>
[-- Attachment #1: Type: text/plain, Size: 1772 bytes --]
On Thu, Jun 25, 2020 at 05:15:42PM -0400, Rich Felker wrote:
> On Thu, Jun 25, 2020 at 04:50:24PM -0400, Rich Felker wrote:
> > > > > but it would be nice if we could get the aarch64
> > > > > memcpy patch in (the c implementation is really
> > > > > slow and i've seen ppl compare aarch64 vs x86
> > > > > server performance with some benchmark on alpine..)
> > > >
> > > > OK, I'll look again.
> > >
> > > thanks.
> > >
> > > (there are more aarch64 string functions in the
> > > optimized-routines github repo but i think they
> > > are not as important as memcpy/memmove/memset)
> >
> > I found the code. Can you commend on performance and whether memset is
> > needed? (The C memset should be rather good already, moreso than
> > memcpy.)
>
> Are the assumptions (v8-a, unaligned access) documented in memcpy.S
> valid for all presently supportable aarch64?
>
> A couple comments for merging if we do, that aren't hard requirements
> but preferences:
>
> - I'd like to expand out the macros from ../asmdefs.h since that won't
> be available and they just hide things (I guess they're attractive
> for Apple/macho users or something but not relevant to musl) and
> since the symbol name lines need to be changed anyway to public
> name. "Local var name" macros are ok to leave; changing them would
> be too error-prone and they make the code more readable anyway.
>
> - I'd prefer not to have memmove logic in memcpy since it makes it
> larger and implies that misuse of memcpy when you mean memmove is
> supported usage. I'd be happy with an approach like x86 though,
> defining an __memcpy_fwd alias and having memmove tail call to that
> unless len>128 and reverse is needed, or just leaving memmove.c.
Something like the attached.
Rich
[-- Attachment #2: memcpy.S --]
[-- Type: text/plain, Size: 4082 bytes --]
/*
* memcpy - copy memory area
*
* Copyright (c) 2012-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
*
*/
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l x14
#define E_h x15
#define F_l x16
#define F_h x17
#define G_l count
#define G_h dst
#define H_l src
#define H_h srcend
#define tmp1 x14
/* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration.
The destination pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end.
*/
.global memcpy
.type memcpy,%function
memcpy:
add srcend, src, count
add dstend, dstin, count
cmp count, 128
b.hi .Lcopy_long
cmp count, 32
b.hi .Lcopy32_128
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo .Lcopy16
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
/* Copy 8-15 bytes. */
.Lcopy16:
tbz count, 3, .Lcopy8
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 3
/* Copy 4-7 bytes. */
.Lcopy8:
tbz count, 2, .Lcopy4
ldr A_lw, [src]
ldr B_lw, [srcend, -4]
str A_lw, [dstin]
str B_lw, [dstend, -4]
ret
/* Copy 0..3 bytes using a branchless sequence. */
.Lcopy4:
cbz count, .Lcopy0
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1]
.Lcopy0:
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
.Lcopy32_128:
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
ldp D_l, D_h, [srcend, -16]
cmp count, 64
b.hi .Lcopy128
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 65..128 bytes. */
.Lcopy128:
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
cmp count, 96
b.ls .Lcopy96
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48]
.Lcopy96:
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy more than 128 bytes. */
.Lcopy_long:
/* Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls .Lcopy64_from_end
.Lloop64:
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [src, 32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [src, 48]
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi .Lloop64
/* Write the last iteration and copy 64 bytes from the end. */
.Lcopy64_from_end:
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
ret
.size memcpy,.-memcpy
next prev parent reply other threads:[~2020-06-26 1:20 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-06-24 20:42 Rich Felker
2020-06-24 22:39 ` Jeffrey Walton
2020-06-25 8:15 ` Szabolcs Nagy
2020-06-25 15:39 ` Rich Felker
2020-06-25 17:31 ` Szabolcs Nagy
2020-06-25 20:50 ` Rich Felker
2020-06-25 21:15 ` Rich Felker
2020-06-26 1:20 ` Rich Felker [this message]
2020-06-26 8:40 ` Szabolcs Nagy
2020-07-06 22:12 ` Rich Felker
2020-07-07 15:00 ` Szabolcs Nagy
2020-07-07 17:22 ` Rich Felker
2020-07-07 18:20 ` Szabolcs Nagy
2020-06-25 21:43 ` Andre McCurdy
2020-06-25 21:51 ` Rich Felker
2020-06-25 22:03 ` Andre McCurdy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200626012003.GX6430@brightrain.aerifal.cx \
--to=dalias@libc.org \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).