Re: [musl] Release prep for 1.2.1, and afterwards

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: Rich Felker <dalias@libc.org>
To: musl@lists.openwall.com
Subject: Re: [musl] Release prep for 1.2.1, and afterwards
Date: Thu, 25 Jun 2020 21:20:06 -0400	[thread overview]
Message-ID: <20200626012003.GX6430@brightrain.aerifal.cx> (raw)
In-Reply-To: <20200625211536.GS6430@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 1772 bytes --]

On Thu, Jun 25, 2020 at 05:15:42PM -0400, Rich Felker wrote:
> On Thu, Jun 25, 2020 at 04:50:24PM -0400, Rich Felker wrote:
> > > > > but it would be nice if we could get the aarch64
> > > > > memcpy patch in (the c implementation is really
> > > > > slow and i've seen ppl compare aarch64 vs x86
> > > > > server performance with some benchmark on alpine..)
> > > > 
> > > > OK, I'll look again.
> > > 
> > > thanks.
> > > 
> > > (there are more aarch64 string functions in the
> > > optimized-routines github repo but i think they
> > > are not as important as memcpy/memmove/memset)
> > 
> > I found the code. Can you commend on performance and whether memset is
> > needed? (The C memset should be rather good already, moreso than
> > memcpy.)
> 
> Are the assumptions (v8-a, unaligned access) documented in memcpy.S
> valid for all presently supportable aarch64?
> 
> A couple comments for merging if we do, that aren't hard requirements
> but preferences:
> 
> - I'd like to expand out the macros from ../asmdefs.h since that won't
>   be available and they just hide things (I guess they're attractive
>   for Apple/macho users or something but not relevant to musl) and
>   since the symbol name lines need to be changed anyway to public
>   name. "Local var name" macros are ok to leave; changing them would
>   be too error-prone and they make the code more readable anyway.
> 
> - I'd prefer not to have memmove logic in memcpy since it makes it
>   larger and implies that misuse of memcpy when you mean memmove is
>   supported usage. I'd be happy with an approach like x86 though,
>   defining an __memcpy_fwd alias and having memmove tail call to that
>   unless len>128 and reverse is needed, or just leaving memmove.c.

Something like the attached.

Rich

[-- Attachment #2: memcpy.S --]
[-- Type: text/plain, Size: 4082 bytes --]

/*
 * memcpy - copy memory area
 *
 * Copyright (c) 2012-2020, Arm Limited.
 * SPDX-License-Identifier: MIT
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 *
 */

#define dstin	x0
#define src	x1
#define count	x2
#define dst	x3
#define srcend	x4
#define dstend	x5
#define A_l	x6
#define A_lw	w6
#define A_h	x7
#define B_l	x8
#define B_lw	w8
#define B_h	x9
#define C_l	x10
#define C_lw	w10
#define C_h	x11
#define D_l	x12
#define D_h	x13
#define E_l	x14
#define E_h	x15
#define F_l	x16
#define F_h	x17
#define G_l	count
#define G_h	dst
#define H_l	src
#define H_h	srcend
#define tmp1	x14

/* This implementation handles overlaps and supports both memcpy and memmove
   from a single entry point.  It uses unaligned accesses and branchless
   sequences to keep the code small, simple and improve performance.

   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
   check is negligible since it is only required for large copies.

   Large copies use a software pipelined loop processing 64 bytes per iteration.
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
   The loop tail is handled by always copying 64 bytes from the end.
*/

.global memcpy
.type memcpy,%function
memcpy:
	add	srcend, src, count
	add	dstend, dstin, count
	cmp	count, 128
	b.hi	.Lcopy_long
	cmp	count, 32
	b.hi	.Lcopy32_128

	/* Small copies: 0..32 bytes.  */
	cmp	count, 16
	b.lo	.Lcopy16
	ldp	A_l, A_h, [src]
	ldp	D_l, D_h, [srcend, -16]
	stp	A_l, A_h, [dstin]
	stp	D_l, D_h, [dstend, -16]
	ret

	/* Copy 8-15 bytes.  */
.Lcopy16:
	tbz	count, 3, .Lcopy8
	ldr	A_l, [src]
	ldr	A_h, [srcend, -8]
	str	A_l, [dstin]
	str	A_h, [dstend, -8]
	ret

	.p2align 3
	/* Copy 4-7 bytes.  */
.Lcopy8:
	tbz	count, 2, .Lcopy4
	ldr	A_lw, [src]
	ldr	B_lw, [srcend, -4]
	str	A_lw, [dstin]
	str	B_lw, [dstend, -4]
	ret

	/* Copy 0..3 bytes using a branchless sequence.  */
.Lcopy4:
	cbz	count, .Lcopy0
	lsr	tmp1, count, 1
	ldrb	A_lw, [src]
	ldrb	C_lw, [srcend, -1]
	ldrb	B_lw, [src, tmp1]
	strb	A_lw, [dstin]
	strb	B_lw, [dstin, tmp1]
	strb	C_lw, [dstend, -1]
.Lcopy0:
	ret

	.p2align 4
	/* Medium copies: 33..128 bytes.  */
.Lcopy32_128:
	ldp	A_l, A_h, [src]
	ldp	B_l, B_h, [src, 16]
	ldp	C_l, C_h, [srcend, -32]
	ldp	D_l, D_h, [srcend, -16]
	cmp	count, 64
	b.hi	.Lcopy128
	stp	A_l, A_h, [dstin]
	stp	B_l, B_h, [dstin, 16]
	stp	C_l, C_h, [dstend, -32]
	stp	D_l, D_h, [dstend, -16]
	ret

	.p2align 4
	/* Copy 65..128 bytes.  */
.Lcopy128:
	ldp	E_l, E_h, [src, 32]
	ldp	F_l, F_h, [src, 48]
	cmp	count, 96
	b.ls	.Lcopy96
	ldp	G_l, G_h, [srcend, -64]
	ldp	H_l, H_h, [srcend, -48]
	stp	G_l, G_h, [dstend, -64]
	stp	H_l, H_h, [dstend, -48]
.Lcopy96:
	stp	A_l, A_h, [dstin]
	stp	B_l, B_h, [dstin, 16]
	stp	E_l, E_h, [dstin, 32]
	stp	F_l, F_h, [dstin, 48]
	stp	C_l, C_h, [dstend, -32]
	stp	D_l, D_h, [dstend, -16]
	ret

	.p2align 4
	/* Copy more than 128 bytes.  */
.Lcopy_long:

	/* Copy 16 bytes and then align dst to 16-byte alignment.  */

	ldp	D_l, D_h, [src]
	and	tmp1, dstin, 15
	bic	dst, dstin, 15
	sub	src, src, tmp1
	add	count, count, tmp1	/* Count is now 16 too large.  */
	ldp	A_l, A_h, [src, 16]
	stp	D_l, D_h, [dstin]
	ldp	B_l, B_h, [src, 32]
	ldp	C_l, C_h, [src, 48]
	ldp	D_l, D_h, [src, 64]!
	subs	count, count, 128 + 16	/* Test and readjust count.  */
	b.ls	.Lcopy64_from_end

.Lloop64:
	stp	A_l, A_h, [dst, 16]
	ldp	A_l, A_h, [src, 16]
	stp	B_l, B_h, [dst, 32]
	ldp	B_l, B_h, [src, 32]
	stp	C_l, C_h, [dst, 48]
	ldp	C_l, C_h, [src, 48]
	stp	D_l, D_h, [dst, 64]!
	ldp	D_l, D_h, [src, 64]!
	subs	count, count, 64
	b.hi	.Lloop64

	/* Write the last iteration and copy 64 bytes from the end.  */
.Lcopy64_from_end:
	ldp	E_l, E_h, [srcend, -64]
	stp	A_l, A_h, [dst, 16]
	ldp	A_l, A_h, [srcend, -48]
	stp	B_l, B_h, [dst, 32]
	ldp	B_l, B_h, [srcend, -32]
	stp	C_l, C_h, [dst, 48]
	ldp	C_l, C_h, [srcend, -16]
	stp	D_l, D_h, [dst, 64]
	stp	E_l, E_h, [dstend, -64]
	stp	A_l, A_h, [dstend, -48]
	stp	B_l, B_h, [dstend, -32]
	stp	C_l, C_h, [dstend, -16]
	ret

.size memcpy,.-memcpy

next prev parent reply	other threads:[~2020-06-26  1:20 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-24 20:42 Rich Felker
2020-06-24 22:39 ` Jeffrey Walton
2020-06-25  8:15 ` Szabolcs Nagy
2020-06-25 15:39   ` Rich Felker
2020-06-25 17:31     ` Szabolcs Nagy
2020-06-25 20:50       ` Rich Felker
2020-06-25 21:15         ` Rich Felker
2020-06-26  1:20           ` Rich Felker [this message]
2020-06-26  8:40             ` Szabolcs Nagy
2020-07-06 22:12               ` Rich Felker
2020-07-07 15:00                 ` Szabolcs Nagy
2020-07-07 17:22                   ` Rich Felker
2020-07-07 18:20                     ` Szabolcs Nagy
2020-06-25 21:43         ` Andre McCurdy
2020-06-25 21:51           ` Rich Felker
2020-06-25 22:03             ` Andre McCurdy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200626012003.GX6430@brightrain.aerifal.cx \
    --to=dalias@libc.org \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).