mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Rich Felker <dalias@libc.org>
To: Rob Landley <rob@landley.net>
Cc: "j-core@j-core.org" <j-core@j-core.org>, musl@lists.openwall.com
Subject: Re: Re: [J-core] Aligned copies and cacheline conflicts?
Date: Fri, 16 Sep 2016 18:16:03 -0400	[thread overview]
Message-ID: <20160916221603.GS15995@brightrain.aerifal.cx> (raw)
In-Reply-To: <20160915023644.GD15995@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 1556 bytes --]

On Wed, Sep 14, 2016 at 10:36:45PM -0400, Rich Felker wrote:
> On Wed, Sep 14, 2016 at 07:58:52PM -0500, Rob Landley wrote:
> > On 09/14/2016 07:34 PM, Rich Felker wrote:
> > > I could put a fork of memcpy.c in sh/memcpy.c and work on it there and
> > > only merge it back to the shared one if others test it on other archs
> > > and find it beneficial (or at least not harmful).
> > 
> > Both musl and the kernel need it. And yes at the moment it seems
> > architecture-specific, but it's a _big_ performance difference...
> 
> I actually think it's justifiable to have in the generic C memcpy,
> from a standpoint that the generic C shouldn't assume an N-way (N>1,
> i.e. not direct mapped) associative cache. Just need to make sure
> changing it doesn't make gcc do something utterly idiotic for other
> archs, I guess. I'll take a look at this.

Attached is a draft memcpy I'm considering for musl. Compared to the
current one, it:

1. Works on 32 bytes per iteration, and adds barriers between the load
   phase and store phase to preclude cache line aliasing between src
   and dest with a direct-mapped cache.

2. Equally unrolls the misaligned src/dest cases.

3. Adjusts the offsets used in the misaligned src/dest loops to all be
   multiples of 4, with the adjustments to make that work outside the
   loops. This helps compilers generate indexed addressing modes (e.g.
   @(4,Rm)) rather than having to resort to arithmetic.

4. Factors the misaligned cases into a common inline function to
   reduce code duplication.

Comments welcome.

Rich

[-- Attachment #2: memcpy-draft.c --]
[-- Type: text/plain, Size: 2704 bytes --]

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <endian.h>

#ifdef __GNUC__

#if __BYTE_ORDER == __LITTLE_ENDIAN
#define LS >>
#define RS <<
#else
#define LS <<
#define RS >>
#endif

typedef uint32_t __attribute__((__may_alias__)) u32;
typedef uint16_t __attribute__((__may_alias__)) u16;

static inline uint32_t shifted_block_copy(unsigned char *d, const unsigned char *s, uint32_t w, int ls)
{
	int rs = 32-ls;
	uint32_t t1 = *(u32 *)(s+4);
	uint32_t t2 = *(u32 *)(s+8);
	uint32_t t3 = *(u32 *)(s+12);
	uint32_t t4 = *(u32 *)(s+16);
	uint32_t t5 = *(u32 *)(s+20);
	uint32_t t6 = *(u32 *)(s+24);
	uint32_t t7 = *(u32 *)(s+28);
	uint32_t t8 = *(u32 *)(s+32);
	__asm__ __volatile__ ( "" : : "r"(s), "r"(d) : "memory" );
	*(u32 *)(d) = (w LS ls) | (t1 RS rs);
	*(u32 *)(d+4) = (t1 LS ls) | (t2 RS rs);
	*(u32 *)(d+8) = (t2 LS ls) | (t3 RS rs);
	*(u32 *)(d+12) = (t3 LS ls) | (t4 RS rs);
	*(u32 *)(d+16) = (t4 LS ls) | (t5 RS rs);
	*(u32 *)(d+20) = (t5 LS ls) | (t6 RS rs);
	*(u32 *)(d+24) = (t6 LS ls) | (t7 RS rs);
	*(u32 *)(d+28) = (t7 LS ls) | (t8 RS rs);
	return t8;
}

#endif

void *memcpy(void *restrict dest, const void *restrict src, size_t n)
{
	unsigned char *d = dest;
	const unsigned char *s = src;

#ifdef __GNUC__

	for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;

	if ((uintptr_t)d % 4 == 0) {
		size_t c32 = n>>5, c4 = (n&31)>>2, c1=n&3;
		for (; c32; c32--, s+=32, d+=32) {
			uint32_t t0 = *(u32 *)(s+0);
			uint32_t t1 = *(u32 *)(s+4);
			uint32_t t2 = *(u32 *)(s+8);
			uint32_t t3 = *(u32 *)(s+12);
			uint32_t t4 = *(u32 *)(s+16);
			uint32_t t5 = *(u32 *)(s+20);
			uint32_t t6 = *(u32 *)(s+24);
			uint32_t t7 = *(u32 *)(s+28);
			__asm__ __volatile__ ( "" : : "r"(s), "r"(d) : "memory" );
			*(u32 *)(d+0) = t0;
			*(u32 *)(d+4) = t1;
			*(u32 *)(d+8) = t2;
			*(u32 *)(d+12) = t3;
			*(u32 *)(d+16) = t4;
			*(u32 *)(d+20) = t5;
			*(u32 *)(d+24) = t6;
			*(u32 *)(d+28) = t7;
		}
		for (; c4; c4--, s+=4, d+=4) {
			*(u32 *)d = *(u32 *)s;
		}
		for (; c1; c1--, s++, d++) {
			*d = *s;
		}
		return dest;
	}

	if (!n) return dest;

	size_t c32 = n>=36 ? (n-4)>>5 : 0;
	uint32_t w = *(u32 *)s;

	n -= (c32<<5);

	if (c32) switch ((uintptr_t)d % 4) {
	case 1:
		d[0] = s[0];
		d[1] = s[1];
		d[2] = s[2];
		d += 3;
		n -= 3;
		for (; c32; c32--, s+=32, d+=32)
			w = shifted_block_copy(d, s, w, 24);
		s += 3;
		break;
	case 2:
		*(u16 *)d = *(u16 *)s;
		d += 2;
		n -= 2;
		for (; c32; c32--, s+=32, d+=32)
			w = shifted_block_copy(d, s, w, 16);
		s += 2;
		break;
	case 3:
		d[0] = s[0];
		d += 1;
		n -= 1;
		for (; c32; c32--, s+=32, d+=32)
			w = shifted_block_copy(d, s, w, 8);
		s += 1;
		break;
	}
#endif

	for (; n; n--) *d++ = *s++;
	return dest;
}


  reply	other threads:[~2016-09-16 22:16 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <0c256cb1-d0fa-9a5a-3976-b7ef545c1827@landley.net>
2016-09-15  0:34 ` Rich Felker
2016-09-15  0:58   ` Rob Landley
2016-09-15  2:36     ` Rich Felker
2016-09-16 22:16       ` Rich Felker [this message]
2016-09-17  1:40         ` Rob Landley
2016-09-17  2:17           ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160916221603.GS15995@brightrain.aerifal.cx \
    --to=dalias@libc.org \
    --cc=j-core@j-core.org \
    --cc=musl@lists.openwall.com \
    --cc=rob@landley.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).