Re: Optimized C memcpy [updated]

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: Rich Felker <dalias@aerifal.cx>
To: musl@lists.openwall.com
Subject: Re: Optimized C memcpy [updated]
Date: Sun, 11 Aug 2013 01:11:35 -0400	[thread overview]
Message-ID: <20130811051135.GW221@brightrain.aerifal.cx> (raw)
In-Reply-To: <20130807182123.GA17670@brightrain.aerifal.cx>

[-- Attachment #1: Type: text/plain, Size: 466 bytes --]

On Wed, Aug 07, 2013 at 02:21:24PM -0400, Rich Felker wrote:
> Unfortunately it only works on little-endian (I haven't though much
> yet about how it could be adapted to big-endian), but testing it on

Making it work for big endian was simply a matter of reversing the
direction of the shifts. I've updated it with this change, and some
other minor improvements. See attached file. If this works on on all
existing archs, I think it might be worth committing.

Rich

[-- Attachment #2: memcpy_risc4.c --]
[-- Type: text/plain, Size: 3016 bytes --]

#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <endian.h>

#if __BYTE_ORDER == __LITTLE_ENDIAN
#define LS >>
#define RS <<
#else
#define LS <<
#define RS >>
#endif

struct block32 { uint32_t data[8]; };
struct block64 { uint64_t data[8]; };

void *memcpy(void *restrict dest, const void *restrict src, size_t n)
{
	unsigned char *d = dest;
	const unsigned char *s = src;
	uint32_t w, x;

	for (; (uintptr_t)s % 8 && n; n--) *d++ = *s++;
	if (!n) return dest;

	if (n>=4) switch ((uintptr_t)d % 4) {
	case 0:
		if (!((uintptr_t)d%8)) for (; n>=64; s+=64, d+=64, n-=64)
			*(struct block64 *)d = *(struct block64 *)s;
		else for (; n>=32; s+=32, d+=32, n-=32)
			*(struct block32 *)d = *(struct block32 *)s;
		for (; n>=4; s+=4, d+=4, n-=4)
			*(uint32_t *)d = *(uint32_t *)s;
		break;
	case 1:
		w = *(uint32_t *)s;
		*d++ = *s++;
		*d++ = *s++;
		*d++ = *s++;
		n -= 3;
		for (; n>=33; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+1);
			*(uint32_t *)(d+0) = (w LS 24) | (x RS 8);
			w = *(uint32_t *)(s+5);
			*(uint32_t *)(d+4) = (x LS 24) | (w RS 8);
			x = *(uint32_t *)(s+9);
			*(uint32_t *)(d+8) = (w LS 24) | (x RS 8);
			w = *(uint32_t *)(s+13);
			*(uint32_t *)(d+12) = (x LS 24) | (w RS 8);
			x = *(uint32_t *)(s+17);
			*(uint32_t *)(d+16) = (w LS 24) | (x RS 8);
			w = *(uint32_t *)(s+21);
			*(uint32_t *)(d+20) = (x LS 24) | (w RS 8);
			x = *(uint32_t *)(s+25);
			*(uint32_t *)(d+24) = (w LS 24) | (x RS 8);
			w = *(uint32_t *)(s+29);
			*(uint32_t *)(d+28) = (x LS 24) | (w RS 8);
		}
		break;
	case 2:
		w = *(uint32_t *)s;
		*d++ = *s++;
		*d++ = *s++;
		n -= 2;
		for (; n>=34; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+2);
			*(uint32_t *)(d+0) = (w LS 16) | (x RS 16);
			w = *(uint32_t *)(s+6);
			*(uint32_t *)(d+4) = (x LS 16) | (w RS 16);
			x = *(uint32_t *)(s+10);
			*(uint32_t *)(d+8) = (w LS 16) | (x RS 16);
			w = *(uint32_t *)(s+14);
			*(uint32_t *)(d+12) = (x LS 16) | (w RS 16);
			x = *(uint32_t *)(s+18);
			*(uint32_t *)(d+16) = (w LS 16) | (x RS 16);
			w = *(uint32_t *)(s+22);
			*(uint32_t *)(d+20) = (x LS 16) | (w RS 16);
			x = *(uint32_t *)(s+26);
			*(uint32_t *)(d+24) = (w LS 16) | (x RS 16);
			w = *(uint32_t *)(s+30);
			*(uint32_t *)(d+28) = (x LS 16) | (w RS 16);
		}
		break;
	case 3:
		w = *(uint32_t *)s;
		*d++ = *s++;
		n -= 1;
		for (; n>=35; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+3);
			*(uint32_t *)(d+0) = (w LS 8) | (x RS 24);
			w = *(uint32_t *)(s+7);
			*(uint32_t *)(d+4) = (x LS 8) | (w RS 24);
			x = *(uint32_t *)(s+11);
			*(uint32_t *)(d+8) = (w LS 8) | (x RS 24);
			w = *(uint32_t *)(s+15);
			*(uint32_t *)(d+12) = (x LS 8) | (w RS 24);
			x = *(uint32_t *)(s+19);
			*(uint32_t *)(d+16) = (w LS 8) | (x RS 24);
			w = *(uint32_t *)(s+23);
			*(uint32_t *)(d+20) = (x LS 8) | (w RS 24);
			x = *(uint32_t *)(s+27);
			*(uint32_t *)(d+24) = (w LS 8) | (x RS 24);
			w = *(uint32_t *)(s+31);
			*(uint32_t *)(d+28) = (x LS 8) | (w RS 24);
		}
		break;
	}

	for (; n; n--) *d++ = *s++;
	return dest;
}

next prev parent reply	other threads:[~2013-08-11  5:11 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-07 18:21 Optimized C memcpy Rich Felker
2013-08-08 12:59 ` Andrew Bradford
2013-08-08 13:03   ` Andrew Bradford
2013-08-08 13:17     ` Luca Barbato
2013-08-08 15:15     ` Rich Felker
2013-08-08 20:17       ` Andre Renaud
2013-08-08 20:26         ` Rich Felker
2013-08-09  5:02 ` Rob Landley
2013-08-11  5:11 ` Rich Felker [this message]
2013-08-11  6:20   ` Optimized C memcpy [updated] Rich Felker
2013-08-11  8:13     ` Rich Felker
2013-08-11 11:14       ` Luca Barbato
2013-08-11 11:27         ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130811051135.GW221@brightrain.aerifal.cx \
    --to=dalias@aerifal.cx \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).