From: Rich Felker <dalias@aerifal.cx>
To: musl@lists.openwall.com
Subject: Re: Optimized C memcpy [updated]
Date: Sun, 11 Aug 2013 01:11:35 -0400 [thread overview]
Message-ID: <20130811051135.GW221@brightrain.aerifal.cx> (raw)
In-Reply-To: <20130807182123.GA17670@brightrain.aerifal.cx>
[-- Attachment #1: Type: text/plain, Size: 466 bytes --]
On Wed, Aug 07, 2013 at 02:21:24PM -0400, Rich Felker wrote:
> Unfortunately it only works on little-endian (I haven't though much
> yet about how it could be adapted to big-endian), but testing it on
Making it work for big endian was simply a matter of reversing the
direction of the shifts. I've updated it with this change, and some
other minor improvements. See attached file. If this works on on all
existing archs, I think it might be worth committing.
Rich
[-- Attachment #2: memcpy_risc4.c --]
[-- Type: text/plain, Size: 3016 bytes --]
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <endian.h>
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define LS >>
#define RS <<
#else
#define LS <<
#define RS >>
#endif
struct block32 { uint32_t data[8]; };
struct block64 { uint64_t data[8]; };
void *memcpy(void *restrict dest, const void *restrict src, size_t n)
{
unsigned char *d = dest;
const unsigned char *s = src;
uint32_t w, x;
for (; (uintptr_t)s % 8 && n; n--) *d++ = *s++;
if (!n) return dest;
if (n>=4) switch ((uintptr_t)d % 4) {
case 0:
if (!((uintptr_t)d%8)) for (; n>=64; s+=64, d+=64, n-=64)
*(struct block64 *)d = *(struct block64 *)s;
else for (; n>=32; s+=32, d+=32, n-=32)
*(struct block32 *)d = *(struct block32 *)s;
for (; n>=4; s+=4, d+=4, n-=4)
*(uint32_t *)d = *(uint32_t *)s;
break;
case 1:
w = *(uint32_t *)s;
*d++ = *s++;
*d++ = *s++;
*d++ = *s++;
n -= 3;
for (; n>=33; s+=32, d+=32, n-=32) {
x = *(uint32_t *)(s+1);
*(uint32_t *)(d+0) = (w LS 24) | (x RS 8);
w = *(uint32_t *)(s+5);
*(uint32_t *)(d+4) = (x LS 24) | (w RS 8);
x = *(uint32_t *)(s+9);
*(uint32_t *)(d+8) = (w LS 24) | (x RS 8);
w = *(uint32_t *)(s+13);
*(uint32_t *)(d+12) = (x LS 24) | (w RS 8);
x = *(uint32_t *)(s+17);
*(uint32_t *)(d+16) = (w LS 24) | (x RS 8);
w = *(uint32_t *)(s+21);
*(uint32_t *)(d+20) = (x LS 24) | (w RS 8);
x = *(uint32_t *)(s+25);
*(uint32_t *)(d+24) = (w LS 24) | (x RS 8);
w = *(uint32_t *)(s+29);
*(uint32_t *)(d+28) = (x LS 24) | (w RS 8);
}
break;
case 2:
w = *(uint32_t *)s;
*d++ = *s++;
*d++ = *s++;
n -= 2;
for (; n>=34; s+=32, d+=32, n-=32) {
x = *(uint32_t *)(s+2);
*(uint32_t *)(d+0) = (w LS 16) | (x RS 16);
w = *(uint32_t *)(s+6);
*(uint32_t *)(d+4) = (x LS 16) | (w RS 16);
x = *(uint32_t *)(s+10);
*(uint32_t *)(d+8) = (w LS 16) | (x RS 16);
w = *(uint32_t *)(s+14);
*(uint32_t *)(d+12) = (x LS 16) | (w RS 16);
x = *(uint32_t *)(s+18);
*(uint32_t *)(d+16) = (w LS 16) | (x RS 16);
w = *(uint32_t *)(s+22);
*(uint32_t *)(d+20) = (x LS 16) | (w RS 16);
x = *(uint32_t *)(s+26);
*(uint32_t *)(d+24) = (w LS 16) | (x RS 16);
w = *(uint32_t *)(s+30);
*(uint32_t *)(d+28) = (x LS 16) | (w RS 16);
}
break;
case 3:
w = *(uint32_t *)s;
*d++ = *s++;
n -= 1;
for (; n>=35; s+=32, d+=32, n-=32) {
x = *(uint32_t *)(s+3);
*(uint32_t *)(d+0) = (w LS 8) | (x RS 24);
w = *(uint32_t *)(s+7);
*(uint32_t *)(d+4) = (x LS 8) | (w RS 24);
x = *(uint32_t *)(s+11);
*(uint32_t *)(d+8) = (w LS 8) | (x RS 24);
w = *(uint32_t *)(s+15);
*(uint32_t *)(d+12) = (x LS 8) | (w RS 24);
x = *(uint32_t *)(s+19);
*(uint32_t *)(d+16) = (w LS 8) | (x RS 24);
w = *(uint32_t *)(s+23);
*(uint32_t *)(d+20) = (x LS 8) | (w RS 24);
x = *(uint32_t *)(s+27);
*(uint32_t *)(d+24) = (w LS 8) | (x RS 24);
w = *(uint32_t *)(s+31);
*(uint32_t *)(d+28) = (x LS 8) | (w RS 24);
}
break;
}
for (; n; n--) *d++ = *s++;
return dest;
}
next prev parent reply other threads:[~2013-08-11 5:11 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-08-07 18:21 Optimized C memcpy Rich Felker
2013-08-08 12:59 ` Andrew Bradford
2013-08-08 13:03 ` Andrew Bradford
2013-08-08 13:17 ` Luca Barbato
2013-08-08 15:15 ` Rich Felker
2013-08-08 20:17 ` Andre Renaud
2013-08-08 20:26 ` Rich Felker
2013-08-09 5:02 ` Rob Landley
2013-08-11 5:11 ` Rich Felker [this message]
2013-08-11 6:20 ` Optimized C memcpy [updated] Rich Felker
2013-08-11 8:13 ` Rich Felker
2013-08-11 11:14 ` Luca Barbato
2013-08-11 11:27 ` Rich Felker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130811051135.GW221@brightrain.aerifal.cx \
--to=dalias@aerifal.cx \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).