mailing list of musl libc
 help / color / mirror / code / Atom feed
* Optimized C memcpy
@ 2013-08-07 18:21 Rich Felker
  2013-08-08 12:59 ` Andrew Bradford
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Rich Felker @ 2013-08-07 18:21 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1275 bytes --]

Attached is the latest version of my "pure C" (modulo aliasing issues)
memcpy implementation. Compiled with -O3 on arm, it matches the
performance of the assembly language memcpy from Bionic for aligned
copies, and is only 25% slower than the asm for misaligned copies. And
it's only mildly larger. It uses the same principle as the Bionic
code: large block copies as aligned 32-bit units for aligned copies,
and aligned-load, bitshift-then-or, aligned-store for misaligned
copies. This should, in principle, work well on typical risc archs
that have plenty of registers but no misaligned load or store support.

Unfortunately it only works on little-endian (I haven't though much
yet about how it could be adapted to big-endian), but testing it on
qemu-ppc with the endian check disabled (thus wrong behavior)
suggested that this approach would work well on there too if we could
adapt it. Of course tests under qemu are not worth much; the ARM tests
were on real hardware and I'd like to see real-hardware results for
others archs (mipsel?) too.

This is not a replacement for the ARM asm (which is still better), but
it's a step towards avoiding the need to have written-by-hand assembly
for every single new arch we add as a prerequisite for tolerable
performance.

Rich

[-- Attachment #2: memcpy_risc3.c --]
[-- Type: text/plain, Size: 2710 bytes --]

#include <string.h>
#include <stdlib.h>
#include <stdint.h>

struct block {
	uint32_t data[8];
};

void *memcpy(void *restrict dest, const void *restrict src, size_t n)
{
	unsigned char *d = dest;
	const unsigned char *s = src;
	uint32_t w, x;

	for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;
	if (!n) return dest;

	if (n>=4) switch ((uintptr_t)d % 4) {
	case 0:
		for (; n>=32; s+=32, d+=32, n-=32)
			*(struct block *)d = *(struct block *)s;
		break;
	case 1:
		if (!(union { int i; char c; }){1}.c) break;
		w = *(uint32_t *)s;
		*d++ = *s++;
		*d++ = *s++;
		*d++ = *s++;
		n -= 3;
		for (; n>=33; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+1);
			*(uint32_t *)(d+0) = (w>>24) | (x<<8);
			w = *(uint32_t *)(s+5);
			*(uint32_t *)(d+4) = (x>>24) | (w<<8);
			x = *(uint32_t *)(s+9);
			*(uint32_t *)(d+8) = (w>>24) | (x<<8);
			w = *(uint32_t *)(s+13);
			*(uint32_t *)(d+12) = (x>>24) | (w<<8);
			x = *(uint32_t *)(s+17);
			*(uint32_t *)(d+16) = (w>>24) | (x<<8);
			w = *(uint32_t *)(s+21);
			*(uint32_t *)(d+20) = (x>>24) | (w<<8);
			x = *(uint32_t *)(s+25);
			*(uint32_t *)(d+24) = (w>>24) | (x<<8);
			w = *(uint32_t *)(s+29);
			*(uint32_t *)(d+28) = (x>>24) | (w<<8);
		}
		break;
	case 2:
		if (!(union { int i; char c; }){1}.c) break;
		w = *(uint32_t *)s;
		*d++ = *s++;
		*d++ = *s++;
		n -= 2;
		for (; n>=34; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+2);
			*(uint32_t *)(d+0) = (w>>16) | (x<<16);
			w = *(uint32_t *)(s+6);
			*(uint32_t *)(d+4) = (x>>16) | (w<<16);
			x = *(uint32_t *)(s+10);
			*(uint32_t *)(d+8) = (w>>16) | (x<<16);
			w = *(uint32_t *)(s+14);
			*(uint32_t *)(d+12) = (x>>16) | (w<<16);
			x = *(uint32_t *)(s+18);
			*(uint32_t *)(d+16) = (w>>16) | (x<<16);
			w = *(uint32_t *)(s+22);
			*(uint32_t *)(d+20) = (x>>16) | (w<<16);
			x = *(uint32_t *)(s+26);
			*(uint32_t *)(d+24) = (w>>16) | (x<<16);
			w = *(uint32_t *)(s+30);
			*(uint32_t *)(d+28) = (x>>16) | (w<<16);
		}
		break;
	case 3:
		if (!(union { int i; char c; }){1}.c) break;
		w = *(uint32_t *)s;
		*d++ = *s++;
		n -= 1;
		for (; n>=35; s+=32, d+=32, n-=32) {
			x = *(uint32_t *)(s+3);
			*(uint32_t *)(d+0) = (w>>8) | (x<<24);
			w = *(uint32_t *)(s+7);
			*(uint32_t *)(d+4) = (x>>8) | (w<<24);
			x = *(uint32_t *)(s+11);
			*(uint32_t *)(d+8) = (w>>8) | (x<<24);
			w = *(uint32_t *)(s+15);
			*(uint32_t *)(d+12) = (x>>8) | (w<<24);
			x = *(uint32_t *)(s+19);
			*(uint32_t *)(d+16) = (w>>8) | (x<<24);
			w = *(uint32_t *)(s+23);
			*(uint32_t *)(d+20) = (x>>8) | (w<<24);
			x = *(uint32_t *)(s+27);
			*(uint32_t *)(d+24) = (w>>8) | (x<<24);
			w = *(uint32_t *)(s+31);
			*(uint32_t *)(d+28) = (x>>8) | (w<<24);
		}
		break;
	}

	for (; n; n--) *d++ = *s++;
	return dest;
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-08-11 11:27 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-07 18:21 Optimized C memcpy Rich Felker
2013-08-08 12:59 ` Andrew Bradford
2013-08-08 13:03   ` Andrew Bradford
2013-08-08 13:17     ` Luca Barbato
2013-08-08 15:15     ` Rich Felker
2013-08-08 20:17       ` Andre Renaud
2013-08-08 20:26         ` Rich Felker
2013-08-09  5:02 ` Rob Landley
2013-08-11  5:11 ` Optimized C memcpy [updated] Rich Felker
2013-08-11  6:20   ` Rich Felker
2013-08-11  8:13     ` Rich Felker
2013-08-11 11:14       ` Luca Barbato
2013-08-11 11:27         ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).