From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/3848 Path: news.gmane.org!not-for-mail From: Rich Felker Newsgroups: gmane.linux.lib.musl.general Subject: Optimized C memcpy Date: Wed, 7 Aug 2013 14:21:24 -0400 Message-ID: <20130807182123.GA17670@brightrain.aerifal.cx> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: plane.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="sm4nu43k4a2Rpi4c" X-Trace: ger.gmane.org 1375899696 15615 80.91.229.3 (7 Aug 2013 18:21:36 GMT) X-Complaints-To: usenet@ger.gmane.org NNTP-Posting-Date: Wed, 7 Aug 2013 18:21:36 +0000 (UTC) To: musl@lists.openwall.com Original-X-From: musl-return-3852-gllmg-musl=m.gmane.org@lists.openwall.com Wed Aug 07 20:21:38 2013 Return-path: Envelope-to: gllmg-musl@plane.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by plane.gmane.org with smtp (Exim 4.69) (envelope-from ) id 1V78Mb-0007Ov-2O for gllmg-musl@plane.gmane.org; Wed, 07 Aug 2013 20:21:37 +0200 Original-Received: (qmail 17414 invoked by uid 550); 7 Aug 2013 18:21:36 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: Original-Received: (qmail 16382 invoked from network); 7 Aug 2013 18:21:36 -0000 Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Xref: news.gmane.org gmane.linux.lib.musl.general:3848 Archived-At: --sm4nu43k4a2Rpi4c Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Attached is the latest version of my "pure C" (modulo aliasing issues) memcpy implementation. Compiled with -O3 on arm, it matches the performance of the assembly language memcpy from Bionic for aligned copies, and is only 25% slower than the asm for misaligned copies. And it's only mildly larger. It uses the same principle as the Bionic code: large block copies as aligned 32-bit units for aligned copies, and aligned-load, bitshift-then-or, aligned-store for misaligned copies. This should, in principle, work well on typical risc archs that have plenty of registers but no misaligned load or store support. Unfortunately it only works on little-endian (I haven't though much yet about how it could be adapted to big-endian), but testing it on qemu-ppc with the endian check disabled (thus wrong behavior) suggested that this approach would work well on there too if we could adapt it. Of course tests under qemu are not worth much; the ARM tests were on real hardware and I'd like to see real-hardware results for others archs (mipsel?) too. This is not a replacement for the ARM asm (which is still better), but it's a step towards avoiding the need to have written-by-hand assembly for every single new arch we add as a prerequisite for tolerable performance. Rich --sm4nu43k4a2Rpi4c Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="memcpy_risc3.c" #include #include #include struct block { uint32_t data[8]; }; void *memcpy(void *restrict dest, const void *restrict src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; uint32_t w, x; for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; if (!n) return dest; if (n>=4) switch ((uintptr_t)d % 4) { case 0: for (; n>=32; s+=32, d+=32, n-=32) *(struct block *)d = *(struct block *)s; break; case 1: if (!(union { int i; char c; }){1}.c) break; w = *(uint32_t *)s; *d++ = *s++; *d++ = *s++; *d++ = *s++; n -= 3; for (; n>=33; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+1); *(uint32_t *)(d+0) = (w>>24) | (x<<8); w = *(uint32_t *)(s+5); *(uint32_t *)(d+4) = (x>>24) | (w<<8); x = *(uint32_t *)(s+9); *(uint32_t *)(d+8) = (w>>24) | (x<<8); w = *(uint32_t *)(s+13); *(uint32_t *)(d+12) = (x>>24) | (w<<8); x = *(uint32_t *)(s+17); *(uint32_t *)(d+16) = (w>>24) | (x<<8); w = *(uint32_t *)(s+21); *(uint32_t *)(d+20) = (x>>24) | (w<<8); x = *(uint32_t *)(s+25); *(uint32_t *)(d+24) = (w>>24) | (x<<8); w = *(uint32_t *)(s+29); *(uint32_t *)(d+28) = (x>>24) | (w<<8); } break; case 2: if (!(union { int i; char c; }){1}.c) break; w = *(uint32_t *)s; *d++ = *s++; *d++ = *s++; n -= 2; for (; n>=34; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+2); *(uint32_t *)(d+0) = (w>>16) | (x<<16); w = *(uint32_t *)(s+6); *(uint32_t *)(d+4) = (x>>16) | (w<<16); x = *(uint32_t *)(s+10); *(uint32_t *)(d+8) = (w>>16) | (x<<16); w = *(uint32_t *)(s+14); *(uint32_t *)(d+12) = (x>>16) | (w<<16); x = *(uint32_t *)(s+18); *(uint32_t *)(d+16) = (w>>16) | (x<<16); w = *(uint32_t *)(s+22); *(uint32_t *)(d+20) = (x>>16) | (w<<16); x = *(uint32_t *)(s+26); *(uint32_t *)(d+24) = (w>>16) | (x<<16); w = *(uint32_t *)(s+30); *(uint32_t *)(d+28) = (x>>16) | (w<<16); } break; case 3: if (!(union { int i; char c; }){1}.c) break; w = *(uint32_t *)s; *d++ = *s++; n -= 1; for (; n>=35; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+3); *(uint32_t *)(d+0) = (w>>8) | (x<<24); w = *(uint32_t *)(s+7); *(uint32_t *)(d+4) = (x>>8) | (w<<24); x = *(uint32_t *)(s+11); *(uint32_t *)(d+8) = (w>>8) | (x<<24); w = *(uint32_t *)(s+15); *(uint32_t *)(d+12) = (x>>8) | (w<<24); x = *(uint32_t *)(s+19); *(uint32_t *)(d+16) = (w>>8) | (x<<24); w = *(uint32_t *)(s+23); *(uint32_t *)(d+20) = (x>>8) | (w<<24); x = *(uint32_t *)(s+27); *(uint32_t *)(d+24) = (w>>8) | (x<<24); w = *(uint32_t *)(s+31); *(uint32_t *)(d+28) = (x>>8) | (w<<24); } break; } for (; n; n--) *d++ = *s++; return dest; } --sm4nu43k4a2Rpi4c--