From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/3866 Path: news.gmane.org!not-for-mail From: Rich Felker Newsgroups: gmane.linux.lib.musl.general Subject: Re: Optimized C memcpy [updated] Date: Sun, 11 Aug 2013 01:11:35 -0400 Message-ID: <20130811051135.GW221@brightrain.aerifal.cx> References: <20130807182123.GA17670@brightrain.aerifal.cx> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: plane.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="+B+y8wtTXqdUj1xM" X-Trace: ger.gmane.org 1376197906 23839 80.91.229.3 (11 Aug 2013 05:11:46 GMT) X-Complaints-To: usenet@ger.gmane.org NNTP-Posting-Date: Sun, 11 Aug 2013 05:11:46 +0000 (UTC) To: musl@lists.openwall.com Original-X-From: musl-return-3870-gllmg-musl=m.gmane.org@lists.openwall.com Sun Aug 11 07:11:49 2013 Return-path: Envelope-to: gllmg-musl@plane.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by plane.gmane.org with smtp (Exim 4.69) (envelope-from ) id 1V8NwT-0004Zj-5j for gllmg-musl@plane.gmane.org; Sun, 11 Aug 2013 07:11:49 +0200 Original-Received: (qmail 18387 invoked by uid 550); 11 Aug 2013 05:11:47 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: Original-Received: (qmail 18362 invoked from network); 11 Aug 2013 05:11:47 -0000 Content-Disposition: inline In-Reply-To: <20130807182123.GA17670@brightrain.aerifal.cx> User-Agent: Mutt/1.5.21 (2010-09-15) Xref: news.gmane.org gmane.linux.lib.musl.general:3866 Archived-At: --+B+y8wtTXqdUj1xM Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Wed, Aug 07, 2013 at 02:21:24PM -0400, Rich Felker wrote: > Unfortunately it only works on little-endian (I haven't though much > yet about how it could be adapted to big-endian), but testing it on Making it work for big endian was simply a matter of reversing the direction of the shifts. I've updated it with this change, and some other minor improvements. See attached file. If this works on on all existing archs, I think it might be worth committing. Rich --+B+y8wtTXqdUj1xM Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="memcpy_risc4.c" #include #include #include #include #if __BYTE_ORDER == __LITTLE_ENDIAN #define LS >> #define RS << #else #define LS << #define RS >> #endif struct block32 { uint32_t data[8]; }; struct block64 { uint64_t data[8]; }; void *memcpy(void *restrict dest, const void *restrict src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; uint32_t w, x; for (; (uintptr_t)s % 8 && n; n--) *d++ = *s++; if (!n) return dest; if (n>=4) switch ((uintptr_t)d % 4) { case 0: if (!((uintptr_t)d%8)) for (; n>=64; s+=64, d+=64, n-=64) *(struct block64 *)d = *(struct block64 *)s; else for (; n>=32; s+=32, d+=32, n-=32) *(struct block32 *)d = *(struct block32 *)s; for (; n>=4; s+=4, d+=4, n-=4) *(uint32_t *)d = *(uint32_t *)s; break; case 1: w = *(uint32_t *)s; *d++ = *s++; *d++ = *s++; *d++ = *s++; n -= 3; for (; n>=33; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+1); *(uint32_t *)(d+0) = (w LS 24) | (x RS 8); w = *(uint32_t *)(s+5); *(uint32_t *)(d+4) = (x LS 24) | (w RS 8); x = *(uint32_t *)(s+9); *(uint32_t *)(d+8) = (w LS 24) | (x RS 8); w = *(uint32_t *)(s+13); *(uint32_t *)(d+12) = (x LS 24) | (w RS 8); x = *(uint32_t *)(s+17); *(uint32_t *)(d+16) = (w LS 24) | (x RS 8); w = *(uint32_t *)(s+21); *(uint32_t *)(d+20) = (x LS 24) | (w RS 8); x = *(uint32_t *)(s+25); *(uint32_t *)(d+24) = (w LS 24) | (x RS 8); w = *(uint32_t *)(s+29); *(uint32_t *)(d+28) = (x LS 24) | (w RS 8); } break; case 2: w = *(uint32_t *)s; *d++ = *s++; *d++ = *s++; n -= 2; for (; n>=34; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+2); *(uint32_t *)(d+0) = (w LS 16) | (x RS 16); w = *(uint32_t *)(s+6); *(uint32_t *)(d+4) = (x LS 16) | (w RS 16); x = *(uint32_t *)(s+10); *(uint32_t *)(d+8) = (w LS 16) | (x RS 16); w = *(uint32_t *)(s+14); *(uint32_t *)(d+12) = (x LS 16) | (w RS 16); x = *(uint32_t *)(s+18); *(uint32_t *)(d+16) = (w LS 16) | (x RS 16); w = *(uint32_t *)(s+22); *(uint32_t *)(d+20) = (x LS 16) | (w RS 16); x = *(uint32_t *)(s+26); *(uint32_t *)(d+24) = (w LS 16) | (x RS 16); w = *(uint32_t *)(s+30); *(uint32_t *)(d+28) = (x LS 16) | (w RS 16); } break; case 3: w = *(uint32_t *)s; *d++ = *s++; n -= 1; for (; n>=35; s+=32, d+=32, n-=32) { x = *(uint32_t *)(s+3); *(uint32_t *)(d+0) = (w LS 8) | (x RS 24); w = *(uint32_t *)(s+7); *(uint32_t *)(d+4) = (x LS 8) | (w RS 24); x = *(uint32_t *)(s+11); *(uint32_t *)(d+8) = (w LS 8) | (x RS 24); w = *(uint32_t *)(s+15); *(uint32_t *)(d+12) = (x LS 8) | (w RS 24); x = *(uint32_t *)(s+19); *(uint32_t *)(d+16) = (w LS 8) | (x RS 24); w = *(uint32_t *)(s+23); *(uint32_t *)(d+20) = (x LS 8) | (w RS 24); x = *(uint32_t *)(s+27); *(uint32_t *)(d+24) = (w LS 8) | (x RS 24); w = *(uint32_t *)(s+31); *(uint32_t *)(d+28) = (x LS 8) | (w RS 24); } break; } for (; n; n--) *d++ = *s++; return dest; } --+B+y8wtTXqdUj1xM--