From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/10481 Path: news.gmane.org!.POSTED!not-for-mail From: Rich Felker Newsgroups: gmane.linux.lib.musl.general Subject: Re: Re: [J-core] Aligned copies and cacheline conflicts? Date: Fri, 16 Sep 2016 18:16:03 -0400 Message-ID: <20160916221603.GS15995@brightrain.aerifal.cx> References: <0c256cb1-d0fa-9a5a-3976-b7ef545c1827@landley.net> <20160915003451.GC15995@brightrain.aerifal.cx> <8498eaa7-f263-efc8-a59c-d601e84af2db@landley.net> <20160915023644.GD15995@brightrain.aerifal.cx> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: blaine.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="3VRmKSg17yJg2MZg" X-Trace: blaine.gmane.org 1474064190 19276 195.159.176.226 (16 Sep 2016 22:16:30 GMT) X-Complaints-To: usenet@blaine.gmane.org NNTP-Posting-Date: Fri, 16 Sep 2016 22:16:30 +0000 (UTC) User-Agent: Mutt/1.5.21 (2010-09-15) Cc: "j-core@j-core.org" , musl@lists.openwall.com To: Rob Landley Original-X-From: musl-return-10494-gllmg-musl=m.gmane.org@lists.openwall.com Sat Sep 17 00:16:26 2016 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.84_2) (envelope-from ) id 1bl1Qi-0003yE-32 for gllmg-musl@m.gmane.org; Sat, 17 Sep 2016 00:16:20 +0200 Original-Received: (qmail 30091 invoked by uid 550); 16 Sep 2016 22:16:20 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 30071 invoked from network); 16 Sep 2016 22:16:19 -0000 Content-Disposition: inline In-Reply-To: <20160915023644.GD15995@brightrain.aerifal.cx> Original-Sender: Rich Felker Xref: news.gmane.org gmane.linux.lib.musl.general:10481 Archived-At: --3VRmKSg17yJg2MZg Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Wed, Sep 14, 2016 at 10:36:45PM -0400, Rich Felker wrote: > On Wed, Sep 14, 2016 at 07:58:52PM -0500, Rob Landley wrote: > > On 09/14/2016 07:34 PM, Rich Felker wrote: > > > I could put a fork of memcpy.c in sh/memcpy.c and work on it there and > > > only merge it back to the shared one if others test it on other archs > > > and find it beneficial (or at least not harmful). > > > > Both musl and the kernel need it. And yes at the moment it seems > > architecture-specific, but it's a _big_ performance difference... > > I actually think it's justifiable to have in the generic C memcpy, > from a standpoint that the generic C shouldn't assume an N-way (N>1, > i.e. not direct mapped) associative cache. Just need to make sure > changing it doesn't make gcc do something utterly idiotic for other > archs, I guess. I'll take a look at this. Attached is a draft memcpy I'm considering for musl. Compared to the current one, it: 1. Works on 32 bytes per iteration, and adds barriers between the load phase and store phase to preclude cache line aliasing between src and dest with a direct-mapped cache. 2. Equally unrolls the misaligned src/dest cases. 3. Adjusts the offsets used in the misaligned src/dest loops to all be multiples of 4, with the adjustments to make that work outside the loops. This helps compilers generate indexed addressing modes (e.g. @(4,Rm)) rather than having to resort to arithmetic. 4. Factors the misaligned cases into a common inline function to reduce code duplication. Comments welcome. Rich --3VRmKSg17yJg2MZg Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="memcpy-draft.c" #include #include #include #include #ifdef __GNUC__ #if __BYTE_ORDER == __LITTLE_ENDIAN #define LS >> #define RS << #else #define LS << #define RS >> #endif typedef uint32_t __attribute__((__may_alias__)) u32; typedef uint16_t __attribute__((__may_alias__)) u16; static inline uint32_t shifted_block_copy(unsigned char *d, const unsigned char *s, uint32_t w, int ls) { int rs = 32-ls; uint32_t t1 = *(u32 *)(s+4); uint32_t t2 = *(u32 *)(s+8); uint32_t t3 = *(u32 *)(s+12); uint32_t t4 = *(u32 *)(s+16); uint32_t t5 = *(u32 *)(s+20); uint32_t t6 = *(u32 *)(s+24); uint32_t t7 = *(u32 *)(s+28); uint32_t t8 = *(u32 *)(s+32); __asm__ __volatile__ ( "" : : "r"(s), "r"(d) : "memory" ); *(u32 *)(d) = (w LS ls) | (t1 RS rs); *(u32 *)(d+4) = (t1 LS ls) | (t2 RS rs); *(u32 *)(d+8) = (t2 LS ls) | (t3 RS rs); *(u32 *)(d+12) = (t3 LS ls) | (t4 RS rs); *(u32 *)(d+16) = (t4 LS ls) | (t5 RS rs); *(u32 *)(d+20) = (t5 LS ls) | (t6 RS rs); *(u32 *)(d+24) = (t6 LS ls) | (t7 RS rs); *(u32 *)(d+28) = (t7 LS ls) | (t8 RS rs); return t8; } #endif void *memcpy(void *restrict dest, const void *restrict src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; #ifdef __GNUC__ for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; if ((uintptr_t)d % 4 == 0) { size_t c32 = n>>5, c4 = (n&31)>>2, c1=n&3; for (; c32; c32--, s+=32, d+=32) { uint32_t t0 = *(u32 *)(s+0); uint32_t t1 = *(u32 *)(s+4); uint32_t t2 = *(u32 *)(s+8); uint32_t t3 = *(u32 *)(s+12); uint32_t t4 = *(u32 *)(s+16); uint32_t t5 = *(u32 *)(s+20); uint32_t t6 = *(u32 *)(s+24); uint32_t t7 = *(u32 *)(s+28); __asm__ __volatile__ ( "" : : "r"(s), "r"(d) : "memory" ); *(u32 *)(d+0) = t0; *(u32 *)(d+4) = t1; *(u32 *)(d+8) = t2; *(u32 *)(d+12) = t3; *(u32 *)(d+16) = t4; *(u32 *)(d+20) = t5; *(u32 *)(d+24) = t6; *(u32 *)(d+28) = t7; } for (; c4; c4--, s+=4, d+=4) { *(u32 *)d = *(u32 *)s; } for (; c1; c1--, s++, d++) { *d = *s; } return dest; } if (!n) return dest; size_t c32 = n>=36 ? (n-4)>>5 : 0; uint32_t w = *(u32 *)s; n -= (c32<<5); if (c32) switch ((uintptr_t)d % 4) { case 1: d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d += 3; n -= 3; for (; c32; c32--, s+=32, d+=32) w = shifted_block_copy(d, s, w, 24); s += 3; break; case 2: *(u16 *)d = *(u16 *)s; d += 2; n -= 2; for (; c32; c32--, s+=32, d+=32) w = shifted_block_copy(d, s, w, 16); s += 2; break; case 3: d[0] = s[0]; d += 1; n -= 1; for (; c32; c32--, s+=32, d+=32) w = shifted_block_copy(d, s, w, 8); s += 1; break; } #endif for (; n; n--) *d++ = *s++; return dest; } --3VRmKSg17yJg2MZg--