#include #include #include __attribute((optimize("s"))) //a bit faster than -O2/-O3 int my_memcmp(const void *Vl, const void *Vr, size_t N) { typedef size_t uword_tp; const unsigned char *l=Vl, *r=Vr; size_t ByteWiseCnt; size_t aloff = (uintptr_t)l%_Alignof(uword_tp); if ( aloff != (uintptr_t)r%_Alignof(uword_tp) ) { /*differently aligned, finish with bytewise cmp and let it fallthru to return 0 unless the return happens in the bytewise loop*/ ByteWiseCnt = N; //the goto helps keep the output assembly small do_ByteWiseCmp: //run for all, at the beginning if equally misaligned, and possibly at the end N-=ByteWiseCnt; //<= disables the rest if run for all or at the end; adjusts N if run at the beginning for (; 0!=ByteWiseCnt; ByteWiseCnt--, l++, r++) if (*l != *r) return *l-*r; }else if(aloff!=0) { /*do_ByteWiseCmp until aligned*/ ByteWiseCnt= sizeof(uword_tp)-aloff < N ? sizeof(uword_tp)-aloff : N; goto do_ByteWiseCmp; } size_t nw = N/sizeof(uword_tp); N %= sizeof(uword_tp); for (; nw != 0; nw--,l+=sizeof(uword_tp),r+=sizeof(uword_tp)){ size_t l0, r0; memcpy(&l0, l, sizeof(uword_tp)); memcpy(&r0, r, sizeof(uword_tp)); if (l0 == r0) continue; else{ if (1) break; //smaller assembly else { ByteWiseCnt = sizeof(uword_tp); goto do_ByteWiseCmp; } //larger code } } //if (nw!=0), then ByteWiseCnt will return within the next sizeof(size_t) bytes and //nw needs to be >=sizeof(size_t), otherwise it needs to be N if (0==(ByteWiseCnt=nw*sizeof(size_t)+N)) return 0; goto do_ByteWiseCmp; }