.global memset .type memset,@function memset: movzbq %sil,%rax cmp $16,%rdx jb .Less_than_16 test %esi,%esi jnz .L_widen_rax # unlikely .L_widened: mov %rdi,%r8 test $7,%dil jnz .L_align # unlikely .L_aligned: lea -1(%rdx),%rcx shr $3,%rcx mov %rax,-8(%rdi,%rdx) rep stosq mov %r8,%rax ret .L_widen_rax: # 64-bit imul has 3-7 cycles latency mov $0x101010101010101,%rsi imul %rsi,%rax jmp .L_widened # 8-byte alignment gives ~25% speedup on "rep stosq" memsets # to L1 cache, compared to intentionally misaligned ones. # It is a smaller win of ~15% on larger memsets to L2 too. # Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz) .L_align: mov %rax,(%rdi) 1: inc %rdi dec %rdx test $7,%dil jnz 1b jmp .L_aligned .Less_than_16: test %edx,%edx jz .L_ret mov %al,(%rdi) mov %al,-1(%rdi,%rdx) cmp $2,%edx jbe .L_ret mov %al,1(%rdi) mov %al,-2(%rdi,%rdx) # 32-bit imul has 3-4 cycles latency imul $0x1010101,%eax cmp $4,%edx jbe .L_ret mov %eax,(%rdi) mov %eax,-4(%rdi,%rdx) cmp $8,%edx jbe .L_ret mov %eax,4(%rdi) mov %eax,-8(%rdi,%rdx) .L_ret: mov %rdi,%rax ret