.global memset .type memset,@function memset: movzbq %sil,%rax cmp $16,%rdx jb .Less_than_16 test %esi,%esi jnz .L_widen_rax # unlikely .L_widened: lea -1(%rdx),%rcx mov %rdi,%r8 shr $3,%rcx mov %rax,-8(%rdi,%rdx) rep stosq mov %r8,%rax ret .L_widen_rax: # 64-bit imul has 3-7 cycles latency mov $0x101010101010101,%rsi imul %rsi,%rax jmp .L_widened .Less_than_16: test %edx,%edx jz .L_ret mov %al,(%rdi) mov %al,-1(%rdi,%rdx) cmp $2,%edx jbe .L_ret mov %al,1(%rdi) mov %al,-2(%rdi,%rdx) # 32-bit imul has 3-4 cycles latency imul $0x1010101,%eax cmp $4,%edx jbe .L_ret mov %eax,(%rdi) mov %eax,-4(%rdi,%rdx) cmp $8,%edx jbe .L_ret mov %eax,4(%rdi) mov %eax,-8(%rdi,%rdx) .L_ret: mov %rdi,%rax ret