Halve the number of instructions (from 12 to 6) to fetch the (3-bit partial) quotient from the FPU flags C0:C3:C1, and perform its negation without conditional branch. --- -/math/i386/remquo.s +++ +/math/i386/remquo.s @@ -2,49 +2,44 @@ .type remquof,@function remquof: mov 12(%esp),%ecx + mov 8(%esp),%eax + xor 4(%esp),%eax flds 8(%esp) flds 4(%esp) - mov 11(%esp),%dh - xor 7(%esp),%dh - jmp 1f + jmp 0f .global remquol .type remquol,@function remquol: mov 28(%esp),%ecx + mov 24(%esp),%eax + xor 12(%esp),%eax + cwtl fldt 16(%esp) fldt 4(%esp) - mov 25(%esp),%dh - xor 13(%esp),%dh - jmp 1f + jmp 0f .global remquo .type remquo,@function remquo: mov 20(%esp),%ecx + mov 16(%esp),%eax + xor 8(%esp),%eax fldl 12(%esp) fldl 4(%esp) - mov 19(%esp),%dh - xor 11(%esp),%dh +0: cltd 1: fprem1 fnstsw %ax sahf jp 1b fstp %st(1) - mov %ah,%dl - shr %dl - and $1,%dl - mov %ah,%al - shr $5,%al - and $2,%al - or %al,%dl - mov %ah,%al - shl $2,%al - and $4,%al - or %al,%dl - test %dh,%dh - jns 1f - neg %dl -1: movsbl %dl,%edx - mov %edx,(%ecx) + adc %al,%al + shl $2,%ah + adc %al,%al + shl $5,%ah + adc %al,%al + and $7,%eax + xor %edx,%eax + sub %edx,%eax + mov %eax,(%ecx) ret