From mboxrd@z Thu Jan  1 00:00:00 1970
Received: (from majordomo@localhost) by pauillac.inria.fr (8.7.6/8.7.3) id NAA12699; Sat, 20 Dec 2003 13:44:11 +0100 (MET)
X-Authentication-Warning: pauillac.inria.fr: majordomo set sender to owner-caml-list@pauillac.inria.fr using -f
Received: from concorde.inria.fr (concorde.inria.fr [192.93.2.39]) by pauillac.inria.fr (8.7.6/8.7.3) with ESMTP id NAA12843 for <caml-list@pauillac.inria.fr>; Sat, 20 Dec 2003 13:44:10 +0100 (MET)
Received: from fep04-svc.swip.net (fep04.swip.net [130.244.199.132])
	by concorde.inria.fr (8.11.1/8.11.1) with ESMTP id hBKCi9H16729
	for <caml-list@inria.fr>; Sat, 20 Dec 2003 13:44:09 +0100 (MET)
Received: from debian ([81.211.203.51]) by fep04-svc.swip.net with SMTP
          id <20031220124407.AQY2375.fep04-svc.swip.net@debian>
          for <caml-list@inria.fr>; Sat, 20 Dec 2003 13:44:07 +0100
Date: Sat, 20 Dec 2003 10:50:40 +0100
From: Francesco Abbate <segfault@email.it>
To: caml-list@inria.fr
Subject: [Caml-list] ASM code generated by OCAML
Message-Id: <20031220105040.7a079f67.segfault@email.it>
X-Mailer: Sylpheed version 0.9.7 (GTK+ 1.2.10; i586-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
X-Loop: caml-list@inria.fr
X-Spam: no; 0.00; francesco:99 segfault:01 paranoid:01 gcc:01 subl:01 movl:01 movl:01 $1,:01 lea:99 $1,:01 lea:99 cmpl:01 addl:01 addl:01 gcc:01 
Sender: owner-caml-list@pauillac.inria.fr
Precedence: bulk

Dear Ocamlers,

ok, I confess that I'm a little bit paranoid and I often
look to the assembler code generated by Ocaml to get an
idea of real efficience of the compiler.

While, generally speaking, the ASM code generated by ocaml
is pretty good, I wonder why the following function
is not decently assembled by ocaml :
-----------------------------------------
let rec conv n =
  let r = n mod 10
  and n' = n / 10 in
  if n' = 0 then r
  else r + 8 * (conv n')
-----------------------------------------
nor the C version is decently assembled by GCC
-----------------------------------------
int
conv (int n)
{
  int m = n / 10, r = n % 10;
  if (m > 0)
    return r + 8 * conv (m);
  return m;
}
-----------------------------------------
If fact the code generated by ocaml is
-----------------------------------------
Pr1__conv_56:
	subl	$4, %esp
.L101:
	movl	%eax, %ebx
	movl	$10, %ecx
	movl	%ebx, %eax
	sarl	$1, %eax
	cltd
	idivl	%ecx
	lea	1(, %edx, 2), %esi
	movl	$10, %ecx
	movl	%ebx, %eax
	sarl	$1, %eax
	cltd
	idivl	%ecx
	lea	1(, %eax, 2), %eax
	cmpl	$1, %eax
	jne	.L100
	movl	%esi, %eax
	addl	$4, %esp
	ret
	.align	16
.L100:
	movl	%esi, 0(%esp)
	call	Pr1__conv_56
.L102:
	sarl	$1, %eax
	movl	%eax, %ebx
	sall	$4, %ebx
	movl	0(%esp), %eax
	addl	%ebx, %eax
	addl	$4, %esp
	ret
-----------------------------------------
and the GCC generated code is (-O3 -fomit-frame-pointer -finline)
-----------------------------------------
conv:
	subl $24,%esp
	pushl %ebx
	movl 32(%esp),%ecx
	movl $1717986919,%edx
	movl %edx,%eax
	imull %ecx
	sarl $2,%edx
	movl %ecx,%eax
	sarl $31,%eax
	subl %eax,%edx
	leal (%edx,%edx,4),%eax
	addl %eax,%eax
	movl %ecx,%ebx
	subl %eax,%ebx
	testl %edx,%edx
	jle .L3
	addl $-12,%esp
	pushl %edx
	call conv
	leal (%ebx,%eax,8),%eax
	addl $16,%esp
	jmp .L2
	.p2align 4,,7
.L3:
	movl %edx,%eax
.L2:
	popl %ebx
	addl $24,%esp
	ret
-----------------------------------------
The code is inefficent because
- only one "idiv" instruction is needed to get bot the quotient
  both the remainder while the assembled code does use to "idiv"
  instruction;
- the function is called recursively while this is not really
  necessary (I know that in other cases both ocaml and GCC does
  transform a recursive function in one simple loop)

I've written the assembled code which implements the required
function by hand (which takes two argument the second of which
should be 10):
-----------------------------------------
	.file	"asmtest.s"
.text
	.align 4
.globl conv
	.type 	conv,@function
conv:
	pushl %ebp
	movl %esp,%ebp
	pushl %ebx
	pushl %ecx
	pushl %edx
	movl 8(%ebp),%eax
	movl 12(%ebp),%ebx
# in the following loop I use idiv to get the remainders
.L1:
	cltd
	idivl %ebx
	pushl %edx # we push the remainder into the stack
	testl %eax,%eax
	jne .L1 # end of the loop
	movl %ebp,%eax
	subl $16,%eax
	subl %esp,%eax
	sarl $2,%eax
	movl %eax,%ecx # now %ecx does contain the number of
#                      # remainders pushed into the stack
	popl %eax
# now there is a cycle that form the octal number
# using the formula : (...((r0 * 8 + r1) * 8 + r2) *8 + ...)*8 + rn
# where r0 is the most significant digit (remainder)
.L2:
	sall $3,%eax # multiply by 8
	popl %ebx
	addl %ebx,%eax
	decl %ecx
	jne .L2 # end of the loop
	popl %edx
	popl %ecx
	popl %ebx
	leave
	ret # in %eax we have the correct result
-----------------------------------------

So my answer is why nor Ocaml nor GCC does generate efficient
assembler code ?

I will attempt to give a tentative answer
- for some reason the compiler does not understands the (n mod 10)
  and (n /10) both can be avaluated with a simgle "idiv"
  instruction
- for some reason the compilers does not conceive to have a loop
  which push something on the stack at each cycle.

While the question of the two "idiv" instead of one is not of
practical importance the different implementation of the loop
is quite significative.
Anyway both problems are of academic importance (insn't it ?).

comments are welcome,
bye,
-- 
Francesco Abbate <segfault@email.it>

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners