Re: Value types (Was: [Caml-list] ocamlopt LLVM support)

caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed

From: "Török Edwin" <edwintorok@gmail.com>
To: "Jon Harrop" <jon@ffconsultancy.com>
Cc: <caml-list@inria.fr>
Subject: Re: Value types (Was: [Caml-list] ocamlopt LLVM support)
Date: Sun, 12 Dec 2010 19:26:32 +0200	[thread overview]
Message-ID: <20101212192632.6536a647@deb0> (raw)
In-Reply-To: <038301cb9a20$13af7d10$3b0e7730$@com>

[-- Attachment #1: Type: text/plain, Size: 2265 bytes --]

On Sun, 12 Dec 2010 17:14:45 -0000
"Jon Harrop" <jon@ffconsultancy.com> wrote:

> Török Edwin wrote:
> > Problem #1: Int64.rem n 2 -> another idiv instruction
> > 
> > A C compiler would optimize this to an 'and' instruction.
> > Change that to 'Int64.logand n 1L = 0L'/
> 
> Yes. LLVM did that for me.
> 
> > Problem #2: Int64.div n 2 -> idiv instruction.
> > 
> > A C compiler would optimize this to a right shift. Changing that to
> > 'Int64.shift_right n 1' speeds
> > up the code.
> 
> Yes. LLVM also did that for me. In fact, I have been bitten by
> ocamlopt not optimizing div and mod by a constant in real OCaml code
> before. This problem also turns up in the context of hash table
> implementations where you want to % by the length of the spine.

Do you really need to use Int64 for that though? Won't the 63-bit
version do?

> 
> > With these changes I get almost the same speed as the C code:
> > $ ocamlopt x.ml -o x && time ./x
> > 837799
> > real    0m0.664s
> > user    0m0.667s
> > sys     0m0.000s
> > 
> > $ gcc -O3 x.c && time ./a.out
> > 837799
> > real    0m0.635s
> > user    0m0.633s
> > sys     0m0.000s
> > 
> > Here's the OCaml code:
> > let rec collatzLen(c, n) : int =
> >     if n = 1L then c else
> >       collatzLen (c+1, if Int64.logand n 1L = 0L then
> > Int64.shift_right n 1 else Int64.add (Int64.mul 3L n) 1L);;
> > 
> >   let rec loop(i, (nlen, n)) =
> >     if i = 1L then n else
> >       let ilen = collatzLen(1, i) in
> >       let nlen, n = if ilen > nlen then ilen, i else nlen, n in
> >       loop (Int64.sub i 1L, (nlen, n));;
> > 
> >   let _ =
> >       let s = loop (1000000L, (1,1000000L)) in
> >       print_int (Int64.to_int s);;
> 
> I am unable to reproduce your results. Here, the time falls from 24s
> to 19.5s (using ocamlopt 3.12.0 on Intel x86) which is still 26×
> slower than HLVM.

Do you still have 'idiv' in the compiled code? See my attached
assembly, and compare it with yours please.
I was doing the test on 64-bit, with ocamlopt 3.11.2 and 3.12.0.

FWIW the original code took 2.8 seconds here, so only 4x slower (this
is an AMD Phenom II x6 1090T CPU). It probably depends how fast/slow
the 'idiv' is on your CPU.

--Edwin

[-- Attachment #2: x.s --]
[-- Type: application/octet-stream, Size: 4971 bytes --]

	.section        .rodata.cst8,"a",@progbits
	.align	16
caml_negf_mask:	.quad   0x8000000000000000, 0
	.align	16
caml_absf_mask:	.quad   0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
	.data
	.globl	camlX__data_begin
camlX__data_begin:
	.text
	.globl	camlX__code_begin
camlX__code_begin:
	.data
	.quad	2048
	.globl	camlX
camlX:
	.space	16
	.data
	.quad	3319
camlX__2:
	.quad	caml_tuplify2
	.quad	-3
	.quad	camlX__loop_1033
	.data
	.quad	3319
camlX__3:
	.quad	caml_tuplify2
	.quad	-3
	.quad	camlX__collatzLen_1030
	.data
	.quad	2048
camlX__1:
	.quad	.L100004
	.quad	.L100005
	.quad	2048
.L100005:
	.quad	3
	.quad	.L100006
	.quad	2303
.L100006:
	.quad	caml_int64_ops
	.quad	1000000
	.quad	2303
.L100004:
	.quad	caml_int64_ops
	.quad	1000000
	.text
	.align	16
	.globl	camlX__collatzLen_1030
camlX__collatzLen_1030:
	subq	$8, %rsp
.L103:
	movq	%rax, %rdi
	movq	$1, %rsi
	movq	8(%rbx), %rax
	cmpq	%rsi, %rax
	jne	.L102
	movq	%rdi, %rax
	addq	$8, %rsp
	ret
	.align	4
.L102:
	xorq	%rdx, %rdx
	movq	$1, %rsi
	movq	8(%rbx), %rax
	andq	%rsi, %rax
	cmpq	%rdx, %rax
	jne	.L101
.L104:	subq	$24, %r15
	movq	caml_young_limit@GOTPCREL(%rip), %rax
	cmpq	(%rax), %r15
	jb	.L105
	leaq	8(%r15), %rcx
	movq	$2303, -8(%rcx)
	movq	caml_int64_ops@GOTPCREL(%rip), %rax
	movq	%rax, (%rcx)
	movq	8(%rbx), %rax
	sarq	$1, %rax
	movq	%rax, 8(%rcx)
	jmp	.L100
	.align	4
.L101:
.L107:	subq	$24, %r15
	movq	caml_young_limit@GOTPCREL(%rip), %rax
	cmpq	(%rax), %r15
	jb	.L108
	leaq	8(%r15), %rcx
	movq	$2303, -8(%rcx)
	movq	caml_int64_ops@GOTPCREL(%rip), %rax
	movq	%rax, (%rcx)
	movq	$1, %rdx
	movq	8(%rbx), %rsi
	movq	$3, %rax
	imulq	%rsi, %rax
	addq	%rdx, %rax
	movq	%rax, 8(%rcx)
.L100:
	movq	%rdi, %rax
	addq	$2, %rax
	movq	%rcx, %rbx
	jmp	.L103
.L108:	call	caml_call_gc@PLT
.L109:	jmp	.L107
.L105:	call	caml_call_gc@PLT
.L106:	jmp	.L104
	.type	camlX__collatzLen_1030,@function
	.size	camlX__collatzLen_1030,.-camlX__collatzLen_1030
	.text
	.align	16
	.globl	camlX__loop_1033
camlX__loop_1033:
	subq	$24, %rsp
.L113:
	movq	%rax, %rdx
	movq	8(%rbx), %rax
	movq	$1, %rsi
	movq	8(%rdx), %rdi
	cmpq	%rsi, %rdi
	jne	.L112
	addq	$24, %rsp
	ret
	.align	4
.L112:
	movq	%rax, 8(%rsp)
	movq	%rdx, 16(%rsp)
	movq	(%rbx), %rax
	movq	%rax, 0(%rsp)
	movq	$3, %rax
	movq	%rdx, %rbx
	call	camlX__collatzLen_1030@PLT
.L114:
	movq	%rax, %rsi
	movq	0(%rsp), %rbx
	cmpq	%rbx, %rsi
	jle	.L111
.L115:	subq	$24, %r15
	movq	caml_young_limit@GOTPCREL(%rip), %rax
	cmpq	(%rax), %r15
	jb	.L116
	leaq	8(%r15), %rdi
	movq	$2048, -8(%rdi)
	movq	%rsi, (%rdi)
	movq	16(%rsp), %rax
	movq	%rax, 8(%rdi)
	jmp	.L110
	.align	4
.L111:
.L118:	subq	$24, %r15
	movq	caml_young_limit@GOTPCREL(%rip), %rax
	cmpq	(%rax), %r15
	jb	.L119
	leaq	8(%r15), %rdi
	movq	$2048, -8(%rdi)
	movq	%rbx, (%rdi)
	movq	8(%rsp), %rax
	movq	%rax, 8(%rdi)
.L110:
.L121:	subq	$48, %r15
	movq	caml_young_limit@GOTPCREL(%rip), %rax
	cmpq	(%rax), %r15
	jb	.L122
	leaq	8(%r15), %rbx
	movq	$2048, -8(%rbx)
	movq	(%rdi), %rax
	movq	%rax, (%rbx)
	movq	8(%rdi), %rax
	movq	%rax, 8(%rbx)
	leaq	24(%rbx), %rax
	movq	$2303, -8(%rax)
	movq	caml_int64_ops@GOTPCREL(%rip), %rdi
	movq	%rdi, (%rax)
	movq	$1, %rsi
	movq	16(%rsp), %rdi
	movq	8(%rdi), %rdi
	subq	%rsi, %rdi
	movq	%rdi, 8(%rax)
	jmp	.L113
.L122:	call	caml_call_gc@PLT
.L123:	jmp	.L121
.L119:	call	caml_call_gc@PLT
.L120:	jmp	.L118
.L116:	call	caml_call_gc@PLT
.L117:	jmp	.L115
	.type	camlX__loop_1033,@function
	.size	camlX__loop_1033,.-camlX__loop_1033
	.text
	.align	16
	.globl	camlX__entry
camlX__entry:
	subq	$8, %rsp
.L124:
	movq	camlX__3@GOTPCREL(%rip), %rbx
	movq	camlX@GOTPCREL(%rip), %rax
	movq	%rbx, (%rax)
	movq	camlX__2@GOTPCREL(%rip), %rbx
	movq	camlX@GOTPCREL(%rip), %rax
	movq	%rbx, 8(%rax)
	movq	camlX@GOTPCREL(%rip), %rax
	movq	8(%rax), %rbx
	movq	camlX__1@GOTPCREL(%rip), %rax
	movq	(%rbx), %rdi
	call	*%rdi
.L125:
	movq	8(%rax), %rax
	salq	$1, %rax
	orq	$1, %rax
	call	camlPervasives__string_of_int_1130@PLT
.L126:
	movq	%rax, %rbx
	movq	camlPervasives@GOTPCREL(%rip), %rax
	movq	184(%rax), %rax
	call	camlPervasives__output_string_1191@PLT
.L127:
	movq	$1, %rax
	addq	$8, %rsp
	ret
	.type	camlX__entry,@function
	.size	camlX__entry,.-camlX__entry
	.data
	.text
	.globl	camlX__code_end
camlX__code_end:
	.data
	.globl	camlX__data_end
camlX__data_end:
	.long	0
	.globl	camlX__frametable
camlX__frametable:
	.quad	9
	.quad	.L127
	.word	17
	.word	0
	.align	8
	.long	(.L200000 - .) + 0xe0000000
	.long	0x168120
	.quad	.L126
	.word	17
	.word	0
	.align	8
	.long	(.L200000 - .) + 0xe0000000
	.long	0x168270
	.quad	.L125
	.word	16
	.word	0
	.align	8
	.quad	.L123
	.word	32
	.word	2
	.word	16
	.word	5
	.align	8
	.quad	.L120
	.word	32
	.word	3
	.word	3
	.word	8
	.word	16
	.align	8
	.quad	.L117
	.word	32
	.word	2
	.word	16
	.word	7
	.align	8
	.quad	.L114
	.word	32
	.word	3
	.word	0
	.word	8
	.word	16
	.align	8
	.quad	.L109
	.word	16
	.word	2
	.word	3
	.word	5
	.align	8
	.quad	.L106
	.word	16
	.word	2
	.word	3
	.word	5
	.align	8
.L200000:
	.asciz	"pervasives.ml"
	.align	8
	.section .note.GNU-stack,"",%progbits

next prev parent reply	other threads:[~2010-12-12 17:26 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-12-12 14:54 Jon Harrop
2010-12-12 15:55 ` Török Edwin
2010-12-12 17:14   ` Jon Harrop
2010-12-12 17:26     ` Török Edwin [this message]
2010-12-12 18:01       ` Jon Harrop
2010-12-12 18:22         ` Török Edwin
2010-12-12 19:09   ` Benedikt Meurer
2010-12-12 19:20     ` John Carr
2010-12-14  9:43       ` Value types Goswin von Brederlow
2010-12-12 19:55     ` Value types (Was: [Caml-list] ocamlopt LLVM support) Török Edwin
2010-12-12 22:05       ` Jon Harrop
2010-12-12 22:27         ` Török Edwin
2010-12-12 23:41           ` Jon Harrop
2010-12-13  2:13             ` Eray Ozkural
2010-12-12 21:50     ` Jon Harrop
2010-12-13  8:43     ` Alain Frisch
2010-12-15 10:29       ` Benedikt Meurer
2010-12-15 13:15         ` Jon Harrop
2010-12-14  9:54   ` Value types Goswin von Brederlow
2010-12-12 19:53 ` Value types (Was: [Caml-list] ocamlopt LLVM support) Brian Hurt
2010-12-12 20:39   ` Jon Harrop

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101212192632.6536a647@deb0 \
    --to=edwintorok@gmail.com \
    --cc=caml-list@inria.fr \
    --cc=jon@ffconsultancy.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).