caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
* Performance questions, -inline, ...
@ 2008-01-03 16:28 Kuba Ober
  2008-01-03 17:11 ` [Caml-list] " Edgar Friendly
  2008-01-05 19:36 ` Jon Harrop
  0 siblings, 2 replies; 19+ messages in thread
From: Kuba Ober @ 2008-01-03 16:28 UTC (permalink / raw)
  To: caml-list

I haven't looked at assembly output yet, but I've run into some unexpected
behavior in my benchmarks.

This was compiled by ocamlopt -inline 100 -unsafe, the results and code are 
below (MIPS is obtained by dividing 50 million iterations by (Unix.times 
()) . Unix.tms_utime it took to run). I haven't included the timing etc. code 
(it's part of a larger benchmark).

What I wonder is why vector-to-vector add is so much faster than (constant) 
scalar to vector add. Vectors are preinitialized each time with a 1.0000, 
1.0001, ... sequence.

Also, the very bad performance from generic vector-to-vector *with* inlining 
is another puzzler, whereas generic add of scalar-to-scalar performs 
similarly to straight-coded one.

Cheers, Kuba

* add1: add scalar to scalar   120 MIPS
* add3: add scalar to vector   250 MIPS
* add5: add vector to vector   320 MIPS
* add2: generic add scalar to scalar   100 MIPS
* add4: generic add vector to vector   38 MIPS

let start = 1.3

(* generic scalar operation *)
let op1 op const nloop =
	let accum = ref start in
	for i = 1 to nloop do
		accum := op !accum const
	done

(* generic vector operation *)
let op2 op const a b (nloop : int) =
	let len = Array.length a in
	for j = 0 to len-1 do
		for i = 0 to len-1 do
			b.(i) <- op a.(i) b.(i)
		done;
	done

(** addition **)
let add1 nloop =
	let accum = ref start in
	for i = 1 to nloop do
		accum := !accum +. addconst
	done
let add2 = op1 ( +. ) addconst
let add3 a b nloop =
	let len = Array.length a in
	for j = 0 to len-1 do
		for i = 0 to len-1 do
			b.(i) <- a.(i) +. addconst
		done;
	done
let add4 = op2 ( +. ) addconst
let add5 a b nloop =
	let len = Array.length a in
	for j = 0 to len-1 do
		for i = 0 to len-1 do
			b.(i) <- a.(i) +. b.(i)
		done;
	done


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-03 16:28 Performance questions, -inline, Kuba Ober
@ 2008-01-03 17:11 ` Edgar Friendly
  2008-01-05 18:09   ` Kuba Ober
  2008-01-05 19:36 ` Jon Harrop
  1 sibling, 1 reply; 19+ messages in thread
From: Edgar Friendly @ 2008-01-03 17:11 UTC (permalink / raw)
  To: Kuba Ober; +Cc: caml-list


On Thu, 2008-01-03 at 11:28 -0500, Kuba Ober wrote:
> I haven't looked at assembly output yet, but I've run into some unexpected
> behavior in my benchmarks.
> 
> This was compiled by ocamlopt -inline 100 -unsafe, the results and code are 
> below (MIPS is obtained by dividing 50 million iterations by (Unix.times 
> ()) . Unix.tms_utime it took to run). I haven't included the timing etc. code 
> (it's part of a larger benchmark).
> 
> What I wonder is why vector-to-vector add is so much faster than (constant) 
> scalar to vector add. Vectors are preinitialized each time with a 1.0000, 
> 1.0001, ... sequence.
> 
> Also, the very bad performance from generic vector-to-vector *with* inlining 
> is another puzzler, whereas generic add of scalar-to-scalar performs 
> similarly to straight-coded one.
> 
> Cheers, Kuba
> 
> * add1: add scalar to scalar   120 MIPS
> * add3: add scalar to vector   250 MIPS
> * add5: add vector to vector   320 MIPS
> * add2: generic add scalar to scalar   100 MIPS
> * add4: generic add vector to vector   38 MIPS
> 
> let start = 1.3
> 
> (* generic scalar operation *)
> let op1 op const nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := op !accum const
> 	done
> 
> (* generic vector operation *)
> let op2 op const a b (nloop : int) =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- op a.(i) b.(i)
> 		done;
> 	done
> 
> (** addition **)
> let add1 nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := !accum +. addconst
> 	done
> let add2 = op1 ( +. ) addconst
> let add3 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. addconst
> 		done;
> 	done
> let add4 = op2 ( +. ) addconst
> let add5 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. b.(i)
> 		done;
> 	done
> 
how about:

(* generic vector operation *)
let op2 op a b nloop =
	let len = Array.length a in
	for j = 0 to nloop do
		for i = 0 to len-1 do
			b.(i) <- op a.(i) b.(i)
		done;
	done

let add4 = op2 (+.)


Why does your code have the j loops?  You add a constant (or vector
element) a number of times equal to the length of your vector?  Do you
mean for j = 1 to nloop do ...  And I'd move that out of the test
function if I could, into the testing harness.

Arrays of floats have some optimizations built in to the compiler (no
boxing, even though they're not 31-bit values), so you should get as
good performance as you'll get.  

E.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-03 17:11 ` [Caml-list] " Edgar Friendly
@ 2008-01-05 18:09   ` Kuba Ober
  2008-01-05 18:44     ` Kuba Ober
  0 siblings, 1 reply; 19+ messages in thread
From: Kuba Ober @ 2008-01-05 18:09 UTC (permalink / raw)
  To: caml-list

> how about:
>
> (* generic vector operation *)
> let op2 op a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to nloop do
> 		for i = 0 to len-1 do
> 			b.(i) <- op a.(i) b.(i)
> 		done;
> 	done
>
> let add4 = op2 (+.)

This doesn't change a thing, as I've said the code is part
of a larger benchmark, that's why there are some unused parameters
etc. They have no effect on performance.

> Why does your code have the j loops?

Simply to execute the operation Array.length ^ 2 times :)

> You add a constant (or vector 
> element) a number of times equal to the length of your vector?

No, equal to the square of it. I didn't want to have too big vectors. The
stuff executes floor(sqrt(50e6)) times.

> Arrays of floats have some optimizations built in to the compiler (no
> boxing, even though they're not 31-bit values), so you should get as
> good performance as you'll get.

Are you saying that it may be faster to use a one-element array than a ref? 
That'd be curious at best, and a WTF otherwise ;)

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-05 18:09   ` Kuba Ober
@ 2008-01-05 18:44     ` Kuba Ober
  0 siblings, 0 replies; 19+ messages in thread
From: Kuba Ober @ 2008-01-05 18:44 UTC (permalink / raw)
  To: caml-list

> > You add a constant (or vector
> > element) a number of times equal to the length of your vector?
>
> No, equal to the square of it. I didn't want to have too big vectors. The
> stuff executes floor(sqrt(50e6)) times.

I meant floor(sqrt(50e6))^2 times.

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-03 16:28 Performance questions, -inline, Kuba Ober
  2008-01-03 17:11 ` [Caml-list] " Edgar Friendly
@ 2008-01-05 19:36 ` Jon Harrop
  2008-01-05 20:31   ` Bünzli Daniel
  2008-01-07 13:48   ` Kuba Ober
  1 sibling, 2 replies; 19+ messages in thread
From: Jon Harrop @ 2008-01-05 19:36 UTC (permalink / raw)
  To: caml-list


Optimizing numerical code is discussed in detail in my book OCaml for 
Scientists. You may also be interested in a very similar thread where I 
optimized someone's almost identical code on the beginners list recently. 
There is also this relevant blog post of mine:

  http://ocamlnews.blogspot.com/2007/09/spectral-norm.html

Essentially, your benchmark has rediscovered the fact that abstractions (HOFs, 
polymorphism etc.) are prohibitively slow for high-performance numerics and 
must be avoided. There are also some minor boxing issues at play.

On Thursday 03 January 2008 16:28:30 Kuba Ober wrote:
> I haven't looked at assembly output yet, but I've run into some unexpected
> behavior in my benchmarks.

Your benchmarks aren't sufficiently well defined to convey information about 
anything specific, so you're highly likely to misinterpret what you see.

> This was compiled by ocamlopt -inline 100 -unsafe,

You should use Array.unsafe_get and _set rather than the command-line 
option -unsafe because the latter breaks correct code.

> What I wonder is why vector-to-vector add is so much faster than (constant)
> scalar to vector add. Vectors are preinitialized each time with a 1.0000,
> 1.0001, ... sequence.

This is also highly likely to be platform and architecture dependent.

> Also, the very bad performance from generic vector-to-vector *with*
> inlining is another puzzler, whereas generic add of scalar-to-scalar
> performs similarly to straight-coded one.
>
> Cheers, Kuba
>
> * add1: add scalar to scalar   120 MIPS
> * add3: add scalar to vector   250 MIPS
> * add5: add vector to vector   320 MIPS
> * add2: generic add scalar to scalar   100 MIPS
> * add4: generic add vector to vector   38 MIPS
>
> let start = 1.3
>
> (* generic scalar operation *)
> let op1 op const nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := op !accum const
> 	done

You probably meant to return "!accum".

> (* generic vector operation *)
> let op2 op const a b (nloop : int) =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- op a.(i) b.(i)
> 		done;
> 	done
>
> (** addition **)
> let add1 nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := !accum +. addconst
> 	done

Again, should probably return "!accum". However, you can encourage OCaml to 
unbox the float by returning "!accum + 0.0" instead.

> let add2 = op1 ( +. ) addconst

This should be slower than "add1".

> let add3 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. addconst
> 		done;
> 	done

The loop over "j" can be removed.

> let add4 = op2 ( +. ) addconst

This will be slow because "op2" is polymorphic and "+." will not be inlined.

> let add5 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. b.(i)
> 		done;
> 	done

This increments "b" by "a" many times. Replace the repeated adds with a single 
multiply for each element.

-- 
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/products/?e


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-05 19:36 ` Jon Harrop
@ 2008-01-05 20:31   ` Bünzli Daniel
  2008-01-07 13:48   ` Kuba Ober
  1 sibling, 0 replies; 19+ messages in thread
From: Bünzli Daniel @ 2008-01-05 20:31 UTC (permalink / raw)
  To: Kuba Ober; +Cc: caml-list caml-list

Have also a look at this page

http://caml.inria.fr/pub/old_caml_site/ocaml/numerical.html

Best,

Daniel


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-05 19:36 ` Jon Harrop
  2008-01-05 20:31   ` Bünzli Daniel
@ 2008-01-07 13:48   ` Kuba Ober
  2008-01-07 14:41     ` Jon Harrop
  1 sibling, 1 reply; 19+ messages in thread
From: Kuba Ober @ 2008-01-07 13:48 UTC (permalink / raw)
  To: caml-list

On Saturday 05 January 2008, Jon Harrop wrote:
> Optimizing numerical code is discussed in detail in my book OCaml for
> Scientists. You may also be interested in a very similar thread where I
> optimized someone's almost identical code on the beginners list recently.
> There is also this relevant blog post of mine:
>
>   http://ocamlnews.blogspot.com/2007/09/spectral-norm.html
>
> Essentially, your benchmark has rediscovered the fact that abstractions
> (HOFs, polymorphism etc.) are prohibitively slow for high-performance
> numerics and must be avoided.

In the case of the particular OCaml implementation - yes. In general - no. I 
hope we agree here.

> On Thursday 03 January 2008 16:28:30 Kuba Ober wrote:
> > I haven't looked at assembly output yet, but I've run into some
> > unexpected behavior in my benchmarks.
>
> Your benchmarks aren't sufficiently well defined to convey information
> about anything specific, so you're highly likely to misinterpret what you
> see.

They are straight rewrites from C code and are used to compare how gcc and 
OCaml stack up.

> > This was compiled by ocamlopt -inline 100 -unsafe,
>
> You should use Array.unsafe_get and _set rather than the command-line
> option -unsafe because the latter breaks correct code.

This option is not affecting the execution speed in my case and thus can be 
dropped.

> > What I wonder is why vector-to-vector add is so much faster than
> > (constant) scalar to vector add. Vectors are preinitialized each time
> > with a 1.0000, 1.0001, ... sequence.
>
> This is also highly likely to be platform and architecture dependent.

The benchmark was run on the same machine and on the same day as the C code it 
was rewritten from :)

> > (* generic scalar operation *)
> > let op1 op const nloop =
> > 	let accum = ref start in
> > 	for i = 1 to nloop do
> > 		accum := op !accum const
> > 	done
>
> You probably meant to return "!accum".

The return value is ignored anyway. It's a benchmark, noone cares what
the result is. Or is it for performance reasons??

> > (* generic vector operation *)
> > let op2 op const a b (nloop : int) =
> > 	let len = Array.length a in
> > 	for j = 0 to len-1 do
> > 		for i = 0 to len-1 do
> > 			b.(i) <- op a.(i) b.(i)
> > 		done;
> > 	done
> >
> > (** addition **)
> > let add1 nloop =
> > 	let accum = ref start in
> > 	for i = 1 to nloop do
> > 		accum := !accum +. addconst
> > 	done
>
> Again, should probably return "!accum". However, you can encourage OCaml to
> unbox the float by returning "!accum + 0.0" instead.

OK, I don't quite get it. Are you talking about what the function should 
return? If so, are you implying that the function body will be compiled 
differently (better?) if a different type is returned?

> > let add2 = op1 ( +. ) addconst
>
> This should be slower than "add1".

But shouldn't. The way I write the code in this case should not affect the 
assembly the least bit. The loop with explicit operand, functional recursion 
and generic function-as-an-argument approach should all generate same 
assembly. Obviously enough they don't :)

> > let add3 a b nloop =
> > 	let len = Array.length a in
> > 	for j = 0 to len-1 do
> > 		for i = 0 to len-1 do
> > 			b.(i) <- a.(i) +. addconst
> > 		done;
> > 	done
>
> The loop over "j" can be removed.

Well, the goal was to iterate Array.length^2 times :)

> > let add4 = op2 ( +. ) addconst
>
> This will be slow because "op2" is polymorphic and "+." will not be
> inlined.

Yeah, I see that, but that shouldn't be the case if OCaml were to be serious 
in that department :)

> > let add5 a b nloop =
> > 	let len = Array.length a in
> > 	for j = 0 to len-1 do
> > 		for i = 0 to len-1 do
> > 			b.(i) <- a.(i) +. b.(i)
> > 		done;
> > 	done
>
> This increments "b" by "a" many times. Replace the repeated adds with a
> single multiply for each element.

It's benchmark code. It's supposed to check performance of 
increment-vector-by-a-vector operation. I was hoping it would be obvious 
enough :(

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 13:48   ` Kuba Ober
@ 2008-01-07 14:41     ` Jon Harrop
  2008-01-07 15:22       ` Kuba Ober
                         ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Jon Harrop @ 2008-01-07 14:41 UTC (permalink / raw)
  To: caml-list

On Monday 07 January 2008 13:48:40 Kuba Ober wrote:
> On Saturday 05 January 2008, Jon Harrop wrote:
> > Optimizing numerical code is discussed in detail in my book OCaml for
> > Scientists. You may also be interested in a very similar thread where I
> > optimized someone's almost identical code on the beginners list recently.
> > There is also this relevant blog post of mine:
> >
> >   http://ocamlnews.blogspot.com/2007/09/spectral-norm.html
> >
> > Essentially, your benchmark has rediscovered the fact that abstractions
> > (HOFs, polymorphism etc.) are prohibitively slow for high-performance
> > numerics and must be avoided.
>
> In the case of the particular OCaml implementation - yes. In general - no.
> I hope we agree here.

You mean it might be possible to recover the performance of C from numerical 
code with high-level abstractions? Yes. Indeed, I would like to see this 
done. However, I've never heard of an implementation of any language that can 
do this.

> > On Thursday 03 January 2008 16:28:30 Kuba Ober wrote:
> > > I haven't looked at assembly output yet, but I've run into some
> > > unexpected behavior in my benchmarks.
> >
> > Your benchmarks aren't sufficiently well defined to convey information
> > about anything specific, so you're highly likely to misinterpret what you
> > see.
>
> They are straight rewrites from C code and are used to compare how gcc and
> OCaml stack up.

I assume you didn't mimic run-time polymorphism, currying, closures and 
higher-order functions in the C though?

> > > This was compiled by ocamlopt -inline 100 -unsafe,
> >
> > You should use Array.unsafe_get and _set rather than the command-line
> > option -unsafe because the latter breaks correct code.
>
> This option is not affecting the execution speed in my case and thus can be
> dropped.

I find that surprising.

> > > What I wonder is why vector-to-vector add is so much faster than
> > > (constant) scalar to vector add. Vectors are preinitialized each time
> > > with a 1.0000, 1.0001, ... sequence.
> >
> > This is also highly likely to be platform and architecture dependent.
>
> The benchmark was run on the same machine and on the same day as the C code
> it was rewritten from :)

OCaml is only likely to compete with C in terms of numerical performance on 
modern CPUs, e.g. AMD64. If you're using an old CPU or running in 32-bit then 
OCaml can be up to 4x slower.

> > > (* generic scalar operation *)
> > > let op1 op const nloop =
> > > 	let accum = ref start in
> > > 	for i = 1 to nloop do
> > > 		accum := op !accum const
> > > 	done
> >
> > You probably meant to return "!accum".
>
> The return value is ignored anyway. It's a benchmark, noone cares what
> the result is.

If you do not care what the result of calls to this function are then you must 
also not care how quickly it runs. I cannot overemphasize how important this 
distinction is.

> Or is it for performance reasons?? 

In this case, I believe that will not affect performance because "accum" will 
always be boxed because it is polymorphic.

> > > (* generic vector operation *)
> > > let op2 op const a b (nloop : int) =
> > > 	let len = Array.length a in
> > > 	for j = 0 to len-1 do
> > > 		for i = 0 to len-1 do
> > > 			b.(i) <- op a.(i) b.(i)
> > > 		done;
> > > 	done
> > >
> > > (** addition **)
> > > let add1 nloop =
> > > 	let accum = ref start in
> > > 	for i = 1 to nloop do
> > > 		accum := !accum +. addconst
> > > 	done
> >
> > Again, should probably return "!accum". However, you can encourage OCaml
> > to unbox the float by returning "!accum + 0.0" instead.
>
> OK, I don't quite get it. Are you talking about what the function should
> return?

Yes. Your "add1" function is entirely dead code and, therefore, is trivially 
reducible. This is not true of real programs and, therefore, your results 
will not be representative of real programs. Moreover, there is no sense in 
trying to optimize a program that does nothing anyway.

> If so, are you implying that the function body will be compiled 
> differently (better?) if a different type is returned?

The change of type isn't the reason for the performance improvement. If you 
return "!accum" here, so "add1" returns "float" then the performance will 
probably not change. However, if you then change the return expression 
to "!accum + 0.0" then you're likely to see a significant improvement in 
performance because the OCaml compiler will unbox the floating point 
value "!accum" throughout this function, avoiding costly allocations.

You can't appreciate any of that without fixing your function first though.

> > > let add2 = op1 ( +. ) addconst
> >
> > This should be slower than "add1".
>
> But shouldn't. The way I write the code in this case should not affect the
> assembly the least bit. The loop with explicit operand, functional
> recursion and generic function-as-an-argument approach should all generate
> same assembly. Obviously enough they don't :)

The simplest route to recovering C performance here is:

. Inline "( +. )".
. Inline "op1".
. Type-specialize "op1".
. Hoist bounds checks.

> > > let add3 a b nloop =
> > > 	let len = Array.length a in
> > > 	for j = 0 to len-1 do
> > > 		for i = 0 to len-1 do
> > > 			b.(i) <- a.(i) +. addconst
> > > 		done;
> > > 	done
> >
> > The loop over "j" can be removed.
>
> Well, the goal was to iterate Array.length^2 times :)

Again, that makes your benchmark trivially reducible and, consequently, your 
results cannot be related to real code.

> > > let add4 = op2 ( +. ) addconst
> >
> > This will be slow because "op2" is polymorphic and "+." will not be
> > inlined.
>
> Yeah, I see that, but that shouldn't be the case if OCaml were to be
> serious in that department :)

AFAIK, there are no languages with implementations that are "serious in that 
department" by this definition, although I agree that it would be very useful 
in practice.

There are some trade-offs here though. Stalin-compiled Scheme, MLton-compiled 
SML and GHC-compiled Haskell do a lot more than OCaml in theory but, in 
practice, they are much less useful for high-performance numerics because 
their performance is so unpredictable.

> > > let add5 a b nloop =
> > > 	let len = Array.length a in
> > > 	for j = 0 to len-1 do
> > > 		for i = 0 to len-1 do
> > > 			b.(i) <- a.(i) +. b.(i)
> > > 		done;
> > > 	done
> >
> > This increments "b" by "a" many times. Replace the repeated adds with a
> > single multiply for each element.
>
> It's benchmark code. It's supposed to check performance of
> increment-vector-by-a-vector operation. I was hoping it would be obvious
> enough :(

Good benchmark code should not be trivially reducible in ways that real 
programs aren't, i.e. unless you're specifically testing the compiler's 
dead-code elimination. As you've written these functions, there is a lot of 
dead code. Some compilers (particularly C/C++) might eliminate the dead code 
and look better on this benchmark as a consequence but that is of little 
practical relevance.

I've already spent a long time meticulously contructing benchmarks in various 
different languages along these lines and the most important optimizations 
missing from OCaml are:

. Hoisting bounds checks.
. Common subexpression elimination.
. Type specialization.
. More aggressive inlining.
. Static optimization of div and mod.

The nice thing about OCaml is that it has a superb 64-bit code generator so, 
once you've done those optimizations on the hot paths manually, OCaml lets 
you run code as quickly as C but with all of the high-level benefits 
everywhere else.

A perfect language implementation would certainly do these optimizations for 
you and I wish OCaml did but I still haven't found anything better.

-- 
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/products/?e


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 14:41     ` Jon Harrop
@ 2008-01-07 15:22       ` Kuba Ober
  2008-01-07 19:58         ` Jon Harrop
  2008-01-07 15:31       ` Christophe Raffalli
  2008-01-07 17:00       ` Jacques Carette
  2 siblings, 1 reply; 19+ messages in thread
From: Kuba Ober @ 2008-01-07 15:22 UTC (permalink / raw)
  To: caml-list

On Monday 07 January 2008, Jon Harrop wrote:
> On Monday 07 January 2008 13:48:40 Kuba Ober wrote:
> > On Saturday 05 January 2008, Jon Harrop wrote:
> > > Optimizing numerical code is discussed in detail in my book OCaml for
> > > Scientists. You may also be interested in a very similar thread where I
> > > optimized someone's almost identical code on the beginners list
> > > recently. There is also this relevant blog post of mine:
> > >
> > >   http://ocamlnews.blogspot.com/2007/09/spectral-norm.html
> > >
> > > Essentially, your benchmark has rediscovered the fact that abstractions
> > > (HOFs, polymorphism etc.) are prohibitively slow for high-performance
> > > numerics and must be avoided.
> >
> > In the case of the particular OCaml implementation - yes. In general -
> > no. I hope we agree here.
>
> You mean it might be possible to recover the performance of C from
> numerical code with high-level abstractions? Yes. Indeed, I would like to
> see this done. However, I've never heard of an implementation of any
> language that can do this.

g++ does that reasonably well. Heck, I have a severely hacked in-house LISP 
system which had grown out of an assembler-with-LISP-macros, and it does it 
all the time. It's a LISP with static types at runtime, but general LISPiness 
at compile time (macros just run in gcl). In fact I'm looking to port it to 
OCaml just because type inference and commonplace LISP syntax for type 
declarations don't mix too well - the code starts looking really ugly. Maybe 
I stuck too much with LISP's usual declaim-this-and-that approach, but that's 
what you get when you reuse most of underlying LISP's implementation.

> > They are straight rewrites from C code and are used to compare how gcc
> > and OCaml stack up.
>
> I assume you didn't mimic run-time polymorphism, currying, closures and
> higher-order functions in the C though?

No. But while those are very useful features to have, the code is messy if 
you've got two languages to work with (hi-perf stuff in C, everything else in 
OCaml).

> > > > (* generic scalar operation *)
> > > > let op1 op const nloop =
> > > > 	let accum = ref start in
> > > > 	for i = 1 to nloop do
> > > > 		accum := op !accum const
> > > > 	done
> > >
> > > You probably meant to return "!accum".
> >
> > The return value is ignored anyway. It's a benchmark, noone cares what
> > the result is.
>
> If you do not care what the result of calls to this function are then you
> must also not care how quickly it runs. I cannot overemphasize how
> important this distinction is.

I assumed (correctly) that OCaml's optimizer isn't all that clever. This is 
something you definitely have to watch for when using gcc/g++.

> > > > (** addition **)
> > > > let add1 nloop =
> > > > 	let accum = ref start in
> > > > 	for i = 1 to nloop do
> > > > 		accum := !accum +. addconst
> > > > 	done
> > >
> > > Again, should probably return "!accum". However, you can encourage
> > > OCaml to unbox the float by returning "!accum + 0.0" instead.
> >
> > OK, I don't quite get it. Are you talking about what the function should
> > return?
>
> Yes. Your "add1" function is entirely dead code and, therefore, is
> trivially reducible. This is not true of real programs and, therefore, your
> results will not be representative of real programs. Moreover, there is no
> sense in trying to optimize a program that does nothing anyway.

If you code such a thing in gcc, it correctly becomes a no-op. OCaml isn't all 
that clever ;)

You're 100% academically-correct of course.

> > If so, are you implying that the function body will be compiled
> > differently (better?) if a different type is returned?
>
> The change of type isn't the reason for the performance improvement. If you
> return "!accum" here, so "add1" returns "float" then the performance will
> probably not change. However, if you then change the return expression
> to "!accum + 0.0" then you're likely to see a significant improvement in
> performance because the OCaml compiler will unbox the floating point
> value "!accum" throughout this function, avoiding costly allocations.

Ah-ha, I have to try that!

> > > > let add2 = op1 ( +. ) addconst
> > >
> > > This should be slower than "add1".
> >
> > But shouldn't. The way I write the code in this case should not affect
> > the assembly the least bit. The loop with explicit operand, functional
> > recursion and generic function-as-an-argument approach should all
> > generate same assembly. Obviously enough they don't :)
>
> The simplest route to recovering C performance here is:
>
> . Inline "( +. )".
> . Inline "op1".
> . Type-specialize "op1".
> . Hoist bounds checks.

You mean "the programmer has to do this"? OCaml compiler is *really* bad, 
then.

> > > > let add4 = op2 ( +. ) addconst
> > >
> > > This will be slow because "op2" is polymorphic and "+." will not be
> > > inlined.
> >
> > Yeah, I see that, but that shouldn't be the case if OCaml were to be
> > serious in that department :)
>
> AFAIK, there are no languages with implementations that are "serious in
> that department" by this definition, although I agree that it would be very
> useful in practice.

I have a very broken inlining statically-typed LISP implementation which does 
that all the time. For code which runs on 8-bit hardware and is better than 
stock C compiler that came with it (Zilog's Encore! platform). It's really 
not all that hard to do. It's hardly a full-fledged implementation, and it's 
buggy, but it's only goal is to compile some very specific codebase. A hack, 
but it works, and I write everything functional-style, with functions passed 
left and right, etc. In fact, the compiler guarantees that a stateless 
function, if passed as an argument, is equivalent to coding it in-line. It's 
all DSP code.

> There are some trade-offs here though. Stalin-compiled Scheme,
> MLton-compiled SML and GHC-compiled Haskell do a lot more than OCaml in
> theory but, in practice, they are much less useful for high-performance
> numerics because their performance is so unpredictable.

Good to know.

> I've already spent a long time meticulously contructing benchmarks in
> various different languages along these lines and the most important
> optimizations missing from OCaml are:
>
> . Hoisting bounds checks.
> . Common subexpression elimination.
> . Type specialization.
> . More aggressive inlining.
> . Static optimization of div and mod.
>
> The nice thing about OCaml is that it has a superb 64-bit code generator
> so, once you've done those optimizations on the hot paths manually, OCaml
> lets you run code as quickly as C but with all of the high-level benefits
> everywhere else.
>
> A perfect language implementation would certainly do these optimizations
> for you and I wish OCaml did but I still haven't found anything better.

After I gain some more experience I will hopefully add an OCaml-esque front 
end to my hackish compiler. It's completely unportable, only works on 
12-bit-PIC-like architecture (SX from Parallax) and on Z8 Encore!, but hey, 
maybe it can be a good starting point to something bigger and better. The 
backend still has to be ported from Lisp, and I'm too time- and 
knowledge-constrained to do it just yet.

The code is 70% Lisp macros, reuses every bit of underlying LISP platform 
it can, and as of yet I don't know how clean the ported OCaml would look. 
Needless to say, I'd never get this done without having the macros available, 
as in "this would take me forever in C/C++". The way it currenly runs is as 
follows: the source code is run via a LISP program which creates another LISP 
program which then generates the linked binary to be downloaded to the 
target.

I was originally hoping to reuse most of OCaml's compiler/optimizer and just 
write a platform-specific backend. It now seems that it's easier for me to 
add a hacked OCaml front-end to my existing system. Probably it will just 
translate OCaml to LISP and then process it as it originally was. I really 
like OCaml syntax better, *although* some of my sources also use LISP macros. 
They could probably stay as-is for now if the OCaml parts will just get 
translated to LISP.

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 14:41     ` Jon Harrop
  2008-01-07 15:22       ` Kuba Ober
@ 2008-01-07 15:31       ` Christophe Raffalli
  2008-01-07 17:00       ` Jacques Carette
  2 siblings, 0 replies; 19+ messages in thread
From: Christophe Raffalli @ 2008-01-07 15:31 UTC (permalink / raw)
  To: Jon Harrop; +Cc: caml-list

[-- Attachment #1: Type: text/plain, Size: 990 bytes --]


> 
> You mean it might be possible to recover the performance of C from numerical 
> code with high-level abstractions? Yes. Indeed, I would like to see this 
> done. However, I've never heard of an implementation of any language that can 
> do this.
> 

That's what MLTon tries to do ... I do not know how much they succeed, but monomorphisation and
defunctorization shoud do the job better than OCaml (look at the specific numerical benchmark for
MLTon ?)


-- 
Christophe Raffalli
Universite de Savoie
Batiment Le Chablais, bureau 21
73376 Le Bourget-du-Lac Cedex

tel: (33) 4 79 75 81 03
fax: (33) 4 79 75 87 42
mail: Christophe.Raffalli@univ-savoie.fr
www: http://www.lama.univ-savoie.fr/~RAFFALLI
---------------------------------------------
IMPORTANT: this mail is signed using PGP/MIME
At least Enigmail/Mozilla, mutt or evolution
can check this signature. The public key is
stored on www.keyserver.net
---------------------------------------------


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 252 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 14:41     ` Jon Harrop
  2008-01-07 15:22       ` Kuba Ober
  2008-01-07 15:31       ` Christophe Raffalli
@ 2008-01-07 17:00       ` Jacques Carette
  2008-01-07 17:07         ` Till Varoquaux
  2008-01-07 17:31         ` Kuba Ober
  2 siblings, 2 replies; 19+ messages in thread
From: Jacques Carette @ 2008-01-07 17:00 UTC (permalink / raw)
  To: Jon Harrop; +Cc: caml-list

Jon Harrop wrote:
> You mean it might be possible to recover the performance of C from numerical 
> code with high-level abstractions? Yes. Indeed, I would like to see this 
> done. However, I've never heard of an implementation of any language that can 
> do this.
>   
<shameless plug>
With MetaOCaml you can -- see either the long version
http://www.cas.mcmaster.ca/~carette/scp_metamonads.pdf
or the more condensed version
http://www.cas.mcmaster.ca/~carette/metamonads/index.html
</shameless plug>

With a little bit of work, you can achieve all of
> The simplest route to recovering C performance here is:
>
> . Inline "( +. )".
> . Inline "op1".
> . Type-specialize "op1".
> . Hoist bounds checks.
>   
automatically.  There are three drawbacks:
1) the code you write no longer looks like O'Caml but Lisp instead [can 
be fixed with enough campl4 hacking]
2) the error messages can be very difficult to figure out [could be 
improved a lot if monads were integrated in O'Caml]
3) metaocaml is not as well supported as ocaml

Jacques


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 17:00       ` Jacques Carette
@ 2008-01-07 17:07         ` Till Varoquaux
  2008-01-07 17:20           ` Jacques Carette
  2008-01-07 17:31         ` Kuba Ober
  1 sibling, 1 reply; 19+ messages in thread
From: Till Varoquaux @ 2008-01-07 17:07 UTC (permalink / raw)
  To: Jacques Carette; +Cc: Jon Harrop, caml-list

First link is dead link... which a shame because any article with
metaocaml monads and oleg is bound to be very interesting..
Till

On Jan 7, 2008 5:00 PM, Jacques Carette <carette@mcmaster.ca> wrote:
> Jon Harrop wrote:
> > You mean it might be possible to recover the performance of C from numerical
> > code with high-level abstractions? Yes. Indeed, I would like to see this
> > done. However, I've never heard of an implementation of any language that can
> > do this.
> >
> <shameless plug>
> With MetaOCaml you can -- see either the long version
> http://www.cas.mcmaster.ca/~carette/scp_metamonads.pdf
> or the more condensed version
> http://www.cas.mcmaster.ca/~carette/metamonads/index.html
> </shameless plug>
>
> With a little bit of work, you can achieve all of
> > The simplest route to recovering C performance here is:
> >
> > . Inline "( +. )".
> > . Inline "op1".
> > . Type-specialize "op1".
> > . Hoist bounds checks.
> >
> automatically.  There are three drawbacks:
> 1) the code you write no longer looks like O'Caml but Lisp instead [can
> be fixed with enough campl4 hacking]
> 2) the error messages can be very difficult to figure out [could be
> improved a lot if monads were integrated in O'Caml]
> 3) metaocaml is not as well supported as ocaml
>
> Jacques
>
>
> _______________________________________________
> Caml-list mailing list. Subscription management:
> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
> Archives: http://caml.inria.fr
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
>



-- 
http://till-varoquaux.blogspot.com/


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 17:07         ` Till Varoquaux
@ 2008-01-07 17:20           ` Jacques Carette
  0 siblings, 0 replies; 19+ messages in thread
From: Jacques Carette @ 2008-01-07 17:20 UTC (permalink / raw)
  To: Till Varoquaux; +Cc: caml-list

Oops, sorry, the link should have been

http://www.cas.mcmaster.ca/~carette/publications/scp_metamonads.pdf

Jacques

Till Varoquaux wrote:
> First link is dead link... which a shame because any article with
> metaocaml monads and oleg is bound to be very interesting..
> Till
>   


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 17:00       ` Jacques Carette
  2008-01-07 17:07         ` Till Varoquaux
@ 2008-01-07 17:31         ` Kuba Ober
  1 sibling, 0 replies; 19+ messages in thread
From: Kuba Ober @ 2008-01-07 17:31 UTC (permalink / raw)
  To: caml-list

On Monday 07 January 2008, Jacques Carette wrote:
> Jon Harrop wrote:
> > You mean it might be possible to recover the performance of C from
> > numerical code with high-level abstractions? Yes. Indeed, I would like to
> > see this done. However, I've never heard of an implementation of any
> > language that can do this.
>
> <shameless plug>
> With MetaOCaml you can -- see either the long version
> http://www.cas.mcmaster.ca/~carette/scp_metamonads.pdf
> or the more condensed version
> http://www.cas.mcmaster.ca/~carette/metamonads/index.html
> </shameless plug>

Yup, that's what my homegrown "compiler" does, although it is all LISP, and 
not really nicely structured - it has grown out of an assembler with LISP 
macros. Eventually the macros became powerful enough to do register 
allocation and optimizations, it was a short road to getting the "assembler" 
to dig generic expression input in LISP syntax too. Function inlining 
and "hoisting" are mostly trivial syntax tree operations -- once you have the 
predicates written to tell you whether it's safe to optimize in a given way.

From day 1 it was a LISP-syntax assembler, of course.  Which may look funny, 
but I never had to write a lexer for it, and "parsing" LISP isn't much :) In 
about 12klocs, for my application it generates assembly mostly on par with 
what I could ever write - 95% is DSP operations in fixed point (matrix by 
vector multiplies, vector adds, MACs (FIR/IIR)). It also handles saturations 
where necessary - I've added a "saturated integer" type which gracefully 
saturates on overflow. Kinda necessary in control loops.

To perform well, some of my applications can't even be coded in C unless you'd 
use some (AFAIK nonexistent) C-extensions, or resort to relatively unclean 
techniques like inlineable assembly functions. That's what gcc digs very 
well, but it becomes mostly unmaintainable crap for someone who only knows a 
higher-level language and assembly, but has no gcc experience. Most languages 
are also perfect at hiding the processor's state/flags register from you, and 
it's nigh impossible to expect the optimizer to deduce that all you want to 
do is long addition or multiplication. I can see (if it gets there) my OCaml 
front-end by default having the result type of + being an (int, bool) tuple 
that coerces to int by default, with the bool being the carry status. Thank 
$deity almost all CPUs know what carry means, so this could be pretty 
portable if I were after that -- which I'm not.

I gave up on working with gcc codebase mostly because C is a very unexpressive 
language for anything scientific-related (I wholeheartedly agree with Jon 
Harrop here). And I don't like the C++ "bolted-on" functional template 
metaprogramming either - it reads like a Turing-machine program to me. It is 
pretty nice in theory, but in practice I found it distracting. And the error 
messages are a yet another story.

But enough rambling, I have some thinking to do :)

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 15:22       ` Kuba Ober
@ 2008-01-07 19:58         ` Jon Harrop
  2008-01-08 14:20           ` Kuba Ober
  0 siblings, 1 reply; 19+ messages in thread
From: Jon Harrop @ 2008-01-07 19:58 UTC (permalink / raw)
  To: caml-list

On Monday 07 January 2008 15:22:40 Kuba Ober wrote:
> On Monday 07 January 2008, Jon Harrop wrote:
> > You mean it might be possible to recover the performance of C from
> > numerical code with high-level abstractions? Yes. Indeed, I would like to
> > see this done. However, I've never heard of an implementation of any
> > language that can do this.
>
> g++ does that reasonably well. Heck, I have a severely hacked in-house LISP
> system which had grown out of an assembler-with-LISP-macros, and it does it
> all the time. It's a LISP with static types at runtime, but general
> LISPiness at compile time (macros just run in gcl). In fact I'm looking to
> port it to OCaml just because type inference and commonplace LISP syntax
> for type declarations don't mix too well - the code starts looking really
> ugly. Maybe I stuck too much with LISP's usual declaim-this-and-that
> approach, but that's what you get when you reuse most of underlying LISP's
> implementation.

Yes. If you're willing to do that kind of hacking then you could get a long 
way with camlp4 without too much trouble.

> > I assume you didn't mimic run-time polymorphism, currying, closures and
> > higher-order functions in the C though?
>
> No. But while those are very useful features to have, the code is messy if
> you've got two languages to work with (hi-perf stuff in C, everything else
> in OCaml).

On 64-bit there is no performance benefit in dropping to C.

> > Yes. Your "add1" function is entirely dead code and, therefore, is
> > trivially reducible. This is not true of real programs and, therefore,
> > your results will not be representative of real programs. Moreover, there
> > is no sense in trying to optimize a program that does nothing anyway.
>
> If you code such a thing in gcc, it correctly becomes a no-op. OCaml isn't
> all that clever ;)
>
> You're 100% academically-correct of course.

As Xavier says, "the OCaml compiler is designed to compile good code".

> > The simplest route to recovering C performance here is:
> >
> > . Inline "( +. )".
> > . Inline "op1".
> > . Type-specialize "op1".
> > . Hoist bounds checks.
>
> You mean "the programmer has to do this"? OCaml compiler is *really* bad,
> then.

You're 100% academically-correct of course.

In practice, the combination of OCaml's awesome native-code generation on 
AMD64 and the ease with which you can work around those missing optimizations 
mean that OCaml is *really* good compared to all other language 
implementations with comparable qualities. This is precisely why it is so 
popular.

> > There are some trade-offs here though. Stalin-compiled Scheme,
> > MLton-compiled SML and GHC-compiled Haskell do a lot more than OCaml in
> > theory but, in practice, they are much less useful for high-performance
> > numerics because their performance is so unpredictable.
>
> Good to know.

OCaml's code generator is also much better than MLton's and GHC's (even 6.8).

> > I've already spent a long time meticulously contructing benchmarks in
> > various different languages along these lines and the most important
> > optimizations missing from OCaml are:
> >
> > . Hoisting bounds checks.
> > . Common subexpression elimination.
> > . Type specialization.
> > . More aggressive inlining.
> > . Static optimization of div and mod.
> >
> > The nice thing about OCaml is that it has a superb 64-bit code generator
> > so, once you've done those optimizations on the hot paths manually, OCaml
> > lets you run code as quickly as C but with all of the high-level benefits
> > everywhere else.
> >
> > A perfect language implementation would certainly do these optimizations
> > for you and I wish OCaml did but I still haven't found anything better.
>
> After I gain some more experience I will hopefully add an OCaml-esque front
> end to my hackish compiler. It's completely unportable, only works on
> 12-bit-PIC-like architecture (SX from Parallax) and on Z8 Encore!, but hey,
> maybe it can be a good starting point to something bigger and better. The
> backend still has to be ported from Lisp, and I'm too time- and
> knowledge-constrained to do it just yet.
> ...

I see. If you're not using OCaml as a general purpose language then you don't 
need its other benefits (e.g. GUI libraries).

I'm in a similar situation. I'm toying with the idea of building a better FPL 
implementation for high-performance numerics that is commerce friendly using 
LLVM. If you take the path of least resistance then I believe that is 
tractable but you lose nice things like LablGTK2...

-- 
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/products/?e


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-07 19:58         ` Jon Harrop
@ 2008-01-08 14:20           ` Kuba Ober
  2008-01-12 14:22             ` Jon Harrop
  0 siblings, 1 reply; 19+ messages in thread
From: Kuba Ober @ 2008-01-08 14:20 UTC (permalink / raw)
  To: caml-list

On Monday 07 January 2008, Jon Harrop wrote:
> On Monday 07 January 2008 15:22:40 Kuba Ober wrote:
> > On Monday 07 January 2008, Jon Harrop wrote:
> > > You mean it might be possible to recover the performance of C from
> > > numerical code with high-level abstractions? Yes. Indeed, I would like
> > > to see this done. However, I've never heard of an implementation of any
> > > language that can do this.
> >
> > g++ does that reasonably well. Heck, I have a severely hacked in-house
> > LISP system which had grown out of an assembler-with-LISP-macros, and it
> > does it all the time. It's a LISP with static types at runtime, but
> > general LISPiness at compile time (macros just run in gcl). In fact I'm
> > looking to port it to OCaml just because type inference and commonplace
> > LISP syntax for type declarations don't mix too well - the code starts
> > looking really ugly. Maybe I stuck too much with LISP's usual
> > declaim-this-and-that approach, but that's what you get when you reuse
> > most of underlying LISP's implementation.
>
> Yes. If you're willing to do that kind of hacking then you could get a long
> way with camlp4 without too much trouble.
>
> > > I assume you didn't mimic run-time polymorphism, currying, closures and
> > > higher-order functions in the C though?
> >
> > No. But while those are very useful features to have, the code is messy
> > if you've got two languages to work with (hi-perf stuff in C, everything
> > else in OCaml).
>
> On 64-bit there is no performance benefit in dropping to C.
>
> > > Yes. Your "add1" function is entirely dead code and, therefore, is
> > > trivially reducible. This is not true of real programs and, therefore,
> > > your results will not be representative of real programs. Moreover,
> > > there is no sense in trying to optimize a program that does nothing
> > > anyway.
> >
> > If you code such a thing in gcc, it correctly becomes a no-op. OCaml
> > isn't all that clever ;)
> >
> > You're 100% academically-correct of course.
>
> As Xavier says, "the OCaml compiler is designed to compile good code".
>
> > > The simplest route to recovering C performance here is:
> > >
> > > . Inline "( +. )".
> > > . Inline "op1".
> > > . Type-specialize "op1".
> > > . Hoist bounds checks.
> >
> > You mean "the programmer has to do this"? OCaml compiler is *really* bad,
> > then.
>
> You're 100% academically-correct of course.
>
> In practice, the combination of OCaml's awesome native-code generation on
> AMD64 and the ease with which you can work around those missing
> optimizations mean that OCaml is *really* good compared to all other
> language
> implementations with comparable qualities. This is precisely why it is so
> popular.
>
> > > There are some trade-offs here though. Stalin-compiled Scheme,
> > > MLton-compiled SML and GHC-compiled Haskell do a lot more than OCaml in
> > > theory but, in practice, they are much less useful for high-performance
> > > numerics because their performance is so unpredictable.
> >
> > Good to know.
>
> OCaml's code generator is also much better than MLton's and GHC's (even
> 6.8).
>
> > > I've already spent a long time meticulously contructing benchmarks in
> > > various different languages along these lines and the most important
> > > optimizations missing from OCaml are:
> > >
> > > . Hoisting bounds checks.
> > > . Common subexpression elimination.
> > > . Type specialization.
> > > . More aggressive inlining.
> > > . Static optimization of div and mod.
> > >
> > > The nice thing about OCaml is that it has a superb 64-bit code
> > > generator so, once you've done those optimizations on the hot paths
> > > manually, OCaml lets you run code as quickly as C but with all of the
> > > high-level benefits everywhere else.
> > >
> > > A perfect language implementation would certainly do these
> > > optimizations for you and I wish OCaml did but I still haven't found
> > > anything better.
> >
> > After I gain some more experience I will hopefully add an OCaml-esque
> > front end to my hackish compiler. It's completely unportable, only works
> > on 12-bit-PIC-like architecture (SX from Parallax) and on Z8 Encore!, but
> > hey, maybe it can be a good starting point to something bigger and
> > better. The backend still has to be ported from Lisp, and I'm too time-
> > and
> > knowledge-constrained to do it just yet.
> > ...
>
> I see. If you're not using OCaml as a general purpose language then you
> don't need its other benefits (e.g. GUI libraries).
>
> I'm in a similar situation. I'm toying with the idea of building a better
> FPL implementation for high-performance numerics that is commerce friendly
> using LLVM. If you take the path of least resistance then I believe that is
> tractable but you lose nice things like LablGTK2...

If you can do some code generation, it shouldn't be a big deal to implement 
even some very complex ABIs, say interfacing to C++ libraries. As long as you 
can cleanly run your language at compile time (like lisp does), and as long 
as the compiler provides a documented way to pop assembly into the code, you 
can have a nice language that can, in "natively compiled" output, interface 
say with Qt. IMHO, the latter is now a few years ahead of GTK, and is only 
gaining the advantage as time passes. Languages such as OCaml sorely lack in 
nontrivial ABI department - there's no reason to have to write (or even 
generate) C-style binding code for say Qt just to use it in OCaml. Both 
bytecode compiler and native code compiler should have a pluggable 
ABI-generator scheme where they can interface with C++ (at least). Another 
way to go about it would be to machine-translate C++ libraries to OCaml. This 
should be doable for Qt, but I don't think I'm up for that one just yet, even 
though I think about it.

If you want to generate code for a VM, the backend should allow selectable 
VMs - at least CLR and JVM, if the thing is to be of any relevance.

The language should, IMHO, also be flexible enough that features could be 
reasonably added to have it support resource-constrained architectures.

I'd expect the language to have a core feature set which doesn't do automated 
memory management beyond what can be inferred for fixed-lifetime data. This 
would be akin to working with good old C++, and should be easily portable to 
tiny targets. Then you have "level 2" feature set which is supposed to use 
garbage collecting virtual machines. I'd even add something like "level 1.5" 
which uses say C++ ABI but also links-in a garbage collector. So that you get 
automated memory management but no VM per se. Heck, for all I care you could 
have that thing emit code in C++ ABI by default - it's a reasonable-enough 
choice.

I find it annoying that writing good "PC" code requires a wholly different 
toolset from writing embedded code, the latter usually limited to really 
broken vendor C (or sometimes C++).

Cheers, Kuba


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-08 14:20           ` Kuba Ober
@ 2008-01-12 14:22             ` Jon Harrop
  2008-01-12 16:18               ` Dario Teixeira
  0 siblings, 1 reply; 19+ messages in thread
From: Jon Harrop @ 2008-01-12 14:22 UTC (permalink / raw)
  To: caml-list

On Tuesday 08 January 2008 14:20:03 Kuba Ober wrote:
> If you can do some code generation, it shouldn't be a big deal to implement
> even some very complex ABIs, say interfacing to C++ libraries. As long as 
> you can cleanly run your language at compile time (like lisp does), and as
> long as the compiler provides a documented way to pop assembly into the
> code, you can have a nice language that can, in "natively compiled" output,
> interface say with Qt.

Yes. This is one of the features that I would dearly love but it is also 
another one of the features that doesn't count as research so you're never 
likely to find it in a research implementation of a language like OCaml. F# 
has a fantastic FFI in comparison, for example.

Perhaps the best solution for OCaml is to interface with a popular dynamic 
language and borrow its bindings. I believe Richard Jones' Perl interface 
does this, although I've never used it myself.

The obvious flaw is that you end up with a very fast compiled language with a 
very slow interface boundary. That's fine for many cases, of course, but I'm 
particularly interested in high-performance numerics and visualization which 
really benefit from a high-performance FFI.

> IMHO, the latter is now a few years ahead of GTK, and is only gaining the
> advantage as time passes. 

May I ask what features Qt has that GTK does not?

My only gripe with GTK is that it is very slow and, consequently, I always 
seem to end up migrating my GUI code to OpenGL. I still think an OpenGL-based 
GUI written in OCaml would be great...

> Languages such as OCaml 
> sorely lack in nontrivial ABI department - there's no reason to have to
> write (or even generate) C-style binding code for say Qt just to use it in
> OCaml. Both bytecode compiler and native code compiler should have a
> pluggable ABI-generator scheme where they can interface with C++ (at
> least). Another way to go about it would be to machine-translate C++
> libraries to OCaml.

I'd be happy to interface with C and have no real preference about C++ myself.

-- 
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/products/?e


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-12 14:22             ` Jon Harrop
@ 2008-01-12 16:18               ` Dario Teixeira
  2008-01-12 23:50                 ` Jon Harrop
  0 siblings, 1 reply; 19+ messages in thread
From: Dario Teixeira @ 2008-01-12 16:18 UTC (permalink / raw)
  To: Jon Harrop, caml-list

Hi,

> > IMHO, the latter is now a few years ahead of GTK, and is only gaining the
> > advantage as time passes. 
> 
> May I ask what features Qt has that GTK does not?

Though some would argue this is a matter of taste, Qt feels like a
much more elegant API.  And yes, feature-wise is also a far more
comprehensive library.  It includes modules not only for the expected
GUI widgets, but also for database connectivity, XML processing,
network programming, easy integration with openGL, generation and
visualisation of SVG and PDF, etc, etc.  Moreover, the various modules
are well integrated and go well together.  To achieve the same degree
of functionality in Gtk-land, you need to mix in several independent
libraries (Gtk+Cairo+...), which not always feel like part of a coherent
whole.

You could of course argue that in the Ocaml world we have better solutions
for some of the modules present in Qt.  Ocamlnet is top-notch, and the
facilities for XML processing (such as Cduce and allies) are so good you
probably will find the similarly-purposed Qt modules unnecessary.
Nevertheless, just the graphics facilities present in Qt would more
than justify Ocaml bindings.

Incidentally, the Haskell folks are working on bindings:
http://qthaskell.sourceforge.net/
Does Haskell's FFI make this an easier task than Ocaml's?

Cheers,
Dario Teixeira



      ___________________________________________________________
Support the World Aids Awareness campaign this month with Yahoo! For Good http://uk.promotions.yahoo.com/forgood/


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Caml-list] Performance questions, -inline, ...
  2008-01-12 16:18               ` Dario Teixeira
@ 2008-01-12 23:50                 ` Jon Harrop
  0 siblings, 0 replies; 19+ messages in thread
From: Jon Harrop @ 2008-01-12 23:50 UTC (permalink / raw)
  To: caml-list

On Saturday 12 January 2008 16:18:46 Dario Teixeira wrote:
> > > IMHO, the latter is now a few years ahead of GTK, and is only gaining
> > > the advantage as time passes.
> >
> > May I ask what features Qt has that GTK does not?
>
> Though some would argue this is a matter of taste, Qt feels like a
> much more elegant API.  And yes, feature-wise is also a far more
> comprehensive library.  It includes modules not only for the expected
> GUI widgets, but also for database connectivity, XML processing,
> network programming, easy integration with openGL, generation and
> visualisation of SVG and PDF, etc, etc.  Moreover, the various modules
> are well integrated and go well together.  To achieve the same degree
> of functionality in Gtk-land, you need to mix in several independent
> libraries (Gtk+Cairo+...), which not always feel like part of a coherent
> whole.

I see. That's very interesting.

> You could of course argue that in the Ocaml world we have better solutions
> for some of the modules present in Qt.  Ocamlnet is top-notch, and the
> facilities for XML processing (such as Cduce and allies) are so good you
> probably will find the similarly-purposed Qt modules unnecessary.
> Nevertheless, just the graphics facilities present in Qt would more
> than justify Ocaml bindings.

I'm surprised to hear that. The last time I looked (a long time ago) I thought 
OCaml's support for such things (and unicode) was not that good but we do 
seem to be hearing about web-related successes with OCaml all the time and 
they must require working implementations of these kinds of libraries. Gerd 
always seems to be involved. ;-)

> Incidentally, the Haskell folks are working on bindings:
> http://qthaskell.sourceforge.net/

Yes. I noticed those Qt bindings for GHC being advertised recently but it 
turns out they are pre-alpha release have no applications using them and no 
Debian or Ubuntu packages providing them.

> Does Haskell's FFI make this an easier task than Ocaml's?

Haskell certainly has far more bindings to libraries than OCaml but it also 
has far fewer working applications using those bindings, which makes me 
suspicious. ;-)

-- 
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/products/?e


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2008-01-12 23:57 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-03 16:28 Performance questions, -inline, Kuba Ober
2008-01-03 17:11 ` [Caml-list] " Edgar Friendly
2008-01-05 18:09   ` Kuba Ober
2008-01-05 18:44     ` Kuba Ober
2008-01-05 19:36 ` Jon Harrop
2008-01-05 20:31   ` Bünzli Daniel
2008-01-07 13:48   ` Kuba Ober
2008-01-07 14:41     ` Jon Harrop
2008-01-07 15:22       ` Kuba Ober
2008-01-07 19:58         ` Jon Harrop
2008-01-08 14:20           ` Kuba Ober
2008-01-12 14:22             ` Jon Harrop
2008-01-12 16:18               ` Dario Teixeira
2008-01-12 23:50                 ` Jon Harrop
2008-01-07 15:31       ` Christophe Raffalli
2008-01-07 17:00       ` Jacques Carette
2008-01-07 17:07         ` Till Varoquaux
2008-01-07 17:20           ` Jacques Carette
2008-01-07 17:31         ` Kuba Ober

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).