[Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C

caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed

* [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
@ 2002-10-18 20:09 Paul Stodghill
  2002-10-20  9:42 ` Xavier Leroy
  0 siblings, 1 reply; 8+ messages in thread
From: Paul Stodghill @ 2002-10-18 20:09 UTC (permalink / raw)
  To: caml-list

[-- Attachment #1: Type: text/plain, Size: 1959 bytes --]

Doug Bagley's Great Language Shootout says that O'Caml and GCC get about
the same performance for Matrix-Matrix Multiplication (MMM). I'm seeing
somewhat different results for my code. In particular, on my machine
(1.2GHz PIIIM, 256M), the O'Caml version of the code is 6 times slower
than the C version.

The codes are attached. A few of the differences between my code and
Doug's code is that I have tiled the loops and I use constants for the
matrices and the blocksize. I have not (yet) hoisted the invariant
expressions.

Looking at the assembly produced by O'Caml and GCC, it appears that GCC
is performance loop unrolling (as requested with -funroll-loops) and
strength reduction in the inner loops. I can easily see why these two
optimizations can result in such a tremendous performance difference.

My question is this: I can obviously performance loop unrolling myself
by hand - does ocamlopt perform strength reduction? Is there anyway that
I can get the O'Caml code to close to the performance of the C code?

I can provide additonal information about my set up if that would help.

Thanks.


quimby-xp$ uname -a
CYGWIN_NT-5.1 QUIMBY-XP 1.3.13(0.62/3/2) 2002-10-13 23:15 i686 unknown
quimby-xp$ ocamlopt.opt -v
The Objective Caml native-code compiler, version 3.06
Standard library directory: /usr/local/ocaml/lib/ocaml
quimby-xp$ gcc --version
gcc (GCC) 3.2 20020818 (prerelease)
Copyright (C) 2002 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

quimby-xp$ ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml
quimby-xp$ gcc -O4 -funroll-loops -o mmm_c.exe mmm_c.c
quimby-xp$ ./mmm_ml
(* 86.83 mflops *)
quimby-xp$ ./mmm_ml
(* 88.16 mflops *)
quimby-xp$ ./mmm_ml
(* 89.07 mflops *)
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ 


[-- Attachment #2: mmm_ml.ml --]
[-- Type: application/octet-stream, Size: 1683 bytes --]

(* ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml *)

open Printf

let t = 5;;
let n = 120;;


let rec f create mmm n =
  let (a,b,c) = create () in
    (* do the work once to load the cache *)
    mmm n a b c;
    (* do the work t times and time it *)
    let begin_time = Unix.gettimeofday () in
    let _ =
      for t = 1 to t do
	mmm n a b c
      done
    in let end_time = Unix.gettimeofday () in
    let n_f = float_of_int n in
    let total_flops = 2.*.(float_of_int t)*.n_f*.n_f*.n_f in
    let mflops = (total_flops /. (end_time -. begin_time)) /. 1e6 in
      printf "(* %.2f mflops *)\n" mflops;
      ()
;;

(* ***************************************************************** *)

let create_arrays () =
  let a = Array.create (n*n) 0. and
    b = Array.create (n*n) 0. and
    c = Array.create (n*n) 0. in
    (* Init a and b*)
    for i = 0 to n-1 do
      for j = 0 to n-1 do
	a.(i*n+j) <- (float_of_int (i+j));
	b.(i*n+j) <- (float_of_int (i+j));
      done
    done;
    (a,b,c)

(* ***************************************************************** *)

let mmm n (a: float array) (b: float array) (c: float array) =
  assert (120 mod 30 = 0);
  let num_blocks = 120/30 in
    for bi = 0 to num_blocks-1 do
      for bj = 0 to num_blocks-1 do
	for bk = 0 to num_blocks-1 do
	  for i = bi*30 to (bi+1)*30-1 do
	    for j = bj*30 to (bj+1)*30-1 do
	      for k = bk*30 to (bk+1)*30-1 do
		Array.unsafe_set c (i*120+j)
		  ((Array.unsafe_get c (i*120+j)) 
		   +. (Array.unsafe_get a (i*120+k)) 
		   *. (Array.unsafe_get b (k*120+j)));
	      done
	    done
	  done
	done
      done
    done
;;

(* 83.08 mflops (0.03% of peak) *)
let _ = f create_arrays mmm n;;


[-- Attachment #3: mmm_c.c --]
[-- Type: application/octet-stream, Size: 1304 bytes --]

#include <assert.h>
#include <stdio.h>

#include <sys/time.h>

#define T 5
#define N 120
#define BB 30

static double A[N*N];
static double B[N*N];
static double C[N*N];

void mmm();

int main(int argc, char** argv)
{
    int i, j, k;
    int t;

    struct timeval start_time;
    struct timeval end_time;

    double seconds;
    double total_flops;
    double mflops;

    /* init the matrices */
    for (i=0; i<N; i++)
	for (j=0; j<N; j++)
	    A[i*N+j] = B[i*N+j] = i+j;
    /* warm the cache */
    mmm();

    /* do the work T times and time it */
    gettimeofday(&start_time,NULL);
    for (t=1; t<=T; t++)
	mmm();
    gettimeofday(&end_time,NULL);

    seconds = (double)(end_time.tv_usec - start_time.tv_usec)/1e6;
    seconds += (double)(end_time.tv_sec - start_time.tv_sec);

    total_flops = 2.*(double)T*(double)N*(double)N*(double)N;
    mflops = (total_flops / seconds) / 1e6;

    printf("/* %.2f mflops */\n", mflops);

}

#define NUM_BLOCKS (N/BB)

void mmm()
{
    int i, j, k;
    int bi, bj, bk;

    assert (N % BB == 0);

    for (bi=0; bi<NUM_BLOCKS; bi++)
	for (bj=0; bj<NUM_BLOCKS; bj++)
	    for (bk=0; bk<NUM_BLOCKS; bk++)
		for (i=bi*BB; i<(bi+1)*BB; i++)
		    for (j=bj*BB; j<(bj+1)*BB; j++)
			for (k=bk*BB; k<(bk+1)*BB; k++)
			    C[i*N+j] += A[i*N+k] * B[k*N+j];
}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
  2002-10-18 20:09 [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Paul Stodghill
@ 2002-10-20  9:42 ` Xavier Leroy
  2002-10-20 18:06   ` Paul Stodghill
  0 siblings, 1 reply; 8+ messages in thread
From: Xavier Leroy @ 2002-10-20  9:42 UTC (permalink / raw)
  To: Paul Stodghill; +Cc: caml-list

> Looking at the assembly produced by O'Caml and GCC, it appears that GCC
> is performance loop unrolling (as requested with -funroll-loops) and
> strength reduction in the inner loops. I can easily see why these two
> optimizations can result in such a tremendous performance difference.
> 
> My question is this: I can obviously performance loop unrolling myself
> by hand - does ocamlopt perform strength reduction?

No, it doesn't.  It doesn't hoist loop-invariant computations either.
(See http://caml.inria.fr/ocaml/numerical.html)

Multiplications are pretty expensive on modern processors, so your
code would run much faster if you hoist the computation i*120 out of
the j and k loops, and strength-reduce k*120 in the innermost
loop.

Another option worth considering is to swap the j and k loops, thus
making k*120 and a.(i*120+k) two invariants of the inner loop.  In
this case, you might not even have to strength-reduce k*120.

Other loop optimizations such as loop unrolling would (I guess) make
much less of a difference -- modern processors are very good at branch
prediction, making loop overhead low.

Some more thoughts:

Bagley's code works on a "proper" matrix (an array of
arrays), while yours flatten the matrix into one single array.  There
are pros and cons to either approach, but notice that your approach
(without strength reduction) entails fewer loads but more
multiplications, which can be more expensive...

Tiling loops is good for really large matrices, but yours (in this
example) occupies "only" 115 Kb, thus fits comfortably in L2 cache.
Perhaps tiling isn't beneficial in this case.

- Xavier Leroy
-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
  2002-10-20  9:42 ` Xavier Leroy
@ 2002-10-20 18:06   ` Paul Stodghill
  2002-10-20 21:58     ` Issac Trotts
  2002-10-21 12:53     ` [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Xavier Leroy
  0 siblings, 2 replies; 8+ messages in thread
From: Paul Stodghill @ 2002-10-20 18:06 UTC (permalink / raw)
  To: Xavier Leroy; +Cc: caml-list

>>>>> "Xavier" == Xavier Leroy <xavier.leroy@inria.fr> writes:
    Xavier> No, it doesn't. It doesn't hoist loop-invariant computations
    Xavier> either. (See http://caml.inria.fr/ocaml/numerical.html)

I'll take a look. Thanks.

    Xavier> Multiplications are pretty expensive on modern processors,
    Xavier> so your code would run much faster if you hoist the
    Xavier> computation i*120 out of the j and k loops, and
    Xavier> strength-reduce k*120 in the innermost loop.

I don't see how to do strength-reduction without introducing references,
which appears to reduce performance more than is gained.

    Xavier> Another option worth considering is to swap the j and k
    Xavier> loops, thus making k*120 and a.(i*120+k) two invariants of
    Xavier> the inner loop. In this case, you might not even have to
    Xavier> strength-reduce k*120.

I tried this, and indeed the performance of the ML code get up to 95
Mflops. However, the performance of the C code goes down, as the ikj
loop ordering requires an additional store of C[i,j] in every iteration.

    Xavier> Other loop optimizations such as loop unrolling would (I
    Xavier> guess) make much less of a difference -- modern processors
    Xavier> are very good at branch prediction, making loop overhead
    Xavier> low.

Loop unrolling can also increase ILP. It can make a big difference in
performance.

Loop unrolling can be thought of as a way of getting some of the effect
of software pipelining without the complicated scheduling.

    Xavier> Some more thoughts:

    Xavier> Bagley's code works on a "proper" matrix (an array of
    Xavier> arrays), while yours flatten the matrix into one single
    Xavier> array. There are pros and cons to either approach, but
    Xavier> notice that your approach (without strength reduction)
    Xavier> entails fewer loads but more multiplications, which can be
    Xavier> more expensive...

Right. So it is a win when it is done in combination with strength reduction.

    Xavier> Tiling loops is good for really large matrices, but yours
    Xavier> (in this example) occupies "only" 115 Kb, thus fits
    Xavier> comfortably in L2 cache. Perhaps tiling isn't beneficial in
    Xavier> this case.

True. This was a miscalculation on my part. However, the blocksize that
I chose makes the blocks fit in the L1 cache, so I was still getting a
measurable benefit.

    Xavier> - Xavier Leroy

Here is what I was trying to accomplish - I am involved with a project
that is trying to automate/generalize some of the tricks that ATLAS
(http://math-atlas.sourceforge.net/) uses for getting maximal
performance from matrix-matrix multiply (MMM). I knew about Bagley's
conclusion that O'Caml was competitive with C for MMM, so I was curious
how close O'Caml came to GCC on Blocked MMM. I would like to use O'Caml
as a target language over C or Fortran.

My conclusion, at this point, is that the current native O'Caml compiler
is not going to competitive with a native C or Fortran compiler because
the it does not optimize the innermost loop as aggressively.

Thanks for your help.

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
  2002-10-20 18:06   ` Paul Stodghill
@ 2002-10-20 21:58     ` Issac Trotts
  2002-10-23 10:16       ` [Caml-list] Re: float boxing (was: matrix-matrix multiply) Christophe TROESTLER
  2002-10-21 12:53     ` [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Xavier Leroy
  1 sibling, 1 reply; 8+ messages in thread
From: Issac Trotts @ 2002-10-20 21:58 UTC (permalink / raw)
  To: OCaml Mailing List

> I don't see how to do strength-reduction without introducing references,
> which appears to reduce performance more than is gained.

You might try converting your references to mutable fields.
Here's an example of the performance gain:

% cat ref.ml

let x = ref 1.0 in
let n = int_of_string Sys.argv.(1) in
for i = 1 to n do x := !x +. 1.0 done

% cat ref2.ml

type t = { mutable f:float };;
let x = { f = 1.0 } in
let n = int_of_string Sys.argv.(1) in
for i = 1 to n do x.f <- x.f +. 1.0 done

% ocamlopt -o ref{,.ml}

% ocamlopt -o ref2{,.ml}

% time ./ref 100000000
./ref 100000000  2.51s user 0.00s system 99% cpu 2.515 total

% time ./ref2 100000000
./ref2 100000000  1.54s user 0.01s system 100% cpu 1.542 total

The standard references take 1.6 times as much cpu time in this
case.

Issac
 
-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
  2002-10-20 18:06   ` Paul Stodghill
  2002-10-20 21:58     ` Issac Trotts
@ 2002-10-21 12:53     ` Xavier Leroy
  1 sibling, 0 replies; 8+ messages in thread
From: Xavier Leroy @ 2002-10-21 12:53 UTC (permalink / raw)
  To: Paul Stodghill; +Cc: caml-list

> I don't see how to do strength-reduction without introducing references,
> which appears to reduce performance more than is gained.

The compiler can optimize "local" references into variables, avoiding
the extra indirection normally associated with references.  But even
with this optimization, there might be issues related to the small
number of registers available on the x86.

> Here is what I was trying to accomplish - I am involved with a project
> that is trying to automate/generalize some of the tricks that ATLAS
> (http://math-atlas.sourceforge.net/) uses for getting maximal
> performance from matrix-matrix multiply (MMM). I knew about Bagley's
> conclusion that O'Caml was competitive with C for MMM, so I was curious
> how close O'Caml came to GCC on Blocked MMM. I would like to use O'Caml
> as a target language over C or Fortran.

OCaml is a fun language to write the problem analysis and code generation for
specialized numerical components, see e.g. http://www.fftw.org/
However, I agree with your conclusions that for ultimate performance
you probably want to generate Fortran (for a good parallelizing compiler)
or C (with asm inserts for SIMD instructions).

- Xavier Leroy
-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Caml-list] Re: float boxing (was: matrix-matrix multiply)
  2002-10-20 21:58     ` Issac Trotts
@ 2002-10-23 10:16       ` Christophe TROESTLER
  2002-10-23 12:08         ` malc
  0 siblings, 1 reply; 8+ messages in thread
From: Christophe TROESTLER @ 2002-10-23 10:16 UTC (permalink / raw)
  To: caml-list

On Sun, 20 Oct 2002, Issac Trotts <ijtrotts@ucdavis.edu> wrote:
> 
> You might try converting your references to mutable fields.
>
> let x = ref 1.0 in
> let n = int_of_string Sys.argv.(1) in
> for i = 1 to n do x := !x +. 1.0 done
> 
> ./ref 100000000  2.51s user 0.00s system 99% cpu 2.515 total
> 
> type t = { mutable f:float };;
> let x = { f = 1.0 } in
> let n = int_of_string Sys.argv.(1) in
> for i = 1 to n do x.f <- x.f +. 1.0 done
>
> ./ref2 100000000  1.54s user 0.01s system 100% cpu 1.542 total

A few questions in view of this.  First, on my machine (AMD Athlon
1GHz running GNU/Linux), the timings give a preference to ref.ml

time ./ref 100000000
real    0m1.279s user    0m1.280s sys     0m0.000s
time ./ref2 100000000
real    0m1.411s user    0m1.380s sys     0m0.000s

What could be a reason for that?

Second, ain't references be optimized when their type is statically
known to be a float ref (I thought so, please confirm or correct)?

It seems to me there are three main issues concerning floats:
* storing       (avoid unnecessary indirections but take care of GC)
* comparisons   (= is not reflexive in IEEE 754 arithmetic)
* conversions

About "conversions", float : int -> float seems to be slow (compared
to a cast in C at least).  Is there any way to optimize it when it
intervene in an algebraic expression?  (Frequently on has to write
things like: for i=0 to n do let x = float i / float n in ... done)

I understand that float values need to be boxed to "dialog" with
polymorphic functions.  Let me picture it as

    +--------+  f : float -> float   +--------+
    |        | --------------------> |        |
    +--------+                       +--------+
        |                                |
        V                                V
    +--------+                       +--------+
    | double |                       | double |
    +--------+                       +--------+

However, couldn't we imagine that functions with float arguments or
return value have "two interfaces": the standard one (where one knows
the pointer) and another one (which gives the value) :

    +--------+  f : float -> float   +--------+
    |        | --------------------> |        |
    +--------+                       +--------+
        |                                |
        V                                V
    +--------+  f': float -> float   +--------+
    | double | --------------------> | double |
    +--------+                       +--------+

The _idea_, in C parlance, is to declare f(x) as &(f'(*x)).  Now, the
boxing should allow the GC to take care of these values.  But, if a
function returning a float feeds another function expecting a float,
the compiler could connect the "bottom lines" instead of passing
through the pointers:

     f : 'a -> float   +--------+      +--------+  g : float -> 'b
    -----------------> |        |      |        | -----------------> 
                       +--------+      +--------+                   
                           |               |                        
                           V               V                        
     f': 'a -> float   +--------+      +--------+  g': float -> 'b   
    -----------------> | double |----->| double | -----------------> 
                       +--------+      +--------+ 

This kind of idea could also apply to recursive functions passing
float values along the recursion...

My question is: is this type of idea workable?  Is it difficult to
implement?  (In a way it just generalize the special treatment of
arithmetic expressions.)  Maybe this can be generalized further to put
float references & equalities under the same umbrella?

Bear in mind I am not a compiler expert (and people even giving
compiling courses here are not very helpful), so my questions are also
a way for me to learn a little something...

Cheers,
ChriS
-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Re: float boxing (was: matrix-matrix multiply)
  2002-10-23 10:16       ` [Caml-list] Re: float boxing (was: matrix-matrix multiply) Christophe TROESTLER
@ 2002-10-23 12:08         ` malc
  2002-10-24  0:00           ` malc
  0 siblings, 1 reply; 8+ messages in thread
From: malc @ 2002-10-23 12:08 UTC (permalink / raw)
  To: Christophe TROESTLER; +Cc: caml-list

On Wed, 23 Oct 2002, Christophe TROESTLER wrote:

> On Sun, 20 Oct 2002, Issac Trotts <ijtrotts@ucdavis.edu> wrote:
> > 
> > You might try converting your references to mutable fields.
> >
> > let x = ref 1.0 in
> > let n = int_of_string Sys.argv.(1) in
> > for i = 1 to n do x := !x +. 1.0 done
> > 
> > ./ref 100000000  2.51s user 0.00s system 99% cpu 2.515 total
> > 
> > type t = { mutable f:float };;
> > let x = { f = 1.0 } in
> > let n = int_of_string Sys.argv.(1) in
> > for i = 1 to n do x.f <- x.f +. 1.0 done
> >
> > ./ref2 100000000  1.54s user 0.01s system 100% cpu 1.542 total
> 
> A few questions in view of this.  First, on my machine (AMD Athlon
> 1GHz running GNU/Linux), the timings give a preference to ref.ml
> 
> time ./ref 100000000
> real    0m1.279s user    0m1.280s sys     0m0.000s
> time ./ref2 100000000
> real    0m1.411s user    0m1.380s sys     0m0.000s
> 
> What could be a reason for that?

I think the reason is simple, both are more or less nop operations,
x or x.f is not used anywhere, hence no need to allocate the float.
This short example highlights the difference:

let useref n = 
  let x = ref 1.0 in
  for i = 1 to n do x := !x +. 1.0 done;
  !x

type t = { mutable f:float };;
let userec n = 
  let x = { f = 1.0 } in
  for i = 1 to n do x.f <- x.f +. 1.0 done;
  x.f

let _ =
  let n = int_of_string Sys.argv.(2) in
  Printf.printf "%f\n"
  (if Sys.argv.(1) = "ref" then
    useref n
  else
    userec n)

ref# time ./refrec rec 100000000
100000001.000000

real    0m2.283s
user    0m2.280s
sys     0m0.000s
ref# time ./refrec ref 100000000
100000001.000000

real    0m1.916s
user    0m1.910s
sys     0m0.010s

More or less same machine here.

-- 
mailto:malc@pulsesoft.com

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Re: float boxing (was: matrix-matrix multiply)
  2002-10-23 12:08         ` malc
@ 2002-10-24  0:00           ` malc
  0 siblings, 0 replies; 8+ messages in thread
From: malc @ 2002-10-24  0:00 UTC (permalink / raw)
  To: Christophe TROESTLER; +Cc: caml-list

On Wed, 23 Oct 2002, malc wrote:

> > A few questions in view of this.  First, on my machine (AMD Athlon
> > 1GHz running GNU/Linux), the timings give a preference to ref.ml
> > 
> > time ./ref 100000000
> > real    0m1.279s user    0m1.280s sys     0m0.000s
> > time ./ref2 100000000
> > real    0m1.411s user    0m1.380s sys     0m0.000s
> > 
> > What could be a reason for that?
> 
> I think the reason is simple, both are more or less nop operations,
> x or x.f is not used anywhere, hence no need to allocate the float.
> This short example highlights the difference:
> 
> let useref n = 
>   let x = ref 1.0 in
>   for i = 1 to n do x := !x +. 1.0 done;
>   !x
> 
> type t = { mutable f:float };;
> let userec n = 
>   let x = { f = 1.0 } in
>   for i = 1 to n do x.f <- x.f +. 1.0 done;
>   x.f
> 
> let _ =
>   let n = int_of_string Sys.argv.(2) in
>   Printf.printf "%f\n"
>   (if Sys.argv.(1) = "ref" then
>     useref n
>   else
>     userec n)
> 
> ref# time ./refrec rec 100000000
> 100000001.000000
> 
> real    0m2.283s
> user    0m2.280s
> sys     0m0.000s
> ref# time ./refrec ref 100000000
> 100000001.000000
> 
> real    0m1.916s
> user    0m1.910s
> sys     0m0.010s
> 
> More or less same machine here.

I believe i should add something here. Let's look at the inner loops.

Mutable version:
.L107:
	fld1
	faddl	(%ecx)
	fstpl	(%ecx)
	addl	$2, %eax
	cmpl	%ebx, %eax
	jle	.L107

Suboptimal but ok...

Reference version:
.L101:
.L103:	movl	young_ptr, %eax
	subl	$12, %eax
	movl	%eax, young_ptr
	cmpl	young_limit, %eax
	jb	.L104
	leal	4(%eax), %ecx
	movl	$2301, -4(%ecx)
	fld1
	faddl	(%esi)
	fstpl	(%ecx)
	movl	%ecx, %esi
	addl	$2, %ebx
	cmpl	%edx, %ebx
	jle	.L101

Lots of instructions + boxing.. And yet its faster than mutable one..
Wonders of modern CPUs.

My first take at simplest asm code doing the same:
    mov eax, n
    fld1
    fld1
  LL:
    fadd st, st(1)
    dec eax
    jnz LL

    fstp result
    fstp st

ref# time ./c 100000000
100000001.000000

real    0m0.394s
user    0m0.390s
sys     0m0.000s

(Turned out that both gcc and icc produce similar code give or take)

P.S. It would be interesting to see timings produced by P3/P4.

-- 
mailto:malc@pulsesoft.com

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-10-24  0:00 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-18 20:09 [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Paul Stodghill
2002-10-20  9:42 ` Xavier Leroy
2002-10-20 18:06   ` Paul Stodghill
2002-10-20 21:58     ` Issac Trotts
2002-10-23 10:16       ` [Caml-list] Re: float boxing (was: matrix-matrix multiply) Christophe TROESTLER
2002-10-23 12:08         ` malc
2002-10-24  0:00           ` malc
2002-10-21 12:53     ` [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Xavier Leroy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).