[Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C

caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed

From: Paul Stodghill <stodghil@CS.Cornell.EDU>
To: caml-list@inria.fr
Subject: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
Date: Fri, 18 Oct 2002 16:09:59 -0400	[thread overview]
Message-ID: <r99u1jjh5k8.fsf@quimby-xp.cs.cornell.edu> (raw)

[-- Attachment #1: Type: text/plain, Size: 1959 bytes --]

Doug Bagley's Great Language Shootout says that O'Caml and GCC get about
the same performance for Matrix-Matrix Multiplication (MMM). I'm seeing
somewhat different results for my code. In particular, on my machine
(1.2GHz PIIIM, 256M), the O'Caml version of the code is 6 times slower
than the C version.

The codes are attached. A few of the differences between my code and
Doug's code is that I have tiled the loops and I use constants for the
matrices and the blocksize. I have not (yet) hoisted the invariant
expressions.

Looking at the assembly produced by O'Caml and GCC, it appears that GCC
is performance loop unrolling (as requested with -funroll-loops) and
strength reduction in the inner loops. I can easily see why these two
optimizations can result in such a tremendous performance difference.

My question is this: I can obviously performance loop unrolling myself
by hand - does ocamlopt perform strength reduction? Is there anyway that
I can get the O'Caml code to close to the performance of the C code?

I can provide additonal information about my set up if that would help.

Thanks.


quimby-xp$ uname -a
CYGWIN_NT-5.1 QUIMBY-XP 1.3.13(0.62/3/2) 2002-10-13 23:15 i686 unknown
quimby-xp$ ocamlopt.opt -v
The Objective Caml native-code compiler, version 3.06
Standard library directory: /usr/local/ocaml/lib/ocaml
quimby-xp$ gcc --version
gcc (GCC) 3.2 20020818 (prerelease)
Copyright (C) 2002 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

quimby-xp$ ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml
quimby-xp$ gcc -O4 -funroll-loops -o mmm_c.exe mmm_c.c
quimby-xp$ ./mmm_ml
(* 86.83 mflops *)
quimby-xp$ ./mmm_ml
(* 88.16 mflops *)
quimby-xp$ ./mmm_ml
(* 89.07 mflops *)
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ 


[-- Attachment #2: mmm_ml.ml --]
[-- Type: application/octet-stream, Size: 1683 bytes --]

(* ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml *)

open Printf

let t = 5;;
let n = 120;;


let rec f create mmm n =
  let (a,b,c) = create () in
    (* do the work once to load the cache *)
    mmm n a b c;
    (* do the work t times and time it *)
    let begin_time = Unix.gettimeofday () in
    let _ =
      for t = 1 to t do
	mmm n a b c
      done
    in let end_time = Unix.gettimeofday () in
    let n_f = float_of_int n in
    let total_flops = 2.*.(float_of_int t)*.n_f*.n_f*.n_f in
    let mflops = (total_flops /. (end_time -. begin_time)) /. 1e6 in
      printf "(* %.2f mflops *)\n" mflops;
      ()
;;

(* ***************************************************************** *)

let create_arrays () =
  let a = Array.create (n*n) 0. and
    b = Array.create (n*n) 0. and
    c = Array.create (n*n) 0. in
    (* Init a and b*)
    for i = 0 to n-1 do
      for j = 0 to n-1 do
	a.(i*n+j) <- (float_of_int (i+j));
	b.(i*n+j) <- (float_of_int (i+j));
      done
    done;
    (a,b,c)

(* ***************************************************************** *)

let mmm n (a: float array) (b: float array) (c: float array) =
  assert (120 mod 30 = 0);
  let num_blocks = 120/30 in
    for bi = 0 to num_blocks-1 do
      for bj = 0 to num_blocks-1 do
	for bk = 0 to num_blocks-1 do
	  for i = bi*30 to (bi+1)*30-1 do
	    for j = bj*30 to (bj+1)*30-1 do
	      for k = bk*30 to (bk+1)*30-1 do
		Array.unsafe_set c (i*120+j)
		  ((Array.unsafe_get c (i*120+j)) 
		   +. (Array.unsafe_get a (i*120+k)) 
		   *. (Array.unsafe_get b (k*120+j)));
	      done
	    done
	  done
	done
      done
    done
;;

(* 83.08 mflops (0.03% of peak) *)
let _ = f create_arrays mmm n;;


[-- Attachment #3: mmm_c.c --]
[-- Type: application/octet-stream, Size: 1304 bytes --]

#include <assert.h>
#include <stdio.h>

#include <sys/time.h>

#define T 5
#define N 120
#define BB 30

static double A[N*N];
static double B[N*N];
static double C[N*N];

void mmm();

int main(int argc, char** argv)
{
    int i, j, k;
    int t;

    struct timeval start_time;
    struct timeval end_time;

    double seconds;
    double total_flops;
    double mflops;

    /* init the matrices */
    for (i=0; i<N; i++)
	for (j=0; j<N; j++)
	    A[i*N+j] = B[i*N+j] = i+j;
    /* warm the cache */
    mmm();

    /* do the work T times and time it */
    gettimeofday(&start_time,NULL);
    for (t=1; t<=T; t++)
	mmm();
    gettimeofday(&end_time,NULL);

    seconds = (double)(end_time.tv_usec - start_time.tv_usec)/1e6;
    seconds += (double)(end_time.tv_sec - start_time.tv_sec);

    total_flops = 2.*(double)T*(double)N*(double)N*(double)N;
    mflops = (total_flops / seconds) / 1e6;

    printf("/* %.2f mflops */\n", mflops);

}

#define NUM_BLOCKS (N/BB)

void mmm()
{
    int i, j, k;
    int bi, bj, bk;

    assert (N % BB == 0);

    for (bi=0; bi<NUM_BLOCKS; bi++)
	for (bj=0; bj<NUM_BLOCKS; bj++)
	    for (bk=0; bk<NUM_BLOCKS; bk++)
		for (i=bi*BB; i<(bi+1)*BB; i++)
		    for (j=bj*BB; j<(bj+1)*BB; j++)
			for (k=bk*BB; k<(bk+1)*BB; k++)
			    C[i*N+j] += A[i*N+k] * B[k*N+j];
}

next             reply	other threads:[~2002-10-18 20:10 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-10-18 20:09 Paul Stodghill [this message]
2002-10-20  9:42 ` Xavier Leroy
2002-10-20 18:06   ` Paul Stodghill
2002-10-20 21:58     ` Issac Trotts
2002-10-23 10:16       ` [Caml-list] Re: float boxing (was: matrix-matrix multiply) Christophe TROESTLER
2002-10-23 12:08         ` malc
2002-10-24  0:00           ` malc
2002-10-21 12:53     ` [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Xavier Leroy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=r99u1jjh5k8.fsf@quimby-xp.cs.cornell.edu \
    --to=stodghil@cs.cornell.edu \
    --cc=caml-list@inria.fr \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).