From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gava@univ-paris12.fr>
X-Spam-Checker-Version: SpamAssassin 3.1.3 (2006-06-01) on yquem.inria.fr
X-Spam-Level: *
X-Spam-Status: No, score=1.0 required=5.0 tests=AWL,SPF_FAIL 
	autolearn=disabled version=3.1.3
X-Original-To: caml-list@yquem.inria.fr
Delivered-To: caml-list@yquem.inria.fr
Received: from concorde.inria.fr (concorde.inria.fr [192.93.2.39])
	by yquem.inria.fr (Postfix) with ESMTP id 10059BC69
	for <caml-list@yquem.inria.fr>; Thu,  8 Feb 2007 10:28:00 +0100 (CET)
Received: from smtp23.orange.fr (smtp23.orange.fr [193.252.22.126])
	by concorde.inria.fr (8.13.6/8.13.6) with ESMTP id l189Rx7B020754
	for <caml-list@yquem.inria.fr>; Thu, 8 Feb 2007 10:27:59 +0100
Received: from me-wanadoo.net (localhost [127.0.0.1])
	by mwinf2322.orange.fr (SMTP Server) with ESMTP id 7FD5170000A0
	for <caml-list@yquem.inria.fr>; Thu,  8 Feb 2007 10:27:57 +0100 (CET)
Received: from [192.168.0.9] (ARouen-205-1-4-25.w80-11.abo.wanadoo.fr [80.11.106.25])
	by mwinf2322.orange.fr (SMTP Server) with ESMTP id D93357000096
	for <caml-list@yquem.inria.fr>; Thu,  8 Feb 2007 10:27:56 +0100 (CET)
X-ME-UUID: 20070208092756889.D93357000096@mwinf2322.orange.fr
Message-ID: <45CAED1C.3040503@univ-paris12.fr>
Date: Thu, 08 Feb 2007 10:27:56 +0100
From: =?ISO-8859-1?Q?Fr=E9d=E9ric_Gava?= <gava@univ-paris12.fr>
Reply-To: gava@univ-paris12.fr
Organization: =?ISO-8859-1?Q?Universit=E9_de_Paris_12=2C_LACL?=
User-Agent: Thunderbird 1.5.0.9 (X11/20070104)
MIME-Version: 1.0
To: caml-list@yquem.inria.fr
Subject: Re: [Caml-list] Multiplication of matrix in C and OCaml
References: <45CA63F5.6020301@univ-paris12.fr> <20070208.111401.55511744.garrigue@math.nagoya-u.ac.jp>
In-Reply-To: <20070208.111401.55511744.garrigue@math.nagoya-u.ac.jp>
Content-Type: multipart/mixed;
 boundary="------------090104040802030806090904"
X-Miltered: at concorde with ID 45CAED1F.003 by Joe's j-chkmail (http://j-chkmail . ensmp . fr)!
X-Spam: no; 0.00; gava:01 gava:01 ocaml:01 multiplying:01 matrices:01 cpuinfo:01 genuineintel:01 model:01 model:01 2048:01 bug:01 bug:01 flags:01 msr:01 matrices:01 
X-Attachments: type="text/x-csrc" name="mult.c" name="mult.c" name="mult.ml" name="mult.ml" 

This is a multi-part message in MIME format.
--------------090104040802030806090904
Content-Type: text/plain; charset=iso-8859-1; format=flowed
Content-Transfer-Encoding: 8bit

> This is not what I see:
> Multiplying 1000 times two 50x50 matrices on a Pentium M 1.8G, using
> your code.

Thanks.

Curious. In my cat "/proc/cpuinfo" (using Ubuntu 6.06)
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 13
model name      : Intel(R) Pentium(R) M processor 1.60GHz
stepping        : 6
cpu MHz         : 598.127
cache size      : 2048 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 2
wp              : yes
flags           : fpu vme de pse tsc msr mce cx8 sep mtrr pge mca cmov 
pat clflush dts acpi mmx fxsr sse sse2 ss tm pbe est tm2
bogomips        : 1197.16

when I multiply two 400x400 matrices, I observe that C is faster than 
OCaml (2 seconds for C code and 16 seconds for OCaml one). It is perhaps 
because I use MPI. Do you used -O2 ?

 > gcc -O3 monomorphic: 4.7s
 > Now, I also found a strange result:
 > gcc -O3 monomorphic: 1.4s

Why two different times for the same program ? What is monomorphic ?

Regards,
Frédéric Gava

ps: I attach the C code and the OCaml one which used some specific 
functions of our parallel version of OCaml (here implemented using MPI). 
         I will send the .exe file to Jacques.

--------------090104040802030806090904
Content-Type: text/x-csrc;
 name="mult.c"
Content-Transfer-Encoding: 8bit
Content-Disposition: inline;
 filename="mult.c"

/* param 1 = taille des matrices au depart
   param 2 = incrementation
   param 3 = taille des matrices à l'arrive
   param 4 = nombre d'itérations 

mpicc -O3 -o cmult cmult.c
(using MPICH)

mpirun -arch LINUX -np 1 ./cmult 400 2 402 1

*/

#include <mpi.h>
#include <stdlib.h> 
#include <stdio.h> 
#include <math.h>
#include <string.h>

typedef struct Complexe {double re; double img;} complexe;

complexe zero;

/* __inline__ complexe complexe_add(complexe a, complexe b)
{
 return (complexe){a.re+b.re, a.img+b.img};
}


__inline__ complexe complexe_mult(complexe x, complexe y)
{
 return (complexe) {x.re*y.re-x.img*y.img, x.re*y.img+x.img*y.re};
}

*/


/* version generic */

__inline__ void* add(void *x, void *y)
{
complexe a=((complexe*) x)[0];
complexe b=((complexe*) y)[0];
return (void *) &((complexe){a.re+b.re, a.img+b.img});
}

__inline__ void* mult(void *x, void *y)
{
complexe a=((complexe*) x)[0];
complexe b=((complexe*) y)[0];
return (void *) &((complexe){a.re*b.re-a.img*b.img, a.re*b.img+a.img*b.re});
}

complexe random_complexe(void)
{
 complexe c = {c.re=(double)(rand()%1000), c.img=(double)(rand()%1000)};
 return c;
}


void multiply_generic(int n, void *zero, void* (*add)(void *a, void *b), void* (*mult)(void *a, void *b), void *a, void *b, void *c)
{
 register int i,j,k;
 void *tmp;
 void *cc;
 
 void *tt1, *tt2, *tt3;

  for (i=0; i<n; i++) 
   {
    for (j=0;j<n;j++)
     {
      tmp=zero;
      for (k=0;k<n;k++)
       {
        tmp=add(tmp,(mult(b+(k*n+j),a+(i*n+k))));
        /* tt1=malloc(sizeof(complexe));
        memcpy(tt1,mult(b+(k*n+j),a+(i*n+k)),sizeof(complexe));
	tt2=malloc(sizeof(complexe));
	memcpy(tt2,add(tmp,tt1),sizeof(complexe));
	free(tt1);
	memcpy(tmp,tt2,sizeof(complexe));
	free(tt2);	 */
       }
      cc=c+(i*n+j);
      cc=tmp;
      /* memcpy(cc,tmp,sizeof(complexe)); */
     }
  }
}

/*

void multiply_complex(int n, complexe *a, complexe *b, complexe *c)
{
 register int i,j,k,t1,t2;
 complexe tmp;

  for (i=0; i<n; i++) 
   {
    for (j=0;j<n;j++)
     {
      tmp=zero;
      for (k=0;k<n;k++) tmp=complexe_add(tmp,(complexe_mult(b[k*n+j],a[i*n+k])));
      c[i*n+j]=tmp;
     }
  } 
}

*/

int main(int argc, char* argv[])
{
 complexe *a, *b, *c;
 int i,j,k,t1,pas,maxi,iter,pid;
 double chrono=0;

 zero.re=(double)0;
 zero.img=(double)0;

 MPI_Init(&argc,&argv);
 MPI_Comm_rank(MPI_COMM_WORLD,&pid);

 i=atoi(argv[1]);
 pas=atoi(argv[2]);
 maxi=atoi(argv[3]);
 iter=atoi(argv[4]);

 srand(1000);
 while (i<maxi)
  {  
   /* initialisation des matrices */
   a=(complexe *)malloc(i*i*sizeof(complexe));
   b=(complexe *)malloc(i*i*sizeof(complexe));
   c=(complexe *)malloc(i*i*sizeof(complexe));
      
   for (j=0;j<i;j++)
    for (k=0,t1=j*i;k<i;k++)
     {
      a[t1+k]=random_complexe();
      b[t1+k]=random_complexe();
     }

  chrono=MPI_Wtime();  

  for (k=1;k<=iter;k++) 
   /* multiply_complex(i,a,b,c); */
   multiply_generic(i,&zero,add,mult,a,b,c);

  chrono=MPI_Wtime()-chrono;
  MPI_Barrier(MPI_COMM_WORLD);
  
  if (pid==0) printf("%f\n",chrono/iter);
  fflush(stdout);

  /* suppression des vieilles matrices */
  free((complexe *)a);
  free((complexe *)b);
  free((complexe *)c);

  i=i+pas;
  }

 MPI_Finalize();
 return 0;
}

--------------090104040802030806090904
Content-Type: text/plain;
 name="mult.ml"
Content-Transfer-Encoding: 8bit
Content-Disposition: inline;
 filename="mult.ml"

(* param 1 = taille des matrices au depart
   param 2 = incrementation
   param 3 = taille des matrices à l'arrive
   param 4 = nombre d'itérations *)

open Bsmllib
open Bsmlcomm

let multiplication n zero add mul a b c =
  let tmp=ref zero in
  for i=0 to n-1 do
   for j=0 to n-1 do
    tmp:=zero;
    (* let rec loop tmp k =
     if k=n then tmp else loop (add tmp (mul b.(k*n+j) a.(i*n+k))) (k+1)
    in 
     c.(i*n+j)<-loop zero 0 *)
    for k=0 to n-1 do
     tmp:= add (!tmp) (mul b.(k*n+j) a.(i*n+k)) 
    done; 
    c.(i*n+j)<-(!tmp)    
   done
  done

let random_complex n = {Complex.re=Random.float n; im=Random.float n}

let matrixRandom n = Array.init (n*n) (fun _ -> random_complex 1000.)

let _ = 
        Bsmllib.initialize();
	Random.self_init();
        let i = ref (int_of_string (argv()).(1))
	and pas=(int_of_string (argv()).(2))
	and maxi=(int_of_string (argv()).(3)) 
	and iter=(int_of_string (argv()).(4)) in
	while !i<maxi do
         let a=matrixRandom (!i)
         and b=matrixRandom (!i)
         and c=Array.make ((!i)*(!i)) Complex.zero in
	 ignore(bench_fun (fun () -> 
                 ignore(multiplication (!i) Complex.zero Complex.add 
		                                      Complex.mul a b c)) iter);
	 i:=!i+pas;
	done

--------------090104040802030806090904--