caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
* Floating point array computations
@ 2001-01-25 10:34 wester
  2001-02-01 13:57 ` Xavier Leroy
  0 siblings, 1 reply; 2+ messages in thread
From: wester @ 2001-01-25 10:34 UTC (permalink / raw)
  To: caml-list

Hi,

I'm a OCmal beginner and would like to use it for some numerical
computations. Speed is not the only criterion but for number crunching it
is an important point. So I would like to make my OCaml code as fast 
as possible. I have tried some small examples (source attached below) 
and have some questions. 

In sample code form the ICFP 200 contest I found:

type v = float array

is array a build in type?

The functions main1 .. main4 below do the same but take different time for it
(compiled with ocamlopt). I expected main2 to be the fastest but main3 is 
fastest. main3 300 300 1000 takes on my PC (450 Mhz PI, 520 MB RAM) 
26 sec, this is about twice the time of C++ and about the same as Java.

Can somone tell me why main3 is faster than main2 (36 sec) although
main3 makes function calls to step?

I also tried Array.unsafe_get/set in main4. I expected this to be even faster
because I have seen it in the ICFP contest sample code. But actually 
it is much slower (about 100 sec). Did I make anything wrong?

Would Bigarray perform better? 

I would be very appriciative for your help to clarify this points.

Thank you in advance.

Rolf


-----------------------------------------------------------------------------------------------------------

let main1 nx ny nt =
    let matrix1 = Array.make_matrix nx ny 0.0 in
    let matrix2 = Array.make_matrix nx ny 0.0 in
    let sum   = ref 0  in
    let start = ref 0. in
    let upper_idx = Array.length matrix1 - 1 in
    for r = 1 to nt do
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix2.(i)   and
		      mat1m = matrix1.(i-1) and
			  mat10 = matrix1.(i)   and
			  mat1p = matrix1.(i+1) in
            for j = 1 to upper_idx - 1 do
                mat2.(j) <- mat10.(j) +. 1.0  +.
				                0.1*.(mat1p.(j)   +.
                                      mat1m.(j)   +.
                                      mat10.(j+1) +.
                                      mat10.(j-1) -.
                               4.0 *. mat10.(j));
            done;
        done;
        for i = 1 to upper_idx - 1  do
            for j = 1 to upper_idx - 1 do
                matrix1.(i).(j) <- matrix2.(i).(j) +. 1.0 +.
				0.1*.(matrix2.(i+1).(j) +.
                                      matrix2.(i-1).(j) +.
                                      matrix2.(i).(j+1) +.
                                      matrix2.(i).(j-1) -.
                                   4.0 *. matrix2.(i).(j));
            done;
        done;
	Printf.printf "%f\n" (matrix1.(150).(150));
	flush stdout;
    done;
;;


let main2 nx ny nt =
    let matrix1 = Array.make_matrix nx ny 0.0 in
    let matrix2 = Array.make_matrix nx ny 0.0 in
    let sum   = ref 0  in
    let start = ref 0. in
    let upper_idx = Array.length matrix1 - 1 in
    for r = 1 to nt do
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix2.(i)   and
		      mat1m = matrix1.(i-1) and
			  mat10 = matrix1.(i)   and
			  mat1p = matrix1.(i+1) in
              for j = 1 to upper_idx - 1 do
                mat2.(j) <- mat10.(j) +. 1.0  +.
				                0.1*.(mat1p.(j)   +.
                                      mat1m.(j)   +.
                                      mat10.(j+1) +.
                                      mat10.(j-1) -.
                               4.0 *. mat10.(j));
            done;
        done;
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix1.(i)   and
		      mat1m = matrix2.(i-1) and
			  mat10 = matrix2.(i)   and
			  mat1p = matrix2.(i+1) in
              for j = 1 to upper_idx - 1 do
                mat2.(j) <- mat10.(j) +. 1.0  +.
				                0.1*.(mat1p.(j)   +.
                                      mat1m.(j)   +.
                                      mat10.(j+1) +.
                                      mat10.(j-1) -.
                               4.0 *. mat10.(j));
            done;
        done;
	Printf.printf "%f\n" (matrix1.(nx / 2).(ny / 2));
	flush stdout;
    done;
;;

let main3 nx ny nt =
  let step nmax m1p m1m m10 m2 =
     for j = 1 to nmax - 1 do
          m2.(j) <- m10.(j) +. 1.0  +.
				   0.1*.(m1p.(j)   +.
                         m1m.(j)   +.
                         m10.(j+1) +.
                         m10.(j-1) -.
                  4.0 *. m10.(j));
      done;
	in
    let matrix1 = Array.make_matrix nx ny 0.0 in
    let matrix2 = Array.make_matrix nx ny 0.0 in
    let sum   = ref 0  in
    let start = ref 0. in
    let upper_idx = Array.length matrix1 - 1 in
    for r = 1 to nt do
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix2.(i)   and
		      mat1m = matrix1.(i-1) and
			  mat10 = matrix1.(i)   and
			  mat1p = matrix1.(i+1) in
		  step upper_idx mat1p mat1m mat10 mat2;
        done;
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix1.(i)   and
		      mat1m = matrix2.(i-1) and
			  mat10 = matrix2.(i)   and
			  mat1p = matrix2.(i+1) in
		  step upper_idx mat1p mat1m mat10 mat2;
        done;
	Printf.printf "%f\n" (matrix1.(nx / 2).(ny / 2));
	flush stdout;
    done;
;;


let uset = Array.unsafe_set;;
let uget = Array.unsafe_get;;

let main4 nx ny nt =
  let step nmax m1p m1m m10 m2 =
     for j = 1 to nmax - 1 do
          uset m2 j  ((uget m10 j) +. 1.0  +.
				             0.1*.((uget m1p j    ) +.
                                   (uget m1m j    ) +.
                                   (uget m10 (j+1)) +.
                                   (uget m10 (j-1)) -.
                            4.0 *. (uget m10 j    )    ));
      done;
	in
    let matrix1 = Array.make_matrix nx ny 0.0 in
    let matrix2 = Array.make_matrix nx ny 0.0 in
    let sum   = ref 0  in
    let start = ref 0. in
    let upper_idx = Array.length matrix1 - 1 in
    for r = 1 to nt do
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix2.(i)   and
		      mat1m = matrix1.(i-1) and
			  mat10 = matrix1.(i)   and
			  mat1p = matrix1.(i+1) in
		  step upper_idx mat1p mat1m mat10 mat2;
        done;
        for i = 1 to upper_idx - 1  do
		  let mat2  = matrix1.(i)   and
		      mat1m = matrix2.(i-1) and
			  mat10 = matrix2.(i)   and
			  mat1p = matrix2.(i+1) in
		  step upper_idx mat1p mat1m mat10 mat2;
        done;
	Printf.printf "%f\n" (matrix1.(nx / 2).(ny / 2));
	flush stdout;
    done;
;;

let _ = main3 300 300 1000;;


-------------------------------------
Rolf Wester
wester@ilt.fhg.de



^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: Floating point array computations
  2001-01-25 10:34 Floating point array computations wester
@ 2001-02-01 13:57 ` Xavier Leroy
  0 siblings, 0 replies; 2+ messages in thread
From: Xavier Leroy @ 2001-02-01 13:57 UTC (permalink / raw)
  To: wester; +Cc: caml-list

> In sample code form the ICFP 200 contest I found:
> type v = float array
> is array a build in type?

Yes, it is.

> The functions main1 .. main4 below do the same but take different time for it
> (compiled with ocamlopt). I expected main2 to be the fastest but main3 is 
> fastest. main3 300 300 1000 takes on my PC (450 Mhz PI, 520 MB RAM) 
> 26 sec, this is about twice the time of C++ and about the same as Java.
> Can somone tell me why main3 is faster than main2 (36 sec) although
> main3 makes function calls to step?

Here are two possible explanations.  First, with modern processors,
minor variations in code placement (i.e. insert or delete unused code)
can result in performance improvements or degradations by as much as 20%,
due to cache effects, alignment constraints on instruction fetches,
and what not.  (Predicting accurately the performance of a processor
such as the PIII is extremely hard; a few Intel engineers are rumored
to be able to do it :-)

Second, on a register-poor architecture such as the IA32, spilling and
reloading (moving values between registers and stack locations)
happens often and has a significant performance impact.  Since OCaml
follows a callee-save model, function calls act as natural spill hint,
forcing all live registers to be spilled to the stack before the call,
then reloaded on demand after the call.

In your "main3" test, the function "step" runs in exactly the 7
available registers, thus the call to "step" causes a nearly perfect
placement of spills: spill everything live before the inner loop; run
the inner loop in registers; reload the spilled data.

In "main2", there is no such function call to help the compiler, and
the placement of spills and reloads it chooses is not as good
(although still reasonable).

> I also tried Array.unsafe_get/set in main4. I expected this to be even faster
> because I have seen it in the ICFP contest sample code. But actually 
> it is much slower (about 100 sec). Did I make anything wrong?

You hit a rather subtle point in the OCaml compiler, so don't feel too
bad about it.  Array.unsafe_get/set are not quite regular Caml functions,
but rather predefined operators (like + or +.) with special
compilation schemes (e.g. open-coding and type specialization) *when
the operator is fully applied*.  This means that

        Array.unsafe_get a i

will generate optimized code.  But the following rebinding

        let get = Array.unsafe_get

where Array.unsafe_get is not fully applied, gets compiled as

        let get x y = Array.unsafe_get x y

so subsequent calls to get may incur the cost of a function call
(although ocamlopt will actually inline the function at point of call).
More subtly, the definition of "get" is polymorphic, hence no type
specialization occurs, and it happens that accesses to float arrays
are compiled much more efficiently if the array is statically known to
have type "float array".  

In summary, for maximal performance, you should not write

        let get = Array.unsafe_get
        ... get ... get ...

but instead write

        ... Array.unsafe_get ... Array.unsafe_get ...

With this modification, your "main4" example runs a bit faster than
"main3" (20-25% faster), because Array.unsafe_get/set do not perform
array bounds checking.  However, this is unsafe: an error in your
code, causing an out-of-bound access, can very easily crash the
program.  So, this is a high price to pay in terms of reliability for
a relatively small gain in performance, and I'd advise against it.

> Would Bigarray perform better? 

I tried to rewrite your code using a 2-d bigarray, and it was clearly
slower.  Accessing a 2-d bigarray involves
        two bound checks
        one integer multiplication
        one integer addition
        one load,
while accessing a float array array (as returned by Array.make_matrix)
involves
        two bound checks
        two integer additions
        two loads
On the Pentium at least, the integer multiplication is much slower
than the load.  Moreover, OCaml doesn't perform hoisting of loop
invariant subexpressions, meaning that the multiplication is performed
at each iteration of the inner loop.  And since the multiplication is
performed "under the hood" by the compiler, it cannot be manually
hoisted out of the inner loop, like you did in main2 and main3 for 
hoisting the outer array access (1 bound check, 1 addition, 1 load).

> I would be very appriciative for your help to clarify this points.

I hope this helps, despite the rather technical explanations.

- Xavier Leroy



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2001-02-02 15:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-01-25 10:34 Floating point array computations wester
2001-02-01 13:57 ` Xavier Leroy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).