I have made some further experiments.
I have a functional version of the reading algorithm. I have the original imperative version of the algorithm.
Either it is linked to thread (T) or not (X). Either it uses extlib (E) or not (X).

Results are.
                  XX      TX     XE     TE
Imperative | 3.37 | 7.80 | 3.56 | 8.40
Functional | 4.20 | 8.28 | 4.47 | 9.08

test.csv is a 21mo file with ~13k rows and a thousands of columns on a 15rpm disk.

ocaml version : 3.11.0

uname -a gives
Linux localhost 2.6.28.4-server-1mnb #1 SMP Mon Feb 9 09:05:19 EST 2009 i686 Intel(R) Core(TM)2 Duo CPU     E8400  @ 3.00GHz GNU/Linux

While I think I have to find improvements to the functional version, I struggle finding a rationale behind this high loss of performance while I am not even using threads, just linking to...

Cheers,
Rémi

On Mon, Feb 16, 2009 at 18:37, Rémi Dewitte <remi@gide.net> wrote:
Yaron,

I use a slightly modified version of the CSV library's load_rows . Here is the main code which is highly imperative style. I might transform it in purely functional style ?

The main program is :

open Printf;;
open Sys;;
let timed_exec start_message f =
  print_string start_message;
  let st1 = time () in
  let r = f () in
  print_endline ("done in " ^ (string_of_float ((time ()) -. st1)) );
  r;;

(* This line enabled makes the program really slow ! *)
let run_threaded f = Thread.create (fun () -> f (); Thread.exit ()) ()

let () = timed_exec "Reading data " (fun () ->
  load_rows (fun _ -> ()) (open_in "file1.csv");
  load_rows (fun _ -> ()) (open_in "file2.csv");
  ()
)

The load_rows :
let load_rows ?(separator = ',') ?(nread = -1) f chan =
  let nr = ref 0 in
  let row = ref [] in            (* Current row. *)
  let field = ref [] in            (* Current field. *)
  let state = ref StartField in        (* Current state. *)
  let end_of_field () =
    let field_list = List.rev !field in
    let field_len = List.length field_list in
    let field_str = String.create field_len in
    let rec loop i = function
    [] -> ()
      | x :: xs ->
      field_str.[i] <- x;
      loop (i+1) xs
    in
    loop 0 field_list;
    row := (Some field_str) :: !row;
    field := [];
    state := StartField
  in
  let empty_field () =
    row := None :: !row;
    field := [];
    state := StartField
  in
  let end_of_row () =
    let row_list = List.rev !row in
    f row_list;
    row := [];
    state := StartField;
    nr := !nr + 1;
  in
  let rec loop () =
    let c = input_char chan in
    if c != '\r' then (            (* Always ignore \r characters. *)
      match !state with
      StartField ->            (* Expecting quote or other char. *)
        if c = '"' then (
          state := InQuotedField;
          field := []
        ) else if c = separator then (* Empty field. *)
          empty_field ()
        else if c = '\n' then (    (* Empty field, end of row. *)
          empty_field ();
          end_of_row ()
        ) else (
          state := InUnquotedField;
          field := [c]
        )
    | InUnquotedField ->        (* Reading chars to end of field. *)
        if c = separator then    (* End of field. *)
          end_of_field ()
        else if c = '\n' then (    (* End of field and end of row. *)
          end_of_field ();
          end_of_row ()
        ) else
          field := c :: !field
    | InQuotedField ->        (* Reading chars to end of field. *)
        if c = '"' then
          state := InQuotedFieldAfterQuote
        else
          field := c :: !field
    | InQuotedFieldAfterQuote ->
        if c = '"' then (        (* Doubled quote. *)
          field := c :: !field;
          state := InQuotedField
        ) else if c = '0' then (    (* Quote-0 is ASCII NUL. *)
          field := '\000' :: !field;
          state := InQuotedField
        ) else if c = separator then (* End of field. *)
          end_of_field ()
        else if c = '\n' then (    (* End of field and end of row. *)
          end_of_field ();
          end_of_row ()
        ) else (            (* Bad single quote in field. *)
          field := c :: '"' :: !field;
          state := InQuotedField
        )
    ); (* end of match *)
  if( nread < 0 or !nr < nread) then loop () else ()
  in
  try
    loop ()
  with
      End_of_file ->
    (* Any part left to write out? *)
    (match !state with
         StartField ->
           if !row <> [] then
         ( empty_field (); end_of_row () )
       | InUnquotedField | InQuotedFieldAfterQuote ->
           end_of_field (); end_of_row ()
       | InQuotedField ->
           raise (Bad_CSV_file "Missing end quote after quoted field.")
    )


Thanks,
Rémi


On Mon, Feb 16, 2009 at 17:47, Yaron Minsky <yminsky@gmail.com> wrote:
2009/2/16 Rémi Dewitte <remi@gide.net>
Hello,

I would like to read two files in two different threads.

I have made a first version reading the first then the second and it takes 2.8s (native).

I decided to make a threaded version and before any use of thread I realized that just linking no even using it to the threads library makes my first version of the program to run in 12s !

Do you have a short benchmark you can post?  The idea that the thread-overhead would make a difference like that, particularly for IO-bound code (which I'm guessing this is) is pretty surprising.

y
 

I use pcre, extlib, csv libraries as well.

I guess it might come from GC slowing down thinks here, doesn't it ? Where can it come from otherwise ? Is there a workaround or something I should know ?

Can ocaml use multiple cores ?

Do you have few pointers on libraries to make parallel I/Os ?

Thanks,
Rémi

_______________________________________________
Caml-list mailing list. Subscription management:
http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
Archives: http://caml.inria.fr
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
Bug reports: http://caml.inria.fr/bin/caml-bugs