caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: "Rémi Dewitte" <remi@gide.net>
To: yminsky@gmail.com
Cc: caml-list@yquem.inria.fr
Subject: Re: [Caml-list] Threads performance issue.
Date: Mon, 16 Feb 2009 18:37:47 +0100	[thread overview]
Message-ID: <2184b2340902160937i53b8f3fbga01eaf14ed829f8f@mail.gmail.com> (raw)
In-Reply-To: <891bd3390902160847p25ad3bf1pe59da620dfc667f2@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 5317 bytes --]

Yaron,

I use a slightly modified version of the CSV library's load_rows . Here is
the main code which is highly imperative style. I might transform it in
purely functional style ?

The main program is :

open Printf;;
open Sys;;
let timed_exec start_message f =
  print_string start_message;
  let st1 = time () in
  let r = f () in
  print_endline ("done in " ^ (string_of_float ((time ()) -. st1)) );
  r;;

(* This line enabled makes the program really slow ! *)
let run_threaded f = Thread.create (fun () -> f (); Thread.exit ()) ()

let () = timed_exec "Reading data " (fun () ->
  load_rows (fun _ -> ()) (open_in "file1.csv");
  load_rows (fun _ -> ()) (open_in "file2.csv");
  ()
)

The load_rows :
let load_rows ?(separator = ',') ?(nread = -1) f chan =
  let nr = ref 0 in
  let row = ref [] in            (* Current row. *)
  let field = ref [] in            (* Current field. *)
  let state = ref StartField in        (* Current state. *)
  let end_of_field () =
    let field_list = List.rev !field in
    let field_len = List.length field_list in
    let field_str = String.create field_len in
    let rec loop i = function
    [] -> ()
      | x :: xs ->
      field_str.[i] <- x;
      loop (i+1) xs
    in
    loop 0 field_list;
    row := (Some field_str) :: !row;
    field := [];
    state := StartField
  in
  let empty_field () =
    row := None :: !row;
    field := [];
    state := StartField
  in
  let end_of_row () =
    let row_list = List.rev !row in
    f row_list;
    row := [];
    state := StartField;
    nr := !nr + 1;
  in
  let rec loop () =
    let c = input_char chan in
    if c != '\r' then (            (* Always ignore \r characters. *)
      match !state with
      StartField ->            (* Expecting quote or other char. *)
        if c = '"' then (
          state := InQuotedField;
          field := []
        ) else if c = separator then (* Empty field. *)
          empty_field ()
        else if c = '\n' then (    (* Empty field, end of row. *)
          empty_field ();
          end_of_row ()
        ) else (
          state := InUnquotedField;
          field := [c]
        )
    | InUnquotedField ->        (* Reading chars to end of field. *)
        if c = separator then    (* End of field. *)
          end_of_field ()
        else if c = '\n' then (    (* End of field and end of row. *)
          end_of_field ();
          end_of_row ()
        ) else
          field := c :: !field
    | InQuotedField ->        (* Reading chars to end of field. *)
        if c = '"' then
          state := InQuotedFieldAfterQuote
        else
          field := c :: !field
    | InQuotedFieldAfterQuote ->
        if c = '"' then (        (* Doubled quote. *)
          field := c :: !field;
          state := InQuotedField
        ) else if c = '0' then (    (* Quote-0 is ASCII NUL. *)
          field := '\000' :: !field;
          state := InQuotedField
        ) else if c = separator then (* End of field. *)
          end_of_field ()
        else if c = '\n' then (    (* End of field and end of row. *)
          end_of_field ();
          end_of_row ()
        ) else (            (* Bad single quote in field. *)
          field := c :: '"' :: !field;
          state := InQuotedField
        )
    ); (* end of match *)
  if( nread < 0 or !nr < nread) then loop () else ()
  in
  try
    loop ()
  with
      End_of_file ->
    (* Any part left to write out? *)
    (match !state with
         StartField ->
           if !row <> [] then
         ( empty_field (); end_of_row () )
       | InUnquotedField | InQuotedFieldAfterQuote ->
           end_of_field (); end_of_row ()
       | InQuotedField ->
           raise (Bad_CSV_file "Missing end quote after quoted field.")
    )


Thanks,
Rémi

On Mon, Feb 16, 2009 at 17:47, Yaron Minsky <yminsky@gmail.com> wrote:

> 2009/2/16 Rémi Dewitte <remi@gide.net>
>
>> Hello,
>>
>> I would like to read two files in two different threads.
>>
>> I have made a first version reading the first then the second and it takes
>> 2.8s (native).
>>
>> I decided to make a threaded version and before any use of thread I
>> realized that just linking no even using it to the threads library makes my
>> first version of the program to run in 12s !
>
>
> Do you have a short benchmark you can post?  The idea that the
> thread-overhead would make a difference like that, particularly for IO-bound
> code (which I'm guessing this is) is pretty surprising.
>
> y
>
>
>>
>> I use pcre, extlib, csv libraries as well.
>>
>> I guess it might come from GC slowing down thinks here, doesn't it ? Where
>> can it come from otherwise ? Is there a workaround or something I should
>> know ?
>>
>> Can ocaml use multiple cores ?
>>
>> Do you have few pointers on libraries to make parallel I/Os ?
>>
>> Thanks,
>> Rémi
>>
>> _______________________________________________
>> Caml-list mailing list. Subscription management:
>> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
>> Archives: http://caml.inria.fr
>> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
>> Bug reports: http://caml.inria.fr/bin/caml-bugs
>>
>>
>

[-- Attachment #2: Type: text/html, Size: 9597 bytes --]

  reply	other threads:[~2009-02-16 17:38 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-16 15:15 Rémi Dewitte
2009-02-16 15:28 ` [Caml-list] " Michał Maciejewski
2009-02-16 15:32   ` Rémi Dewitte
2009-02-16 15:42     ` David Allsopp
2009-02-16 16:07       ` Rémi Dewitte
2009-02-16 16:32 ` Sylvain Le Gall
2009-02-17 13:52   ` [Caml-list] " Frédéric Gava
2009-02-16 16:47 ` [Caml-list] " Yaron Minsky
2009-02-16 17:37   ` Rémi Dewitte [this message]
2009-02-17  7:40     ` Rémi Dewitte
2009-02-17  8:59       ` Mark Shinwell
2009-02-17  9:09         ` Rémi Dewitte
2009-02-17  9:53         ` Jon Harrop
2009-02-17 10:07       ` Sylvain Le Gall
2009-02-17 10:26         ` [Caml-list] " Mark Shinwell
2009-02-17 10:50           ` Rémi Dewitte
2009-02-17 10:56             ` Mark Shinwell
2009-02-17 11:33             ` Jon Harrop
2009-02-17 12:20         ` Yaron Minsky
2009-02-17 12:26           ` Rémi Dewitte
2009-02-17 17:14           ` Sylvain Le Gall

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2184b2340902160937i53b8f3fbga01eaf14ed829f8f@mail.gmail.com \
    --to=remi@gide.net \
    --cc=caml-list@yquem.inria.fr \
    --cc=yminsky@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).