caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: "Rémi Dewitte" <remi@gide.net>
To: yminsky@gmail.com
Cc: Sylvain Le Gall <sylvain@le-gall.net>, caml-list@inria.fr
Subject: Re: [Caml-list] Re: Threads performance issue.
Date: Tue, 17 Feb 2009 13:26:32 +0100	[thread overview]
Message-ID: <2184b2340902170426n381d4d7dv2d4d3ea2bad6701f@mail.gmail.com> (raw)
In-Reply-To: <891bd3390902170420i6e7647ccl44c55456952d3f5a@mail.gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 2424 bytes --]

Not using channels with either file descriptors or bigarray works well in my
case.

Good to know when working with ocaml to take care of channels ;) !

Rémi

2009/2/17 Yaron Minsky <yminsky@gmail.com>

> Interestingly, this probably has nothing to do with the size of the
> buffer.  input_char actually acquires and releases a lock for every single
> call, whether or not an underlying system call is required to fill the
> buffer.  This has always struck me as an odd aspect of the in/out channel
> implementation, and means that IO is a lot more expensive in a threaded
> context than it should be.
>
> At Jane Street, performance-sensitive code tends to use other libraries
> that we've built directly on top of file descriptors that batches the IO and
> doesn't require constant lock acquisition.
>
> y
>
>
> On Tue, Feb 17, 2009 at 5:07 AM, Sylvain Le Gall <sylvain@le-gall.net>wrote:
>
>> On 17-02-2009, Rémi Dewitte <remi@gide.net> wrote:
>> >
>> > test.csv is a 21mo file with ~13k rows and a thousands of columns on a
>> 15rp=
>> > m
>> > disk.
>> >
>> > ocaml version : 3.11.0
>> >
>>
>> You are using input_char and standard IO channel. This is a good choice
>> for non-threaded program. But in your case, I will use Unix.read with a
>> big buffer (32KB to 4MB) and change your program to use it. As
>> benchmarked by John Harrop, you are spending most of your time in
>> caml_enter|leave_blocking section. I think it comes from reading using
>> std IO channel which use 4k buffer. Using a bigger buffer will allow
>> less call to this two functions (but you won't win time at the end, I
>> think you will just reduce the difference between non-threaded and
>> threaded code).
>>
>> Regards
>> Sylvain Le Gall
>>
>> _______________________________________________
>> Caml-list mailing list. Subscription management:
>> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
>> Archives: http://caml.inria.fr
>> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
>> Bug reports: http://caml.inria.fr/bin/caml-bugs
>>
>
>
> _______________________________________________
> Caml-list mailing list. Subscription management:
> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
> Archives: http://caml.inria.fr
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
>
>

[-- Attachment #1.2: Type: text/html, Size: 3792 bytes --]

[-- Attachment #2: transi2.ml --]
[-- Type: text/x-ocaml, Size: 3737 bytes --]

(* open ExtLib *)

(** Slithly modified copy from module CSV *)
exception Bad_CSV_file of string

type state_t = StartField
	       | InUnquotedField
	       | InQuotedField
	       | InQuotedFieldAfterQuote

let load_rows ?(separator = ',') ?(nread = -1) f file =
  let nr = ref 0 in
  let row = ref [] in			(* Current row. *)
  let field = ref [] in			(* Current field. *)
  let state = ref StartField in		(* Current state. *)
  let end_of_field () =
    let field_list = List.rev !field in
    let field_len = List.length field_list in
    let field_str = String.create field_len in
    let rec loop i = function
	[] -> ()
      | x :: xs ->
	  field_str.[i] <- x;
	  loop (i+1) xs
    in
    loop 0 field_list;
    row := (Some field_str) :: !row;
    field := [];
    state := StartField
  in
  let empty_field () =
    row := None :: !row;
    field := [];
    state := StartField
  in
  let end_of_row () =
    let row_list = List.rev !row in
    f row_list;
    row := [];
    state := StartField;
    nr := !nr + 1;
  in
  let process c =
    if c != '\r' then (			(* Always ignore \r characters. *)
      match !state with
	  StartField ->			(* Expecting quote or other char. *)
	    if c = '"' then (
	      state := InQuotedField;
	      field := []
	    ) else if c = separator then (* Empty field. *)
	      empty_field ()
	    else if c = '\n' then (	(* Empty field, end of row. *)
	      empty_field ();
	      end_of_row ()
	    ) else (
	      state := InUnquotedField;
	      field := [c]
	    )
	| InUnquotedField ->		(* Reading chars to end of field. *)
	    if c = separator then	(* End of field. *)
	      end_of_field ()
	    else if c = '\n' then (	(* End of field and end of row. *)
	      end_of_field ();
	      end_of_row ()
	    ) else
	      field := c :: !field
	| InQuotedField ->		(* Reading chars to end of field. *)
	    if c = '"' then
	      state := InQuotedFieldAfterQuote
	    else
	      field := c :: !field
	| InQuotedFieldAfterQuote ->
	    if c = '"' then (		(* Doubled quote. *)
	      field := c :: !field;
	      state := InQuotedField
	    ) else if c = '0' then (	(* Quote-0 is ASCII NUL. *)
	      field := '\000' :: !field;
	      state := InQuotedField
	    ) else if c = separator then (* End of field. *)
	      end_of_field ()
	    else if c = '\n' then (	(* End of field and end of row. *)
	      end_of_field ();
	      end_of_row ()
	    ) else (			(* Bad single quote in field. *)
	      field := c :: '"' :: !field;
	      state := InQuotedField
	    )
    ) (* end of match *)
  in
  let continue = ref true in
  let file_in = Unix.openfile file [Unix.O_RDONLY] 0o640 in
  let end_processing () =
    continue := false;
    try Unix.close file_in with _ -> ();
    (match !state with
      | StartField ->
	 if !row <> [] then
	   ( empty_field (); end_of_row () )
      | InUnquotedField | InQuotedFieldAfterQuote ->
	    end_of_field (); end_of_row ()
      | InQuotedField ->
	   raise (Bad_CSV_file "Missing end quote after quoted field.")
    )
  in
  let buffer_length = 2 * 1024 * 1024 in
  let buffer = String.make buffer_length '\000' in
  let process_buffer l = 
(*     for i = 0 to l do *)
    let ii = ref 0 in
    while (!continue) && (!ii) <= l do
      let i = !ii in
      process buffer.[i];
      ii := i + 1;
      if( nread > 0 && !nr = nread ) then end_processing () else ()
    done
  in
  while !(continue)
  do 
    let n = Unix.read file_in buffer 0 buffer_length in
    if (n > 0 )
    then process_buffer n
    else end_processing ()
  done

let run_threaded f = Thread.create (fun () -> f (); Thread.exit ()) ()

let t1 = load_rows (fun _ -> ()) "test.csv"
let t2 = load_rows (fun _ -> ()) "test2.csv"
let t3 = load_rows (fun _ -> ()) "test3.csv"

[-- Attachment #3: transimm.ml --]
[-- Type: text/x-ocaml, Size: 3514 bytes --]

(* open ExtLib *)

open Bigarray

(** Slithly modified copy from module CSV *)
exception Bad_CSV_file of string

type state_t = StartField
	       | InUnquotedField
	       | InQuotedField
	       | InQuotedFieldAfterQuote

let load_rows ?(separator = ',') ?(nread = -1) f file =
  let nr = ref 0 in
  let row = ref [] in			(* Current row. *)
  let field = ref [] in			(* Current field. *)
  let state = ref StartField in		(* Current state. *)
  let end_of_field () =
    let field_list = List.rev !field in
    let field_len = List.length field_list in
    let field_str = String.create field_len in
    let rec loop i = function
	[] -> ()
      | x :: xs ->
	  field_str.[i] <- x;
	  loop (i+1) xs
    in
    loop 0 field_list;
    row := (Some field_str) :: !row;
    field := [];
    state := StartField
  in
  let empty_field () =
    row := None :: !row;
    field := [];
    state := StartField
  in
  let end_of_row () =
    let row_list = List.rev !row in
    f row_list;
    row := [];
    state := StartField;
    nr := !nr + 1;
  in
  let process c =
    if c != '\r' then (			(* Always ignore \r characters. *)
      match !state with
	  StartField ->			(* Expecting quote or other char. *)
	    if c = '"' then (
	      state := InQuotedField;
	      field := []
	    ) else if c = separator then (* Empty field. *)
	      empty_field ()
	    else if c = '\n' then (	(* Empty field, end of row. *)
	      empty_field ();
	      end_of_row ()
	    ) else (
	      state := InUnquotedField;
	      field := [c]
	    )
	| InUnquotedField ->		(* Reading chars to end of field. *)
	    if c = separator then	(* End of field. *)
	      end_of_field ()
	    else if c = '\n' then (	(* End of field and end of row. *)
	      end_of_field ();
	      end_of_row ()
	    ) else
	      field := c :: !field
	| InQuotedField ->		(* Reading chars to end of field. *)
	    if c = '"' then
	      state := InQuotedFieldAfterQuote
	    else
	      field := c :: !field
	| InQuotedFieldAfterQuote ->
	    if c = '"' then (		(* Doubled quote. *)
	      field := c :: !field;
	      state := InQuotedField
	    ) else if c = '0' then (	(* Quote-0 is ASCII NUL. *)
	      field := '\000' :: !field;
	      state := InQuotedField
	    ) else if c = separator then (* End of field. *)
	      end_of_field ()
	    else if c = '\n' then (	(* End of field and end of row. *)
	      end_of_field ();
	      end_of_row ()
	    ) else (			(* Bad single quote in field. *)
	      field := c :: '"' :: !field;
	      state := InQuotedField
	    )
    ) (* end of match *)
  in
  let file_in = Unix.openfile file [Unix.O_RDONLY] 0o640 in
  let end_processing () =
    try Unix.close file_in with _ -> ();
    (match !state with
      | StartField ->
	 if !row <> [] then
	   ( empty_field (); end_of_row () )
      | InUnquotedField | InQuotedFieldAfterQuote ->
	    end_of_field (); end_of_row ()
      | InQuotedField ->
	   raise (Bad_CSV_file "Missing end quote after quoted field.")
    )
  in
  let mmap = Bigarray.Array1.map_file file_in Bigarray.char Bigarray.c_layout false (-1) in
  let l = (Bigarray.Array1.dim mmap) in
  let continue = ref true in
  let i = ref 0 in
  while !continue do
    process (Array1.(*unsafe_*)get mmap !i);
    i := !i + 1;
    continue := (nread < 0 || !nr < nread ) && !i < l
  done;
  end_processing ()
;;

let run_threaded f = Thread.create (fun () -> f (); Thread.exit ());;

load_rows (fun _ -> ()) "test.csv" ;;
load_rows (fun _ -> ()) "test2.csv";;
load_rows (fun _ -> ()) "test3.csv";;

  reply	other threads:[~2009-02-17 12:26 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-16 15:15 Rémi Dewitte
2009-02-16 15:28 ` [Caml-list] " Michał Maciejewski
2009-02-16 15:32   ` Rémi Dewitte
2009-02-16 15:42     ` David Allsopp
2009-02-16 16:07       ` Rémi Dewitte
2009-02-16 16:32 ` Sylvain Le Gall
2009-02-17 13:52   ` [Caml-list] " Frédéric Gava
2009-02-16 16:47 ` [Caml-list] " Yaron Minsky
2009-02-16 17:37   ` Rémi Dewitte
2009-02-17  7:40     ` Rémi Dewitte
2009-02-17  8:59       ` Mark Shinwell
2009-02-17  9:09         ` Rémi Dewitte
2009-02-17  9:53         ` Jon Harrop
2009-02-17 10:07       ` Sylvain Le Gall
2009-02-17 10:26         ` [Caml-list] " Mark Shinwell
2009-02-17 10:50           ` Rémi Dewitte
2009-02-17 10:56             ` Mark Shinwell
2009-02-17 11:33             ` Jon Harrop
2009-02-17 12:20         ` Yaron Minsky
2009-02-17 12:26           ` Rémi Dewitte [this message]
2009-02-17 17:14           ` Sylvain Le Gall

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2184b2340902170426n381d4d7dv2d4d3ea2bad6701f@mail.gmail.com \
    --to=remi@gide.net \
    --cc=caml-list@inria.fr \
    --cc=sylvain@le-gall.net \
    --cc=yminsky@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).