caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: "Jonathan Roewen" <jonathan.roewen@gmail.com>
Cc: caml-list@inria.fr
Subject: Re: [Caml-list] Re: zcat vs CamlZip
Date: Wed, 30 Aug 2006 12:53:37 +1200	[thread overview]
Message-ID: <ad8cfe7e0608291753q28a80434ya622015348262647@mail.gmail.com> (raw)
In-Reply-To: <Pine.LNX.4.64.0608300433220.8632@home.oyster.ru>

Have you tried Unzip module from Extlib? Haven't tried it, but plan on
using it later on.

Jonathan

On 8/30/06, malc <malc@pulsesoft.com> wrote:
> On Tue, 29 Aug 2006, Gerd Stolpmann wrote:
>
> > Am Dienstag, den 29.08.2006, 15:15 -0400 schrieb Sam Steingold:
> >> at any rate, do you really expect that using Gzip.input and then
> >> searching the result for a newline, slicing and dicing to get the
> >> individual input lines, &c &c would be faster?
> >
> > Ah yes, and there is an easy solution with ocamlnet:
>
> [..snip..]
>
> > This adds a buffering layer.
>
> The Netchannels buffering looks very elegant, but my (admittedly rather
> cursory) testing shows that it's also rather slow.
>
> Following code implements 4 line readers:
> Sam's original [char]
> Netchannels [net]
> open_process_in [zcat]
> and buffered (trying to stay compatible with original interface) [block]
>
> While Netchannels do win over original implementation it looses to all
> other methods (on my machine).
>
> let buf = Buffer.create 1024
> let gz_input_line gz_in char_counter line_counter =
>   Buffer.clear buf;
>   let finish () = incr line_counter; Buffer.contents buf in
>   let rec loop () =
>     let ch = Gzip.input_char gz_in in
>     char_counter := Int64.succ !char_counter;
>     if ch = '\n' then finish () else ( Buffer.add_char buf ch; loop (); ) in
>   try loop ()
>   with End_of_file ->
>     if Buffer.length buf = 0 then raise End_of_file else finish ()
>
> class input_gzip_rec gzip_ch : Netchannels.rec_in_channel =
> object(self)
>   method input s p l =
>     let n = Gzip.input gzip_ch s p l in
>     if n = 0 then raise End_of_file;
>     n
>   method close_in() =
>     Gzip.close_in gzip_ch
> end
>
> let wrap_gz gz_in =
>   let s = String.create 4096 in
>   let b = Buffer.create 1024 in
>   let r = ref (fun _ _ -> assert false) in
>   let findlf s start finish =
>     let rec loop pos = if pos >= finish then None
>     else if String.unsafe_get s pos = '\n' then Some pos else loop (succ pos)
>     in loop start
>   in
>   let rec cont pos char_counter line_counter =
>     let n = Gzip.input gz_in s pos (String.length s - pos) in
>     let rec subcont pos len char_counter line_counter =
>       let finish = pos + len in
>       match findlf s pos finish with
>       | None ->
>           Buffer.add_substring b s pos len;
>           cont 0 char_counter line_counter
>
>       | Some lfpos ->
>           let runlen = lfpos - pos in
>           incr line_counter;
>           Buffer.add_substring b s pos runlen;
>           let s = Buffer.contents b in
>           Buffer.clear b;
>           r := subcont (succ lfpos) (len - succ runlen);
>           s
>     in
>     if n = 0
>     then raise End_of_file
>     else (
>       char_counter := Int64.add (Int64.of_int n) !char_counter;
>       subcont pos n char_counter line_counter
>      )
>   in
>   let exec c l = !r c l in
>   r := cont 0;
>   exec
>
> let char () =
>   let gz = Gzip.open_in_chan stdin in
>   let cc = ref 0L in
>   let lc = ref 0 in
>   try
>     while true
>     do
>       let _line = gz_input_line gz cc lc in
>       ()
>     done
>   with End_of_file ->
>     Format.printf "cc=%Ld lc=%d@." !cc !lc
>
> let block () =
>   let gz = Gzip.open_in_chan stdin in
>   let cc = ref 0L in
>   let lc = ref 0 in
>   let lg = wrap_gz gz in
>   try
>     while true
>     do
>       let _line = lg cc lc in
>       ()
>     done
>   with End_of_file ->
>     Format.printf "cc=%Ld lc=%d@." !cc !lc
>
> let zcat () =
>   let ic = Unix.open_process_in "zcat" in
>   let cc = ref 0L in
>   let lc = ref 0 in
>   try
>     while true
>     do
>       let _line = input_line ic in
>       cc := Int64.add (Int64.of_int (String.length _line + 1)) !cc;
>       incr lc
>     done
>   with End_of_file ->
>     Format.printf "cc=%Ld lc=%d@." !cc !lc
>
> let net () =
>   let gz_in = Gzip.open_in_chan stdin in
>   let gz_ch = Netchannels.lift_in (`Rec (new input_gzip_rec gz_in)) in
>   let cc = ref 0L in
>   let lc = ref 0 in
>   try
>     while true
>     do
>       let _line = gz_ch#input_line () in
>       cc := Int64.add (Int64.of_int (String.length _line + 1)) !cc;
>       incr lc
>     done
>   with End_of_file ->
>     Format.printf "cc=%Ld lc=%d@." !cc !lc
>
> let _ =
>   match Sys.argv with
>   | [| _; "char" |] -> char ()
>   | [| _; "zcat" |] -> zcat ()
>   | [| _; "block" |] -> block ()
>   | [| _; "net" |] -> net ()
>   | _ -> prerr_endline (Sys.argv.(0) ^ ": [char|zcat|block|net]")
>
> --
> mailto:malc@pulsesoft.com
>
> _______________________________________________
> Caml-list mailing list. Subscription management:
> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
> Archives: http://caml.inria.fr
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
>


  reply	other threads:[~2006-08-30  0:53 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-29 18:40 Sam Steingold
2006-08-29 18:54 ` Bardur Arantsson
2006-08-29 19:01   ` [Caml-list] " Florian Hars
2006-08-29 19:15   ` Sam Steingold
2006-08-29 19:48     ` Bárður Árantsson
2006-08-29 19:54     ` [Caml-list] " Gerd Stolpmann
2006-08-29 20:04     ` Gerd Stolpmann
2006-08-30  0:44       ` malc
2006-08-30  0:53         ` Jonathan Roewen [this message]
2006-08-29 19:37   ` John Carr
2006-08-29 19:11 ` [Caml-list] " Eric Cooper
2006-08-30  6:12 ` Jeff Henrikson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ad8cfe7e0608291753q28a80434ya622015348262647@mail.gmail.com \
    --to=jonathan.roewen@gmail.com \
    --cc=caml-list@inria.fr \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).