caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: Michal Moskal <malekith@pld-linux.org>
To: Ram Bhamidipaty <ramb@sonic.net>
Cc: caml-list@pauillac.inria.fr
Subject: Re: [Caml-list] what is the functional way to solve this problem?
Date: Thu, 2 Oct 2003 16:53:27 +0200	[thread overview]
Message-ID: <20031002145327.GA4690@roke.freak> (raw)
In-Reply-To: <16252.6810.502406.679837@oak.ramandgita.com>

[-- Attachment #1: Type: text/plain, Size: 2415 bytes --]

On Thu, Oct 02, 2003 at 05:31:22AM -0700, Ram Bhamidipaty wrote:
> Someone suggested that my previous question about dynamically resizing
> arrays hinted that my solution may be going in a non-functional
> direction. That might be true.
> 
> So here is the problem I am trying to solve. I would like to solve it
> in a "functional way".

Use Map. You won't notice a difference.

[...]
> I have already written a python application that can read this data file and
> generate the data I am looking for. I was not happy with the python solution
> because it was not very fast. Even with using a heap to store the top 100
> largest files I was not able to create a python solution that could beat
> a "grep | sort" pipeline (on unix of course).

[...]

./stats < /shm/d  0.11s user 0.00s system 78% cpu 0.139 total

grep '^F' /shm/d  0.02s user 0.00s system 22% cpu 0.090 total
sort -n -t' ' -k2,2  0.32s user 0.05s system 71% cpu 0.517 total
tail -100  0.02s user 0.00s system 3% cpu 0.514 total

 86287  263948 1912177 /shm/d

Did I win something? ;-)

[...]

> My goal of implementing this in OCaml is to beat the grep+sort
> combination. If I can create a solution that can output all the
> date I want - in one pass - AND still be faster than the grep+sort
> partial solution - _that_ would be cool!
> 
> Having said all that - I wanted to use an array to hold the data for
> each directory. I hoped that using an array would be faster than a
> hash table since I know that the directory numbers are assigned
> sequentially.

Map! Map! Stop thinking in such an imperative way ;-)

Key points of my implementation:

1. It also uses heap (simple one from pure-fun)
2. It is compiled with ocamlopt (ocamlc version is 6 times slower)
3. It doesn't use Scanf. For such linear task as this Scanf takes 10 or
   more times to parse input then actual computations.
4. There is little trick, that heap holds 101 elements, and always
   discards last one, so we don't need to check if newly inserted
   element is really bigger then min we just deleted.
5. The code isn't very pretty (it would be better to make heap functor
   with item count, and so on). It is left as an exercise to the reader.

You're free to use it. I also attach program I used to create input.

-- 
: Michal Moskal :: http://www.kernel.pl/~malekith : GCS {C,UL}++++$ a? !tv
: When in doubt, use brute force. -- Ken Thompson : {E-,w}-- {b++,e}>+++ h

[-- Attachment #2: stats.ml --]
[-- Type: text/plain, Size: 4105 bytes --]

exception Empty

module Heap (Element : Map.OrderedType) =
struct
  module Elem = Element

  type heap = E | T of int * Elem.t * heap * heap

  let rank = function E -> 0 | T (r,_,_,_) -> r

  let makeT x a b =
    if rank a >= rank b then T (rank b + 1, x, a, b)
    else T (rank a + 1, x, b, a)

  let empty = E
  let is_empty h = h = E

  let rec merge h1 h2 = match h1, h2 with
    | _, E -> h1
    | E, _ -> h2
    | T (_, x, a1, b1), T (_, y, a2, b2) ->
        if Elem.compare x y <= 0 then makeT x a1 (merge b1 h2)
        else makeT y a2 (merge h1 b2)

  let insert x h = merge (T (1, x, E, E)) h
  let find_min = function E -> raise Empty | T (_, x, _, _) -> x
  let delete_min = function E -> raise Empty | T (_, x, a, b) -> merge a b

  let flatten x =
    let rec loop acc = function
      | E -> acc
      | T (_, x, a, b) -> loop (loop (x :: acc) a) b
    in loop [] x
end


module Dir = struct
  type t =
    {
      size : int;
      name : string;
      parent : t option;
      no : int;
    }

  let compare x y = x.size - y.size
end

module Int = struct
  type t = int
  let compare x y = x - y
end

module File = struct
  type t =
    {
      size : int;
      dir : Dir.t;
      name : string;
    }

  let compare x y = x.size - y.size
end

module Dir_set = Map.Make(Int)
module Dir_heap = Heap(Dir)
module File_heap = Heap(File)

type line =
  | D of int * int * string
  | F of int * string
  | Eof

let limit = 100

let read () =
  try
    let l = read_line () in
    let rec scan_int pos =
      match l.[pos] with
      | '0' .. '9' -> scan_int (pos + 1)
      | _ -> pos
    in
    let int_from pos =
      let xend = scan_int pos in
      (int_of_string (String.sub l pos (xend - pos)), xend + 1)
    in
    match l.[0] with
    | 'D' ->
      let (x, p) = int_from 2 in
      let (y, p) = int_from p in
      let s = String.sub l p (String.length l - p) in
      D (x, y, s)
    | 'F' -> 
      let (x, p) = int_from 2 in
      let s = String.sub l p (String.length l - p) in
      F (x, s)
    | _ -> invalid_arg "read"
  with End_of_file -> Eof

let rec loop dh fh ds cur_dir = 
  let push_dir () =
    let (dh_sz, dh) = dh in
    if dh_sz > limit then 
      (dh_sz, Dir_heap.insert cur_dir (Dir_heap.delete_min dh))
    else 
      (dh_sz + 1, Dir_heap.insert cur_dir dh)
  in
  
  function
  | Eof ->
    (push_dir (), fh)
  | D (no, par, nam) ->
    let cur_dir =
      {
        Dir.parent = Some (Dir_set.find par ds);
        Dir.name = nam;
        Dir.size = 0;
        Dir.no = no;
      }
    in loop (push_dir ()) fh (Dir_set.add cur_dir.Dir.no cur_dir ds) cur_dir (read ())
  | F (sz, name) ->
    let f = { File.size = sz; File.name = name; File.dir = cur_dir } in
    let fh =
      let (fh_sz, fh) = fh in
      if fh_sz > limit then 
        (fh_sz, File_heap.insert f (File_heap.delete_min fh))
      else 
        (fh_sz + 1, File_heap.insert f fh)
    in loop dh fh ds {cur_dir with Dir.size = cur_dir.Dir.size + sz} (read ())

let main =
  let start nam =
    let dir = { Dir.name = nam; Dir.size = 0; Dir.no = 0; Dir.parent = None } in
    let ds = Dir_set.add 0 dir Dir_set.empty in
    loop (0, Dir_heap.empty) (0, File_heap.empty) ds dir (read ()) 
  in
  let l = read_line () in
  let (dh, fh) = Scanf.sscanf l "R 0 %[^\n]" start in
  let dir_name d =
    let rec loop acc = function
      | { Dir.name = n; Dir.parent = None } -> n :: acc
      | { Dir.name = n; Dir.parent = Some p } -> loop (n :: acc) p
    in String.concat "/" (loop [] d)
  in
  let print_dir d = 
    Printf.printf "%10d %s\n" d.Dir.size (dir_name d) 
  in
  let print_file f = 
    Printf.printf "%10d %s/%s\n" f.File.size (dir_name f.File.dir) f.File.name
  in
  let dh =
    let (dh_sz, dh) = dh in
    if dh_sz > limit then Dir_heap.delete_min dh
    else dh
  in 
  Printf.printf "DIRS:\n";
  List.iter print_dir (List.sort Dir.compare (Dir_heap.flatten dh));
  let fh =
    let (fh_sz, fh) = fh in
    if fh_sz > limit then File_heap.delete_min fh
    else fh
  in 
  Printf.printf "FILES:\n";
  List.iter print_file (List.sort File.compare (File_heap.flatten fh));

[-- Attachment #3: mkd.ml --]
[-- Type: text/plain, Size: 695 bytes --]

let id = ref 0

let rec go dir_name par_id =
  Unix.chdir dir_name;
  incr id;
  let id = !id in
  Printf.printf "D %d %d %s\n" id par_id dir_name;
  let d = Unix.opendir "." in
  let dirs = ref [] in
  let rec loop () =
    let f = Unix.readdir d in
    let st = Unix.lstat f in
    let _ =
      match f.[0], st.Unix.st_kind with
      | '.', _ -> ()
      | _, Unix.S_DIR -> dirs := f :: !dirs
      | _, Unix.S_REG -> Printf.printf "F %d %s\n" st.Unix.st_size f
      | _ -> ()
    in loop ()
  in
  begin
    List.iter (fun f -> go f id) 
      (try
        loop ()
       with End_of_file -> Unix.closedir d; !dirs)
  end;
  Unix.chdir ".."

let _ =
  Printf.printf "R 0 .\n";
  go "." 0


  reply	other threads:[~2003-10-02 14:55 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-10-02 12:31 Ram Bhamidipaty
2003-10-02 14:53 ` Michal Moskal [this message]
2003-10-08 14:48   ` Pierre Weis
2003-10-08 16:09     ` Michal Moskal
2003-10-08 20:34       ` Pierre Weis
2003-10-08 21:22         ` scanf (Re: [Caml-list] what is the functional way to solve this problem?) Michal Moskal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20031002145327.GA4690@roke.freak \
    --to=malekith@pld-linux.org \
    --cc=caml-list@pauillac.inria.fr \
    --cc=ramb@sonic.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).