caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
* [Caml-list] Performance problem
@ 2003-05-14 21:03 Matt Gushee
  2003-05-14 22:13 ` Ville-Pertti Keinonen
  0 siblings, 1 reply; 6+ messages in thread
From: Matt Gushee @ 2003-05-14 21:03 UTC (permalink / raw)
  To: caml-list

Hello, all--

I am writing a utility that obtains a list of available fonts on a
system and outputs them as a list of OCaml records. I am currently
testing this on Linux, where the font list is obtained with 

  Unix.open_process_in("xlsfonts -d " ^ disp ^ " |sort |uniq")

where disp is the result of Unix.getenv "DISPLAY" or a default value of
":0.0". Each line is then parsed using a regular expression. For further
detail, see the complete code below.

Well, the program seems to work correctly, but it is extremely slow. My
test program calls get_font_info () once, then prints "Done" and exits.
But with the test program compiled to *native code*, GNU time reports:

  real    4m2.329s
  user    3m54.800s
  sys     0m0.090s

Granted, the machine I'm running it on is a bit slow (Pentium 200), but
these figures seem way beyond the pale. There must be a bug either in my
code or one of the supporting libraries. The only obvious possibilities
that occur to me are an output buffering problem or some delay in
interacting with the X display, but I'm not sure how to solve either of
those. And maybe it's something else entirely. Can anyone explain my
terrible results?

---- fontInfo.ml ----------------------------------------------------

exception NotSupported
exception InvalidXLFD

type font_weight =
  [ `Medium
  | `Bold
  | `DemiBold
  | `ExtraBold
  | `Light
  ]
and font_style =
  [ `Roman
  | `Italic
  | `Oblique
  ]
and font_stretch =
  [ `Normal
  | `Condensed
  | `Extended
  ]
and font_class =
  [ `Serif
  | `SansSerif
  | `Monospaced
  | `Cursive
  | `Fantasy
  ]

type font_variant = {
  ptsize: int;
  weight: font_weight;
  style: font_style;
  stretch: font_stretch;
  }
type font = {
  id: string;
  family: string;
  scalable: bool;
  variants: font_variant array;
  encoding: string;
  }


(* foundry, Family, Weight, Slant, Stretch, ?, Ptsize, pxlsize, ht, width, MCP, ?,
   EncA, EncB *)
let xlfd_re = Str.regexp "-.*-\(.*\)-\(.*\)-\(.*\)-\(.*\)-.*-\(.*\)-.*-.*-.*-\(.*\)-.*-\(.*-.*\)"

let parse_xlfd xlfd =
  let m = Str.string_match xlfd_re xlfd 0 in
  if not m then raise InvalidXLFD;
  try
    let size = int_of_string (Str.matched_group 5 xlfd) in
    let scalable = (size = 0)
    and family = Str.matched_group 1 xlfd 
    and encoding = Str.matched_group 7 xlfd
    and var =
      let weight =
        match (Str.matched_group 2 xlfd) with
        | "medium" | "book" -> `Medium
        | "bold" -> `Bold
        | "demibold" -> `DemiBold
        | "light" -> `Light
        | _ -> `Medium
      and style =
        match (Str.matched_group 3 xlfd) with
        | "o" -> `Oblique
        | "i" -> `Italic
        | _ -> `Roman
      and stretch =
        match (Str.matched_group 4 xlfd) with
        | "condensed" -> `Condensed 
        | "extended" -> `Extended
        | _ -> `Normal in
      { ptsize = size;
        weight = weight;
        style = style;
        stretch = stretch; } in
    (family, scalable, encoding, var)
  with _ -> raise InvalidXLFD

let parse_xlfds xlfds =
  let rec raw_parse_xlfds xlfds' =
    match xlfds' with
    | [] -> []
    | h :: t ->
      try
        let fdata = parse_xlfd h in
        fdata :: raw_parse_xlfds t
      with InvalidXLFD ->
        (* Normal results include some lines that are not XLFDs, so
           we just ignore those. *)
        prerr_endline "WARNING: Invalid XLFD";
        flush stderr;
        raw_parse_xlfds t in
  raw_parse_xlfds xlfds

let get_x_fonts disp =
  let rec getlines ch =
    try
      let line = input_line ch in
      line :: getlines ch
    with End_of_file ->
      [] in
  let fonts =
    getlines (Unix.open_process_in ("xlsfonts -d " ^ disp ^ " |sort |uniq")) in
  fonts

let get_font_info () =
  match Sys.os_type with
  | "Unix" ->
    let disp = 
      try
        Unix.getenv "DISPLAY"
      with Not_found ->
        (* try a reasonable default *)
        ":0.0" in
    parse_xlfds (get_x_fonts disp)
  | _ -> raise NotSupported



-- 
Matt Gushee                 When a nation follows the Way,
Englewood, Colorado, USA    Horses bear manure through
mgushee@havenrock.com           its fields;
http://www.havenrock.com/   When a nation ignores the Way,
                            Horses bear soldiers through
                                its streets.
                                
                            --Lao Tzu (Peter Merel, trans.)

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Caml-list] Performance problem
  2003-05-14 21:03 [Caml-list] Performance problem Matt Gushee
@ 2003-05-14 22:13 ` Ville-Pertti Keinonen
       [not found]   ` <20030514223956.GB27927@swordfish>
  2003-05-14 23:13   ` Matt Gushee
  0 siblings, 2 replies; 6+ messages in thread
From: Ville-Pertti Keinonen @ 2003-05-14 22:13 UTC (permalink / raw)
  To: Matt Gushee; +Cc: caml-list

> these figures seem way beyond the pale. There must be a bug either in 
> my
> code or one of the supporting libraries. The only obvious possibilities
> that occur to me are an output buffering problem or some delay in
> interacting with the X display, but I'm not sure how to solve either of
> those. And maybe it's something else entirely. Can anyone explain my
> terrible results?

My guess (without profiling) is that most of the time is being spent 
doing regular expression matching.

Regular expressions can be much slower than you might expect for 
non-trivial cases.  The expression in your program looks particularly 
nasty, since there are a lot of ambiguous cases (whether '-' should 
match '.*' or '-' depends on everything that follows).

You might want to try using Str.split instead.

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Caml-list] Performance problem
       [not found]   ` <20030514223956.GB27927@swordfish>
@ 2003-05-14 22:41     ` Matt Gushee
  2003-05-15  7:57       ` Xavier Leroy
  0 siblings, 1 reply; 6+ messages in thread
From: Matt Gushee @ 2003-05-14 22:41 UTC (permalink / raw)
  To: caml-list

On Thu, May 15, 2003 at 01:13:48AM +0300, Ville-Pertti Keinonen wrote:

> >that occur to me are an output buffering problem or some delay in
> >interacting with the X display, but I'm not sure how to solve either of
> >those. And maybe it's something else entirely. Can anyone explain my
> >terrible results?
> 
> My guess (without profiling) is that most of the time is being spent 
> doing regular expression matching.
> 
> Regular expressions can be much slower than you might expect for 
> non-trivial cases.  The expression in your program looks particularly 
> nasty, since there are a lot of ambiguous cases (whether '-' should 
> match '.*' or '-' depends on everything that follows).
> 
> You might want to try using Str.split instead.

Good point. I don't know why I didn't think of that in the first place.

On Thu, May 15, 2003 at 12:18:51AM +0200, Olivier Andrieu wrote:
> 
> Hi, I don't really know where your performance problem come from, but :
> 
> 1) you regexp string is not properly escaped, all the "\(" and "\)"
>    should be "\\(" and "\\)".

Okay, I've run into that with other languages. But I didn't know you had
to do that in OCaml. Is that documented anywhere?

> (I guess you're not using ocaml 3.06
>    because it prints a lot of warnings concerning these).

Oh, I've seen those warnings more than once, and was thinking about
asking on the list, but it didn't seem urgent.

By the way, why doesn't the compiler just reject regexes with single
backslashes? What is the point of issuing warnings and then accepting
the incorrect syntax?

> 2) there are no problems using bytecode :
>      0.3s using bytecode
>      23s  using native

How strange. I actually didn't try compiling to bytecode. I had tried
using the function in the interactive toplevel, and finding it very
slow, went straight to native code in the hope that it would perform 
reasonably well. Any idea why bytecode would be faster?

> 3) you should use "[^-]*" instead of ".*" : this makes life easier for
>    the regexp matching and thus runs faster.
>      0.16s bytecode
>      0.03s native

Ah, another good point. I think I'll go with Str.split, but I'll keep
this in mind for future reference.

Thank you both for the suggestions.

-- 
Matt Gushee                 When a nation follows the Way,
Englewood, Colorado, USA    Horses bear manure through
mgushee@havenrock.com           its fields;
http://www.havenrock.com/   When a nation ignores the Way,
                            Horses bear soldiers through
                                its streets.
                                
                            --Lao Tzu (Peter Merel, trans.)

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Caml-list] Performance problem
  2003-05-14 22:13 ` Ville-Pertti Keinonen
       [not found]   ` <20030514223956.GB27927@swordfish>
@ 2003-05-14 23:13   ` Matt Gushee
  1 sibling, 0 replies; 6+ messages in thread
From: Matt Gushee @ 2003-05-14 23:13 UTC (permalink / raw)
  To: caml-list

On Thu, May 15, 2003 at 01:13:48AM +0300, Ville-Pertti Keinonen wrote:
> 
> Regular expressions can be much slower than you might expect for 
> non-trivial cases.  The expression in your program looks particularly 
> nasty, since there are a lot of ambiguous cases (whether '-' should 
> match '.*' or '-' depends on everything that follows).
> 
> You might want to try using Str.split instead.

Yes, that works well. Considering the number of times I've seen Python
gurus admonishing newbies to use simple string functions instead of
regexes whenever possible (and have probably admonished a few myself),
you'd think I would have thought of that. But all's well that ends well,
I guess.

-- 
Matt Gushee                 When a nation follows the Way,
Englewood, Colorado, USA    Horses bear manure through
mgushee@havenrock.com           its fields;
http://www.havenrock.com/   When a nation ignores the Way,
                            Horses bear soldiers through
                                its streets.
                                
                            --Lao Tzu (Peter Merel, trans.)

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Caml-list] Performance problem
  2003-05-14 22:41     ` Matt Gushee
@ 2003-05-15  7:57       ` Xavier Leroy
  2003-05-15 10:53         ` Michal Moskal
  0 siblings, 1 reply; 6+ messages in thread
From: Xavier Leroy @ 2003-05-15  7:57 UTC (permalink / raw)
  To: caml-list

> > >that occur to me are an output buffering problem or some delay in
> > >interacting with the X display, but I'm not sure how to solve either of
> > >those. And maybe it's something else entirely. Can anyone explain my
> > >terrible results?
> > 
> > My guess (without profiling) is that most of the time is being spent 
> > doing regular expression matching.

Profiling (ocamlopt -p) confirms this guess: 99.9% of the time is
spent in the regexp matching engine...

> > Regular expressions can be much slower than you might expect for 
> > non-trivial cases.  The expression in your program looks particularly 
> > nasty, since there are a lot of ambiguous cases (whether '-' should 
> > match '.*' or '-' depends on everything that follows).

Indeed.  This kind of regexp is among the worst cases for a greedy
backtracking regexp matcher like that of Str.  As someone else
suggested, making the regexp deterministic (replace .*- with [^-]*-)
helps immensely.

> > 1) you regexp string is not properly escaped, all the "\(" and "\)"
> >    should be "\\(" and "\\)".
> 
> Okay, I've run into that with other languages. But I didn't know you had
> to do that in OCaml. Is that documented anywhere?

In the syntax for strings, yes: a literal \ should be written \\,
hence the \( regexp construct should be written \\( in a string literal.
Perhaps this should be recalled in the docs for Str.regexp.  At any rate,
the compiler will nicely warn you.

> By the way, why doesn't the compiler just reject regexes with single
> backslashes? What is the point of issuing warnings and then accepting
> the incorrect syntax?

Backward compatibility, mostly.  But this is the kind of warning that
might become an error at some point in the future.  

> > 2) there are no problems using bytecode :
> >      0.3s using bytecode
> >      23s  using native

This I doubt very much, and indeed a quick test shows that bytecode
and native take exactly the same time.  This isn't surprising:
since 99% of the time is spent is the regexp engine, and this engine
is in C, it runs at the same speed in bytecode and in native-code.

- Xavier Leroy

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Caml-list] Performance problem
  2003-05-15  7:57       ` Xavier Leroy
@ 2003-05-15 10:53         ` Michal Moskal
  0 siblings, 0 replies; 6+ messages in thread
From: Michal Moskal @ 2003-05-15 10:53 UTC (permalink / raw)
  To: Xavier Leroy; +Cc: caml-list

On Thu, May 15, 2003 at 09:57:51AM +0200, Xavier Leroy wrote:
> > > 2) there are no problems using bytecode :
> > >      0.3s using bytecode
> > >      23s  using native
> 
> This I doubt very much, and indeed a quick test shows that bytecode
> and native take exactly the same time.  This isn't surprising:
> since 99% of the time is spent is the regexp engine, and this engine
> is in C, it runs at the same speed in bytecode and in native-code.

I have no idea *why* but here (Linux, x86), using ocamlc and ocamlopt
3.06, I get exactly the same timings (i.e. 0.44s for bytecode and 21.91s
for native code). No special options (like -p or -g) were used for
compilation.

Does the bytecode and nativecode use *exectly* the same C engine (i.e.
maybe one is compiled with different options (like -fpic -O2 or
someting) ?)

-- 
: Michal Moskal :: http://www.kernel.pl/~malekith : GCS {C,UL}++++$ a? !tv
: PLD Linux ::::::::: Wroclaw University, CS Dept : {E-,w}-- {b++,e}>+++ h

-------------------
To unsubscribe, mail caml-list-request@inria.fr Archives: http://caml.inria.fr
Bug reports: http://caml.inria.fr/bin/caml-bugs FAQ: http://caml.inria.fr/FAQ/
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2003-05-15 10:54 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-05-14 21:03 [Caml-list] Performance problem Matt Gushee
2003-05-14 22:13 ` Ville-Pertti Keinonen
     [not found]   ` <20030514223956.GB27927@swordfish>
2003-05-14 22:41     ` Matt Gushee
2003-05-15  7:57       ` Xavier Leroy
2003-05-15 10:53         ` Michal Moskal
2003-05-14 23:13   ` Matt Gushee

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).