caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
* camlp4 and lexers
@ 2008-05-15 15:00 Pietro Abate
  2008-05-16 15:24 ` [Caml-list] " Pietro Abate
  0 siblings, 1 reply; 2+ messages in thread
From: Pietro Abate @ 2008-05-15 15:00 UTC (permalink / raw)
  To: caml-list

Hi all,
This question was asked a few weeks ago, and again last week.  However I
still don't really get how to proceed. I hope we can cook down a small
example to understand a bit more the camlp4 internals.

Say I want to write a small parser for regexp (or an aritmetic
calculator), but I don't want to extend the ocaml grammar to do that. I
just want to create a minimal lexer and a minimal grammar to parse
expressions like (aaa*|b?);c

The parser part is easy (below). The part I don't understand is how to
create a lexer. I had a look at the ocsigen xmlcaml lexer and the camlp4
lexer, but I still haven't found a minimal example I can use without
getting confused. 

In particular, the problem below is that I want my lexer to give me back
CHAR tokens (different from the CHAR of char * string of camlp4) and not
strings. I could do the same with the camlp4 lexer, but all my regexp
should be then written as ('a''a''a' *) etc ... that it's not good
looking.

A while ago I did something similar with the old camlp4 [1] using
plexer, but this is not possible anymore...

Nicolas a while ago suggested to copy the Camlp4.PreCast module and the 
lexer module and customize them. I think it should be possible just
to use Struct.Grammar.Static.Make with a new lexer instead... but, as I
said, I'm not able to write a very minimal lexer for this example...
Maybe I'm confused about this.

I think a minimal example will help more then one person here.

thanks :)
p


-------------------------- This is my parser...

module RegExGram =  Struct.Grammar.Static.Make(RegExpLexer)

let regex = RegExGram.Entry.mk "regex"

EXTEND RegExGram
  GLOBAL: regex;

  regex: [[ e1 = SELF ; "|" ; e2 = concat -> Alt(e1,e2)
          | e1 = seq -> e1 ]
  ];

  concat:[[ e1 = SELF ; ";"; e2 = seq -> Seq(e1,e2)
          | e1 = SELF ; e2 = seq -> Seq(e1,e2)
          | e1 = seq -> e1 ]
  ];

  seq:   [[ e1 = simple ; "?" -> Opt e1
          | e1 = simple ; "*" -> Star e1
          | e1 = simple ; "+" -> Plus e1
          | e1 = simple -> e1 ]
  ];

  simple:[[ "." -> Dot
          | "("; e1 = regex; ")" -> e1
          | `CHAR(s) -> Sym s ]
  ];

END

----------------------

[1] http://groups.google.com/group/fa.caml/browse_thread/thread/e26569427cc8879d


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [Caml-list] camlp4 and lexers
  2008-05-15 15:00 camlp4 and lexers Pietro Abate
@ 2008-05-16 15:24 ` Pietro Abate
  0 siblings, 0 replies; 2+ messages in thread
From: Pietro Abate @ 2008-05-16 15:24 UTC (permalink / raw)
  To: caml-list

[-- Attachment #1: Type: text/plain, Size: 2607 bytes --]

Hi again.

I have a minimal (?) lexer (attached) working with the grammar below.
For the purpose of this excercise I used ulex. I started with the cduce
lexer and removed all cduce-specific functions. However I'm not enterely
happy.

First I'd like to have another example using ocamllex and not ulex (one
less dependecy), but I guess this is not too hard to do.

Second, I've copy-pasted some code in the lexer to instanciate the
camlp4 modules, but I'm not sure what is required and what is not. I
mean, I can look at the camlp4 modules sigs, but without documentation
there are a lot of functions that I don't really understand. Can anybody
explain the signature of the Loc, Token and Error modules ? 
How these function used within the camlp4 parsing machinery ?
- Token.match_keyword
- Token.extract_string
- Token.Filter.mk
- Token.Filter.filter
- Token.Filter.define_filter
- Token.Filter.keyword_added
- Token.Filter.keyword_removed

Third, I'm not sure if this is the real minimal example I was looking
for. I've the impression I could reuse the Camlp4.PreCast.Loc module,
but I'm not sure if I can reuse the Camlp4.PreCast.Token since it is
linked with the token type definition. I don't think I can reuse/extend 
the caml_token type... Making the lexer extensible would be a great !

Hope this helps.

comments ?

pietro


This is the _tags file to compile it:
---------- _tags -------
"parser.ml": use_camlp4, pp(camlp4of)
"ulexer.ml": pkg_ulex, use_camlp4, syntax_camlp4o
"ulexer.mli": use_camlp4, pkg_ulex
-----------

+ nicolas' universal myocamlbuil.ml

-------------------- parser.ml -----------------------

type t = 
      Seq of t * t
    | Alt of t * t
    | Opt of t 
    | Star of t 
    | Plus of t 
    | Dot 
    | Sym of char

open Ulexer

module RegExGram = Camlp4.Struct.Grammar.Static.Make(Ulexer)

let regex = RegExGram.Entry.mk "regex"

(* I guess I don't need to use KWD *) 

EXTEND RegExGram
  GLOBAL: regex;

  regex: [[ e1 = SELF ; `KWD "|" ; e2 = concat -> Alt(e1,e2)
          | e1 = concat -> e1 ]
  ];

  concat:[[ e1 = SELF ; `KWD ";"; e2 = seq -> Seq(e1,e2)
          | e1 = SELF ; e2 = seq -> Seq(e1,e2)
          | e1 = seq -> e1 ]
  ];

  seq:   [[ e1 = simple ; `KWD "?" -> Opt e1
          | e1 = simple ; `KWD "*" -> Star e1
          | e1 = simple ; `KWD "+" -> Plus e1
          | e1 = simple -> e1 ]
  ];

  simple:[[ `KWD "." -> Dot
          | `KWD "("; e1 = regex; `KWD ")" -> e1
          | `CHAR(s) -> Sym s ]
  ];


END

let from_string s = RegExGram.parse_string regex (Loc.mk "<string>") s

------------------------------------------------------



[-- Attachment #2: ulexer.ml --]
[-- Type: text/plain, Size: 4658 bytes --]

open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
    match exn with 
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | KWD of string
  | CHAR of char
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
    | CHAR s -> sf "CHAR %c" s
    | KWD s  -> sf "KWD \"%s\"" s
    | EOI    -> sf "EOI"

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KWD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
    | KWD s -> s
    | CHAR c -> String.make 1 c 
    | tok ->
        invalid_arg ("Cannot extract a string from this token: "^
                     to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = tok in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing

exception Error of int * int * string

let error i j s = raise (Error (i,j,s))

(***********************************************************)
(* Buffer for string literals *)
  
let string_buff = Buffer.create 1024

let store_lexeme lexbuf = 
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

(***********************************************************)
(* Lexer *)

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
    (L.lexeme_end lexbuf) 
    "Illegal character"

let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

let rec token = lexer 
 | [' ' '\t'] -> token lexbuf
 | ['*' '?' '+' '(' ')' ';' '|' ] ->
         let k = KWD (L.latin1_lexeme lexbuf) in return lexbuf k
 | ['A'-'Z' 'a'-'z'] -> 
         let c = CHAR (L.latin1_lexeme_char lexbuf 0) in return lexbuf c
 | eof -> return lexbuf EOI
 | _ -> illegal lexbuf


(***********************************************************)

let enc = ref L.Latin1
let lexbuf = ref None
let last_tok = ref (KWD "DUMMY")

let raise_clean e =
  clear_buff ();
  raise e

let mk () _loc cs =
  let lb = L.from_var_enc_stream enc cs in
  lexbuf := Some lb;
  let next _ =
    let tok, loc = 
      try token lb
      with
        | Ulexing.Error -> 
            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
                          "Unexpected character"))
        | Ulexing.InvalidCodepoint i ->
            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
                          "Code point invalid for the current encoding"))
        | e -> raise_clean e
    in
    last_tok := tok;
    Some (tok, loc)
  in
  Stream.from next

[-- Attachment #3: ulexer.mli --]
[-- Type: text/plain, Size: 312 bytes --]

open Camlp4.Sig

type token =
  | KWD of string
  | CHAR of char
  | EOI

exception Error of int * int * string

module Loc   : Loc with type t = int * int
module Token : Token with module Loc = Loc and type t = token
module Error : Error

val mk : unit -> (Loc.t -> char Stream.t -> (Token.t * Loc.t) Stream.t)

[-- Attachment #4: myocamlbuild.ml --]
[-- Type: text/plain, Size: 2045 bytes --]

open Ocamlbuild_plugin
open Command (* no longer needed for OCaml >= 3.10.2 *)

(* these functions are not really officially exported *)
let run_and_read = Ocamlbuild_pack.My_unix.run_and_read
let blank_sep_strings = Ocamlbuild_pack.Lexers.blank_sep_strings

(* this lists all supported packages *)
let find_packages () =
  blank_sep_strings &
    Lexing.from_string &
      run_and_read "ocamlfind list | cut -d' ' -f1"

(* this is supposed to list available syntaxes, but I don't know how to do it. *)
let find_syntaxes () = ["camlp4o"; "camlp4r"]

(* ocamlfind command *)
let ocamlfind x = S[A"ocamlfind"; x]

let _ = dispatch begin function
   | Before_options ->

       (* override default commands by ocamlfind ones *)
       Options.ocamlc   := ocamlfind & A"ocamlc";
       Options.ocamlopt := ocamlfind & A"ocamlopt";
       Options.ocamldep := ocamlfind & A"ocamldep";
       Options.ocamldoc := ocamlfind & A"ocamldoc"

   | After_rules ->

       (* When one link an OCaml library/binary/package, one should use -linkpkg *)
       flag ["ocaml"; "link"] & A"-linkpkg";

       (* For each ocamlfind package one inject the -package option when
        * compiling, computing dependencies, generating documentation and
        * linking. *)
       List.iter begin fun pkg ->
         flag ["ocaml"; "compile";  "pkg_"^pkg] & S[A"-package"; A pkg];
         flag ["ocaml"; "ocamldep"; "pkg_"^pkg] & S[A"-package"; A pkg];
         flag ["ocaml"; "doc";      "pkg_"^pkg] & S[A"-package"; A pkg];
         flag ["ocaml"; "link";     "pkg_"^pkg] & S[A"-package"; A pkg];
       end (find_packages ());

       (* Like -package but for extensions syntax. Morover -syntax is useless
        * when linking. *)
       List.iter begin fun syntax ->
         flag ["ocaml"; "compile";  "syntax_"^syntax] & S[A"-syntax"; A syntax];
         flag ["ocaml"; "ocamldep"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
         flag ["ocaml"; "doc";      "syntax_"^syntax] & S[A"-syntax"; A syntax];
       end (find_syntaxes ());

   | _ -> ()
end 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2008-05-16 15:25 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-05-15 15:00 camlp4 and lexers Pietro Abate
2008-05-16 15:24 ` [Caml-list] " Pietro Abate

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).