caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: Francois Pottier <Francois.Pottier@inria.fr>
To: caml-list@inria.fr
Subject: [Caml-list] Some sugar for regexp matching using camlp4
Date: Mon, 16 Jul 2001 17:54:06 +0200	[thread overview]
Message-ID: <20010716175406.A25317@pauillac.inria.fr> (raw)

[-- Attachment #1: Type: text/plain, Size: 1613 bytes --]


Hello all,

I have experimented a bit with custom syntax for regular expression
matching. My goal was to implement some high-level constructs on top
of a low-level regexp library such as PCRE. The result of my (modest)
experiment is attached. It is a camlp4 grammar extension, which allows
writing

  extract x, y, ... matching e against r in e'

The semantics is as follows. The expression e is evaluated, yielding
a string which is matched against the regular expression r. r must be
either a constant string, or a compiled regular expression; if the
former, pre-compilation code is inserted transparently. The variables
x, y, ... etc. are then bound to the appropriate groups (i.e. x is
bound to the sub-string which matched the whole pattern, y is bound
to the sub-string which matched the first group, etc.) and can be
referred to within e'. Wildcards _ can be used instead of variables.

This is of course pretty modest, but it seems that, with a small
number of such constructs, O'Caml could be turned into a rather nice
textual manipulation language. (Something often requested on this
list.) Opinions and further suggestions are welcome.

-- 
François Pottier
Francois.Pottier@inria.fr
http://pauillac.inria.fr/~fpottier/

Here's how to use the syntax extension:

1. Compile it:

  ocamlc -pp "camlp4o -I `camlp4o -where`" -I `camlp4o -where` -c pcreg.ml

2. At the beginning of your source files, insert

  #load "pcreg.cmo";;

3. Compile your source files using the following option:

  -pp "camlp4o -I ."

   (in addition to any options necessary to include the PCRE library,
    e.g. -I +contrib).


[-- Attachment #2: pcreg.ml --]
[-- Type: text/plain, Size: 5147 bytes --]

(* $Header: /net/pauillac/caml/repository/bigbro/pcreg.ml,v 1.1 2001/07/16 15:04:04 fpottier Exp $ *)

open Pcaml

#load "pa_extend.cmo";;
#load "q_MLast.cmo";;

(* ----------------------------------------------------------------------------------------------------------------- *)
(* We begin with an internal utility: a global variable generator, which can be called within grammar rules.

   The global variables receive names numbered in a linear fashion. There is a possibility of name clashes
   if another module, which uses the same name generator, is ``opened'' and that module does not have a
   [.mli] file. It is recommended to always use [.mli] files to describe module interfaces, so these
   internal variable names will not be exported. *)

(* This global variable is used to accumulate global variable declarations while the parser is running. *)

let globals =
  ref []

(* This function allows registering a new global declaration. It can be called within a grammar rule. *)

let declare (item : MLast.str_item) =
  globals := (item, (0, 0) (* dummy location *)) :: !globals

(* This function is used to generate a fresh identifier. *)

let generate =
  let count = ref 0 in
  fun () ->
    incr count;
    Printf.sprintf "_regexp_%d" !count

(* This hook, which is called once per implementation file, adds the global declarations generated by calls
   to [declare] at the beginning of the module. *)

let _ = EXTEND
  implem: FIRST
    [[ (sil, stopped) = NEXT ->
       let extra = !globals in
       globals := [];
       (extra @ sil, stopped)
     ]];
END

(* ----------------------------------------------------------------------------------------------------------------- *)
(* This auxiliary function allows generating code for assertions.

   [assert] is dealt with as a kind of special-purpose syntax extension in O'Caml. However, code in quotations must
   be expressed in plain (righteous) syntax, which means that it cannot use [assert] directly. Hence, we must use
   this code (taken from [camlp4]'s [pa_o.ml]) to generate assertions.

   Note that the generated code depends on the value of [camlp4]'s [-noassert] option. This option is distinct
   from [ocaml]'s own [-noassert] option. *)

let make_assert loc e =
  let f = <:expr< $str:!Pcaml.input_file$ >> in
  let bp = <:expr< $int:string_of_int (fst loc)$ >> in
  let ep = <:expr< $int:string_of_int (snd loc)$ >> in
  let raiser = <:expr< raise (Assert_failure ($f$, $bp$, $ep$)) >> in
  if !Pcaml.no_assert
  then <:expr< () >>
  else <:expr< if $e$ then () else $raiser$ >>

(* ----------------------------------------------------------------------------------------------------------------- *)
(* We continue with syntactic extensions which allow dealing with regular expressions easily.

   The syntax

     extract s0, s1, ..., sk matching e against r in e'

   evaluates the expression [e], matches its value against the regular expression [r] using [Pcre.exec], and binds the
   substrings thus obtained to the patterns [s0], [s1], ..., [sk]. (Each [si] must be either a variable or the
   wildcard pattern [_].) [Pcre.exec] raises [Not_found] if it doesn't match. The code also contains a dynamic check
   (using [assert]) which ensures that the number of extracted substrings, namely $k+1$, is consistent with the
   supplied regular expression. Lastly, the expression [r] must be either a string constant, or a compiled regular
   expression. If the former, the string is pre-compiled (using a global declaration) into a regular expression. *)

let _ = EXTEND
  GLOBAL: expr;
  expr: LEVEL "expr1"
    [[ (p, e, r, l) = [ "extract"; p = LIST1 simplepat SEP ","; "matching"; e = expr; "against"; r = expr ->
                        (p, e, r, loc) ]; (* anonymous sub-rule allows extracting partial location [l] *)
       "in"; body = expr LEVEL "top" ->

	 (* If the regular expression is a string constant, generate pre-compilation code for it. *)

	 let r = match r with
	 | <:expr< $str:s$ >> ->
	     let name = generate() in
	     declare <:str_item< value $lid:name$ = Pcre.regexp $str:s$ >>;
	     <:expr< $lid:name$ >>
	 | _ ->
	     r in

	 (* Wrap bindings for the substrings around the declaration's body. *)

	 let body, _ = List.fold_left (fun (body, index) name ->
	   begin
	     match name with
	     | Some name ->
		 <:expr<
	           let $lid:name$ = Pcre.get_substring _substrings $int:(string_of_int index)$ in
		   $body$
	         >>
	     | None ->
		 body
	   end, index + 1
	 ) (body, 0) p in

	 (* Wrap a dynamic check around the code thus obtained, to ensure that the number of substrings
	    extracted out of the pattern is correct. *)

	 let condition = <:expr< Pcre.num_of_subs _substrings = $int:(string_of_int (List.length p))$ >> in
	 let assertion = make_assert l condition in

	 let body = <:expr< 
	   do {
	     $assertion$;
	     $body$
	   }
	 >> in

	 (* Wrap the actual pattern matching instruction around the code thus obtained. *)

	 <:expr<
	   let _substrings = Pcre.exec ~rex:$r$ $e$ in
	   $body$
	 >>

    ]]
  ;
  simplepat:
    [[ x = LIDENT -> Some x
     | "_"        -> None ]]
  ;
END


             reply	other threads:[~2001-07-16 15:54 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2001-07-16 15:54 Francois Pottier [this message]
2001-07-16 17:37 ` Alexander V. Voinov
2001-07-17  2:36   ` Brian Rogoff
2001-07-17 10:36 ` Markus Mottl
2001-07-17 12:15   ` Francois Pottier
2001-07-17 12:39     ` Markus Mottl
2001-07-17 12:44       ` Daniel de Rauglaudre
2001-07-17 12:52         ` Markus Mottl
2001-07-17 11:45 ` Michel Schinz
2001-07-17 12:18   ` Francois Pottier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20010716175406.A25317@pauillac.inria.fr \
    --to=francois.pottier@inria.fr \
    --cc=caml-list@inria.fr \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).