caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: Martin Jambon <martin.jambon@ens-lyon.org>
To: Andrej Bauer <andrej.bauer@andrej.com>
Cc: caml-list@inria.fr
Subject: Re: [Caml-list] ocamllex and python-style indentation
Date: Fri, 12 Jun 2009 14:56:21 +0200	[thread overview]
Message-ID: <4A325075.7040909@ens-lyon.org> (raw)
In-Reply-To: <7d8707de0906120120x10cc8fe0p54adbd189003f3da@mail.gmail.com>

Andrej Bauer wrote:
> Thanks to Andreas, I'll have a look at the "old" code.
> 
> I think I understand the general idea of inserting "virtual" tokens,
> but the details confuse me still. So starting with
> 
>> if True:
>>     x = 3
>>     y = (2 +
>>       4 + 5)
>> else:
>>     x = 5
>>     if False:
>>         x = 8
>>         z = 2
> 
> Martin suggests the following:
> 
>> {
>> if True:
>> ;
>>    {
>>    x = 3
>>    ;
>>    y = (2 +
>>    ;
>>      {
>>      4 + 5)
>>      }
>>    }
>> ;
>> else:
>> ;
>>    {
>>    x = 5
>>    ;
>>    if False:
>>    ;
>>        {
>>        x = 8
>>        ;
>>        z = 2
>>        }
>>    }
>> }
> 
> I have two questions. Notice that the { ... } and ( ... ) need not be
> correctly nested (in the top half), so how are we going to deal with
> this? The second question is, why are there the separators after and
> just before "else:". I would expect separators inside { .... }, but
> not around "else".

Original example:

if True:
    x = 3
    y = (2 +
      4 + 5)
else:
    x = 5
    if False:
        x = 8
        z = 2


For pure indentation concerns, it is equivalent to:

x
  x
  x
    x
x
  x
  x
    x
    x


Which is parsed into:

[
  Line;
  Block
    [
      Line;
      Line;
      Block
       [
         Line
       ]
    ];
  Line;
  Block
    [
       Line;
       Line
    ];
  Block
    [
      Line;
      Line
    ]
]


I wrote the following code, which does the job.  You might want to use
ocamllex instead in order to better manage newline characters (CRLF...), line
number directives and allow input from something else than a file or in_channel.


Note that the following must be rejected:

x
    x
  x (indentation here could be only 0, 4 or more)


But this is accepted:

x
    x
x
  x


You could also enforce that the indentation of a block must be the current
indentation + k, for example k=2 for the whole input.



(******************* indent_parser.ml **********************)

type indent_line = Lexing.position * (int * string)

type indent_tree =
    [ `Line of (Lexing.position * string)
    | `Block of (Lexing.position * indent_tree list) ]


let split s =
  let len = String.length s in
  let result = ref None in
  try
    for i = 0 to len - 1 do
      if s.[i] <> ' ' then (
	result := Some (i, String.sub s i (len - i));
	raise Exit
      )
    done;
     None
  with Exit -> !result

let parse_lines fname ic : indent_line list =
  let lines = ref [] in
  let lnum = ref 0 in
  try
    while true do
      let bol = pos_in ic in
      let s = input_line ic in
      incr lnum;
      match split s with
	  None -> ()
	| Some ((n, _) as x) ->
	    let pos = {
	      Lexing.pos_fname = fname;
	      pos_lnum = !lnum;
	      pos_bol = bol;
	      pos_cnum = bol + n;
	    } in
	    lines := (pos, x) :: !lines
    done;
    assert false
  with End_of_file -> List.rev !lines

let parse_lines_from_file fname =
  let ic = open_in fname in
  try
    let x = parse_lines fname ic in
    close_in ic;
    x
  with e ->
    close_in_noerr ic;
    raise e

let error pos msg =
  let cpos = pos.Lexing.pos_cnum - pos.Lexing.pos_bol in
  let msg =
    Printf.sprintf "File %S, line %i, characters %i-%i:\n%s"
      pos.Lexing.pos_fname pos.Lexing.pos_lnum 0 cpos msg
  in
  failwith msg

let rec block_body cur_indent sub_indent cur_block l :
    indent_tree list * indent_line list =
  match l with
      [] -> (List.rev cur_block, [])
    | (pos, (n, s)) :: tl ->
	if n = cur_indent then
	  block_body cur_indent sub_indent (`Line (pos, s) :: cur_block) tl
	else if n > cur_indent then (
	  (match sub_indent with
	       None -> ()
	     | Some n' ->
		 if n <> n' then
		   error pos "Inconsistent indentation"
	  );
	  let sub_block, remaining =
	    block_body n None [ `Line (pos, s) ] tl in

	  block_body
	    cur_indent (Some n) (`Block (pos, sub_block) :: cur_block)
	    remaining
	)
	else
	  (List.rev cur_block, l)


let parse_indentation fname =
  let l = parse_lines_from_file fname in
  let result, remaining = block_body 0 None [] l in
  assert (remaining = []);
  result


let test () =
  let fname = Filename.temp_file "test" ".ind" in
  let oc = open_out fname in
  output_string oc "
if True:
    x = 3
    y = (2 +
      4 + 5)
else:
    x = 5
    if False:
        x = 8
        z = 2
";
  close_out oc;

  try
    let result = parse_indentation fname in
    Sys.remove fname;
    result
  with Failure msg as e ->
    Printf.eprintf "%s\n%!" msg;
    Sys.remove fname;
    raise e


(*****************************************************************)






> Presumably the intermediate stage that I would preprocess the token
> stream would have to know about indentation levels. I have not tried
> this, but ocaml lexer will correctly match things like
> 
> | '\n' [' ' '\t']* -> { INDENTATION (compute_indentation (lexeme buf)) }
> 
> Yes?

Kind of.  Don't discard the rest of the line...
If you have a choice, reject tabs.
Beware of CRLF newlines (\r\n) and missing \n before the end of file.
Also ocamllex does not keep track of newlines automatically.  See the
documentation for Lexing.lexbuf.



Martin

-- 
http://mjambon.com/


  reply	other threads:[~2009-06-12 13:01 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-06-11 12:57 Andrej Bauer
2009-06-11 13:12 ` [Caml-list] " yoann padioleau
2009-06-11 13:21 ` Andreas Rossberg
2009-06-11 13:44 ` Martin Jambon
2009-06-12  8:20   ` Andrej Bauer
2009-06-12 12:56     ` Martin Jambon [this message]
2009-06-12 13:34     ` Martin Jambon
2009-06-12 15:43     ` Andreas Rossberg
2009-06-30 18:58       ` Yitzhak Mandelbaum
2009-06-30 20:19         ` Mike Lin
2009-06-30 22:06         ` Andreas Rossberg
2009-07-01  2:13           ` Mike Lin
2009-07-01  7:31             ` Andreas Rossberg
2009-07-01 14:02               ` Mike Lin
2009-07-01 14:17                 ` Andreas Rossberg
2009-07-01 14:21                   ` Andreas Rossberg
2009-07-01 14:37                     ` Mike Lin
2009-07-01 15:03                   ` Sylvain Le Gall
2009-07-01 15:16                     ` [Caml-list] " Andreas Rossberg
2009-07-01 16:26                       ` Sylvain Le Gall
2009-07-01 15:19                     ` [Caml-list] " Martin Jambon
2009-07-01 15:43                       ` Andreas Rossberg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4A325075.7040909@ens-lyon.org \
    --to=martin.jambon@ens-lyon.org \
    --cc=andrej.bauer@andrej.com \
    --cc=caml-list@inria.fr \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).