Unifying the escape-sequence parser.

tech@mandoc.bsd.lv
 help / color / mirror / Atom feed

From: Kristaps Dzonsons <kristaps@bsd.lv>
To: tech@mdocml.bsd.lv
Subject: Unifying the escape-sequence parser.
Date: Thu, 07 Apr 2011 16:00:54 +0200	[thread overview]
Message-ID: <4D9DC396.9010504@bsd.lv> (raw)

[-- Attachment #1: Type: text/plain, Size: 719 bytes --]

Hi,

Enclosed are my efforts to unify the escaped-sequence functions in out.c 
(a2roffdeco()) and mandoc.c (mandoc_special()).

This handles, as far as I can see, all syntaxes of the groff(7) escapes.

When called during libmandoc validation, it will check for GLYPH_ERROR 
and be followed by a search-and-replace of ASCII_HYPH for `-' in the 
substring.  When invoked from term.c or html.c, it will switch on the 
returned type and substring value.

This will clear up a nice big chunk of code, but it's a pretty delicate 
area, so please look it over!

If you compile this file, you can test escapes by running, e.g.,

   % ./a.out s+\'\(\\f\[asdf\]\)\'123

to see the values of "start" and "end".

Thanks,

Kristaps

[-- Attachment #2: glyph.c --]
[-- Type: text/x-csrc, Size: 6971 bytes --]

#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define	ASCII_HYPH	30

enum mandoc_gly {
	GLYPH_ERROR = 0,
	GLYPH_IGNORE, /* glyph to be ignored */
	GLYPH_SPECIAL, /* a regular special character */
	GLYPH_PREDEF, /* a predefined special character */
	GLYPH_FONT, /* a font mode */
	GLYPH_FONTFAM, /* a font family */
	GLYPH__MAX
};

enum mandoc_gly	mandoc_glyph(const char **, const char **, int *);
static int	mandoc_glyphexp(const char *);

static const char * const glyphs[GLYPH__MAX] = {
	"error", 
	"ignore", 
	"special", 
	"predefined", 
	"font", 
	"font family"
};

/*
 * Pass over recursive numerical expressions.  This context of this
 * function is important: it's only called within character-terminating
 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
 * recursion: we don't care about what's in these blocks. 
 * This returns the number of characters skipped or -1 if an error
 * occurs (the caller should bail).
 */
static int
mandoc_glyphexp(const char *start)
{
	int		 i;
	size_t		 sz;
	const char	*cp;

	i = 0;

	/* The expression consists of a subexpression. */

	if ('\\' == start[i]) {
		cp = &start[++i];
		/*
		 * Read past the end of the subexpression.
		 * Bail immediately on errors.
		 */
		if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
			return(-1);
		return(i + cp - &start[i]);
	} 

	if ('(' != start[i++])
		return(0);

	/*
	 * A parenthesised subexpression.  Read until the closing
	 * parenthesis, making sure to handle any nested subexpressions
	 * that might ruin our parse.
	 */

	while (')' != start[i]) {
		sz = strcspn(&start[i], ")\\");
		i += (int)sz;

		if ('\0' == start[i])
			return(-1);
		else if ('\\' != start[i])
			continue;

		cp = &start[++i];
		if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
			return(-1);
		i += cp - &start[i];
	}

	/* Read past the terminating ')'. */
	return(++i);
}

/*
 * Handle an escaped sequeence.  This should be called with any
 * string subsequent a `\'.  Pass a pointer to this substring as "end";
 * it will be set to the supremum of the parsed escape sequence.  If
 * this returns GLYPH_ERROR, the string is bogus and should be thrown
 * away.  If not GLYPH_ERROR or GLYPH_IGNORE, "start" is set to the
 * first relevant character of the substring (font, glyph, whatever) of
 * length sz.  Both "start" and "sz" may be NULL.
 */
enum mandoc_gly
mandoc_glyph(const char **end, const char **start, int *sz)
{
	char		 c, term, numeric;
	int		 i, lim, ssz;
	const char	*cp;
	enum mandoc_gly	 gly; 

	cp = *end;
	if (start)
		*start = cp;
	i = 0;
	gly = GLYPH_ERROR;
	term = '\0';
	numeric = 0;

	switch ((c = cp[i++])) {
	/*
	 * First the glyphs.  There are several different forms of
	 * these, but each eventually returns a substring of the glyph
	 * name.
	 */
	case ('('):
		gly = GLYPH_SPECIAL;
		lim = 2;
		break;
	case ('['):
		gly = GLYPH_SPECIAL;
		term = ']';
		break;
	case ('C'):
		if ('\'' != cp[i])
			return(GLYPH_ERROR);
		gly = GLYPH_SPECIAL;
		term = '\'';
		break;

	/*
	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
	 * 'X' is the trigger.  These have opaque sub-strings.
	 */
	case ('g'):
		/* FALLTHROUGH */
	case ('k'):
		/* FALLTHROUGH */
	case ('M'):
		/* FALLTHROUGH */
	case ('m'):
		/* FALLTHROUGH */
	case ('n'):
		/* FALLTHROUGH */
	case ('V'):
		/* FALLTHROUGH */
	case ('Y'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_IGNORE;
		/* FALLTHROUGH */
	case ('*'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_PREDEF;
		/* FALLTHROUGH */
	case ('F'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_FONTFAM;
		/* FALLTHROUGH */
	case ('f'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_FONT;

		if (start) 
			*start = &cp[i];

		switch (cp[i++]) {
		case ('('):
			lim = 2;
			break;
		case ('['):
			term = ']';
			break;
		default:
			lim = 1;
			i--;
			break;
		}
		break;

	/*
	 * These escapes are of the form \X'Y', where 'X' is the trigger
	 * and 'Y' is any string.  These have opaque sub-strings.
	 */
	case ('A'):
		/* FALLTHROUGH */
	case ('b'):
		/* FALLTHROUGH */
	case ('D'):
		/* FALLTHROUGH */
	case ('o'):
		/* FALLTHROUGH */
	case ('R'):
		/* FALLTHROUGH */
	case ('X'):
		/* FALLTHROUGH */
	case ('Z'):
		if ('\'' != cp[i++])
			return(GLYPH_ERROR);
		gly = GLYPH_IGNORE;
		term = '\'';
		break;

	/*
	 * These escapes are of the form \X'N', where 'X' is the trigger
	 * and 'N' resolves to a numerical expression.
	 */
	case ('B'):
		/* FALLTHROUGH */
	case ('h'):
		/* FALLTHROUGH */
	case ('H'):
		/* FALLTHROUGH */
	case ('L'):
		/* FALLTHROUGH */
	case ('l'):
		/* FALLTHROUGH */
	case ('N'):
		/* FALLTHROUGH */
	case ('S'):
		/* FALLTHROUGH */
	case ('v'):
		/* FALLTHROUGH */
	case ('w'):
		/* FALLTHROUGH */
	case ('x'):
		if ('\'' != cp[i++])
			return(GLYPH_ERROR);
		gly = GLYPH_IGNORE;
		term = numeric = '\'';
		break;

	/* 
	 * Sizes get a special category of their own.
	 */
	case ('s'):
		gly = GLYPH_IGNORE;

		if (start) 
			*start = &cp[i];

		/* See +/- counts as a sign. */
		c = cp[i];
		if ('+' == c || '-' == c || ASCII_HYPH == c)
			++i;

		switch (cp[i++]) {
		case ('('):
			lim = 2;
			break;
		case ('['):
			term = numeric = ']';
			break;
		case ('\''):
			term = numeric = '\'';
			break;
		default:
			lim = 1;
			i--;
			break;
		}

		/* See +/- counts as a sign. */
		c = cp[i];
		if ('+' == c || '-' == c || ASCII_HYPH == c)
			++i;

		break;

	/*
	 * Anything else is assumed to be a glyph.
	 */
	default:
		gly = GLYPH_SPECIAL;
		lim = 1;
		i--;
		break;
	}

	assert(GLYPH_ERROR != gly);

	if (start)
		*start = &cp[i];

	/*
	 * If a terminating block has been specified, we need to
	 * handle the case of recursion, which could have their
	 * own terminating blocks that mess up our parse.  This, by the
	 * way, means that the "start" and "size" values will be
	 * effectively meaningless.
	 */

	ssz = 0;
	if (numeric && -1 == (ssz = mandoc_glyphexp(&cp[i])))
		return(GLYPH_ERROR);

	i += ssz;

	/*
	 * We have a character terminator.  Try to read up to that
	 * character.  If we can't (i.e., we hit the nil), then return
	 * an error; if we can, calculate our length, read past the
	 * terminating character, and exit.
	 */

	if ('\0' != term) {
		*end = strchr(&cp[i], term);
		if ('\0' == *end)
			return(GLYPH_ERROR);
		if (sz)
			*sz = *end - &cp[i];
		(*end)++;
		return(gly);
	}

	assert(lim > 0);

	/*
	 * We have a numeric limit.  If the string is shorter than that,
	 * stop and return an error.  Else adjust our endpoint, length,
	 * and return the current glyph.
	 */

	if ((size_t)lim > strlen(&cp[i]))
		return(GLYPH_ERROR);

	if (sz)
		*sz = lim;
	*end = &cp[i] + lim;
	return(gly);
}

int
main(int argc, char *argv[])
{
	const char	*v, *start;
	enum mandoc_gly	 gly;
	int		 sz;

	if (2 != argc)
		return(EXIT_FAILURE);

	v = argv[1];
	printf("input: %s\n", v);

	gly = mandoc_glyph(&v, &start, &sz);
	printf("glyph=%s, end=%s, start=%s, sz=%d\n", glyphs[gly], v, start, sz);

	return(EXIT_SUCCESS);
}

next             reply	other threads:[~2011-04-07 14:01 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-07 14:00 Kristaps Dzonsons [this message]
2011-04-08 11:19 ` Kristaps Dzonsons
2011-04-08 12:16   ` Kristaps Dzonsons
2011-04-08 12:50     ` Kristaps Dzonsons
2011-04-08 13:15       ` Kristaps Dzonsons
2011-04-08 13:56         ` Finished: unifying " Kristaps Dzonsons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D9DC396.9010504@bsd.lv \
    --to=kristaps@bsd.lv \
    --cc=tech@mdocml.bsd.lv \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).