From: Kristaps Dzonsons <kristaps@bsd.lv>
To: tech@mdocml.bsd.lv
Subject: Unifying the escape-sequence parser.
Date: Thu, 07 Apr 2011 16:00:54 +0200 [thread overview]
Message-ID: <4D9DC396.9010504@bsd.lv> (raw)
[-- Attachment #1: Type: text/plain, Size: 719 bytes --]
Hi,
Enclosed are my efforts to unify the escaped-sequence functions in out.c
(a2roffdeco()) and mandoc.c (mandoc_special()).
This handles, as far as I can see, all syntaxes of the groff(7) escapes.
When called during libmandoc validation, it will check for GLYPH_ERROR
and be followed by a search-and-replace of ASCII_HYPH for `-' in the
substring. When invoked from term.c or html.c, it will switch on the
returned type and substring value.
This will clear up a nice big chunk of code, but it's a pretty delicate
area, so please look it over!
If you compile this file, you can test escapes by running, e.g.,
% ./a.out s+\'\(\\f\[asdf\]\)\'123
to see the values of "start" and "end".
Thanks,
Kristaps
[-- Attachment #2: glyph.c --]
[-- Type: text/x-csrc, Size: 6971 bytes --]
#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define ASCII_HYPH 30
enum mandoc_gly {
GLYPH_ERROR = 0,
GLYPH_IGNORE, /* glyph to be ignored */
GLYPH_SPECIAL, /* a regular special character */
GLYPH_PREDEF, /* a predefined special character */
GLYPH_FONT, /* a font mode */
GLYPH_FONTFAM, /* a font family */
GLYPH__MAX
};
enum mandoc_gly mandoc_glyph(const char **, const char **, int *);
static int mandoc_glyphexp(const char *);
static const char * const glyphs[GLYPH__MAX] = {
"error",
"ignore",
"special",
"predefined",
"font",
"font family"
};
/*
* Pass over recursive numerical expressions. This context of this
* function is important: it's only called within character-terminating
* escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
* recursion: we don't care about what's in these blocks.
* This returns the number of characters skipped or -1 if an error
* occurs (the caller should bail).
*/
static int
mandoc_glyphexp(const char *start)
{
int i;
size_t sz;
const char *cp;
i = 0;
/* The expression consists of a subexpression. */
if ('\\' == start[i]) {
cp = &start[++i];
/*
* Read past the end of the subexpression.
* Bail immediately on errors.
*/
if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
return(-1);
return(i + cp - &start[i]);
}
if ('(' != start[i++])
return(0);
/*
* A parenthesised subexpression. Read until the closing
* parenthesis, making sure to handle any nested subexpressions
* that might ruin our parse.
*/
while (')' != start[i]) {
sz = strcspn(&start[i], ")\\");
i += (int)sz;
if ('\0' == start[i])
return(-1);
else if ('\\' != start[i])
continue;
cp = &start[++i];
if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
return(-1);
i += cp - &start[i];
}
/* Read past the terminating ')'. */
return(++i);
}
/*
* Handle an escaped sequeence. This should be called with any
* string subsequent a `\'. Pass a pointer to this substring as "end";
* it will be set to the supremum of the parsed escape sequence. If
* this returns GLYPH_ERROR, the string is bogus and should be thrown
* away. If not GLYPH_ERROR or GLYPH_IGNORE, "start" is set to the
* first relevant character of the substring (font, glyph, whatever) of
* length sz. Both "start" and "sz" may be NULL.
*/
enum mandoc_gly
mandoc_glyph(const char **end, const char **start, int *sz)
{
char c, term, numeric;
int i, lim, ssz;
const char *cp;
enum mandoc_gly gly;
cp = *end;
if (start)
*start = cp;
i = 0;
gly = GLYPH_ERROR;
term = '\0';
numeric = 0;
switch ((c = cp[i++])) {
/*
* First the glyphs. There are several different forms of
* these, but each eventually returns a substring of the glyph
* name.
*/
case ('('):
gly = GLYPH_SPECIAL;
lim = 2;
break;
case ('['):
gly = GLYPH_SPECIAL;
term = ']';
break;
case ('C'):
if ('\'' != cp[i])
return(GLYPH_ERROR);
gly = GLYPH_SPECIAL;
term = '\'';
break;
/*
* Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
* 'X' is the trigger. These have opaque sub-strings.
*/
case ('g'):
/* FALLTHROUGH */
case ('k'):
/* FALLTHROUGH */
case ('M'):
/* FALLTHROUGH */
case ('m'):
/* FALLTHROUGH */
case ('n'):
/* FALLTHROUGH */
case ('V'):
/* FALLTHROUGH */
case ('Y'):
if (GLYPH_ERROR == gly)
gly = GLYPH_IGNORE;
/* FALLTHROUGH */
case ('*'):
if (GLYPH_ERROR == gly)
gly = GLYPH_PREDEF;
/* FALLTHROUGH */
case ('F'):
if (GLYPH_ERROR == gly)
gly = GLYPH_FONTFAM;
/* FALLTHROUGH */
case ('f'):
if (GLYPH_ERROR == gly)
gly = GLYPH_FONT;
if (start)
*start = &cp[i];
switch (cp[i++]) {
case ('('):
lim = 2;
break;
case ('['):
term = ']';
break;
default:
lim = 1;
i--;
break;
}
break;
/*
* These escapes are of the form \X'Y', where 'X' is the trigger
* and 'Y' is any string. These have opaque sub-strings.
*/
case ('A'):
/* FALLTHROUGH */
case ('b'):
/* FALLTHROUGH */
case ('D'):
/* FALLTHROUGH */
case ('o'):
/* FALLTHROUGH */
case ('R'):
/* FALLTHROUGH */
case ('X'):
/* FALLTHROUGH */
case ('Z'):
if ('\'' != cp[i++])
return(GLYPH_ERROR);
gly = GLYPH_IGNORE;
term = '\'';
break;
/*
* These escapes are of the form \X'N', where 'X' is the trigger
* and 'N' resolves to a numerical expression.
*/
case ('B'):
/* FALLTHROUGH */
case ('h'):
/* FALLTHROUGH */
case ('H'):
/* FALLTHROUGH */
case ('L'):
/* FALLTHROUGH */
case ('l'):
/* FALLTHROUGH */
case ('N'):
/* FALLTHROUGH */
case ('S'):
/* FALLTHROUGH */
case ('v'):
/* FALLTHROUGH */
case ('w'):
/* FALLTHROUGH */
case ('x'):
if ('\'' != cp[i++])
return(GLYPH_ERROR);
gly = GLYPH_IGNORE;
term = numeric = '\'';
break;
/*
* Sizes get a special category of their own.
*/
case ('s'):
gly = GLYPH_IGNORE;
if (start)
*start = &cp[i];
/* See +/- counts as a sign. */
c = cp[i];
if ('+' == c || '-' == c || ASCII_HYPH == c)
++i;
switch (cp[i++]) {
case ('('):
lim = 2;
break;
case ('['):
term = numeric = ']';
break;
case ('\''):
term = numeric = '\'';
break;
default:
lim = 1;
i--;
break;
}
/* See +/- counts as a sign. */
c = cp[i];
if ('+' == c || '-' == c || ASCII_HYPH == c)
++i;
break;
/*
* Anything else is assumed to be a glyph.
*/
default:
gly = GLYPH_SPECIAL;
lim = 1;
i--;
break;
}
assert(GLYPH_ERROR != gly);
if (start)
*start = &cp[i];
/*
* If a terminating block has been specified, we need to
* handle the case of recursion, which could have their
* own terminating blocks that mess up our parse. This, by the
* way, means that the "start" and "size" values will be
* effectively meaningless.
*/
ssz = 0;
if (numeric && -1 == (ssz = mandoc_glyphexp(&cp[i])))
return(GLYPH_ERROR);
i += ssz;
/*
* We have a character terminator. Try to read up to that
* character. If we can't (i.e., we hit the nil), then return
* an error; if we can, calculate our length, read past the
* terminating character, and exit.
*/
if ('\0' != term) {
*end = strchr(&cp[i], term);
if ('\0' == *end)
return(GLYPH_ERROR);
if (sz)
*sz = *end - &cp[i];
(*end)++;
return(gly);
}
assert(lim > 0);
/*
* We have a numeric limit. If the string is shorter than that,
* stop and return an error. Else adjust our endpoint, length,
* and return the current glyph.
*/
if ((size_t)lim > strlen(&cp[i]))
return(GLYPH_ERROR);
if (sz)
*sz = lim;
*end = &cp[i] + lim;
return(gly);
}
int
main(int argc, char *argv[])
{
const char *v, *start;
enum mandoc_gly gly;
int sz;
if (2 != argc)
return(EXIT_FAILURE);
v = argv[1];
printf("input: %s\n", v);
gly = mandoc_glyph(&v, &start, &sz);
printf("glyph=%s, end=%s, start=%s, sz=%d\n", glyphs[gly], v, start, sz);
return(EXIT_SUCCESS);
}
next reply other threads:[~2011-04-07 14:01 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-04-07 14:00 Kristaps Dzonsons [this message]
2011-04-08 11:19 ` Kristaps Dzonsons
2011-04-08 12:16 ` Kristaps Dzonsons
2011-04-08 12:50 ` Kristaps Dzonsons
2011-04-08 13:15 ` Kristaps Dzonsons
2011-04-08 13:56 ` Finished: unifying " Kristaps Dzonsons
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4D9DC396.9010504@bsd.lv \
--to=kristaps@bsd.lv \
--cc=tech@mdocml.bsd.lv \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).