#include #include #include #include #include #define ASCII_HYPH 30 enum mandoc_gly { GLYPH_ERROR = 0, GLYPH_IGNORE, /* glyph to be ignored */ GLYPH_SPECIAL, /* a regular special character */ GLYPH_PREDEF, /* a predefined special character */ GLYPH_FONT, /* a font mode */ GLYPH_FONTFAM, /* a font family */ GLYPH__MAX }; enum mandoc_gly mandoc_glyph(const char **, const char **, int *); static int mandoc_glyphexp(const char *); static const char * const glyphs[GLYPH__MAX] = { "error", "ignore", "special", "predefined", "font", "font family" }; /* * Pass over recursive numerical expressions. This context of this * function is important: it's only called within character-terminating * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial * recursion: we don't care about what's in these blocks. * This returns the number of characters skipped or -1 if an error * occurs (the caller should bail). */ static int mandoc_glyphexp(const char *start) { int i; size_t sz; const char *cp; i = 0; /* The expression consists of a subexpression. */ if ('\\' == start[i]) { cp = &start[++i]; /* * Read past the end of the subexpression. * Bail immediately on errors. */ if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL)) return(-1); return(i + cp - &start[i]); } if ('(' != start[i++]) return(0); /* * A parenthesised subexpression. Read until the closing * parenthesis, making sure to handle any nested subexpressions * that might ruin our parse. */ while (')' != start[i]) { sz = strcspn(&start[i], ")\\"); i += (int)sz; if ('\0' == start[i]) return(-1); else if ('\\' != start[i]) continue; cp = &start[++i]; if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL)) return(-1); i += cp - &start[i]; } /* Read past the terminating ')'. */ return(++i); } /* * Handle an escaped sequeence. This should be called with any * string subsequent a `\'. Pass a pointer to this substring as "end"; * it will be set to the supremum of the parsed escape sequence. If * this returns GLYPH_ERROR, the string is bogus and should be thrown * away. If not GLYPH_ERROR or GLYPH_IGNORE, "start" is set to the * first relevant character of the substring (font, glyph, whatever) of * length sz. Both "start" and "sz" may be NULL. */ enum mandoc_gly mandoc_glyph(const char **end, const char **start, int *sz) { char c, term, numeric; int i, lim, ssz; const char *cp; enum mandoc_gly gly; cp = *end; if (start) *start = cp; i = 0; gly = GLYPH_ERROR; term = '\0'; numeric = 0; switch ((c = cp[i++])) { /* * First the glyphs. There are several different forms of * these, but each eventually returns a substring of the glyph * name. */ case ('('): gly = GLYPH_SPECIAL; lim = 2; break; case ('['): gly = GLYPH_SPECIAL; term = ']'; break; case ('C'): if ('\'' != cp[i]) return(GLYPH_ERROR); gly = GLYPH_SPECIAL; term = '\''; break; /* * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where * 'X' is the trigger. These have opaque sub-strings. */ case ('g'): /* FALLTHROUGH */ case ('k'): /* FALLTHROUGH */ case ('M'): /* FALLTHROUGH */ case ('m'): /* FALLTHROUGH */ case ('n'): /* FALLTHROUGH */ case ('V'): /* FALLTHROUGH */ case ('Y'): if (GLYPH_ERROR == gly) gly = GLYPH_IGNORE; /* FALLTHROUGH */ case ('*'): if (GLYPH_ERROR == gly) gly = GLYPH_PREDEF; /* FALLTHROUGH */ case ('F'): if (GLYPH_ERROR == gly) gly = GLYPH_FONTFAM; /* FALLTHROUGH */ case ('f'): if (GLYPH_ERROR == gly) gly = GLYPH_FONT; if (start) *start = &cp[i]; switch (cp[i++]) { case ('('): lim = 2; break; case ('['): term = ']'; break; default: lim = 1; i--; break; } break; /* * These escapes are of the form \X'Y', where 'X' is the trigger * and 'Y' is any string. These have opaque sub-strings. */ case ('A'): /* FALLTHROUGH */ case ('b'): /* FALLTHROUGH */ case ('D'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('R'): /* FALLTHROUGH */ case ('X'): /* FALLTHROUGH */ case ('Z'): if ('\'' != cp[i++]) return(GLYPH_ERROR); gly = GLYPH_IGNORE; term = '\''; break; /* * These escapes are of the form \X'N', where 'X' is the trigger * and 'N' resolves to a numerical expression. */ case ('B'): /* FALLTHROUGH */ case ('h'): /* FALLTHROUGH */ case ('H'): /* FALLTHROUGH */ case ('L'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('N'): /* FALLTHROUGH */ case ('S'): /* FALLTHROUGH */ case ('v'): /* FALLTHROUGH */ case ('w'): /* FALLTHROUGH */ case ('x'): if ('\'' != cp[i++]) return(GLYPH_ERROR); gly = GLYPH_IGNORE; term = numeric = '\''; break; /* * Sizes get a special category of their own. */ case ('s'): gly = GLYPH_IGNORE; if (start) *start = &cp[i]; /* See +/- counts as a sign. */ c = cp[i]; if ('+' == c || '-' == c || ASCII_HYPH == c) ++i; switch (cp[i++]) { case ('('): lim = 2; break; case ('['): term = numeric = ']'; break; case ('\''): term = numeric = '\''; break; default: lim = 1; i--; break; } /* See +/- counts as a sign. */ c = cp[i]; if ('+' == c || '-' == c || ASCII_HYPH == c) ++i; break; /* * Anything else is assumed to be a glyph. */ default: gly = GLYPH_SPECIAL; lim = 1; i--; break; } assert(GLYPH_ERROR != gly); if (start) *start = &cp[i]; /* * If a terminating block has been specified, we need to * handle the case of recursion, which could have their * own terminating blocks that mess up our parse. This, by the * way, means that the "start" and "size" values will be * effectively meaningless. */ ssz = 0; if (numeric && -1 == (ssz = mandoc_glyphexp(&cp[i]))) return(GLYPH_ERROR); i += ssz; /* * We have a character terminator. Try to read up to that * character. If we can't (i.e., we hit the nil), then return * an error; if we can, calculate our length, read past the * terminating character, and exit. */ if ('\0' != term) { *end = strchr(&cp[i], term); if ('\0' == *end) return(GLYPH_ERROR); if (sz) *sz = *end - &cp[i]; (*end)++; return(gly); } assert(lim > 0); /* * We have a numeric limit. If the string is shorter than that, * stop and return an error. Else adjust our endpoint, length, * and return the current glyph. */ if ((size_t)lim > strlen(&cp[i])) return(GLYPH_ERROR); if (sz) *sz = lim; *end = &cp[i] + lim; return(gly); } int main(int argc, char *argv[]) { const char *v, *start; enum mandoc_gly gly; int sz; if (2 != argc) return(EXIT_FAILURE); v = argv[1]; printf("input: %s\n", v); gly = mandoc_glyph(&v, &start, &sz); printf("glyph=%s, end=%s, start=%s, sz=%d\n", glyphs[gly], v, start, sz); return(EXIT_SUCCESS); }