From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from smtp-1.sys.kth.se (smtp-1.sys.kth.se [130.237.32.175]) by krisdoz.my.domain (8.14.3/8.14.3) with ESMTP id p37E13sI021081 for ; Thu, 7 Apr 2011 10:01:04 -0400 (EDT) Received: from mailscan-1.sys.kth.se (mailscan-1.sys.kth.se [130.237.32.91]) by smtp-1.sys.kth.se (Postfix) with ESMTP id 94D9E1562D4 for ; Thu, 7 Apr 2011 16:00:57 +0200 (CEST) X-Virus-Scanned: by amavisd-new at kth.se Received: from smtp-1.sys.kth.se ([130.237.32.175]) by mailscan-1.sys.kth.se (mailscan-1.sys.kth.se [130.237.32.91]) (amavisd-new, port 10024) with LMTP id TiF3cpo2FEjO for ; Thu, 7 Apr 2011 16:00:55 +0200 (CEST) X-KTH-Auth: kristaps [193.10.49.5] X-KTH-mail-from: kristaps@bsd.lv X-KTH-rcpt-to: tech@mdocml.bsd.lv Received: from [172.16.18.84] (unknown [193.10.49.5]) by smtp-1.sys.kth.se (Postfix) with ESMTP id DDCFC1558C4 for ; Thu, 7 Apr 2011 16:00:55 +0200 (CEST) Message-ID: <4D9DC396.9010504@bsd.lv> Date: Thu, 07 Apr 2011 16:00:54 +0200 From: Kristaps Dzonsons User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.16) Gecko/20110303 Icedove/3.0.11 X-Mailinglist: mdocml-tech Reply-To: tech@mdocml.bsd.lv MIME-Version: 1.0 To: tech@mdocml.bsd.lv Subject: Unifying the escape-sequence parser. Content-Type: multipart/mixed; boundary="------------060800060003060802060404" This is a multi-part message in MIME format. --------------060800060003060802060404 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Hi, Enclosed are my efforts to unify the escaped-sequence functions in out.c (a2roffdeco()) and mandoc.c (mandoc_special()). This handles, as far as I can see, all syntaxes of the groff(7) escapes. When called during libmandoc validation, it will check for GLYPH_ERROR and be followed by a search-and-replace of ASCII_HYPH for `-' in the substring. When invoked from term.c or html.c, it will switch on the returned type and substring value. This will clear up a nice big chunk of code, but it's a pretty delicate area, so please look it over! If you compile this file, you can test escapes by running, e.g., % ./a.out s+\'\(\\f\[asdf\]\)\'123 to see the values of "start" and "end". Thanks, Kristaps --------------060800060003060802060404 Content-Type: text/x-csrc; name="glyph.c" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="glyph.c" #include #include #include #include #include #define ASCII_HYPH 30 enum mandoc_gly { GLYPH_ERROR = 0, GLYPH_IGNORE, /* glyph to be ignored */ GLYPH_SPECIAL, /* a regular special character */ GLYPH_PREDEF, /* a predefined special character */ GLYPH_FONT, /* a font mode */ GLYPH_FONTFAM, /* a font family */ GLYPH__MAX }; enum mandoc_gly mandoc_glyph(const char **, const char **, int *); static int mandoc_glyphexp(const char *); static const char * const glyphs[GLYPH__MAX] = { "error", "ignore", "special", "predefined", "font", "font family" }; /* * Pass over recursive numerical expressions. This context of this * function is important: it's only called within character-terminating * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial * recursion: we don't care about what's in these blocks. * This returns the number of characters skipped or -1 if an error * occurs (the caller should bail). */ static int mandoc_glyphexp(const char *start) { int i; size_t sz; const char *cp; i = 0; /* The expression consists of a subexpression. */ if ('\\' == start[i]) { cp = &start[++i]; /* * Read past the end of the subexpression. * Bail immediately on errors. */ if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL)) return(-1); return(i + cp - &start[i]); } if ('(' != start[i++]) return(0); /* * A parenthesised subexpression. Read until the closing * parenthesis, making sure to handle any nested subexpressions * that might ruin our parse. */ while (')' != start[i]) { sz = strcspn(&start[i], ")\\"); i += (int)sz; if ('\0' == start[i]) return(-1); else if ('\\' != start[i]) continue; cp = &start[++i]; if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL)) return(-1); i += cp - &start[i]; } /* Read past the terminating ')'. */ return(++i); } /* * Handle an escaped sequeence. This should be called with any * string subsequent a `\'. Pass a pointer to this substring as "end"; * it will be set to the supremum of the parsed escape sequence. If * this returns GLYPH_ERROR, the string is bogus and should be thrown * away. If not GLYPH_ERROR or GLYPH_IGNORE, "start" is set to the * first relevant character of the substring (font, glyph, whatever) of * length sz. Both "start" and "sz" may be NULL. */ enum mandoc_gly mandoc_glyph(const char **end, const char **start, int *sz) { char c, term, numeric; int i, lim, ssz; const char *cp; enum mandoc_gly gly; cp = *end; if (start) *start = cp; i = 0; gly = GLYPH_ERROR; term = '\0'; numeric = 0; switch ((c = cp[i++])) { /* * First the glyphs. There are several different forms of * these, but each eventually returns a substring of the glyph * name. */ case ('('): gly = GLYPH_SPECIAL; lim = 2; break; case ('['): gly = GLYPH_SPECIAL; term = ']'; break; case ('C'): if ('\'' != cp[i]) return(GLYPH_ERROR); gly = GLYPH_SPECIAL; term = '\''; break; /* * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where * 'X' is the trigger. These have opaque sub-strings. */ case ('g'): /* FALLTHROUGH */ case ('k'): /* FALLTHROUGH */ case ('M'): /* FALLTHROUGH */ case ('m'): /* FALLTHROUGH */ case ('n'): /* FALLTHROUGH */ case ('V'): /* FALLTHROUGH */ case ('Y'): if (GLYPH_ERROR == gly) gly = GLYPH_IGNORE; /* FALLTHROUGH */ case ('*'): if (GLYPH_ERROR == gly) gly = GLYPH_PREDEF; /* FALLTHROUGH */ case ('F'): if (GLYPH_ERROR == gly) gly = GLYPH_FONTFAM; /* FALLTHROUGH */ case ('f'): if (GLYPH_ERROR == gly) gly = GLYPH_FONT; if (start) *start = &cp[i]; switch (cp[i++]) { case ('('): lim = 2; break; case ('['): term = ']'; break; default: lim = 1; i--; break; } break; /* * These escapes are of the form \X'Y', where 'X' is the trigger * and 'Y' is any string. These have opaque sub-strings. */ case ('A'): /* FALLTHROUGH */ case ('b'): /* FALLTHROUGH */ case ('D'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('R'): /* FALLTHROUGH */ case ('X'): /* FALLTHROUGH */ case ('Z'): if ('\'' != cp[i++]) return(GLYPH_ERROR); gly = GLYPH_IGNORE; term = '\''; break; /* * These escapes are of the form \X'N', where 'X' is the trigger * and 'N' resolves to a numerical expression. */ case ('B'): /* FALLTHROUGH */ case ('h'): /* FALLTHROUGH */ case ('H'): /* FALLTHROUGH */ case ('L'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('N'): /* FALLTHROUGH */ case ('S'): /* FALLTHROUGH */ case ('v'): /* FALLTHROUGH */ case ('w'): /* FALLTHROUGH */ case ('x'): if ('\'' != cp[i++]) return(GLYPH_ERROR); gly = GLYPH_IGNORE; term = numeric = '\''; break; /* * Sizes get a special category of their own. */ case ('s'): gly = GLYPH_IGNORE; if (start) *start = &cp[i]; /* See +/- counts as a sign. */ c = cp[i]; if ('+' == c || '-' == c || ASCII_HYPH == c) ++i; switch (cp[i++]) { case ('('): lim = 2; break; case ('['): term = numeric = ']'; break; case ('\''): term = numeric = '\''; break; default: lim = 1; i--; break; } /* See +/- counts as a sign. */ c = cp[i]; if ('+' == c || '-' == c || ASCII_HYPH == c) ++i; break; /* * Anything else is assumed to be a glyph. */ default: gly = GLYPH_SPECIAL; lim = 1; i--; break; } assert(GLYPH_ERROR != gly); if (start) *start = &cp[i]; /* * If a terminating block has been specified, we need to * handle the case of recursion, which could have their * own terminating blocks that mess up our parse. This, by the * way, means that the "start" and "size" values will be * effectively meaningless. */ ssz = 0; if (numeric && -1 == (ssz = mandoc_glyphexp(&cp[i]))) return(GLYPH_ERROR); i += ssz; /* * We have a character terminator. Try to read up to that * character. If we can't (i.e., we hit the nil), then return * an error; if we can, calculate our length, read past the * terminating character, and exit. */ if ('\0' != term) { *end = strchr(&cp[i], term); if ('\0' == *end) return(GLYPH_ERROR); if (sz) *sz = *end - &cp[i]; (*end)++; return(gly); } assert(lim > 0); /* * We have a numeric limit. If the string is shorter than that, * stop and return an error. Else adjust our endpoint, length, * and return the current glyph. */ if ((size_t)lim > strlen(&cp[i])) return(GLYPH_ERROR); if (sz) *sz = lim; *end = &cp[i] + lim; return(gly); } int main(int argc, char *argv[]) { const char *v, *start; enum mandoc_gly gly; int sz; if (2 != argc) return(EXIT_FAILURE); v = argv[1]; printf("input: %s\n", v); gly = mandoc_glyph(&v, &start, &sz); printf("glyph=%s, end=%s, start=%s, sz=%d\n", glyphs[gly], v, start, sz); return(EXIT_SUCCESS); } --------------060800060003060802060404-- -- To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv