From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from krisdoz.my.domain (kristaps@localhost [127.0.0.1]) by krisdoz.my.domain (8.14.3/8.14.3) with ESMTP id o6KEuhc8003923 for ; Tue, 20 Jul 2010 10:56:43 -0400 (EDT) Received: (from kristaps@localhost) by krisdoz.my.domain (8.14.3/8.14.3/Submit) id o6KEuge2006688; Tue, 20 Jul 2010 10:56:42 -0400 (EDT) Date: Tue, 20 Jul 2010 10:56:42 -0400 (EDT) Message-Id: <201007201456.o6KEuge2006688@krisdoz.my.domain> X-Mailinglist: mdocml-source Reply-To: source@mdocml.bsd.lv MIME-Version: 1.0 From: kristaps@mdocml.bsd.lv To: source@mdocml.bsd.lv Subject: mdocml: Strip non-graphable input characters from input. X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/ Content-Type: text/plain; charset=utf-8 Log Message: ----------- Strip non-graphable input characters from input. The manuals specifically say that this is not allowed, and were it allowed, output would be inconsistent across output media (-Tps will puke, non-your-charset terminals will puke, etc.). With this done, simplify check_text() to only check escapes and for tabs. Add in a new tab warning, too. Modified Files: -------------- mdocml: main.c man_validate.c mandoc.h mdoc_validate.c Revision Data ------------- Index: mdoc_validate.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v retrieving revision 1.111 retrieving revision 1.112 diff -Lmdoc_validate.c -Lmdoc_validate.c -u -p -r1.111 -r1.112 --- mdoc_validate.c +++ mdoc_validate.c @@ -453,26 +453,29 @@ check_argv(struct mdoc *m, struct mdoc_n static int -check_text(struct mdoc *mdoc, int line, int pos, char *p) +check_text(struct mdoc *m, int ln, int pos, char *p) { int c; - - /* - * FIXME: we absolutely cannot let \b get through or it will - * destroy some assumptions in terms of format. - */ + size_t sz; for ( ; *p; p++, pos++) { + sz = strcspn(p, "\t\\"); + p += (int)sz; + + if ('\0' == *p) + break; + + pos += (int)sz; + if ('\t' == *p) { - if ( ! (MDOC_LITERAL & mdoc->flags)) - if ( ! mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADCHAR)) - return(0); - } else if ( ! isprint((u_char)*p) && ASCII_HYPH != *p) - if ( ! mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADCHAR)) - return(0); + if (MDOC_LITERAL & m->flags) + continue; + if (mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB)) + continue; + return(0); + } - if ('\\' != *p) - continue; + /* Check the special character. */ c = mandoc_special(p); if (c) { @@ -481,8 +484,8 @@ check_text(struct mdoc *mdoc, int line, continue; } - c = mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADESCAPE); - if ( ! (MDOC_IGN_ESCAPE & mdoc->pflags) && ! c) + c = mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE); + if ( ! (MDOC_IGN_ESCAPE & m->pflags) && ! c) return(c); } @@ -490,8 +493,6 @@ check_text(struct mdoc *mdoc, int line, } - - static int check_parent(PRE_ARGS, enum mdoct tok, enum mdoc_type t) { @@ -507,7 +508,6 @@ check_parent(PRE_ARGS, enum mdoct tok, e mdoc_macronames[tok]); return(0); } - static int Index: mandoc.h =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v retrieving revision 1.16 retrieving revision 1.17 diff -Lmandoc.h -Lmandoc.h -u -p -r1.16 -r1.17 --- mandoc.h +++ mandoc.h @@ -39,6 +39,7 @@ enum mandocerr { MANDOCERR_LISTFIRST, /* list type must come first */ MANDOCERR_BADSTANDARD, /* bad standard */ MANDOCERR_BADLIB, /* bad library */ + MANDOCERR_BADTAB, /* tab in non-literal context */ MANDOCERR_BADESCAPE, /* bad escape sequence */ MANDOCERR_BADQUOTE, /* unterminated quoted string */ MANDOCERR_NOWIDTHARG, /* argument requires the width argument */ Index: man_validate.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v retrieving revision 1.45 retrieving revision 1.46 diff -Lman_validate.c -Lman_validate.c -u -p -r1.45 -r1.46 --- man_validate.c +++ man_validate.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mandoc.h" #include "libman.h" @@ -206,32 +207,37 @@ check_text(CHKARGS) { char *p; int pos, c; - - assert(n->string); + size_t sz; for (p = n->string, pos = n->pos + 1; *p; p++, pos++) { - if ('\\' == *p) { - c = mandoc_special(p); - if (c) { - p += c - 1; - pos += c - 1; - continue; - } + sz = strcspn(p, "\t\\"); + p += (int)sz; + + if ('\0' == *p) + break; - c = man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE); - if ( ! (MAN_IGN_ESCAPE & m->pflags) && ! c) - return(c); + pos += (int)sz; + + if ('\t' == *p) { + if (MAN_LITERAL & m->flags) + continue; + if (man_pmsg(m, n->line, pos, MANDOCERR_BADTAB)) + continue; + return(0); } - /* - * FIXME: we absolutely cannot let \b get through or it - * will destroy some assumptions in terms of format. - */ + /* Check the special character. */ - if ('\t' == *p || isprint((u_char)*p) || ASCII_HYPH == *p) + c = mandoc_special(p); + if (c) { + p += c - 1; + pos += c - 1; continue; - if ( ! man_pmsg(m, n->line, pos, MANDOCERR_BADCHAR)) - return(0); + } + + c = man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE); + if ( ! (MAN_IGN_ESCAPE & m->pflags) && ! c) + return(c); } return(1); Index: main.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/main.c,v retrieving revision 1.98 retrieving revision 1.99 diff -Lmain.c -Lmain.c -u -p -r1.98 -r1.99 --- main.c +++ main.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ static const char * const mandocerrs[MAN "list type must come first", "bad standard", "bad library", + "tab in non-literal context", "bad escape sequence", "unterminated quoted string", "argument requires the width argument", @@ -491,6 +493,26 @@ fdesc(struct curparse *curp) ++lnn; break; } + + /* + * Warn about bogus characters. If you're using + * non-ASCII encoding, you're screwing your + * readers. Since I'd rather this not happen, + * I'll be helpful and drop these characters so + * we don't display gibberish. Note to manual + * writers: use special characters. + */ + + if ( ! isgraph((u_char)blk.buf[i]) && + ! isblank((u_char)blk.buf[i])) { + if ( ! mmsg(MANDOCERR_BADCHAR, curp, + lnn_start, pos, + "ignoring byte")) + goto bailout; + i++; + continue; + } + /* Trailing backslash is like a plain character. */ if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { if (pos >= (int)ln.sz) -- To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv