From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from krisdoz.my.domain (schwarze@localhost [127.0.0.1]) by krisdoz.my.domain (8.14.5/8.14.5) with ESMTP id s9QHC85l010968 for ; Sun, 26 Oct 2014 13:12:08 -0400 (EDT) Received: (from schwarze@localhost) by krisdoz.my.domain (8.14.5/8.14.3/Submit) id s9QHC3Wl019594; Sun, 26 Oct 2014 13:12:03 -0400 (EDT) Date: Sun, 26 Oct 2014 13:12:03 -0400 (EDT) Message-Id: <201410261712.s9QHC3Wl019594@krisdoz.my.domain> X-Mailinglist: mdocml-source Reply-To: source@mdocml.bsd.lv MIME-Version: 1.0 From: schwarze@mdocml.bsd.lv To: source@mdocml.bsd.lv Subject: mdocml: Improve -Tascii output for Unicode escape sequences: For the X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/ Content-Type: text/plain; charset=utf-8 Log Message: ----------- Improve -Tascii output for Unicode escape sequences: For the first 512 code points, provide ASCII approximations. This is already much better than what groff does, which prints nothing for most code points. A few minor fixes while here: * Handle Unicode escape sequences in the ASCII range. * In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8 and the string "" for -Tascii output. * Handle all one-character escape sequences in mchars_spec2{cp,str}() and remove the workarounds on the higher level. Modified Files: -------------- mdocml: chars.c html.c term.c term.h term_ascii.c Revision Data ------------- Index: term.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v retrieving revision 1.228 retrieving revision 1.229 diff -Lterm.c -Lterm.c -u -p -r1.228 -r1.229 --- term.c +++ term.c @@ -444,27 +444,14 @@ term_word(struct termp *p, const char *w if (ESCAPE_ERROR == esc) continue; - if (TERMENC_ASCII != p->enc) - switch (esc) { - case ESCAPE_UNICODE: - uc = mchars_num2uc(seq + 1, sz - 1); - if ('\0' == uc) - break; - encode1(p, uc); - continue; - case ESCAPE_SPECIAL: - uc = mchars_spec2cp(p->symtab, seq, sz); - if (uc <= 0) - break; - encode1(p, uc); - continue; - default: - break; - } - switch (esc) { case ESCAPE_UNICODE: - encode1(p, '?'); + uc = mchars_num2uc(seq + 1, sz - 1); + if (p->enc == TERMENC_ASCII) { + cp = ascii_uc2str(uc); + encode(p, cp, strlen(cp)); + } else + encode1(p, uc); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, sz); @@ -472,11 +459,19 @@ term_word(struct termp *p, const char *w encode(p, &c, 1); break; case ESCAPE_SPECIAL: - cp = mchars_spec2str(p->symtab, seq, sz, &ssz); - if (NULL != cp) - encode(p, cp, ssz); - else if (1 == ssz) - encode(p, seq, sz); + if (p->enc == TERMENC_ASCII) { + cp = mchars_spec2str(p->symtab, + seq, sz, &ssz); + if (cp == NULL) + encode(p, "", 3); + else + encode(p, cp, ssz); + } else { + uc = mchars_spec2cp(p->symtab, seq, sz); + if (uc <= 0) + uc = 0xFFFD; + encode1(p, uc); + } break; case ESCAPE_FONTBOLD: term_fontrepl(p, TERMFONT_BOLD); @@ -683,31 +678,16 @@ term_strlen(const struct termp *p, const if (ESCAPE_ERROR == esc) continue; - if (TERMENC_ASCII != p->enc) - switch (esc) { - case ESCAPE_UNICODE: - c = mchars_num2uc(seq + 1, - ssz - 1); - if ('\0' == c) - break; - sz += cond_width(p, c, &skip); - continue; - case ESCAPE_SPECIAL: - c = mchars_spec2cp(p->symtab, - seq, ssz); - if (c <= 0) - break; - sz += cond_width(p, c, &skip); - continue; - default: - break; - } - rhs = NULL; switch (esc) { case ESCAPE_UNICODE: - sz += cond_width(p, '?', &skip); + c = mchars_num2uc(seq + 1, sz - 1); + if (p->enc == TERMENC_ASCII) { + rhs = ascii_uc2str(c); + rsz = strlen(rhs); + } else + sz += cond_width(p, c, &skip); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, ssz); @@ -715,14 +695,20 @@ term_strlen(const struct termp *p, const sz += cond_width(p, c, &skip); break; case ESCAPE_SPECIAL: - rhs = mchars_spec2str(p->symtab, - seq, ssz, &rsz); - - if (ssz != 1 || rhs) - break; - - rhs = seq; - rsz = ssz; + if (p->enc == TERMENC_ASCII) { + rhs = mchars_spec2str(p->symtab, + seq, ssz, &rsz); + if (rhs == NULL) { + rhs = ""; + rsz = 3; + } + } else { + c = mchars_spec2cp(p->symtab, + seq, ssz); + if (c <= 0) + c = 0xFFFD; + sz += cond_width(p, c, &skip); + } break; case ESCAPE_SKIPCHAR: skip = 1; Index: chars.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/chars.c,v retrieving revision 1.59 retrieving revision 1.60 diff -Lchars.c -Lchars.c -u -p -r1.59 -r1.60 --- chars.c +++ chars.c @@ -1,7 +1,7 @@ /* $Id$ */ /* * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons - * Copyright (c) 2011 Ingo Schwarze + * Copyright (c) 2011, 2014 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -104,9 +104,7 @@ mchars_spec2cp(const struct mchars *arg, const struct ln *ln; ln = find(arg, p, sz); - if (NULL == ln) - return(-1); - return(ln->unicode); + return(ln != NULL ? ln->unicode : sz == 1 ? *p : -1); } char @@ -126,20 +124,13 @@ mchars_num2uc(const char *p, size_t sz) int i; if ((i = mandoc_strntoi(p, sz, 16)) < 0) - return('\0'); + return(0xFFFD); /* - * Security warning: - * Never extend the range of accepted characters - * to overlap with the ASCII range, 0x00-0x7F - * without re-auditing the callers of this function. - * Some callers might relay on the fact that we never - * return ASCII characters for their escaping decisions. - * * XXX Code is missing here to exclude bogus ranges. */ - return(i > 0x80 && i <= 0x10FFFF ? i : '\0'); + return(i <= 0x10FFFF ? i : 0xFFFD); } const char * @@ -149,9 +140,9 @@ mchars_spec2str(const struct mchars *arg const struct ln *ln; ln = find(arg, p, sz); - if (NULL == ln) { + if (ln == NULL) { *rsz = 1; - return(NULL); + return(sz == 1 ? p : NULL); } *rsz = strlen(ln->ascii); Index: term.h =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.h,v retrieving revision 1.103 retrieving revision 1.104 diff -Lterm.h -Lterm.h -u -p -r1.103 -r1.104 --- term.h +++ term.h @@ -104,6 +104,8 @@ struct termp { struct termp_ps *ps; }; +const char *ascii_uc2str(int); + void term_eqn(struct termp *, const struct eqn *); void term_tbl(struct termp *, const struct tbl_span *); void term_free(struct termp *); Index: term_ascii.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term_ascii.c,v retrieving revision 1.33 retrieving revision 1.34 diff -Lterm_ascii.c -Lterm_ascii.c -u -p -r1.33 -r1.34 --- term_ascii.c +++ term_ascii.c @@ -166,6 +166,81 @@ ascii_setwidth(struct termp *p, int iop, p->rmargin = p->maxrmargin = p->defrmargin; } +const char * +ascii_uc2str(int uc) +{ + static const char nbrsp[2] = { ASCII_NBRSP, '\0' }; + static const char *tab[] = { + "","","","","","","","", + "", "\t", "", "", "", "", "", "", + "","","","","","","","", + "","", "","","", "", "", "", + " ", "!", "\"", "#", "$", "%", "&", "'", + "(", ")", "*", "+", ",", "-", ".", "/", + "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", ":", ";", "<", "=", ">", "?", + "@", "A", "B", "C", "D", "E", "F", "G", + "H", "I", "J", "K", "L", "M", "N", "O", + "P", "Q", "R", "S", "T", "U", "V", "W", + "X", "Y", "Z", "[", "\\", "]", "^", "_", + "`", "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", + "p", "q", "r", "s", "t", "u", "v", "w", + "x", "y", "z", "{", "|", "}", "~", "", + "<80>", "<81>", "<82>", "<83>", "<84>", "<85>", "<86>", "<87>", + "<88>", "<89>", "<8A>", "<8B>", "<8C>", "<8D>", "<8E>", "<8F>", + "<90>", "<91>", "<92>", "<93>", "<94>", "<95>", "<96>", "<97>", + "<99>", "<99>", "<9A>", "<9B>", "<9C>", "<9D>", "<9E>", "<9F>", + nbrsp, "!", "c", "GBP", "$?", "Y=", "|", "", + "\"", "(C)", "a.", "<<", "","", "(R)", "-", + "","+-", "^2", "^3", "'", "", "","*", + ",", "^1", "o.", ">>", "1/4", "1/2", "3/4", "?", + "A", "A", "A", "A", "Ae", "Aa", "AE", "C", + "E", "E", "E", "E", "I", "I", "I", "I", + "D", "N", "O", "O", "O", "O", "Oe", "*", + "Oe", "U", "U", "U", "Ue", "Y", "Th", "ss", + "a", "a", "a", "a", "ae", "aa", "ae", "c", + "e", "e", "e", "e", "i", "i", "i", "i", + "d", "n", "o", "o", "o", "o", "oe", "/", + "oe", "u", "u", "u", "ue", "y", "th", "y", + "A", "a", "A", "a", "A", "a", "C", "c", + "C", "c", "C", "c", "C", "c", "D", "d", + "D", "d", "E", "e", "E", "e", "E", "e", + "E", "e", "E", "e", "G", "g", "G", "g", + "G", "g", "G", "g", "H", "h", "H", "h", + "I", "i", "I", "i", "I", "i", "I", "i", + "I", "i", "IJ", "ij", "J", "j", "K", "k", + "q", "L", "l", "L", "l", "L", "l", "L", + "l", "L", "l", "N", "n", "N", "n", "N", + "n", "'n", "Ng", "ng", "O", "o", "O", "o", + "O", "o", "OE", "oe", "R", "r", "R", "r", + "R", "r", "S", "s", "S", "s", "S", "s", + "S", "s", "T", "t", "T", "t", "T", "t", + "U", "u", "U", "u", "U", "u", "U", "u", + "U", "u", "U", "u", "W", "w", "Y", "y", + "Y", "Z", "z", "Z", "z", "Z", "z", "s", + "b", "B", "B", "b", "6", "6", "O", "C", + "c", "D", "D", "D", "d", "d", "3", "@", + "E", "F", "f", "G", "G", "hv", "I", "I", + "K", "k", "l", "l", "W", "N", "n", "O", + "O", "o", "OI", "oi", "P", "p", "YR", "2", + "2", "SH", "sh", "t", "T", "t", "T", "U", + "u", "Y", "V", "Y", "y", "Z", "z", "ZH", + "ZH", "zh", "zh", "2", "5", "5", "ts", "w", + "|", "||", "|=", "!", "DZ", "Dz", "dz", "LJ", + "Lj", "lj", "NJ", "Nj", "nj", "A", "a", "I", + "i", "O", "o", "U", "u", "U", "u", "U", + "u", "U", "u", "U", "u", "@", "A", "a", + "A", "a", "AE", "ae", "G", "g", "G", "g", + "K", "k", "O", "o", "O", "o", "ZH", "zh", + "j", "DZ", "D", "dz", "G", "g", "HV", "W", + "N", "n", "A", "a", "AE", "ae", "O", "o"}; + + if (uc < 0 || (size_t)uc >= sizeof(tab)/sizeof(tab[0])) + return(""); + return(tab[uc]); +} + static size_t ascii_width(const struct termp *p, int c) { Index: html.c =================================================================== RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v retrieving revision 1.176 retrieving revision 1.177 diff -Lhtml.c -Lhtml.c -u -p -r1.176 -r1.177 --- html.c +++ html.c @@ -437,8 +437,18 @@ print_encode(struct html *h, const char case ESCAPE_UNICODE: /* Skip past "u" header. */ c = mchars_num2uc(seq + 1, len - 1); - if ('\0' != c) - printf("&#x%x;", c); + + /* + * XXX Security warning: + * For now, forbid Unicode obfuscation of ASCII + * characters. An audit of the callers is + * required before this can be removed. + */ + + if (c < 0x80) + c = 0xFFFD; + + printf("&#x%x;", c); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, len); -- To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv