* mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" [not found] ` <ZTWO6/0aqdxtS6Vg@asta-kit.de> @ 2023-10-23 0:41 ` Paul Eggert [not found] ` <20231023083059.h43j6g2cse3e55en@illithid> 2023-10-23 21:23 ` Ingo Schwarze 0 siblings, 2 replies; 4+ messages in thread From: Paul Eggert @ 2023-10-23 0:41 UTC (permalink / raw) To: tech; +Cc: g.branden.robinson, Ingo Schwarze, Alejandro Colomar [-- Attachment #1: Type: text/plain, Size: 781 bytes --] On 2023-10-22 14:06, Ingo Schwarze wrote: > mandoc only supports > ASCII strings as arguments to \w, not escape sequences or formatting > instructions. For the TZDB man pages mandoc need not support all that, just \(bu. Just to make sure we're on the same page, I reproduced the problem by running the command "mandoc -man -Tascii t.5", where t.5 contains the following lines: .TH tzfile 5 .SH NAME .IP \(bu "\w'\(bu 'u" xxx .PP yyy The output should contain two spaces between the bullet's "o" and the "x", but with current mandoc it contains five spaces. Proposed mandoc patch attached. This isn't a perfect emulation of groff, nor have I tested with fancy constructs, but it should be good enough for tzfile(5). [-- Attachment #2: mandoc-tzfile-fix.txt --] [-- Type: text/plain, Size: 5541 bytes --] Index: roff.c =================================================================== RCS file: /cvs/mandoc/roff.c,v retrieving revision 1.398 diff -u -r1.398 roff.c --- roff.c 22 Oct 2023 16:02:01 -0000 1.398 +++ roff.c 22 Oct 2023 20:59:52 -0000 @@ -1367,6 +1367,7 @@ int iarg; /* index beginning the argument */ int iendarg; /* index right after the argument */ int iend; /* index right after the sequence */ + int icols; /* output columns of sequence */ int isrc, idst; /* to reduce \\ and \. in names */ int deftype; /* type of definition to paste */ int argi; /* macro argument index */ @@ -1404,7 +1405,7 @@ */ if (roff_escape(buf->buf, ln, pos, &iesc, &inam, - &iarg, &iendarg, &iend) != ESCAPE_EXPAND) { + &iarg, &iendarg, &iend, &icols) != ESCAPE_EXPAND) { while (pos < iend) { if (buf->buf[pos] == ec) { buf->buf[pos] = '\\'; @@ -1552,7 +1553,7 @@ break; case 'w': (void)snprintf(ubuf, sizeof(ubuf), - "%d", (iendarg - iarg) * 24); + "%d", icols * 24); res = ubuf; break; default: @@ -4030,7 +4031,7 @@ if (cp[1] == '{' || cp[1] == '}') break; if (roff_escape(cp, 0, 0, NULL, &inam, - NULL, NULL, &iend) != ESCAPE_UNDEF) { + NULL, NULL, &iend, NULL) != ESCAPE_UNDEF) { mandoc_msg(MANDOCERR_NAMESC, ln, pos, "%.*s%.*s", namesz, name, iend, cp); cp += iend; Index: roff_escape.c =================================================================== RCS file: /cvs/mandoc/roff_escape.c,v retrieving revision 1.14 diff -u -r1.14 roff_escape.c --- roff_escape.c 8 Jun 2022 13:23:57 -0000 1.14 +++ roff_escape.c 22 Oct 2023 20:59:52 -0000 @@ -42,7 +42,7 @@ enum mandoc_esc rval; rval = roff_escape(--*rendarg, 0, 0, - NULL, NULL, &iarg, &iendarg, &iend); + NULL, NULL, &iarg, &iendarg, &iend, NULL); assert(rval != ESCAPE_EXPAND); if (rarg != NULL) *rarg = *rendarg + iarg; @@ -64,14 +64,16 @@ */ enum mandoc_esc roff_escape(const char *buf, const int ln, const int aesc, - int *resc, int *rnam, int *rarg, int *rendarg, int *rend) + int *resc, int *rnam, int *rarg, int *rendarg, int *rend, int *rcols) { int iesc; /* index of leading escape char */ int inam; /* index of escape name */ int iarg; /* index beginning the argument */ int iendarg; /* index right after the argument */ int iend; /* index right after the sequence */ - int sesc, snam, sarg, sendarg, send; /* for sub-escape */ + int icols; /* column width of sequence */ + int sesc, snam, sarg, sendarg, send, scols; + /* for sub-escape */ int escterm; /* whether term is escaped */ int maxl; /* expected length of the argument */ int argl; /* actual length of the argument */ @@ -98,6 +100,7 @@ */ iarg = iendarg = iend = inam + 1; + icols = 0; maxl = INT_MAX; term = '\0'; err = MANDOCERR_OK; @@ -141,11 +144,13 @@ case '\'': case '-': case '0': - case ':': case '_': case '`': case 'e': case '~': + icols++; + /* FALLTHROUGH */ + case ':': iarg--; argl = 1; rval = ESCAPE_SPECIAL; @@ -179,6 +184,7 @@ break; case '(': case '[': + icols++; rval = ESCAPE_SPECIAL; iendarg = iend = --iarg; break; @@ -208,6 +214,7 @@ term = '\b'; break; case 'C': + icols++; rval = ESCAPE_SPECIAL; term = '\b'; break; @@ -224,6 +231,7 @@ term = '\b'; break; case 'o': + icols++; rval = ESCAPE_OVERSTRIKE; term = '\b'; break; @@ -271,7 +279,7 @@ if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) && buf[iarg] == buf[iesc]) { stype = roff_escape(buf, ln, iendarg, - &sesc, &snam, &sarg, &sendarg, &send); + &sesc, &snam, &sarg, &sendarg, &send, &scols); if (stype == ESCAPE_EXPAND) goto out_sub; } @@ -285,11 +293,13 @@ buf[snam]) != NULL) { err = MANDOCERR_ESC_DELIM; iend = send; + icols += scols; iarg = iendarg = sesc; goto out; } escterm = 1; iarg = send; + icols += scols; term = buf[snam]; } else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL && strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) { @@ -347,10 +357,11 @@ } if (buf[iendarg] == buf[iesc]) { stype = roff_escape(buf, ln, iendarg, - &sesc, &snam, &sarg, &sendarg, &send); + &sesc, &snam, &sarg, &sendarg, &send, &scols); if (stype == ESCAPE_EXPAND) goto out_sub; iend = send; + icols += scols; if (escterm == 1 && (buf[snam] == term || buf[inam] == 'N')) break; @@ -366,6 +377,8 @@ valid_A = 0; if (maxl != INT_MAX) maxl--; + if (term == '\'') + icols++; iend = ++iendarg; } } @@ -502,6 +515,7 @@ iarg = sarg; iendarg = sendarg; iend = send; + icols = scols; rval = ESCAPE_EXPAND; out: @@ -515,6 +529,8 @@ *rendarg = iendarg; if (rend != NULL) *rend = iend; + if (rcols != NULL) + *rcols = icols; if (ln == 0) return rval; Index: roff_int.h =================================================================== RCS file: /cvs/mandoc/roff_int.h,v retrieving revision 1.20 diff -u -r1.20 roff_int.h --- roff_int.h 2 Jun 2022 11:29:07 -0000 1.20 +++ roff_int.h 22 Oct 2023 20:59:52 -0000 @@ -83,7 +83,7 @@ void roffhash_free(struct ohash *); enum mandoc_esc roff_escape(const char *, const int, const int, - int *, int *, int *, int *, int *); + int *, int *, int *, int *, int *, int *); void roff_state_reset(struct roff_man *); void roff_validate(struct roff_man *); ^ permalink raw reply [flat|nested] 4+ messages in thread
[parent not found: <20231023083059.h43j6g2cse3e55en@illithid>]
* Re: mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" [not found] ` <20231023083059.h43j6g2cse3e55en@illithid> @ 2023-10-23 10:33 ` Alejandro Colomar 0 siblings, 0 replies; 4+ messages in thread From: Alejandro Colomar @ 2023-10-23 10:33 UTC (permalink / raw) To: G. Branden Robinson; +Cc: Paul Eggert, tech, Ingo Schwarze, Alejandro Colomar [-- Attachment #1: Type: text/plain, Size: 2119 bytes --] Hi Branden, On Mon, Oct 23, 2023 at 03:30:59AM -0500, G. Branden Robinson wrote: > Hi Paul, > > At 2023-10-22T17:41:28-0700, Paul Eggert wrote: > > On 2023-10-22 14:06, Ingo Schwarze wrote: > > > mandoc only supports ASCII strings as arguments to \w, not escape > > > sequences or formatting instructions. > > > > For the TZDB man pages mandoc need not support all that, just \(bu. > > > > Just to make sure we're on the same page, I reproduced the problem by > > running the command "mandoc -man -Tascii t.5", where t.5 contains the > > following lines: > > > > .TH tzfile 5 > > .SH NAME > > .IP \(bu "\w'\(bu 'u" > > xxx > > .PP > > yyy > > At the risk of being simplistic, why not just give `IP` an explicit > measurement as an argument? > > .IP \(bu 2n He feels that IP \(bu 3n is too long of a space in PDF. "\w'\(bu 'u" has the benefit of being 3n in terminals, but shorter in PDF. This was triggered after my suggestion of using 3[n] instead of 2[n] to clearly separate the bullet from the bulleted text, as docuemented in man-pages(7). Cheers, Alex > > (Or 3n, or 4n, or whatever looks best to you.) > > > The output should contain two spaces between the bullet's "o" and the > > "x", but with current mandoc it contains five spaces. > > If you're viewing on a terminal, `.IP \(bu 3n` should achieve this.[1] > > (Typesetters are a different story because how wide a bullet is depends > on the output device and the font.) > > I'm not saying that better mandoc(1) support for `\w` would be an awful > thing to have, but it doesn't seem necessary, to me, to solve this > specific problem. > > Regards, > Branden > > [1] Strictly, you can leave the "n" off, but I consider that slightly > sloppy, and I think that the explicit scaling unit is also helpful > as a reminder to the man page author that `IP`'s second argument, > unlike most arguments to man(7) macros, will _not_ be formatted as > text. I might take that. -- <https://www.alejandro-colomar.es/> [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" 2023-10-23 0:41 ` mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" Paul Eggert [not found] ` <20231023083059.h43j6g2cse3e55en@illithid> @ 2023-10-23 21:23 ` Ingo Schwarze 2023-10-24 19:36 ` Paul Eggert 1 sibling, 1 reply; 4+ messages in thread From: Ingo Schwarze @ 2023-10-23 21:23 UTC (permalink / raw) To: Paul Eggert; +Cc: tech, g.branden.robinson, Alejandro Colomar Hi Paul, Paul Eggert wrote on Sun, Oct 22, 2023 at 05:41:28PM -0700: > On 2023-10-22 14:06, Ingo Schwarze wrote: >> mandoc only supports ASCII strings as arguments to \w, not escape >> sequences or formatting instructions. > For the TZDB man pages mandoc need not support all that, just \(bu. Thank you for identifying a subset of the functionality that is both useful and feasible. That's so much better than the TODO item for \w i have lying around - that one is much less important than what you suggested but horrifically difficult. > Just to make sure we're on the same page, I reproduced the problem by > running the command "mandoc -man -Tascii t.5", where t.5 contains the > following lines: > > .TH tzfile 5 > .SH NAME > .IP \(bu "\w'\(bu 'u" > xxx > .PP > yyy > > The output should contain two spaces between the bullet's "o" and the > "x", but with current mandoc it contains five spaces. Yes, that matches my understanding. > Proposed mandoc patch attached. This isn't a perfect emulation of groff, In this case, that's a virtue, even though in many other cases, compatibility with groff is indeed among the main goals. > nor have I tested with fancy constructs, Not a major roadblock; such testing becomes significantly easier with the existing test suite, and by adding to it. > but it should be good enough for tzfile(5). And likely for some other pages, too. Regarding the implementation: I did not like that you did most of the work in roff_escape.c. That function is called each and every time any parser or formatter wants to deal with any escape sequence. Your patch did additional work for almost every kind of escape sequence, even though \w is the only escape sequence needing that kind of work. On top of that, while changing the internal API mandoc_escape(3) is not unheard of and can be done when it is really important, the need for an API break did not seem very urgent to me here. Consequently, i chose to do all the work at one local place in the internal function roff_expand(). Even though that requires iterating the argument of every \w escape sequence twice, it causes less work regarding the grand total because \w is a rare escape sequence, occurring much less frequently than other sequences, and even for \w relevant extra work is only done if the argument contains embedded escape sequences, which will occur still less frequently. Encountering long, complicated, deeply nested esacpe sequences inside \w would definitely be unusual. Doing extra work for every parsing and every formatting of each and every escape sequence sounds clearly worse. Besides, roff_expand() is really the place where the logic belongs. The function roff_escape() is a parser. Its job is to figure out where the various syntax elements (sequence, name, argument) begin and end and what the class of the sequence is. Doing calculations bases on the content of the argument is out of scope there, and considering that it's a parser, doing calculations related to formatting even more so. Calculating the information needed for interpolation really belongs into the string interpolation function roff_expand(), and given that \w in particular is related to formatting, the normal internal ESCAPE_* API that the formatters use for such purposes should be used here, too. Potentially, that might also make future refinements of the functionality easier: the more this code already resenmbles typical formatter code, the better. To summarize, i committed the following patch. Does that patch work for you as well as your own? Yours, Ingo Log Message: ----------- Support some escape sequences, in particular character escape sequences, inside \w arguments, and skip most other escape sequences when measuring the output length in this way because most escape sequences contribute little or nothing to text width: for example, consider font escapes in terminal output. This implementation is very rudimentary. In particular, it assumes that every character has the same width. No attempt is made to detect double-width or zero-width Unicode characters or to take dependencies on output devices or fonts into account. These limitations are hard to avoid because mandoc has to interpolate \w at the parsing stage when the output device is not yet known. I really do not want the content of the syntax tree to depend on the output device. Feature requested by Paul <Eggert at cs dot ucla dot edu>, who also submitted a patch, but i chose to commit this very different patch with almost the same functionality. His input was still very valuable because complete support for \w is out of the question, and consequently, the main task is identifying subsets of the feature that are needed for real-world manual pages and can be supported without uprooting the whole forest. Modified Files: -------------- mandoc: roff.7 roff.c mandoc/regress/roff/esc: w.in w.out_ascii w.out_lint Revision Data ------------- Index: roff.7 =================================================================== RCS file: /home/cvs/mandoc/mandoc/roff.7,v retrieving revision 1.120 retrieving revision 1.121 diff -Lroff.7 -Lroff.7 -u -p -r1.120 -r1.121 --- roff.7 +++ roff.7 @@ -1,6 +1,6 @@ .\" $Id$ .\" -.\" Copyright (c) 2010-2019, 2022 Ingo Schwarze <schwarze@openbsd.org> +.\" Copyright (c) 2010-2019, 2022-2023 Ingo Schwarze <schwarze@openbsd.org> .\" Copyright (c) 2010, 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv> .\" .\" Permission to use, copy, modify, and distribute this software for any @@ -2224,7 +2224,8 @@ The .Xr mandoc 1 implementation assumes that after expansion of user-defined strings, the .Ar string -only contains normal characters, no escape sequences, and that each +only contains normal characters, characters expressed as escape sequences, +and zero-width escape sequences, and that each character has a width of 24 basic units. .It Ic \eX\(aq Ns Ar string Ns Ic \(aq Output Index: roff.c =================================================================== RCS file: /home/cvs/mandoc/mandoc/roff.c,v retrieving revision 1.398 retrieving revision 1.399 diff -Lroff.c -Lroff.c -u -p -r1.398 -r1.399 --- roff.c +++ roff.c @@ -1,6 +1,6 @@ /* $Id$ */ /* - * Copyright (c) 2010-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org> + * Copyright (c) 2010-2015, 2017-2023 Ingo Schwarze <schwarze@openbsd.org> * Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any @@ -1362,6 +1362,7 @@ roff_expand(struct roff *r, struct buf * const char *res; /* the string to be pasted */ const char *src; /* source for copying */ char *dst; /* destination for copying */ + enum mandoc_esc subtype; /* return value from roff_escape */ int iesc; /* index of leading escape char */ int inam; /* index of the escape name */ int iarg; /* index beginning the argument */ @@ -1551,8 +1552,34 @@ roff_expand(struct roff *r, struct buf * res = ubuf; break; case 'w': - (void)snprintf(ubuf, sizeof(ubuf), - "%d", (iendarg - iarg) * 24); + rsz = 0; + subtype = ESCAPE_UNDEF; + while (iarg < iendarg) { + asz = subtype == ESCAPE_SKIPCHAR ? 0 : 1; + if (buf->buf[iarg] != '\\') { + rsz += asz; + iarg++; + continue; + } + switch ((subtype = roff_escape(buf->buf, 0, + iarg, NULL, NULL, NULL, NULL, &iarg))) { + case ESCAPE_SPECIAL: + case ESCAPE_NUMBERED: + case ESCAPE_UNICODE: + case ESCAPE_OVERSTRIKE: + case ESCAPE_UNDEF: + break; + case ESCAPE_DEVICE: + asz *= 8; + break; + case ESCAPE_EXPAND: + abort(); + default: + continue; + } + rsz += asz; + } + (void)snprintf(ubuf, sizeof(ubuf), "%d", rsz * 24); res = ubuf; break; default: Index: w.out_ascii =================================================================== RCS file: /home/cvs/mandoc/mandoc/regress/roff/esc/w.out_ascii,v retrieving revision 1.3 retrieving revision 1.4 diff -Lregress/roff/esc/w.out_ascii -Lregress/roff/esc/w.out_ascii -u -p -r1.3 -r1.4 --- regress/roff/esc/w.out_ascii +++ regress/roff/esc/w.out_ascii @@ -8,6 +8,13 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN character: 24 blank: 24 text: 96 + special: 24 + numbered: 24 + Unicode: 24 + overstrike: 24 + undefined: 24 + zero-width: 0 + skipchar: 48 A\bAr\brg\bgu\bum\bme\ben\bnt\bt d\bde\bel\bli\bim\bmi\bit\bte\ber\brs\bs unsupported \r: 24u @@ -27,4 +34,4 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN overstrike: 24u unterminated: 72 -OpenBSD June 8, 2022 OpenBSD +OpenBSD October 23, 2023 OpenBSD Index: w.out_lint =================================================================== RCS file: /home/cvs/mandoc/mandoc/regress/roff/esc/w.out_lint,v retrieving revision 1.7 retrieving revision 1.8 diff -Lregress/roff/esc/w.out_lint -Lregress/roff/esc/w.out_lint -u -p -r1.7 -r1.8 --- regress/roff/esc/w.out_lint +++ regress/roff/esc/w.out_lint @@ -1,4 +1,5 @@ -mandoc: w.in:17:20: UNSUPP: unsupported escape sequence: \r -mandoc: w.in:17:23: UNSUPP: unsupported escape sequence: \r -mandoc: w.in:23:16: WARNING: undefined escape, printing literally: \G -mandoc: w.in:51:15: ERROR: incomplete escape sequence: \w'foo +mandoc: w.in:25:15: WARNING: undefined escape, printing literally: \G +mandoc: w.in:31:20: UNSUPP: unsupported escape sequence: \r +mandoc: w.in:31:23: UNSUPP: unsupported escape sequence: \r +mandoc: w.in:37:16: WARNING: undefined escape, printing literally: \G +mandoc: w.in:65:15: ERROR: incomplete escape sequence: \w'foo Index: w.in =================================================================== RCS file: /home/cvs/mandoc/mandoc/regress/roff/esc/w.in,v retrieving revision 1.3 retrieving revision 1.4 diff -Lregress/roff/esc/w.in -Lregress/roff/esc/w.in -u -p -r1.3 -r1.4 --- regress/roff/esc/w.in +++ regress/roff/esc/w.in @@ -1,4 +1,4 @@ -.\" $OpenBSD: w.in,v 1.4 2022/06/08 13:08:00 schwarze Exp $ +.\" $OpenBSD: w.in,v 1.5 2023/10/23 20:07:19 schwarze Exp $ .Dd $Mdocdate$ .Dt ESC-W 1 .Os @@ -13,6 +13,20 @@ character: \w'n' blank: \w' ' .br text: \w'text' +.br +special: \w'\(bu' +.br +numbered: \w'\N'100'' +.br +Unicode: \w'\[u2013]' +.br +overstrike: \w'\o'ab'' +.br +undefined: \w'\G' +.br +zero-width: \w'\fB\&\fP' +.br +skipchar: \w'a\zb\z\(buc' .Ss Argument delimiters unsupported \er: \w\rM\ru .br -- To unsubscribe send an email to tech+unsubscribe@mandoc.bsd.lv ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" 2023-10-23 21:23 ` Ingo Schwarze @ 2023-10-24 19:36 ` Paul Eggert 0 siblings, 0 replies; 4+ messages in thread From: Paul Eggert @ 2023-10-24 19:36 UTC (permalink / raw) To: tech; +Cc: g.branden.robinson, Alejandro Colomar On 10/23/23 14:23, Ingo Schwarze wrote: > Does that patch work for you as well as your own? Yes, thanks, it works for tzfile(5). -- To unsubscribe send an email to tech+unsubscribe@mandoc.bsd.lv ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-10-24 19:36 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <884cb5d0-27ce-a5ca-b449-972021e62e92@gmail.com> [not found] ` <7c3294cf-e3d0-c716-d1c0-5b6c5c757d7e@cs.ucla.edu> [not found] ` <7eb92df5-6c87-8384-c4a8-2a00eabf1c8e@gmail.com> [not found] ` <66ef5b92-6e19-8bea-2840-6c2f0240d225@cs.ucla.edu> [not found] ` <ZTWO6/0aqdxtS6Vg@asta-kit.de> 2023-10-23 0:41 ` mandoc mishandles tzfile(5)'s .IP \(bu "\w'\(bu 'u" Paul Eggert [not found] ` <20231023083059.h43j6g2cse3e55en@illithid> 2023-10-23 10:33 ` Alejandro Colomar 2023-10-23 21:23 ` Ingo Schwarze 2023-10-24 19:36 ` Paul Eggert
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).