* pod2mdoc: Proper translation of POD escape sequences into mdoc macros,
@ 2014-07-18 5:09 schwarze
0 siblings, 0 replies; only message in thread
From: schwarze @ 2014-07-18 5:09 UTC (permalink / raw)
To: source
Log Message:
-----------
Proper translation of POD escape sequences into mdoc macros,
including correct whitespace handling.
This is surprisingly hard to get right.
The main idea is to not write Ns at the end of formatcode(),
but instead clear the wantws state and let the caller parse
ahead (in the manner required by the context) to see what is
following, then break the line or print Ns as appropriate.
Also use the following logic throughout:
* Clear wantws after writing a non-whitespace character.
* Switch to text state after flushing text output.
* Switch back to OUST_NL/wantws after a line break.
Modified Files:
--------------
pod2mdoc:
pod2mdoc.c
pod2mdoc/Regress:
Makefile.inc
pod2mdoc/Regress/formatcode:
Makefile
invalid.pod
textline.pod
Added Files:
-----------
pod2mdoc/Regress/formatcode:
code.pod
macroline.pod
Revision Data
-------------
Index: pod2mdoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/pod2mdoc/pod2mdoc.c,v
retrieving revision 1.31
retrieving revision 1.32
diff -Lpod2mdoc.c -Lpod2mdoc.c -u -p -r1.31 -r1.32
--- pod2mdoc.c
+++ pod2mdoc.c
@@ -52,6 +52,12 @@ enum sect {
SECT_SYNOPSIS, /* SYNOPSIS section */
};
+enum outstate {
+ OUST_NL = 0, /* just started a new output line */
+ OUST_TXT, /* text line output in progress */
+ OUST_MAC /* macro line output in progress */
+};
+
struct state {
const char *fname; /* file being parsed */
int parsing; /* after =cut of before command */
@@ -61,7 +67,8 @@ struct state {
enum list lstack[LIST_STACKSZ]; /* open lists */
size_t lpos; /* where in list stack */
int haspar; /* in paragraph: do we need Pp? */
- int hasnl; /* in text: just started a new line */
+ enum outstate oust; /* state of the mdoc output stream */
+ int wantws; /* let mdoc(7) output whitespace here */
char *outbuf; /* text buffered for output */
size_t outbufsz; /* allocated size of outbuf */
size_t outbuflen; /* current length of outbuf */
@@ -150,6 +157,7 @@ outbuf_addchar(struct state *st)
if ('\\' == last)
st->outbuf[st->outbuflen++] = 'e';
st->outbuf[st->outbuflen] = '\0';
+ st->wantws = 0;
}
static void
@@ -162,6 +170,7 @@ outbuf_addstr(struct state *st, const ch
outbuf_grow(st, slen);
memcpy(st->outbuf + st->outbuflen, str, slen+1);
last = str[slen - 1];
+ st->wantws = 0;
}
static void
@@ -174,25 +183,32 @@ outbuf_flush(struct state *st)
fputs(st->outbuf, stdout);
*st->outbuf = '\0';
st->outbuflen = 0;
- st->hasnl = 0;
+
+ if (OUST_NL == st->oust)
+ st->oust = OUST_TXT;
}
static void
-outbuf_newln(struct state *st)
+mdoc_newln(struct state *st)
{
- if ('\n' == last)
+ if (OUST_NL == st->oust)
return;
- outbuf_flush(st);
+
putchar('\n');
last = '\n';
- st->hasnl = 1;
+ st->oust = OUST_NL;
+ st->wantws = 1;
}
/*
* Given buf[*start] is at the start of an escape name, read til the end
* of the escape ('>') then try to do something with it.
* Sets start to be one after the '>'.
+ *
+ * This function does not care about output modes,
+ * it merely appends text to the output buffer,
+ * which can then be used in any mode.
*/
static void
formatescape(struct state *st, const char *buf, size_t *start, size_t end)
@@ -237,6 +253,9 @@ formatescape(struct state *st, const cha
* I set "start" to be the end of the sequence (last right-carrot) so
* that the caller can safely just continue processing.
* If this is just an empty tag, I'll return 0.
+ *
+ * Always operates in OUST_MAC mode.
+ * Mode handling is done by the caller.
*/
static int
trylink(const char *buf, size_t *start, size_t end, size_t dsz)
@@ -371,6 +390,9 @@ trylink(const char *buf, size_t *start,
* Our flag might be followed by an argument, so make sure that we're
* accounting for that, too.
* If we don't have a flag at all, however, then assume we're an "Ar".
+ *
+ * Always operates in OUST_MAC mode.
+ * Mode handlinf is done by the caller.
*/
static void
dosynopsisfl(const char *buf, size_t *start, size_t end)
@@ -436,24 +458,23 @@ again:
* like X<...> and can contain nested format codes.
* This consumes the whole format code, and any nested format codes, til
* the end of matched production.
- * If "reentrant", then we're being called after a macro has already
- * been printed to the current line.
* If "nomacro", then we don't print any macros, just contained data
* (e.g., following "Sh" or "Nm").
* "pos" is only significant in SYNOPSIS, and should be 0 when invoked
* as the first format code on a line (for decoration as an "Nm"),
* non-zero otherwise.
- * Return whether we've printed a macro or not--in other words, whether
- * this should trigger a subsequent newline (this should be ignored when
- * reentrant).
+ *
+ * Output mode handling is most complicated here.
+ * We may enter in any mode.
+ * We usually exit in OUST_MAC mode, except when
+ * entering without OUST_MAC and the code is invalid.
*/
-static int
+static void
formatcode(struct state *st, const char *buf, size_t *start,
- size_t end, int reentrant, int nomacro, int pos)
+ size_t end, int nomacro, int pos)
{
enum fmt fmt;
size_t i, j, dsz;
- int white;
assert(*start + 1 < end);
assert('<' == buf[*start + 1]);
@@ -491,7 +512,7 @@ formatcode(struct state *st, const char
*/
if (FMT_ESCAPE == fmt) {
formatescape(st, buf, start, end);
- return(0);
+ return;
} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
/*
* Just consume til the end delimiter, accounting for
@@ -521,7 +542,7 @@ formatcode(struct state *st, const char
if (isspace(last))
while (*start < end && isspace((int)buf[*start]))
(*start)++;
- return(0);
+ return;
}
/*
@@ -529,17 +550,26 @@ formatcode(struct state *st, const char
* suppressed in, e.g., "Nm" and "Sh" macros).
*/
if (FMT__MAX != fmt && !nomacro) {
- white = ' ' == last || '\n' == last ||
- ' ' == buf[*start];
+
+ /*
+ * We may already have wantws if there was whitespace
+ * before the code ("text B<text"), but initial
+ * whitespace inside our scope ("textB< text")
+ * allows to break at this point as well.
+ */
+
+ st->wantws |= ' ' == buf[*start];
/*
* If we are on a text line and there is no
* whitespace before our content, we have to make
* the previous word a prefix to the macro line.
+ * In the following, mdoc_newln() must not be used
+ * lest we clobber out output state.
*/
- if ( ! white && ! reentrant) {
- if ( ! st->hasnl)
+ if (OUST_MAC != st->oust && !st->wantws) {
+ if (OUST_NL != st->oust)
putchar('\n');
printf(".Pf ");
}
@@ -548,19 +578,22 @@ formatcode(struct state *st, const char
/* Whitespace is easier to suppress on macro lines. */
- if ( ! white && reentrant)
- printf(" Ns");
+ if (OUST_MAC == st->oust && !st->wantws)
+ printf(" Ns ");
/* Unless we are on a macro line, start one. */
- if (white && ! reentrant) {
- if (last != '\n')
+ if (OUST_MAC != st->oust && st->wantws) {
+ if (OUST_NL != st->oust)
putchar('\n');
putchar('.');
} else
putchar(' ');
- /* Print the macro corresponding to this format code. */
+ /*
+ * Print the macro corresponding to this format code,
+ * and update the output state afterwards.
+ */
switch (fmt) {
case (FMT_ITALIC):
@@ -600,6 +633,8 @@ formatcode(struct state *st, const char
default:
abort();
}
+ st->oust = OUST_MAC;
+ st->wantws = 1;
} else
outbuf_flush(st);
@@ -631,74 +666,86 @@ formatcode(struct state *st, const char
}
}
if (*start + 1 < end && '<' == buf[*start + 1]) {
- formatcode(st, buf, start, end, 1, nomacro, 1);
+ formatcode(st, buf, start, end, nomacro, 1);
continue;
}
- /*
- * Make sure that any macro-like words (or
- * really any word starting with a capital
- * letter) is assumed to be a macro that must be
- * escaped.
- * This matches "Xx " and "XxEOLN".
- */
- if ((' ' == last || '\n' == last) &&
- end - *start > 1 &&
- isupper((int)buf[*start]) &&
- islower((int)buf[*start + 1]) &&
- (end - *start == 2 ||
- ' ' == buf[*start + 2]))
- printf("\\&");
+ /* Suppress newlines and multiple spaces. */
- /* Suppress newline. */
- if ('\n' == buf[*start])
- putchar(last = ' ');
- else
- putchar(last = buf[*start]);
+ last = buf[(*start)++];
+ if (' ' == last || '\n' == last) {
+ putchar(' ');
+ while (*start < end && ' ' == buf[*start])
+ (*start)++;
+ continue;
+ }
+
+ if (OUST_MAC == st->oust) {
+ if ( ! st->wantws) {
+ printf(" Ns ");
+ st->wantws = 1;
+ }
+
+ /*
+ * Escape macro-like words.
+ * This matches "Xx " and "XxEOLN".
+ */
+
+ if (end - *start > 0 &&
+ isupper((unsigned char)last) &&
+ islower((unsigned char)buf[*start]) &&
+ (end - *start == 1 ||
+ ' ' == buf[*start + 1] ||
+ '>' == buf[*start + 1]))
+ printf("\\&");
+ }
+
+ putchar(last);
/* Protect against character escapes. */
+
if ('\\' == last)
putchar('e');
-
- (*start)++;
-
- if (' ' == last)
- while (*start < end && ' ' == buf[*start])
- (*start)++;
}
- if (FMT__MAX == fmt)
- return(0);
-
if ( ! nomacro && FMT_CODE == fmt)
printf(" Qc ");
- /*
- * We're now subsequent the format code.
- * If there isn't a space (or newline) here, and we haven't just
- * printed a space, then suppress space.
- */
- if ( ! nomacro && ' ' != last)
- if (' ' != buf[*start] && '\n' != buf[*start])
- printf(" Ns ");
-
- return(1);
+ if (FMT__MAX != fmt)
+ st->wantws = ' ' == last;
}
/*
* Calls formatcode() til the end of a paragraph.
+ * Goes to OUST_MAC mode and stays there when returning,
+ * such that the caller can add arguments to the macro line
+ * before closing it out.
*/
static void
-formatcodeln(struct state *st, const char *buf,
- size_t *start, size_t end, int nomacro)
+formatcodeln(struct state *st, const char *linemac,
+ const char *buf, size_t *start, size_t end, int nomacro)
{
+ assert(OUST_NL == st->oust);
+ assert(st->wantws);
+ printf(".%s ", linemac);
+ st->oust = OUST_MAC;
+
last = ' ';
while (*start < end) {
if (*start + 1 < end && '<' == buf[*start + 1]) {
- formatcode(st, buf, start, end, 1, nomacro, 1);
+ formatcode(st, buf, start, end, nomacro, 1);
continue;
}
+
+ if (OUST_MAC == st->oust) {
+ if ( ! st->wantws &&
+ ' ' != buf[*start] &&
+ '\n' != buf[*start])
+ printf(" Ns ");
+ st->wantws = 1;
+ }
+
/*
* Since we're already on a macro line, we want to make
* sure that we don't inadvertently invoke a macro.
@@ -752,6 +799,9 @@ listguess(const char *buf, size_t start,
* A command paragraph, as noted in the perlpod manual, just indicates
* that we should do something, optionally with some text to print as
* well.
+ * From the perspective of external callers,
+ * always stays in OUST_NL/wantws mode,
+ * but its children do use OUST_MAC.
*/
static void
command(struct state *st, const char *buf, size_t start, size_t end)
@@ -795,7 +845,6 @@ command(struct state *st, const char *bu
* The behaviour of head= follows from a quick glance at
* how pod2man handles it.
*/
- printf(".Sh ");
st->sect = SECT_NONE;
if (end - start == 4) {
if (0 == memcmp(&buf[start], "NAME", 4))
@@ -804,29 +853,26 @@ command(struct state *st, const char *bu
if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
st->sect = SECT_SYNOPSIS;
}
- formatcodeln(st, buf, &start, end, 1);
- putchar(last = '\n');
+ formatcodeln(st, "Sh", buf, &start, end, 1);
+ mdoc_newln(st);
st->haspar = 1;
break;
case (CMD_HEAD2):
- printf(".Ss ");
- formatcodeln(st, buf, &start, end, 1);
- putchar(last = '\n');
+ formatcodeln(st, "Ss", buf, &start, end, 1);
+ mdoc_newln(st);
st->haspar = 1;
break;
case (CMD_HEAD3):
puts(".Pp");
- printf(".Em ");
- formatcodeln(st, buf, &start, end, 0);
- putchar(last = '\n');
+ formatcodeln(st, "Em", buf, &start, end, 0);
+ mdoc_newln(st);
puts(".Pp");
st->haspar = 1;
break;
case (CMD_HEAD4):
puts(".Pp");
- printf(".No ");
- formatcodeln(st, buf, &start, end, 0);
- putchar(last = '\n');
+ formatcodeln(st, "No", buf, &start, end, 0);
+ mdoc_newln(st);
puts(".Pp");
st->haspar = 1;
break;
@@ -878,9 +924,8 @@ command(struct state *st, const char *bu
}
switch (st->lstack[st->lpos - 1]) {
case (LIST_TAG):
- printf(".It ");
- formatcodeln(st, buf, &start, end, 0);
- putchar(last = '\n');
+ formatcodeln(st, "It", buf, &start, end, 0);
+ mdoc_newln(st);
break;
case (LIST_ENUM):
/* FALLTHROUGH */
@@ -932,6 +977,8 @@ command(struct state *st, const char *bu
/*
* Just pump out the line in a verbatim block.
+ * From the perspective of external callers,
+ * always stays in OUST_NL/wantws mode.
*/
static void
verbatim(struct state *st, const char *buf, size_t start, size_t end)
@@ -1020,22 +1067,21 @@ hasmatch(const char *buf, size_t start,
* If we're an ending bracket, see if we have a stack already.
*/
static int
-dosynopsisop(const char *buf, size_t *start, size_t end, size_t *opstack)
+dosynopsisop(struct state *st, const char *buf,
+ size_t *start, size_t end, size_t *opstack)
{
assert('[' == buf[*start] || ']' == buf[*start]);
if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
- if ('\n' != last)
- putchar('\n');
+ mdoc_newln(st);
puts(".Oo");
(*opstack)++;
} else if ('[' == buf[*start])
return(0);
if (']' == buf[*start] && *opstack > 0) {
- if ('\n' != last)
- putchar('\n');
+ mdoc_newln(st);
puts(".Oc");
(*opstack)--;
} else if (']' == buf[*start])
@@ -1050,12 +1096,18 @@ dosynopsisop(const char *buf, size_t *st
/*
* Format multiple "Nm" manpage names in the NAME section.
+ * From the perspective of external callers,
+ * always stays in OUST_NL/wantws mode,
+ * but its children do use OUST_MAC.
*/
static void
donamenm(struct state *st, const char *buf, size_t *start, size_t end)
{
size_t word;
+ assert(OUST_NL == st->oust);
+ assert(st->wantws);
+
while (*start < end && ' ' == buf[*start])
(*start)++;
@@ -1065,17 +1117,17 @@ donamenm(struct state *st, const char *b
}
while (*start < end) {
- fputs(".Nm ", stdout);
for (word = *start; word < end; word++)
if (',' == buf[word])
break;
- formatcodeln(st, buf, start, word, 1);
+ formatcodeln(st, "Nm", buf, start, word, 1);
if (*start == end) {
- putchar(last = '\n');
- continue;
+ mdoc_newln(st);
+ break;
}
assert(',' == buf[*start]);
- puts(" ,");
+ printf(" ,");
+ mdoc_newln(st);
(*start)++;
while (*start < end && ' ' == buf[*start])
(*start)++;
@@ -1089,6 +1141,11 @@ donamenm(struct state *st, const char *b
* Lots of other snakes in the grass: escaping a newline followed by a
* period (accidental mdoc(7) control), double-newlines after macro
* passages, etc.
+ *
+ * Uses formatcode() to go to OUST_MAC mode
+ * and outbuf_flush() to go to OUST_TXT mode.
+ * Main text mode wantws handling is in this function.
+ * Must make sure to go back to OUST_NL/wantws mode before returning.
*/
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
@@ -1119,9 +1176,8 @@ ordinary(struct state *st, const char *b
start = j + 1;
while (start < end && ' ' == buf[start])
start++;
- fputs(".Nd ", stdout);
- formatcodeln(st, buf, &start, end, 1);
- putchar(last = '\n');
+ formatcodeln(st, "Nd", buf, &start, end, 1);
+ mdoc_newln(st);
return;
}
}
@@ -1130,7 +1186,6 @@ ordinary(struct state *st, const char *b
puts(".Pp");
st->haspar = 0;
- st->hasnl = 1;
last = '\n';
opstack = 0;
@@ -1153,21 +1208,32 @@ ordinary(struct state *st, const char *b
* brackets indicate that we're opening and
* closing an optional context.
*/
+
if (SECT_SYNOPSIS == st->sect &&
('[' == buf[start] ||
']' == buf[start]) &&
- dosynopsisop(buf, &start, end, &opstack))
+ dosynopsisop(st, buf,
+ &start, end, &opstack))
continue;
+
+ /*
+ * On whitespace, flush the output buffer
+ * and allow breaking to a macro line.
+ * Otherwise, buffer text and clear wantws.
+ */
+
last = buf[start++];
if (' ' == last) {
outbuf_flush(st);
putchar(' ');
+ st->wantws = 1;
} else
outbuf_addchar(st);
}
if (start < end - 1 && '<' == buf[start + 1]) {
- if (formatcode(st, buf, &start, end, 0, 0, seq)) {
+ formatcode(st, buf, &start, end, 0, seq);
+ if (OUST_MAC == st->oust) {
/*
* Let mdoc(7) handle trailing punctuation.
* XXX Some punctuation characters
@@ -1182,19 +1248,35 @@ ordinary(struct state *st, const char *b
putchar(' ');
putchar(buf[start++]);
}
- /* End the macro line. */
- putchar(last = '\n');
- st->hasnl = 1;
+
+ if (st->wantws ||
+ ' ' == buf[start] ||
+ '\n' == buf[start])
+ mdoc_newln(st);
+
/*
* Consume all whitespace
* so we don't accidentally start
* an implicit literal line.
*/
+
while (start < end && ' ' == buf[start])
start++;
+
+ /*
+ * Some text is following.
+ * Implement requested spacing.
+ */
+
+ if ( ! st->wantws && start < end &&
+ '<' != buf[start + 1]) {
+ printf(" Ns ");
+ st->wantws = 1;
+ }
}
} else if (start < end && '\n' == buf[start]) {
- outbuf_newln(st);
+ outbuf_flush(st);
+ mdoc_newln(st);
if (++start >= end)
continue;
/*
@@ -1212,7 +1294,8 @@ ordinary(struct state *st, const char *b
break;
}
}
- outbuf_newln(st);
+ outbuf_flush(st);
+ mdoc_newln(st);
}
/*
@@ -1224,6 +1307,9 @@ static void
dopar(struct state *st, const char *buf, size_t start, size_t end)
{
+ assert(OUST_NL == st->oust);
+ assert(st->wantws);
+
if (end == start)
return;
if (' ' == buf[start] || '\t' == buf[start])
@@ -1306,6 +1392,9 @@ dofile(const struct args *args, const ch
free(title);
memset(&st, 0, sizeof(struct state));
+ st.oust = OUST_NL;
+ st.wantws = 1;
+
assert(sz > 0);
/* Main loop over file contents. */
Index: Makefile.inc
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/pod2mdoc/Regress/Makefile.inc,v
retrieving revision 1.1
retrieving revision 1.2
diff -LRegress/Makefile.inc -LRegress/Makefile.inc -u -p -r1.1 -r1.2
--- Regress/Makefile.inc
+++ Regress/Makefile.inc
@@ -87,6 +87,7 @@ ${t}.diff_mdoc: ${t}.groff_mdoc ${t}.man
${DIFF} $?
${t}.diff: ${t}.mandoc_manm ${t}.mandoc_mdoc
${DIFF} $?
+${t}:${t}.diff
.endfor
# --- suffix rules ---
--- /dev/null
+++ Regress/formatcode/macroline.pod
@@ -0,0 +1,13 @@
+=head1 NAME
+
+macroline - escape sequences on macro lines
+
+=head1 DESCRIPTION
+
+B<bold>I<italic Em>normal
+
+B<bold> I<Em italic> normal
+
+B<bold>I< Em italic >normal
+
+B<bold >I<italic>
Index: invalid.pod
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/pod2mdoc/Regress/formatcode/invalid.pod,v
retrieving revision 1.1
retrieving revision 1.2
diff -LRegress/formatcode/invalid.pod -LRegress/formatcode/invalid.pod -u -p -r1.1 -r1.2
--- Regress/formatcode/invalid.pod
+++ Regress/formatcode/invalid.pod
@@ -7,3 +7,7 @@ invalid - invalid escape sequence
beforeY<invalid>after
before B<inY<val>id> after
+
+=head3 containsY<invalid>sequence
+
+plain text
--- /dev/null
+++ Regress/formatcode/code.pod
@@ -0,0 +1,17 @@
+=head1 NAME
+
+code - code escape sequences
+
+=head1 DESCRIPTION
+
+beforeC<code>after
+
+before C<code> after
+
+first secondC<code>
+
+B<bold>C<code>normal
+
+B<bold> C<code> normal
+
+B<bold >C<code>
Index: textline.pod
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/pod2mdoc/Regress/formatcode/textline.pod,v
retrieving revision 1.1
retrieving revision 1.2
diff -LRegress/formatcode/textline.pod -LRegress/formatcode/textline.pod -u -p -r1.1 -r1.2
--- Regress/formatcode/textline.pod
+++ Regress/formatcode/textline.pod
@@ -4,8 +4,10 @@ textline - escape sequences on text line
=head1 DESCRIPTION
-beforeB<bold>
+beforeB<bold>after
-before B<bold>
+beforeB< bold >after
+
+before B<bold> after
first secondI<italic>
Index: Makefile
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/pod2mdoc/Regress/formatcode/Makefile,v
retrieving revision 1.1
retrieving revision 1.2
diff -LRegress/formatcode/Makefile -LRegress/formatcode/Makefile -u -p -r1.1 -r1.2
--- Regress/formatcode/Makefile
+++ Regress/formatcode/Makefile
@@ -1,3 +1,3 @@
-TARGETS = invalid textline
+TARGETS = code invalid macroline textline
.include "../Makefile.inc"
--
To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2014-07-18 5:09 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-18 5:09 pod2mdoc: Proper translation of POD escape sequences into mdoc macros, schwarze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).