source@mandoc.bsd.lv
 help / color / mirror / Atom feed
From: schwarze@mandoc.bsd.lv
To: source@mandoc.bsd.lv
Subject: mandoc: Distinguish between escape sequences that produce no output
Date: Mon, 15 Aug 2022 13:12:32 -0500 (EST)	[thread overview]
Message-ID: <336a1d8dbd80c513@mandoc.bsd.lv> (raw)

Log Message:
-----------
Distinguish between escape sequences that produce no output
whatsoever (for example \fR) and escape sequences that produce
invisible zero-width output (for example \&).  No, i'm not joking,
groff does make that distinction, and it has consequences in some
situations, for example for vertical spacing in no-fill mode.
Heirloom and Plan 9 behaviour is subtly different, but in case of
doubt, we want to follow groff.

While this fixes the behaviour for the majority of escape sequences,
in particular for those most likely to occur in practice, it is not
perfect yet because some of the more exotic ESCAPE_IGNORE sequences
are actually of the "no output whatsoever" type but treated
as "invisible zero-width" for now.  With the new ASCII_NBRZW mechanism
in place, switching them over one by one when the need arises will
no longer be very difficult.

Modified Files:
--------------
    mandoc:
        mandoc.h
        term.c
        term_ascii.c
    mandoc/regress/mdoc/Bd:
        blank.in
        blank.out_ascii
        blank.out_lint
        blank.out_markdown

Revision Data
-------------
Index: mandoc.h
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandoc.h,v
retrieving revision 1.279
retrieving revision 1.280
diff -Lmandoc.h -Lmandoc.h -u -p -r1.279 -r1.280
--- mandoc.h
+++ mandoc.h
@@ -20,8 +20,9 @@
  */
 
 #define ASCII_NBRSP	 31  /* non-breaking space */
-#define	ASCII_HYPH	 30  /* breakable hyphen */
-#define	ASCII_BREAK	 29  /* breakable zero-width space */
+#define ASCII_NBRZW	 30  /* non-breaking zero-width space */
+#define ASCII_BREAK	 29  /* breakable zero-width space */
+#define ASCII_HYPH	 28  /* breakable hyphen */
 
 /*
  * Status level.  This refers to both internal status (i.e., whilst
Index: term_ascii.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/term_ascii.c,v
retrieving revision 1.66
retrieving revision 1.67
diff -Lterm_ascii.c -Lterm_ascii.c -u -p -r1.66 -r1.67
--- term_ascii.c
+++ term_ascii.c
@@ -196,7 +196,7 @@ terminal_sepline(void *arg)
 static size_t
 ascii_width(const struct termp *p, int c)
 {
-	return c != ASCII_BREAK;
+	return c != ASCII_BREAK && c != ASCII_NBRZW;
 }
 
 void
Index: term.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/term.c,v
retrieving revision 1.288
retrieving revision 1.289
diff -Lterm.c -Lterm.c -u -p -r1.288 -r1.289
--- term.c
+++ term.c
@@ -208,7 +208,6 @@ term_flushln(struct termp *p)
 			return;
 
 		endline(p);
-		p->viscol = 0;
 
 		/*
 		 * Normally, start the next line at the same indentation
@@ -314,6 +313,8 @@ term_fill(struct termp *p, size_t *nbr, 
 				vis = term_tab_next(vis);
 				vis -= p->tcol->taboff;
 				break;
+			case ASCII_NBRZW:  /* Non-breakable zero-width. */
+				break;
 			case ASCII_NBRSP:  /* Non-breakable space. */
 				p->tcol->buf[ic] = ' ';
 				/* FALLTHROUGH */
@@ -365,6 +366,7 @@ term_field(struct termp *p, size_t vbl, 
 		switch (p->tcol->buf[ic]) {
 		case '\n':
 		case ASCII_BREAK:
+		case ASCII_NBRZW:
 			continue;
 		case '\t':
 		case ' ':
@@ -571,18 +573,23 @@ term_word(struct termp *p, const char *w
 			break;
 		case ESCAPE_NUMBERED:
 			uc = mchars_num2char(seq, sz);
-			if (uc < 0)
-				continue;
-			break;
+			if (uc >= 0)
+				break;
+			bufferc(p, ASCII_NBRZW);
+			continue;
 		case ESCAPE_SPECIAL:
 			if (p->enc == TERMENC_ASCII) {
 				cp = mchars_spec2str(seq, sz, &ssz);
 				if (cp != NULL)
 					encode(p, cp, ssz);
+				else
+					bufferc(p, ASCII_NBRZW);
 			} else {
 				uc = mchars_spec2cp(seq, sz);
 				if (uc > 0)
 					encode1(p, uc);
+				else
+					bufferc(p, ASCII_NBRZW);
 			}
 			continue;
 		case ESCAPE_UNDEF:
@@ -744,6 +751,9 @@ term_word(struct termp *p, const char *w
 			if (p->col > p->tcol->lastcol)
 				p->col = p->tcol->lastcol;
 			continue;
+		case ESCAPE_IGNORE:
+			bufferc(p, ASCII_NBRZW);
+			continue;
 		default:
 			continue;
 		}
@@ -935,8 +945,8 @@ term_strlen(const struct termp *p, const
 	int		 ssz, skip, uc;
 	const char	*seq, *rhs;
 	enum mandoc_esc	 esc;
-	static const char rej[] = { '\\', ASCII_NBRSP, ASCII_HYPH,
-			ASCII_BREAK, '\0' };
+	static const char rej[] = { '\\', ASCII_NBRSP, ASCII_NBRZW,
+		ASCII_BREAK, ASCII_HYPH, '\0' };
 
 	/*
 	 * Account for escaped sequences within string length
Index: blank.out_markdown
===================================================================
RCS file: /home/cvs/mandoc/mandoc/regress/mdoc/Bd/blank.out_markdown,v
retrieving revision 1.2
retrieving revision 1.3
diff -Lregress/mdoc/Bd/blank.out_markdown -Lregress/mdoc/Bd/blank.out_markdown -u -p -r1.2 -r1.3
--- regress/mdoc/Bd/blank.out_markdown
+++ regress/mdoc/Bd/blank.out_markdown
@@ -14,6 +14,14 @@ BD-BLANK(1) - General Commands Manual
 	
 	line containing space tab space:
 	
+	line containing a zero-width space:
+	
+	line containing an invalid numbered character escape:
+	<?>
+	line containing an invalid named character escape:
+	<?>
+	line containing a font escape:
+	
 	line starting with a blank character:
 	 x
 	line starting with two blank characters:
@@ -30,4 +38,4 @@ An empty one-line literal display:
 
 end of test document
 
-OpenBSD - July 4, 2017
+OpenBSD - August 15, 2022
Index: blank.in
===================================================================
RCS file: /home/cvs/mandoc/mandoc/regress/mdoc/Bd/blank.in,v
retrieving revision 1.2
retrieving revision 1.3
diff -Lregress/mdoc/Bd/blank.in -Lregress/mdoc/Bd/blank.in -u -p -r1.2 -r1.3
--- regress/mdoc/Bd/blank.in
+++ regress/mdoc/Bd/blank.in
@@ -1,4 +1,4 @@
-.\" $OpenBSD: blank.in,v 1.6 2017/07/04 14:53:24 schwarze Exp $
+.\" $OpenBSD: blank.in,v 1.7 2022/08/15 17:59:00 schwarze Exp $
 .Dd $Mdocdate$
 .Dt BD-BLANK 1
 .Os
@@ -15,6 +15,14 @@ line containing two blank characters:
   
 line containing space tab space:
  	 
+line containing a zero-width space:
+\&
+line containing an invalid numbered character escape:
+\N'257'
+line containing an invalid named character escape:
+\[foobar]
+line containing a font escape:
+\fR
 line starting with a blank character:
  x
 line starting with two blank characters:
Index: blank.out_lint
===================================================================
RCS file: /home/cvs/mandoc/mandoc/regress/mdoc/Bd/blank.out_lint,v
retrieving revision 1.6
retrieving revision 1.7
diff -Lregress/mdoc/Bd/blank.out_lint -Lregress/mdoc/Bd/blank.out_lint -u -p -r1.6 -r1.7
--- regress/mdoc/Bd/blank.out_lint
+++ regress/mdoc/Bd/blank.out_lint
@@ -1,8 +1,9 @@
 mandoc: blank.in:13:1: STYLE: whitespace at end of input line
 mandoc: blank.in:15:1: STYLE: whitespace at end of input line
 mandoc: blank.in:17:1: STYLE: whitespace at end of input line
-mandoc: blank.in:22:36: STYLE: whitespace at end of input line
-mandoc: blank.in:23:37: STYLE: whitespace at end of input line
-mandoc: blank.in:24:32: STYLE: whitespace at end of input line
-mandoc: blank.in:31:8: STYLE: whitespace at end of input line
-mandoc: blank.in:31:2: WARNING: empty block: Dl
+mandoc: blank.in:23:1: ERROR: unknown special character: \[foobar]
+mandoc: blank.in:30:36: STYLE: whitespace at end of input line
+mandoc: blank.in:31:37: STYLE: whitespace at end of input line
+mandoc: blank.in:32:32: STYLE: whitespace at end of input line
+mandoc: blank.in:39:8: STYLE: whitespace at end of input line
+mandoc: blank.in:39:2: WARNING: empty block: Dl
Index: blank.out_ascii
===================================================================
RCS file: /home/cvs/mandoc/mandoc/regress/mdoc/Bd/blank.out_ascii,v
retrieving revision 1.2
retrieving revision 1.3
diff -Lregress/mdoc/Bd/blank.out_ascii -Lregress/mdoc/Bd/blank.out_ascii -u -p -r1.2 -r1.3
--- regress/mdoc/Bd/blank.out_ascii
+++ regress/mdoc/Bd/blank.out_ascii
@@ -12,6 +12,13 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
 
      line containing space tab space:
 
+     line containing a zero-width space:
+
+     line containing an invalid numbered character escape:
+
+     line containing an invalid named character escape:
+
+     line containing a font escape:
      line starting with a blank character:
       x
      line starting with two blank characters:
@@ -26,4 +33,4 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
      An empty one-line literal display:
      end of test document
 
-OpenBSD                          July 4, 2017                          OpenBSD
+OpenBSD                         August 15, 2022                        OpenBSD
--
 To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv


                 reply	other threads:[~2022-08-15 18:12 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=336a1d8dbd80c513@mandoc.bsd.lv \
    --to=schwarze@mandoc.bsd.lv \
    --cc=source@mandoc.bsd.lv \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).