source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mdocml: Locale support.
@ 2011-05-17 22:32 kristaps
  0 siblings, 0 replies; only message in thread
From: kristaps @ 2011-05-17 22:32 UTC (permalink / raw)
  To: source

Log Message:
-----------
Locale support.  I'm checking this in to clean up fall-out in-tree, but
it looks pretty good.  Basically, the -Tlocale option propogates into
term_ascii.c, where we set locale-specific console call-backs IFF (1)
setlocale() works; (2) locale support is compiled in (see Makefile for
-DUSE_WCHAR); (3) the internal structure of wchar_t maps directly to
Unicode codepoints as defined by __STDC_ISO_10646__; and (4) the console
supports multi-byte characters.

To date, this configuration only supports GNU/Linux.  OpenBSD doesn't
export __STDC_ISO_10646__ although I'm told by stsp@openbsd.org that it
should (it has the correct map).  Apparently FreeBSD is the same way.
NetBSD?  Don't know.  Apple also supports this, but doesn't define the
macro.  Special-casing!

Benchmark: -Tlocale incurs less than 0.2 factor overhead when run
through several thousand manuals when UTF8 output is enabled.  Native
mode (whether directly -Tascii or through no locale or whatever) is
UNCHANGED: the function callbacks are the same as before.

Note.  If the underlying system does NOT support STDC_ISO_10646, there
is a "slow" version possible with iconv or other means of flipping from
a Unicode codepoint to a wchar_t.

Modified Files:
--------------
    mdocml:
        Makefile
        mandoc.1
        term.c
        term_ascii.c

Revision Data
-------------
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.193
retrieving revision 1.194
diff -Lterm.c -Lterm.c -u -p -r1.193 -r1.194
--- term.c
+++ term.c
@@ -36,6 +36,7 @@
 static	void		 adjbuf(struct termp *p, int);
 static	void		 bufferc(struct termp *, char);
 static	void		 encode(struct termp *, const char *, size_t);
+static	void		 encode1(struct termp *, int);
 
 void
 term_free(struct termp *p)
@@ -403,7 +404,7 @@ term_word(struct termp *p, const char *w
 {
 	const char	*seq, *cp;
 	char		 c;
-	int		 sz;
+	int		 sz, uc;
 	size_t		 ssz;
 	enum mandoc_esc	 esc;
 
@@ -440,7 +441,13 @@ term_word(struct termp *p, const char *w
 
 		switch (esc) {
 		case (ESCAPE_UNICODE):
-			encode(p, "?", 1);
+			if (TERMENC_ASCII == p->enc) {
+				encode1(p, '?');
+				break;
+			}
+			uc = mchars_num2uc(seq + 1, sz - 1);
+			if ('\0' != uc)
+				encode1(p, uc);
 			break;
 		case (ESCAPE_NUMBERED):
 			if ('\0' != (c = mchars_num2char(seq, sz)))
@@ -503,6 +510,33 @@ bufferc(struct termp *p, char c)
 	p->buf[p->col++] = c;
 }
 
+/*
+ * See encode().
+ * Do this for a single (probably unicode) value.
+ * Does not check for non-decorated glyphs.
+ */
+static void
+encode1(struct termp *p, int c)
+{
+	enum termfont	  f;
+
+	if (p->col + 4 >= p->maxcols)
+		adjbuf(p, p->col + 4);
+
+	f = term_fonttop(p);
+
+	if (TERMFONT_NONE == f) {
+		p->buf[p->col++] = c;
+		return;
+	} else if (TERMFONT_UNDER == f) {
+		p->buf[p->col++] = '_';
+	} else
+		p->buf[p->col++] = c;
+
+	p->buf[p->col++] = 8;
+	p->buf[p->col++] = c;
+}
+
 static void
 encode(struct termp *p, const char *word, size_t sz)
 {
@@ -584,11 +618,16 @@ term_strlen(const struct termp *p, const
 			case (ESCAPE_ERROR):
 				return(sz);
 			case (ESCAPE_UNICODE):
-				c = '?';
-				/* FALLTHROUGH */
-			case (ESCAPE_NUMBERED):
+				if (TERMENC_ASCII != p->enc) {
+					sz += (*p->width)(p, '?');
+					break;
+				}
+				c = mchars_num2uc(seq + 1, ssz - 1);
 				if ('\0' != c)
-					c = mchars_num2char(seq, ssz);
+					sz += (*p->width)(p, c);
+				break;
+			case (ESCAPE_NUMBERED):
+				c = mchars_num2char(seq, ssz);
 				if ('\0' != c)
 					sz += (*p->width)(p, c);
 				break;
Index: mandoc.1
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.1,v
retrieving revision 1.86
retrieving revision 1.87
diff -Lmandoc.1 -Lmandoc.1 -u -p -r1.86 -r1.87
--- mandoc.1
+++ mandoc.1
@@ -158,6 +158,12 @@ utility accepts the following
 .Fl T
 arguments, which correspond to output modes:
 .Bl -tag -width Ds
+.It Fl T Ns Cm locale
+This option encodes output characters using the current
+.Xr locale 1
+configuration.
+See
+.Sx Locale Output .
 .It Fl T Ns Cm ascii
 Produce 7-bit ASCII output.
 This is the default.
@@ -189,6 +195,16 @@ See
 .Pp
 If multiple input files are specified, these will be processed by the
 corresponding filter in-order.
+.Ss Locale Output
+Locale-depending output encoding is triggered with
+.Fl T Ns Cm locale .
+This option is not available on all systems: systems without locale
+support, or those whose internal representation is not natively UCS-4,
+will fall back to
+.Fl T Ns Cm ascii .
+See
+.Sx ASCII Output
+for font style specification and available command-line arguments.
 .Ss ASCII Output
 Output produced by
 .Fl T Ns Cm ascii ,
@@ -209,6 +225,9 @@ Emboldened characters are rendered as
 The special characters documented in
 .Xr mandoc_char 7
 are rendered best-effort in an ASCII equivalent.
+If no equivalent is found,
+.Sq \&?
+is used instead.
 .Pp
 Output width is limited to 78 visible columns unless literal input lines
 exceed this limit.
@@ -460,7 +479,7 @@ Each input and output format is separate
 .Ss ASCII Compatibility
 .Bl -bullet -compact
 .It
-Unicode codepoints specified with
+Unrenderable unicode codepoints specified with
 .Sq \e[uNNNN]
 escapes are printed as
 .Sq \&?
Index: term_ascii.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term_ascii.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -Lterm_ascii.c -Lterm_ascii.c -u -p -r1.14 -r1.15
--- term_ascii.c
+++ term_ascii.c
@@ -21,16 +21,26 @@
 #include <sys/types.h>
 
 #include <assert.h>
+#ifdef USE_WCHAR
+# include <locale.h>
+#endif
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#ifdef USE_WCHAR
+# include <wchar.h>
+#endif
 
 #include "mandoc.h"
 #include "out.h"
 #include "term.h"
 #include "main.h"
 
+#if ! defined(__STDC_ISO_10646__)
+# undef USE_WCHAR
+#endif
+
 static	struct termp	 *ascii_init(enum termenc, char *);
 static	double		  ascii_hspan(const struct termp *,
 				const struct roffsu *);
@@ -41,6 +51,13 @@ static	void		  ascii_end(struct termp *)
 static	void		  ascii_endline(struct termp *);
 static	void		  ascii_letter(struct termp *, int);
 
+#ifdef	USE_WCHAR
+static	void		  locale_advance(struct termp *, size_t);
+static	void		  locale_endline(struct termp *);
+static	void		  locale_letter(struct termp *, int);
+static	size_t		  locale_width(const struct termp *, int);
+#endif
+
 static struct termp *
 ascii_init(enum termenc enc, char *outopts)
 {
@@ -54,15 +71,28 @@ ascii_init(enum termenc enc, char *outop
 	p->tabwidth = 5;
 	p->defrmargin = 78;
 
-	p->advance = ascii_advance;
 	p->begin = ascii_begin;
 	p->end = ascii_end;
-	p->endline = ascii_endline;
 	p->hspan = ascii_hspan;
-	p->letter = ascii_letter;
 	p->type = TERMTYPE_CHAR;
+
+	p->enc = TERMENC_ASCII;
+	p->advance = ascii_advance;
+	p->endline = ascii_endline;
+	p->letter = ascii_letter;
 	p->width = ascii_width;
 
+#if defined (USE_WCHAR)
+	if (TERMENC_LOCALE == enc)
+		if (setlocale(LC_ALL, "") && MB_CUR_MAX > 1) {
+			p->enc = enc;
+			p->advance = locale_advance;
+			p->endline = locale_endline;
+			p->letter = locale_letter;
+			p->width = locale_width;
+		}
+#endif
+
 	toks[0] = "width";
 	toks[1] = NULL;
 
@@ -104,7 +134,6 @@ ascii_width(const struct termp *p, int c
 	return(1);
 }
 
-
 void
 ascii_free(void *arg)
 {
@@ -112,17 +141,14 @@ ascii_free(void *arg)
 	term_free((struct termp *)arg);
 }
 
-
 /* ARGSUSED */
 static void
 ascii_letter(struct termp *p, int c)
 {
 	
-	/* LINTED */
 	putchar(c);
 }
 
-
 static void
 ascii_begin(struct termp *p)
 {
@@ -130,7 +156,6 @@ ascii_begin(struct termp *p)
 	(*p->headf)(p, p->argf);
 }
 
-
 static void
 ascii_end(struct termp *p)
 {
@@ -138,7 +163,6 @@ ascii_end(struct termp *p)
 	(*p->footf)(p, p->argf);
 }
 
-
 /* ARGSUSED */
 static void
 ascii_endline(struct termp *p)
@@ -147,19 +171,16 @@ ascii_endline(struct termp *p)
 	putchar('\n');
 }
 
-
 /* ARGSUSED */
 static void
 ascii_advance(struct termp *p, size_t len)
 {
 	size_t	 	i;
 
-	/* Just print whitespace on the terminal. */
 	for (i = 0; i < len; i++)
 		putchar(' ');
 }
 
-
 /* ARGSUSED */
 static double
 ascii_hspan(const struct termp *p, const struct roffsu *su)
@@ -198,3 +219,39 @@ ascii_hspan(const struct termp *p, const
 	return(r);
 }
 
+#ifdef USE_WCHAR
+/* ARGSUSED */
+static size_t
+locale_width(const struct termp *p, int c)
+{
+	int		rc;
+
+	return((rc = wcwidth(c)) < 0 ? 0 : rc);
+}
+
+/* ARGSUSED */
+static void
+locale_advance(struct termp *p, size_t len)
+{
+	size_t	 	i;
+
+	for (i = 0; i < len; i++)
+		putwchar(L' ');
+}
+
+/* ARGSUSED */
+static void
+locale_endline(struct termp *p)
+{
+
+	putwchar(L'\n');
+}
+
+/* ARGSUSED */
+static void
+locale_letter(struct termp *p, int c)
+{
+	
+	putwchar(c);
+}
+#endif
Index: Makefile
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/Makefile,v
retrieving revision 1.338
retrieving revision 1.339
diff -LMakefile -LMakefile -u -p -r1.338 -r1.339
--- Makefile
+++ Makefile
@@ -13,7 +13,10 @@
 
 VERSION		 = 1.11.2
 VDATE		 = 12 May 2011
-CFLAGS		+= -g -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
+# If your system doesn't support multi-byte functions (specifically
+# setlocale(), wcwidth(), putwchar()), then remove -DUSE_CHAR.  You'll
+# still be able to use -Tlocale, but it becomes a synonym for -Tascii.
+CFLAGS		+= -g -DUSE_WCHAR -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
 CFLAGS     	+= -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
 PREFIX		 = /usr/local
 BINDIR		 = $(PREFIX)/bin
--
 To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2011-05-17 22:32 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-17 22:32 mdocml: Locale support kristaps

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).