zsh-workers
 help / color / mirror / code / Atom feed
From: "Jun T." <takimoto-j@kba.biglobe.ne.jp>
To: zsh-workers@zsh.org
Subject: Re: Cannot paste unicode <0221>, <0234> - <024f>
Date: Tue, 2 May 2017 00:52:03 +0900	[thread overview]
Message-ID: <C181A2BD-02D5-409A-ABF4-C608B6767CBD@kba.biglobe.ne.jp> (raw)
In-Reply-To: <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp>

Here is a quick (maybe too simple) patch.

wcwidth() on MacOSX was broken for combining characters,
but Apple has fixed *this* problem a few years ago,
probably in OSX 10.8 (Mavericks). So BROKEN_WCWIDTH is
NOT defined on recent macOS.

In the patch below, I added a test in configure.ac using U+0234
for both wcwidth() and iswprint() (both are broken on macOS;
wcwidth() returns -1 and iswprint() returns 0=false).

As a replacement for the broken iswprint(), I added a very (or
too) simple function wc_isprint(), which returns false only for
those characters for which mk_wcwidth() returns -1, i.e.,
0 <= wc <= 0x1f and 0x7f <= wc <= 0x9f (8bit control chars).

Another possibility is to use --enable-unicode9 if wcwidth()
and/or iswprint() are broken (--enable-unicode9 works fine
without any additional libraries). There is no iswprint-replacement
in wcwidth.h, but implementing it would be easy if we can use the
array wcwidth9_nonprint in wcwidth9.h.
# But I must say I couldn't understand the array; for example,
# why U+00ad is not printable while U+2028 is printable?


diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c
index 8391739..d0dd1ef 100644
--- a/Src/Zle/zle_refresh.c
+++ b/Src/Zle/zle_refresh.c
@@ -1278,7 +1278,7 @@ zrefresh(void)
 #ifdef __STDC_ISO_10646__
 		 !ZSH_INVALID_WCHAR_TEST(*t) &&
 #endif
-		 iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+		 WC_ISPRINT(*t) && (width = WCWIDTH(*t)) > 0) {
 	    int ichars;
 	    if (width > rpms.sen - rpms.s) {
 		int started = 0;
@@ -1460,7 +1460,7 @@ zrefresh(void)
 	u = outputline;
 	for (; u < outputline + outll; u++) {
 #ifdef MULTIBYTE_SUPPORT
-	    if (iswprint(*u)) {
+	    if (WC_ISPRINT(*u)) {
 		int width = WCWIDTH(*u);
 		/* Handle wide characters as above */
 		if (width > rpms.sen - rpms.s) {
@@ -2468,7 +2468,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	if (tmpline[t0] == ZWC('\t'))
 	    vsiz = (vsiz | 7) + 2;
 #ifdef MULTIBYTE_SUPPORT
-	else if (iswprint(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
+	else if (WC_ISPRINT(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
 	    vsiz += width;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
 		while (t0 < tmpll-1 && IS_COMBINING(tmpline[t0+1]))
@@ -2556,7 +2556,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	    vp->atr = all_atr_on | all_atr_off;
 	    vp++;
 #ifdef MULTIBYTE_SUPPORT
-	} else if (iswprint(tmpline[t0]) &&
+	} else if (WC_ISPRINT(tmpline[t0]) &&
 		   (width = WCWIDTH(tmpline[t0])) > 0) {
 	    int ichars;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
diff --git a/Src/compat.c b/Src/compat.c
index a295694..ca9713b 100644
--- a/Src/compat.c
+++ b/Src/compat.c
@@ -1017,3 +1017,20 @@ isprint_ascii(int c)
 
 /**/
 #endif /* __APPLE__ && BROKEN_ISPRINT */
+
+/**/
+#if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+
+/**/
+int
+wc_isprint(wint_t ucs)
+{
+    if (ucs <= 0)
+	return 0;
+    if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
+	return 0;
+    return 1;
+}
+
+/**/
+#endif /* __APPLE__ && BROKEN_ISWPRINT */
diff --git a/Src/pattern.c b/Src/pattern.c
index 75db016..fc7c737 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -3625,7 +3625,7 @@ mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
 		    return 1;
 		break;
 	    case PP_PRINT:
-		if (iswprint(ch))
+		if (WC_ISPRINT(ch))
 		    return 1;
 		break;
 	    case PP_PUNCT:
diff --git a/Src/utils.c b/Src/utils.c
index ea4b34b..8aceb79 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -629,7 +629,7 @@ wcs_nicechar_sel(wchar_t c, size_t *widthp, char **swidep, int quotable)
     }
 
     s = buf;
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f) {
 	    if (quotable) {
 		*s++ = '\\';
@@ -734,7 +734,7 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep)
 /**/
 mod_export int is_wcs_nicechar(wchar_t c)
 {
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f || c == L'\n' || c == L'\t' || c < 0x20)
 	    return 1;
 	if (c >= 0x80) {
diff --git a/Src/ztype.h b/Src/ztype.h
index 76589b1..a8f5fe5 100644
--- a/Src/ztype.h
+++ b/Src/ztype.h
@@ -72,7 +72,11 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define WC_ZISTYPE(X,Y) wcsitype((X),(Y))
-#define WC_ISPRINT(X)	iswprint(X)
+# if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+#  define WC_ISPRINT(X)	wc_isprint(X)
+# else
+#  define WC_ISPRINT(X)	iswprint(X)
+# endif
 #else
 #define WC_ZISTYPE(X,Y)	zistype((X),(Y))
 #define WC_ISPRINT(X)	isprint(X)
diff --git a/configure.ac b/configure.ac
index 911cc45..d2f418d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2591,14 +2591,18 @@ fi])
 
 AH_TEMPLATE([BROKEN_WCWIDTH],
 [Define to 1 if the wcwidth() function is present but broken.])
+AH_TEMPLATE([BROKEN_ISWPRINT],
+[Define to 1 if the iswprint() function is present but broken.])
 AH_TEMPLATE([BROKEN_ISPRINT],
 [Define to 1 if the isprint() function is broken under UTF-8 locale.])
 if test x$zsh_cv_c_unicode_support = xyes; then
   AC_DEFINE(MULTIBYTE_SUPPORT)
 
-  dnl Test for a wcwidth() implementation that gives the wrong width for
-  dnl zero-width combining characters.
-  dnl For the test we use a combining acute accent (\u0301).
+  dnl Test for a wcwidth() implementation that gives the wrong width for either
+  dnl   zero-width combining characters, or
+  dnl   some characters in the Latin Extended-B.
+  dnl For the test we use a combining acute accent (\u0301) or
+  dnl a LATIN SMALL LETTER L WITH CURL (\u0234).
   dnl We input it as UTF-8 since that is the standard we can rely
   dnl upon most:  we can't rely on a wchar_t being stored as a
   dnl Unicode code point on all systems.
@@ -2607,9 +2611,8 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   dnl - the programme compiled, linked and ran
   dnl - we successfully set a UTF-8 locale
   dnl - the locale we set plausibly converted the UTF-8 string
-  dnl   for a zero-width combining character (the only way to be
-  dnl   100% sure would be to output it and ask if it looked right)
-  dnl - the converted wide character gave a non-zero width.
+  dnl   into the correct wide character
+  dnl - but the converted wide character gave a wrong width.
   dnl locale -a is a fallback; on most systems we should find en_US.UTF-8.
   [locale_prog='char *my_locales[] = {
   "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
@@ -2625,17 +2628,19 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   int main() {
     char **localep;
     char comb_acute_mb[] = { (char)0xcc, (char)0x81 };
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
     wchar_t wc;
 
     for (localep = my_locales; *localep; localep++)
-      if (setlocale(LC_ALL, *localep) &&
-          mbtowc(&wc, comb_acute_mb, 2) == 2)
+      if (setlocale(LC_ALL, *localep))
 	  break;
     if (!*localep)
       return 1;
-    if (wcwidth(wc) == 0)
-      return 1;
-    return 0;
+    if (mbtowc(&wc, comb_acute_mb, 2) == 2 && wcwidth(wc) != 0)
+      return 0;
+    if (mbtowc(&wc, u_0234, 2) == 2 && wcwidth(wc) != 1)
+      return 0;
+    return 1;
   }
   "]
 
@@ -2649,6 +2654,43 @@ if test x$zsh_cv_c_unicode_support = xyes; then
     AC_DEFINE(BROKEN_WCWIDTH)
   fi
 
+  dnl Check if iswprint() is broken.
+  [locale_prog='char *my_locales[] = {
+  "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
+  locale_prog="$locale_prog"`locale -a 2>/dev/null | \
+    sed -e 's/utf8/UTF-8/' | grep UTF-8 | \
+    while read line; do echo " \"$line\","; done;`
+  locale_prog="$locale_prog 0 };
+  #include <stdlib.h>
+  #include <locale.h>
+  #include <wchar.h>
+  #include <wctype.h>
+
+  int main() {
+    char **localep;
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
+    wchar_t wc;
+    for (localep = my_locales; *localep; localep++)
+      if (setlocale(LC_ALL, *localep))
+	break;
+    if (!*localep)
+      return 1;
+    if (mbtowc(&wc, u_0234, 2) == 2 && !iswprint(wc))
+      return 0;
+    return 1;
+  }
+  "]
+
+  AC_CACHE_CHECK(if the iswprint() function is broken,
+  zsh_cv_c_broken_iswprint,
+  [AC_TRY_RUN([$locale_prog],
+  zsh_cv_c_broken_iswprint=yes,
+  zsh_cv_c_broken_iswprint=no,
+  zsh_cv_c_broken_iswprint=no)])
+  if test x$zsh_cv_c_broken_iswprint = xyes; then
+    AC_DEFINE(BROKEN_ISWPRINT)
+  fi
+
   dnl Check if isprint() behaves correctly under UTF-8 locale.
   dnl On some platform (maybe only on Mac OS X), isprint() returns
   dnl true for all characters in the range from 0xa0 to 0xff if




  reply	other threads:[~2017-05-01 16:37 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20170428111102epcas3p1cc3d86dc54fdafd8cd0e613bbaeba69b@epcas3p1.samsung.com>
2017-04-28 10:55 ` Sebastian Gniazdowski
2017-04-28 11:44   ` Peter Stephenson
2017-04-28 12:11     ` Sebastian Gniazdowski
2017-04-28 13:16       ` Peter Stephenson
2017-04-28 13:54         ` Sebastian Gniazdowski
2017-04-28 14:10           ` Peter Stephenson
2017-04-28 14:41           ` Peter Stephenson
2017-04-28 15:27             ` Sebastian Gniazdowski
2017-04-28 15:43               ` Peter Stephenson
2017-04-28 16:11                 ` Sebastian Gniazdowski
2017-04-29  6:39             ` Sebastian Gniazdowski
2017-05-01  2:53               ` Jun T.
2017-05-01 15:52                 ` Jun T. [this message]
2017-05-02 13:15                   ` Peter Stephenson
2017-05-06 15:58                   ` Sebastian Gniazdowski
2017-05-06 18:11                     ` Bart Schaefer
2017-05-07  9:50                       ` Sebastian Gniazdowski
2017-05-08  2:44                         ` Daniel Shahaf
2017-05-08  4:04                           ` Gmail patch extraction (was: Cannot paste unicode <0221>, <0234> - <024f>) Sebastian Gniazdowski
2017-05-08  4:33                             ` Sebastian Gniazdowski
2017-05-08 11:07                               ` Daniel Shahaf
2017-05-06 18:39                     ` Cannot paste unicode <0221>, <0234> - <024f> Daniel Shahaf
2017-05-07 10:13                   ` Sebastian Gniazdowski
2017-05-10 11:29                   ` Jun T.
2017-05-12  7:50                     ` Sebastian Gniazdowski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=C181A2BD-02D5-409A-ABF4-C608B6767CBD@kba.biglobe.ne.jp \
    --to=takimoto-j@kba.biglobe.ne.jp \
    --cc=zsh-workers@zsh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).