Re: Cannot paste unicode <0221>, <0234> - <024f>

zsh-workers
 help / color / mirror / code / Atom feed

From: "Jun T." <takimoto-j@kba.biglobe.ne.jp>
To: zsh-workers@zsh.org
Subject: Re: Cannot paste unicode <0221>, <0234> - <024f>
Date: Tue, 2 May 2017 00:52:03 +0900	[thread overview]
Message-ID: <C181A2BD-02D5-409A-ABF4-C608B6767CBD@kba.biglobe.ne.jp> (raw)
In-Reply-To: <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp>

Here is a quick (maybe too simple) patch.

wcwidth() on MacOSX was broken for combining characters,
but Apple has fixed *this* problem a few years ago,
probably in OSX 10.8 (Mavericks). So BROKEN_WCWIDTH is
NOT defined on recent macOS.

In the patch below, I added a test in configure.ac using U+0234
for both wcwidth() and iswprint() (both are broken on macOS;
wcwidth() returns -1 and iswprint() returns 0=false).

As a replacement for the broken iswprint(), I added a very (or
too) simple function wc_isprint(), which returns false only for
those characters for which mk_wcwidth() returns -1, i.e.,
0 <= wc <= 0x1f and 0x7f <= wc <= 0x9f (8bit control chars).

Another possibility is to use --enable-unicode9 if wcwidth()
and/or iswprint() are broken (--enable-unicode9 works fine
without any additional libraries). There is no iswprint-replacement
in wcwidth.h, but implementing it would be easy if we can use the
array wcwidth9_nonprint in wcwidth9.h.
# But I must say I couldn't understand the array; for example,
# why U+00ad is not printable while U+2028 is printable?


diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c
index 8391739..d0dd1ef 100644
--- a/Src/Zle/zle_refresh.c
+++ b/Src/Zle/zle_refresh.c
@@ -1278,7 +1278,7 @@ zrefresh(void)
 #ifdef __STDC_ISO_10646__
 		 !ZSH_INVALID_WCHAR_TEST(*t) &&
 #endif
-		 iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+		 WC_ISPRINT(*t) && (width = WCWIDTH(*t)) > 0) {
 	    int ichars;
 	    if (width > rpms.sen - rpms.s) {
 		int started = 0;
@@ -1460,7 +1460,7 @@ zrefresh(void)
 	u = outputline;
 	for (; u < outputline + outll; u++) {
 #ifdef MULTIBYTE_SUPPORT
-	    if (iswprint(*u)) {
+	    if (WC_ISPRINT(*u)) {
 		int width = WCWIDTH(*u);
 		/* Handle wide characters as above */
 		if (width > rpms.sen - rpms.s) {
@@ -2468,7 +2468,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	if (tmpline[t0] == ZWC('\t'))
 	    vsiz = (vsiz | 7) + 2;
 #ifdef MULTIBYTE_SUPPORT
-	else if (iswprint(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
+	else if (WC_ISPRINT(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
 	    vsiz += width;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
 		while (t0 < tmpll-1 && IS_COMBINING(tmpline[t0+1]))
@@ -2556,7 +2556,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	    vp->atr = all_atr_on | all_atr_off;
 	    vp++;
 #ifdef MULTIBYTE_SUPPORT
-	} else if (iswprint(tmpline[t0]) &&
+	} else if (WC_ISPRINT(tmpline[t0]) &&
 		   (width = WCWIDTH(tmpline[t0])) > 0) {
 	    int ichars;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
diff --git a/Src/compat.c b/Src/compat.c
index a295694..ca9713b 100644
--- a/Src/compat.c
+++ b/Src/compat.c
@@ -1017,3 +1017,20 @@ isprint_ascii(int c)
 
 /**/
 #endif /* __APPLE__ && BROKEN_ISPRINT */
+
+/**/
+#if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+
+/**/
+int
+wc_isprint(wint_t ucs)
+{
+    if (ucs <= 0)
+	return 0;
+    if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
+	return 0;
+    return 1;
+}
+
+/**/
+#endif /* __APPLE__ && BROKEN_ISWPRINT */
diff --git a/Src/pattern.c b/Src/pattern.c
index 75db016..fc7c737 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -3625,7 +3625,7 @@ mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
 		    return 1;
 		break;
 	    case PP_PRINT:
-		if (iswprint(ch))
+		if (WC_ISPRINT(ch))
 		    return 1;
 		break;
 	    case PP_PUNCT:
diff --git a/Src/utils.c b/Src/utils.c
index ea4b34b..8aceb79 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -629,7 +629,7 @@ wcs_nicechar_sel(wchar_t c, size_t *widthp, char **swidep, int quotable)
     }
 
     s = buf;
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f) {
 	    if (quotable) {
 		*s++ = '\\';
@@ -734,7 +734,7 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep)
 /**/
 mod_export int is_wcs_nicechar(wchar_t c)
 {
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f || c == L'\n' || c == L'\t' || c < 0x20)
 	    return 1;
 	if (c >= 0x80) {
diff --git a/Src/ztype.h b/Src/ztype.h
index 76589b1..a8f5fe5 100644
--- a/Src/ztype.h
+++ b/Src/ztype.h
@@ -72,7 +72,11 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define WC_ZISTYPE(X,Y) wcsitype((X),(Y))
-#define WC_ISPRINT(X)	iswprint(X)
+# if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+#  define WC_ISPRINT(X)	wc_isprint(X)
+# else
+#  define WC_ISPRINT(X)	iswprint(X)
+# endif
 #else
 #define WC_ZISTYPE(X,Y)	zistype((X),(Y))
 #define WC_ISPRINT(X)	isprint(X)
diff --git a/configure.ac b/configure.ac
index 911cc45..d2f418d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2591,14 +2591,18 @@ fi])
 
 AH_TEMPLATE([BROKEN_WCWIDTH],
 [Define to 1 if the wcwidth() function is present but broken.])
+AH_TEMPLATE([BROKEN_ISWPRINT],
+[Define to 1 if the iswprint() function is present but broken.])
 AH_TEMPLATE([BROKEN_ISPRINT],
 [Define to 1 if the isprint() function is broken under UTF-8 locale.])
 if test x$zsh_cv_c_unicode_support = xyes; then
   AC_DEFINE(MULTIBYTE_SUPPORT)
 
-  dnl Test for a wcwidth() implementation that gives the wrong width for
-  dnl zero-width combining characters.
-  dnl For the test we use a combining acute accent (\u0301).
+  dnl Test for a wcwidth() implementation that gives the wrong width for either
+  dnl   zero-width combining characters, or
+  dnl   some characters in the Latin Extended-B.
+  dnl For the test we use a combining acute accent (\u0301) or
+  dnl a LATIN SMALL LETTER L WITH CURL (\u0234).
   dnl We input it as UTF-8 since that is the standard we can rely
   dnl upon most:  we can't rely on a wchar_t being stored as a
   dnl Unicode code point on all systems.
@@ -2607,9 +2611,8 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   dnl - the programme compiled, linked and ran
   dnl - we successfully set a UTF-8 locale
   dnl - the locale we set plausibly converted the UTF-8 string
-  dnl   for a zero-width combining character (the only way to be
-  dnl   100% sure would be to output it and ask if it looked right)
-  dnl - the converted wide character gave a non-zero width.
+  dnl   into the correct wide character
+  dnl - but the converted wide character gave a wrong width.
   dnl locale -a is a fallback; on most systems we should find en_US.UTF-8.
   [locale_prog='char *my_locales[] = {
   "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
@@ -2625,17 +2628,19 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   int main() {
     char **localep;
     char comb_acute_mb[] = { (char)0xcc, (char)0x81 };
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
     wchar_t wc;
 
     for (localep = my_locales; *localep; localep++)
-      if (setlocale(LC_ALL, *localep) &&
-          mbtowc(&wc, comb_acute_mb, 2) == 2)
+      if (setlocale(LC_ALL, *localep))
 	  break;
     if (!*localep)
       return 1;
-    if (wcwidth(wc) == 0)
-      return 1;
-    return 0;
+    if (mbtowc(&wc, comb_acute_mb, 2) == 2 && wcwidth(wc) != 0)
+      return 0;
+    if (mbtowc(&wc, u_0234, 2) == 2 && wcwidth(wc) != 1)
+      return 0;
+    return 1;
   }
   "]
 
@@ -2649,6 +2654,43 @@ if test x$zsh_cv_c_unicode_support = xyes; then
     AC_DEFINE(BROKEN_WCWIDTH)
   fi
 
+  dnl Check if iswprint() is broken.
+  [locale_prog='char *my_locales[] = {
+  "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
+  locale_prog="$locale_prog"`locale -a 2>/dev/null | \
+    sed -e 's/utf8/UTF-8/' | grep UTF-8 | \
+    while read line; do echo " \"$line\","; done;`
+  locale_prog="$locale_prog 0 };
+  #include <stdlib.h>
+  #include <locale.h>
+  #include <wchar.h>
+  #include <wctype.h>
+
+  int main() {
+    char **localep;
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
+    wchar_t wc;
+    for (localep = my_locales; *localep; localep++)
+      if (setlocale(LC_ALL, *localep))
+	break;
+    if (!*localep)
+      return 1;
+    if (mbtowc(&wc, u_0234, 2) == 2 && !iswprint(wc))
+      return 0;
+    return 1;
+  }
+  "]
+
+  AC_CACHE_CHECK(if the iswprint() function is broken,
+  zsh_cv_c_broken_iswprint,
+  [AC_TRY_RUN([$locale_prog],
+  zsh_cv_c_broken_iswprint=yes,
+  zsh_cv_c_broken_iswprint=no,
+  zsh_cv_c_broken_iswprint=no)])
+  if test x$zsh_cv_c_broken_iswprint = xyes; then
+    AC_DEFINE(BROKEN_ISWPRINT)
+  fi
+
   dnl Check if isprint() behaves correctly under UTF-8 locale.
   dnl On some platform (maybe only on Mac OS X), isprint() returns
   dnl true for all characters in the range from 0xa0 to 0xff if

next prev parent reply	other threads:[~2017-05-01 16:37 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20170428111102epcas3p1cc3d86dc54fdafd8cd0e613bbaeba69b@epcas3p1.samsung.com>
2017-04-28 10:55 ` Sebastian Gniazdowski
2017-04-28 11:44   ` Peter Stephenson
2017-04-28 12:11     ` Sebastian Gniazdowski
2017-04-28 13:16       ` Peter Stephenson
2017-04-28 13:54         ` Sebastian Gniazdowski
2017-04-28 14:10           ` Peter Stephenson
2017-04-28 14:41           ` Peter Stephenson
2017-04-28 15:27             ` Sebastian Gniazdowski
2017-04-28 15:43               ` Peter Stephenson
2017-04-28 16:11                 ` Sebastian Gniazdowski
2017-04-29  6:39             ` Sebastian Gniazdowski
2017-05-01  2:53               ` Jun T.
2017-05-01 15:52                 ` Jun T. [this message]
2017-05-02 13:15                   ` Peter Stephenson
2017-05-06 15:58                   ` Sebastian Gniazdowski
2017-05-06 18:11                     ` Bart Schaefer
2017-05-07  9:50                       ` Sebastian Gniazdowski
2017-05-08  2:44                         ` Daniel Shahaf
2017-05-08  4:04                           ` Gmail patch extraction (was: Cannot paste unicode <0221>, <0234> - <024f>) Sebastian Gniazdowski
2017-05-08  4:33                             ` Sebastian Gniazdowski
2017-05-08 11:07                               ` Daniel Shahaf
2017-05-06 18:39                     ` Cannot paste unicode <0221>, <0234> - <024f> Daniel Shahaf
2017-05-07 10:13                   ` Sebastian Gniazdowski
2017-05-10 11:29                   ` Jun T.
2017-05-12  7:50                     ` Sebastian Gniazdowski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=C181A2BD-02D5-409A-ABF4-C608B6767CBD@kba.biglobe.ne.jp \
    --to=takimoto-j@kba.biglobe.ne.jp \
    --cc=zsh-workers@zsh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).