From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 27045 invoked by alias); 1 May 2017 16:37:14 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: X-Seq: 41037 Received: (qmail 22244 invoked from network); 1 May 2017 16:37:13 -0000 X-Qmail-Scanner-Diagnostics: from rcpt-mqugw.biglobe.ne.jp by f.primenet.com.au (envelope-from , uid 7791) with qmail-scanner-2.11 (clamdscan: 0.99.2/21882. spamassassin: 3.4.1. Clear:RC:0(133.208.100.1):SA:0(-0.7/5.0):. Processed in 6.519648 secs); 01 May 2017 16:37:13 -0000 X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-0.7 required=5.0 tests=RCVD_IN_DNSWL_LOW, RP_MATCHES_RCVD,SPF_PASS autolearn=unavailable autolearn_force=no version=3.4.1 X-Envelope-From: takimoto-j@kba.biglobe.ne.jp X-Qmail-Scanner-Mime-Attachments: | X-Qmail-Scanner-Zip-Files: | Received-SPF: pass (ns1.primenet.com.au: SPF record at spf01.biglobe.ne.jp designates 133.208.100.1 as permitted sender) X-Biglobe-Sender: Content-Type: text/plain; charset=us-ascii Mime-Version: 1.0 (Mac OS X Mail 7.3 \(1878.6\)) Subject: Re: Cannot paste unicode <0221>, <0234> - <024f> From: "Jun T." In-Reply-To: <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp> Date: Tue, 2 May 2017 00:52:03 +0900 Content-Transfer-Encoding: quoted-printable Message-Id: References: <20170428124439.73447db2@pwslap01u.europe.root.pri> <20170428141650.7ed174d6@pwslap01u.europe.root.pri> <20170428154135.2e2b5626@pwslap01u.europe.root.pri> <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp> To: zsh-workers@zsh.org X-Mailer: Apple Mail (2.1878.6) X-Biglobe-Spnum: 52034 Here is a quick (maybe too simple) patch. wcwidth() on MacOSX was broken for combining characters, but Apple has fixed *this* problem a few years ago, probably in OSX 10.8 (Mavericks). So BROKEN_WCWIDTH is NOT defined on recent macOS. In the patch below, I added a test in configure.ac using U+0234 for both wcwidth() and iswprint() (both are broken on macOS; wcwidth() returns -1 and iswprint() returns 0=3Dfalse). As a replacement for the broken iswprint(), I added a very (or too) simple function wc_isprint(), which returns false only for those characters for which mk_wcwidth() returns -1, i.e., 0 <=3D wc <=3D 0x1f and 0x7f <=3D wc <=3D 0x9f (8bit control chars). Another possibility is to use --enable-unicode9 if wcwidth() and/or iswprint() are broken (--enable-unicode9 works fine without any additional libraries). There is no iswprint-replacement in wcwidth.h, but implementing it would be easy if we can use the array wcwidth9_nonprint in wcwidth9.h. # But I must say I couldn't understand the array; for example, # why U+00ad is not printable while U+2028 is printable? diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c index 8391739..d0dd1ef 100644 --- a/Src/Zle/zle_refresh.c +++ b/Src/Zle/zle_refresh.c @@ -1278,7 +1278,7 @@ zrefresh(void) #ifdef __STDC_ISO_10646__ !ZSH_INVALID_WCHAR_TEST(*t) && #endif - iswprint(*t) && (width =3D WCWIDTH(*t)) > 0) { + WC_ISPRINT(*t) && (width =3D WCWIDTH(*t)) > 0) { int ichars; if (width > rpms.sen - rpms.s) { int started =3D 0; @@ -1460,7 +1460,7 @@ zrefresh(void) u =3D outputline; for (; u < outputline + outll; u++) { #ifdef MULTIBYTE_SUPPORT - if (iswprint(*u)) { + if (WC_ISPRINT(*u)) { int width =3D WCWIDTH(*u); /* Handle wide characters as above */ if (width > rpms.sen - rpms.s) { @@ -2468,7 +2468,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int = tmpcs) if (tmpline[t0] =3D=3D ZWC('\t')) vsiz =3D (vsiz | 7) + 2; #ifdef MULTIBYTE_SUPPORT - else if (iswprint(tmpline[t0]) && ((width =3D = WCWIDTH(tmpline[t0])) > 0)) { + else if (WC_ISPRINT(tmpline[t0]) && ((width =3D = WCWIDTH(tmpline[t0])) > 0)) { vsiz +=3D width; if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) { while (t0 < tmpll-1 && IS_COMBINING(tmpline[t0+1])) @@ -2556,7 +2556,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int = tmpcs) vp->atr =3D all_atr_on | all_atr_off; vp++; #ifdef MULTIBYTE_SUPPORT - } else if (iswprint(tmpline[t0]) && + } else if (WC_ISPRINT(tmpline[t0]) && (width =3D WCWIDTH(tmpline[t0])) > 0) { int ichars; if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) { diff --git a/Src/compat.c b/Src/compat.c index a295694..ca9713b 100644 --- a/Src/compat.c +++ b/Src/compat.c @@ -1017,3 +1017,20 @@ isprint_ascii(int c) =20 /**/ #endif /* __APPLE__ && BROKEN_ISPRINT */ + +/**/ +#if defined(__APPLE__) && defined(BROKEN_ISWPRINT) + +/**/ +int +wc_isprint(wint_t ucs) +{ + if (ucs <=3D 0) + return 0; + if (ucs < 32 || (ucs >=3D 0x7f && ucs < 0xa0)) + return 0; + return 1; +} + +/**/ +#endif /* __APPLE__ && BROKEN_ISWPRINT */ diff --git a/Src/pattern.c b/Src/pattern.c index 75db016..fc7c737 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -3625,7 +3625,7 @@ mb_patmatchrange(char *range, wchar_t ch, int = zmb_ind, wint_t *indptr, int *mtp) return 1; break; case PP_PRINT: - if (iswprint(ch)) + if (WC_ISPRINT(ch)) return 1; break; case PP_PUNCT: diff --git a/Src/utils.c b/Src/utils.c index ea4b34b..8aceb79 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -629,7 +629,7 @@ wcs_nicechar_sel(wchar_t c, size_t *widthp, char = **swidep, int quotable) } =20 s =3D buf; - if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) { + if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) { if (c =3D=3D 0x7f) { if (quotable) { *s++ =3D '\\'; @@ -734,7 +734,7 @@ wcs_nicechar(wchar_t c, size_t *widthp, char = **swidep) /**/ mod_export int is_wcs_nicechar(wchar_t c) { - if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) { + if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) { if (c =3D=3D 0x7f || c =3D=3D L'\n' || c =3D=3D L'\t' || c < = 0x20) return 1; if (c >=3D 0x80) { diff --git a/Src/ztype.h b/Src/ztype.h index 76589b1..a8f5fe5 100644 --- a/Src/ztype.h +++ b/Src/ztype.h @@ -72,7 +72,11 @@ =20 #ifdef MULTIBYTE_SUPPORT #define WC_ZISTYPE(X,Y) wcsitype((X),(Y)) -#define WC_ISPRINT(X) iswprint(X) +# if defined(__APPLE__) && defined(BROKEN_ISWPRINT) +# define WC_ISPRINT(X) wc_isprint(X) +# else +# define WC_ISPRINT(X) iswprint(X) +# endif #else #define WC_ZISTYPE(X,Y) zistype((X),(Y)) #define WC_ISPRINT(X) isprint(X) diff --git a/configure.ac b/configure.ac index 911cc45..d2f418d 100644 --- a/configure.ac +++ b/configure.ac @@ -2591,14 +2591,18 @@ fi]) =20 AH_TEMPLATE([BROKEN_WCWIDTH], [Define to 1 if the wcwidth() function is present but broken.]) +AH_TEMPLATE([BROKEN_ISWPRINT], +[Define to 1 if the iswprint() function is present but broken.]) AH_TEMPLATE([BROKEN_ISPRINT], [Define to 1 if the isprint() function is broken under UTF-8 locale.]) if test x$zsh_cv_c_unicode_support =3D xyes; then AC_DEFINE(MULTIBYTE_SUPPORT) =20 - dnl Test for a wcwidth() implementation that gives the wrong width = for - dnl zero-width combining characters. - dnl For the test we use a combining acute accent (\u0301). + dnl Test for a wcwidth() implementation that gives the wrong width = for either + dnl zero-width combining characters, or + dnl some characters in the Latin Extended-B. + dnl For the test we use a combining acute accent (\u0301) or + dnl a LATIN SMALL LETTER L WITH CURL (\u0234). dnl We input it as UTF-8 since that is the standard we can rely dnl upon most: we can't rely on a wchar_t being stored as a dnl Unicode code point on all systems. @@ -2607,9 +2611,8 @@ if test x$zsh_cv_c_unicode_support =3D xyes; then dnl - the programme compiled, linked and ran dnl - we successfully set a UTF-8 locale dnl - the locale we set plausibly converted the UTF-8 string - dnl for a zero-width combining character (the only way to be - dnl 100% sure would be to output it and ask if it looked right) - dnl - the converted wide character gave a non-zero width. + dnl into the correct wide character + dnl - but the converted wide character gave a wrong width. dnl locale -a is a fallback; on most systems we should find = en_US.UTF-8. [locale_prog=3D'char *my_locales[] =3D { "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", ' @@ -2625,17 +2628,19 @@ if test x$zsh_cv_c_unicode_support =3D xyes; = then int main() { char **localep; char comb_acute_mb[] =3D { (char)0xcc, (char)0x81 }; + char u_0234[] =3D { (char)0xc8, (char)0xb4 }; wchar_t wc; =20 for (localep =3D my_locales; *localep; localep++) - if (setlocale(LC_ALL, *localep) && - mbtowc(&wc, comb_acute_mb, 2) =3D=3D 2) + if (setlocale(LC_ALL, *localep)) break; if (!*localep) return 1; - if (wcwidth(wc) =3D=3D 0) - return 1; - return 0; + if (mbtowc(&wc, comb_acute_mb, 2) =3D=3D 2 && wcwidth(wc) !=3D 0) + return 0; + if (mbtowc(&wc, u_0234, 2) =3D=3D 2 && wcwidth(wc) !=3D 1) + return 0; + return 1; } "] =20 @@ -2649,6 +2654,43 @@ if test x$zsh_cv_c_unicode_support =3D xyes; then AC_DEFINE(BROKEN_WCWIDTH) fi =20 + dnl Check if iswprint() is broken. + [locale_prog=3D'char *my_locales[] =3D { + "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", ' + locale_prog=3D"$locale_prog"`locale -a 2>/dev/null | \ + sed -e 's/utf8/UTF-8/' | grep UTF-8 | \ + while read line; do echo " \"$line\","; done;` + locale_prog=3D"$locale_prog 0 }; + #include + #include + #include + #include + + int main() { + char **localep; + char u_0234[] =3D { (char)0xc8, (char)0xb4 }; + wchar_t wc; + for (localep =3D my_locales; *localep; localep++) + if (setlocale(LC_ALL, *localep)) + break; + if (!*localep) + return 1; + if (mbtowc(&wc, u_0234, 2) =3D=3D 2 && !iswprint(wc)) + return 0; + return 1; + } + "] + + AC_CACHE_CHECK(if the iswprint() function is broken, + zsh_cv_c_broken_iswprint, + [AC_TRY_RUN([$locale_prog], + zsh_cv_c_broken_iswprint=3Dyes, + zsh_cv_c_broken_iswprint=3Dno, + zsh_cv_c_broken_iswprint=3Dno)]) + if test x$zsh_cv_c_broken_iswprint =3D xyes; then + AC_DEFINE(BROKEN_ISWPRINT) + fi + dnl Check if isprint() behaves correctly under UTF-8 locale. dnl On some platform (maybe only on Mac OS X), isprint() returns dnl true for all characters in the range from 0xa0 to 0xff if