zsh-workers
 help / color / mirror / code / Atom feed
From: Peter Stephenson <p.w.stephenson@ntlworld.com>
To: zsh-workers@zsh.org (Zsh hackers list)
Subject: PATCH: count glyphs in multibyte strings
Date: Thu, 25 Mar 2010 21:35:03 +0000	[thread overview]
Message-ID: <22950.1269552903@pws-pc> (raw)

I noticed we were missing this capability; not sure how useful it is in
practice, but it was straightforward to add.

You might want to check my terminology and assumptions about the way
Unicode works aren't gibberish.

--- ../zsh-git/zsh/Doc/Zsh/expn.yo	2010-03-25 21:01:19.000000000 +0000
+++ Doc/Zsh/expn.yo	2010-03-25 21:23:29.000000000 +0000
@@ -1004,6 +1004,12 @@
 length of the string.  Most printable characters have a width of one
 unit, however certain Asian character sets and certain special effects
 use wider characters; combining characters have zero width.
+
+If the tt(m) is repeated, the character either counts zero (if it has
+zero width), else one.  For printable character strings this has the
+effect of counting the number of glyphs (visibly separate characters),
+except for the case where combining characters themselves have non-zero
+width (true in certain alphabets).
 )
 item(tt(r:)var(expr)tt(::)var(string1)tt(::)var(string2)tt(:))(
 As tt(l), but pad the words on the right and insert var(string2)
--- ../zsh-git/zsh/Src/subst.c	2010-03-25 21:01:19.000000000 +0000
+++ Src/subst.c	2010-03-25 21:15:21.000000000 +0000
@@ -675,6 +675,35 @@
     return dest;
 }
 
+#ifdef MULTIBYTE_SUPPORT
+#define WCPADWIDTH(cchar, mw)	wcpadwidth(cchar, mw)
+
+/*
+ * Width of character for padding purposes.
+ * 0: all characters count 1.
+ * 1: use width of multibyte character.
+ * 2: non-zero width characters count 1, zero width 0.
+ */
+static int
+wcpadwidth(wchar_t wc, int multi_width)
+{
+    switch (multi_width)
+    {
+    case 0:
+	return 1;
+
+    case 1:
+	return WCWIDTH(wc);
+
+    default:
+	return WCWIDTH(wc) ? 1 : 0;
+    }
+}
+
+#else
+#define WCPADWIDTH(cchar, mw)	(1)
+#endif
+
 /*
  * Pad the string str, returning a result from the heap (or str itself,
  * if it didn't need padding).  If str is too large, it will be truncated.
@@ -703,12 +732,6 @@
 #endif
     )
 {
-#ifdef MULTIBYTE_SUPPORT
-#define WCPADWIDTH(cchar)	(multi_width ? WCWIDTH(cchar) : 1)
-#else
-#define WCPADWIDTH(cchar)	(1)
-#endif
-
     char *def, *ret, *t, *r;
     int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc, cl;
     convchar_t cchar;
@@ -775,14 +798,14 @@
 		MB_METACHARINIT();
 		while (f > 0) {
 		    str += MB_METACHARLENCONV(str, &cchar);
-		    f -= WCPADWIDTH(cchar);
+		    f -= WCPADWIDTH(cchar, multi_width);
 		}
 		/* Now finish the first half. */
 		for (c = prenum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
 		    while (cl--)
 			*r++ = *str++;
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		}
 	    } else {
 		if (f <= lpreone) {
@@ -796,7 +819,7 @@
 			/* So skip. */
 			for (t = preone; f > 0; ) {
 			    t += MB_METACHARLENCONV(t, &cchar);
-			    f -= WCPADWIDTH(cchar);
+			    f -= WCPADWIDTH(cchar, multi_width);
 			}
 			/* Then copy the entire remainder. */
 			while (*t)
@@ -814,7 +837,7 @@
 			    m = lpremul - m;
 			    for (t = premul; m > 0; ) {
 				t += MB_METACHARLENCONV(t, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 			    }
 			    /* Output the rest. */
 			    while (*t)
@@ -827,7 +850,7 @@
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 		    }
@@ -840,7 +863,7 @@
 		/* Output the first half width of the original string. */
 		for (c = ls2; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		    while (cl--)
 			*r++ = *str++;
 		}
@@ -854,7 +877,7 @@
 		MB_METACHARINIT();
 		for (c = postnum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		    while (cl--)
 			*r++ = *str++;
 		}
@@ -867,7 +890,7 @@
 			/* Can't fit unrepeated string, truncate it */
 			for (c = f; c > 0; ) {
 			    cl = MB_METACHARLENCONV(postone, &cchar);
-			    c -= WCPADWIDTH(cchar);
+			    c -= WCPADWIDTH(cchar, multi_width);
 			    while (cl--)
 				*r++ = *postone++;
 			}
@@ -890,7 +913,7 @@
 			    MB_METACHARINIT();
 			    while (m > 0) {
 				cl = MB_METACHARLENCONV(postmul, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 				while (cl--)
 				    *r++ = *postmul++;
 			    }
@@ -914,14 +937,14 @@
 		MB_METACHARINIT();
 		while (f > 0) {
 		    str += MB_METACHARLENCONV(str, &cchar);
-		    f -= WCPADWIDTH(cchar);
+		    f -= WCPADWIDTH(cchar, multi_width);
 		}
 		/* Copy the rest of the original string */
 		for (c = prenum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
 		    while (cl--)
 			*r++ = *str++;
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		}
 	    } else {
 		/*
@@ -942,7 +965,7 @@
 			MB_METACHARINIT();
 			for (t = preone; f > 0; ) {
 			    t += MB_METACHARLENCONV(t, &cchar);
-			    f -= WCPADWIDTH(cchar);
+			    f -= WCPADWIDTH(cchar, multi_width);
 			}
 			/* Copy the rest of preone */
 			while (*t)
@@ -966,14 +989,14 @@
 			    MB_METACHARINIT();
 			    for (t = premul; m > 0; ) {
 				t += MB_METACHARLENCONV(t, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 			    }
 			    /* Now the rest of the repeated string. */
 			    while (c > 0) {
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 			for (cc = f / lpremul; cc--;) {
@@ -985,7 +1008,7 @@
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 		    }
@@ -1023,7 +1046,7 @@
 		cl = MB_METACHARLENCONV(str, &cchar);
 		while (cl--)
 		    *r++ = *str++;
-		c -= WCPADWIDTH(cchar);
+		c -= WCPADWIDTH(cchar, multi_width);
 	    }
 	} else {
 	    /*
@@ -1035,7 +1058,7 @@
 		cl = MB_METACHARLENCONV(str, &cchar);
 		while (cl--)
 		    *r++ = *str++;
-		c -= WCPADWIDTH(cchar);
+		c -= WCPADWIDTH(cchar, multi_width);
 	    }
 	    MB_METACHARINIT();
 	    if (f <= lpostone) {
@@ -1048,7 +1071,7 @@
 			cl = MB_METACHARLENCONV(postone, &cchar);
 			while (cl--)
 			    *r++ = *postone++;
-			c -= WCPADWIDTH(cchar);
+			c -= WCPADWIDTH(cchar, multi_width);
 		    }
 		}
 	    } else {
@@ -1059,7 +1082,7 @@
 			cl = MB_METACHARLENCONV(postone, &cchar);
 			while (cl--)
 			    *r++ = *postone++;
-			c -= WCPADWIDTH(cchar);
+			c -= WCPADWIDTH(cchar, multi_width);
 		    }
 		}
 		if (lpostmul) {
@@ -1070,7 +1093,7 @@
 			    cl = MB_METACHARLENCONV(t, &cchar);
 			    while (cl--)
 				*r++ = *t++;
-			    c -= WCPADWIDTH(cchar);
+			    c -= WCPADWIDTH(cchar, multi_width);
 			}
 		    }
 		    /*
@@ -1083,7 +1106,7 @@
 			    cl = MB_METACHARLENCONV(postmul, &cchar);
 			    while (cl--)
 				*r++ = *postmul++;
-			    m -= WCPADWIDTH(cchar);
+			    m -= WCPADWIDTH(cchar, multi_width);
 			}
 		    }
 		}
@@ -1782,7 +1805,7 @@
 
 		case 'm':
 #ifdef MULTIBYTE_SUPPORT
-		    multi_width = 1;
+		    multi_width++;
 #endif
 		    break;
 
--- ../zsh-git/zsh/Src/utils.c	2010-03-25 21:01:19.000000000 +0000
+++ Src/utils.c	2010-03-25 21:14:17.000000000 +0000
@@ -4406,6 +4406,8 @@
  * until end of string.
  *
  * If width is 1, return total character width rather than number.
+ * If width is greater than 1, return 1 if character has non-zero width,
+ * else 0.
  */
 
 /**/
@@ -4447,9 +4449,12 @@
 		 * turn this into 1 for backward compatibility.
 		 */
 		int wcw = WCWIDTH(wc);
-		if (wcw >= 0)
-		    num += wcw;
-		else
+		if (wcw >= 0) {
+		    if (width == 1)
+			num += wcw;
+		    else
+			num += (wcw > 0);
+		} else
 		    num++;
 	    } else
 		num++;

-- 
Peter Stephenson <p.w.stephenson@ntlworld.com>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/


             reply	other threads:[~2010-03-25 22:05 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-25 21:35 Peter Stephenson [this message]
2010-03-26  1:02 ` Phil Pennock
2010-03-26  9:53   ` Peter Stephenson
2010-03-26 20:50     ` Peter Stephenson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=22950.1269552903@pws-pc \
    --to=p.w.stephenson@ntlworld.com \
    --cc=zsh-workers@zsh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).