zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: count glyphs in multibyte strings
@ 2010-03-25 21:35 Peter Stephenson
  2010-03-26  1:02 ` Phil Pennock
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Stephenson @ 2010-03-25 21:35 UTC (permalink / raw)
  To: Zsh hackers list

I noticed we were missing this capability; not sure how useful it is in
practice, but it was straightforward to add.

You might want to check my terminology and assumptions about the way
Unicode works aren't gibberish.

--- ../zsh-git/zsh/Doc/Zsh/expn.yo	2010-03-25 21:01:19.000000000 +0000
+++ Doc/Zsh/expn.yo	2010-03-25 21:23:29.000000000 +0000
@@ -1004,6 +1004,12 @@
 length of the string.  Most printable characters have a width of one
 unit, however certain Asian character sets and certain special effects
 use wider characters; combining characters have zero width.
+
+If the tt(m) is repeated, the character either counts zero (if it has
+zero width), else one.  For printable character strings this has the
+effect of counting the number of glyphs (visibly separate characters),
+except for the case where combining characters themselves have non-zero
+width (true in certain alphabets).
 )
 item(tt(r:)var(expr)tt(::)var(string1)tt(::)var(string2)tt(:))(
 As tt(l), but pad the words on the right and insert var(string2)
--- ../zsh-git/zsh/Src/subst.c	2010-03-25 21:01:19.000000000 +0000
+++ Src/subst.c	2010-03-25 21:15:21.000000000 +0000
@@ -675,6 +675,35 @@
     return dest;
 }
 
+#ifdef MULTIBYTE_SUPPORT
+#define WCPADWIDTH(cchar, mw)	wcpadwidth(cchar, mw)
+
+/*
+ * Width of character for padding purposes.
+ * 0: all characters count 1.
+ * 1: use width of multibyte character.
+ * 2: non-zero width characters count 1, zero width 0.
+ */
+static int
+wcpadwidth(wchar_t wc, int multi_width)
+{
+    switch (multi_width)
+    {
+    case 0:
+	return 1;
+
+    case 1:
+	return WCWIDTH(wc);
+
+    default:
+	return WCWIDTH(wc) ? 1 : 0;
+    }
+}
+
+#else
+#define WCPADWIDTH(cchar, mw)	(1)
+#endif
+
 /*
  * Pad the string str, returning a result from the heap (or str itself,
  * if it didn't need padding).  If str is too large, it will be truncated.
@@ -703,12 +732,6 @@
 #endif
     )
 {
-#ifdef MULTIBYTE_SUPPORT
-#define WCPADWIDTH(cchar)	(multi_width ? WCWIDTH(cchar) : 1)
-#else
-#define WCPADWIDTH(cchar)	(1)
-#endif
-
     char *def, *ret, *t, *r;
     int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc, cl;
     convchar_t cchar;
@@ -775,14 +798,14 @@
 		MB_METACHARINIT();
 		while (f > 0) {
 		    str += MB_METACHARLENCONV(str, &cchar);
-		    f -= WCPADWIDTH(cchar);
+		    f -= WCPADWIDTH(cchar, multi_width);
 		}
 		/* Now finish the first half. */
 		for (c = prenum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
 		    while (cl--)
 			*r++ = *str++;
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		}
 	    } else {
 		if (f <= lpreone) {
@@ -796,7 +819,7 @@
 			/* So skip. */
 			for (t = preone; f > 0; ) {
 			    t += MB_METACHARLENCONV(t, &cchar);
-			    f -= WCPADWIDTH(cchar);
+			    f -= WCPADWIDTH(cchar, multi_width);
 			}
 			/* Then copy the entire remainder. */
 			while (*t)
@@ -814,7 +837,7 @@
 			    m = lpremul - m;
 			    for (t = premul; m > 0; ) {
 				t += MB_METACHARLENCONV(t, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 			    }
 			    /* Output the rest. */
 			    while (*t)
@@ -827,7 +850,7 @@
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 		    }
@@ -840,7 +863,7 @@
 		/* Output the first half width of the original string. */
 		for (c = ls2; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		    while (cl--)
 			*r++ = *str++;
 		}
@@ -854,7 +877,7 @@
 		MB_METACHARINIT();
 		for (c = postnum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		    while (cl--)
 			*r++ = *str++;
 		}
@@ -867,7 +890,7 @@
 			/* Can't fit unrepeated string, truncate it */
 			for (c = f; c > 0; ) {
 			    cl = MB_METACHARLENCONV(postone, &cchar);
-			    c -= WCPADWIDTH(cchar);
+			    c -= WCPADWIDTH(cchar, multi_width);
 			    while (cl--)
 				*r++ = *postone++;
 			}
@@ -890,7 +913,7 @@
 			    MB_METACHARINIT();
 			    while (m > 0) {
 				cl = MB_METACHARLENCONV(postmul, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 				while (cl--)
 				    *r++ = *postmul++;
 			    }
@@ -914,14 +937,14 @@
 		MB_METACHARINIT();
 		while (f > 0) {
 		    str += MB_METACHARLENCONV(str, &cchar);
-		    f -= WCPADWIDTH(cchar);
+		    f -= WCPADWIDTH(cchar, multi_width);
 		}
 		/* Copy the rest of the original string */
 		for (c = prenum; c > 0; ) {
 		    cl = MB_METACHARLENCONV(str, &cchar);
 		    while (cl--)
 			*r++ = *str++;
-		    c -= WCPADWIDTH(cchar);
+		    c -= WCPADWIDTH(cchar, multi_width);
 		}
 	    } else {
 		/*
@@ -942,7 +965,7 @@
 			MB_METACHARINIT();
 			for (t = preone; f > 0; ) {
 			    t += MB_METACHARLENCONV(t, &cchar);
-			    f -= WCPADWIDTH(cchar);
+			    f -= WCPADWIDTH(cchar, multi_width);
 			}
 			/* Copy the rest of preone */
 			while (*t)
@@ -966,14 +989,14 @@
 			    MB_METACHARINIT();
 			    for (t = premul; m > 0; ) {
 				t += MB_METACHARLENCONV(t, &cchar);
-				m -= WCPADWIDTH(cchar);
+				m -= WCPADWIDTH(cchar, multi_width);
 			    }
 			    /* Now the rest of the repeated string. */
 			    while (c > 0) {
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 			for (cc = f / lpremul; cc--;) {
@@ -985,7 +1008,7 @@
 				cl = MB_METACHARLENCONV(t, &cchar);
 				while (cl--)
 				    *r++ = *t++;
-				c -= WCPADWIDTH(cchar);
+				c -= WCPADWIDTH(cchar, multi_width);
 			    }
 			}
 		    }
@@ -1023,7 +1046,7 @@
 		cl = MB_METACHARLENCONV(str, &cchar);
 		while (cl--)
 		    *r++ = *str++;
-		c -= WCPADWIDTH(cchar);
+		c -= WCPADWIDTH(cchar, multi_width);
 	    }
 	} else {
 	    /*
@@ -1035,7 +1058,7 @@
 		cl = MB_METACHARLENCONV(str, &cchar);
 		while (cl--)
 		    *r++ = *str++;
-		c -= WCPADWIDTH(cchar);
+		c -= WCPADWIDTH(cchar, multi_width);
 	    }
 	    MB_METACHARINIT();
 	    if (f <= lpostone) {
@@ -1048,7 +1071,7 @@
 			cl = MB_METACHARLENCONV(postone, &cchar);
 			while (cl--)
 			    *r++ = *postone++;
-			c -= WCPADWIDTH(cchar);
+			c -= WCPADWIDTH(cchar, multi_width);
 		    }
 		}
 	    } else {
@@ -1059,7 +1082,7 @@
 			cl = MB_METACHARLENCONV(postone, &cchar);
 			while (cl--)
 			    *r++ = *postone++;
-			c -= WCPADWIDTH(cchar);
+			c -= WCPADWIDTH(cchar, multi_width);
 		    }
 		}
 		if (lpostmul) {
@@ -1070,7 +1093,7 @@
 			    cl = MB_METACHARLENCONV(t, &cchar);
 			    while (cl--)
 				*r++ = *t++;
-			    c -= WCPADWIDTH(cchar);
+			    c -= WCPADWIDTH(cchar, multi_width);
 			}
 		    }
 		    /*
@@ -1083,7 +1106,7 @@
 			    cl = MB_METACHARLENCONV(postmul, &cchar);
 			    while (cl--)
 				*r++ = *postmul++;
-			    m -= WCPADWIDTH(cchar);
+			    m -= WCPADWIDTH(cchar, multi_width);
 			}
 		    }
 		}
@@ -1782,7 +1805,7 @@
 
 		case 'm':
 #ifdef MULTIBYTE_SUPPORT
-		    multi_width = 1;
+		    multi_width++;
 #endif
 		    break;
 
--- ../zsh-git/zsh/Src/utils.c	2010-03-25 21:01:19.000000000 +0000
+++ Src/utils.c	2010-03-25 21:14:17.000000000 +0000
@@ -4406,6 +4406,8 @@
  * until end of string.
  *
  * If width is 1, return total character width rather than number.
+ * If width is greater than 1, return 1 if character has non-zero width,
+ * else 0.
  */
 
 /**/
@@ -4447,9 +4449,12 @@
 		 * turn this into 1 for backward compatibility.
 		 */
 		int wcw = WCWIDTH(wc);
-		if (wcw >= 0)
-		    num += wcw;
-		else
+		if (wcw >= 0) {
+		    if (width == 1)
+			num += wcw;
+		    else
+			num += (wcw > 0);
+		} else
 		    num++;
 	    } else
 		num++;

-- 
Peter Stephenson <p.w.stephenson@ntlworld.com>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: count glyphs in multibyte strings
  2010-03-25 21:35 PATCH: count glyphs in multibyte strings Peter Stephenson
@ 2010-03-26  1:02 ` Phil Pennock
  2010-03-26  9:53   ` Peter Stephenson
  0 siblings, 1 reply; 4+ messages in thread
From: Phil Pennock @ 2010-03-26  1:02 UTC (permalink / raw)
  To: Peter Stephenson; +Cc: Zsh hackers list

On 2010-03-25 at 21:35 +0000, Peter Stephenson wrote:
> I noticed we were missing this capability; not sure how useful it is in
> practice, but it was straightforward to add.
> 
> You might want to check my terminology and assumptions about the way
> Unicode works aren't gibberish.

What about -1 being returned from wcwidth() for non-printable
characters?  Looks like this is an existing issue.

-Phil


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: count glyphs in multibyte strings
  2010-03-26  1:02 ` Phil Pennock
@ 2010-03-26  9:53   ` Peter Stephenson
  2010-03-26 20:50     ` Peter Stephenson
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Stephenson @ 2010-03-26  9:53 UTC (permalink / raw)
  To: Zsh hackers list

Phil Pennock wrote:
> On 2010-03-25 at 21:35 +0000, Peter Stephenson wrote:
> > I noticed we were missing this capability; not sure how useful it is in
> > practice, but it was straightforward to add.
> > 
> > You might want to check my terminology and assumptions about the way
> > Unicode works aren't gibberish.
> 
> What about -1 being returned from wcwidth() for non-printable
> characters?  Looks like this is an existing issue.

Yes, it would make a mess of the existing code although the new capability
would handle it better.  I'm not sure whether the right answer is 1 or 0,
but it's not -1.

-- 
Peter Stephenson <pws@csr.com>            Software Engineer
Tel: +44 (0)1223 692070                   Cambridge Silicon Radio Limited
Churchill House, Cambridge Business Park, Cowley Road, Cambridge, CB4 0WZ, UK


Member of the CSR plc group of companies. CSR plc registered in England and Wales, registered number 4187346, registered office Churchill House, Cambridge Business Park, Cowley Road, Cambridge, CB4 0WZ, United Kingdom


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: count glyphs in multibyte strings
  2010-03-26  9:53   ` Peter Stephenson
@ 2010-03-26 20:50     ` Peter Stephenson
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Stephenson @ 2010-03-26 20:50 UTC (permalink / raw)
  To: Zsh hackers list

On Fri, 26 Mar 2010 09:53:58 +0000
Peter Stephenson <pws@csr.com> wrote:
> Phil Pennock wrote:
> > What about -1 being returned from wcwidth() for non-printable
> > characters?  Looks like this is an existing issue.
> 
> Yes, it would make a mess of the existing code although the new capability
> would handle it better.  I'm not sure whether the right answer is 1 or 0,
> but it's not -1.

The code was inconsistent; some places handled unprintable characters as
width 0, others as 1, some didn't handle them specially.  I'll
rationalise them all to 0.  CVS is still down at the moment (see notice
from https://sourceforge.net/apps/wordpress/sourceforge/ below).


CVS outage 2010-03-25
March 25th, 2010 

Hello folks,

Update on the current CVS outage that is affecting projects whose UNIX
names start with the letters  a, e, h, i, m, o, r, s, w, z.

The work being done on this server may take up to two days to
resolve. We’re working to resolve this with alacrity and assure maximum
stability and performance.

Please bear with us as we work to get this effort completed.

Best regards,

Daniel Hinojosa – Sr. Manager Support, SourceForge.net


-- 
Peter Stephenson <p.w.stephenson@ntlworld.com>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-03-26 20:51 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-25 21:35 PATCH: count glyphs in multibyte strings Peter Stephenson
2010-03-26  1:02 ` Phil Pennock
2010-03-26  9:53   ` Peter Stephenson
2010-03-26 20:50     ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).