From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 28972 invoked from network); 19 Oct 2005 08:34:21 -0000 Received: from news.dotsrc.org (HELO a.mx.sunsite.dk) (130.225.247.88) by ns1.primenet.com.au with SMTP; 19 Oct 2005 08:34:21 -0000 Received: (qmail 85065 invoked from network); 19 Oct 2005 08:34:15 -0000 Received: from sunsite.dk (130.225.247.90) by a.mx.sunsite.dk with SMTP; 19 Oct 2005 08:34:15 -0000 Received: (qmail 4861 invoked by alias); 19 Oct 2005 08:34:12 -0000 Mailing-List: contact zsh-workers-help@sunsite.dk; run by ezmlm Precedence: bulk X-No-Archive: yes X-Seq: 21882 Received: (qmail 4852 invoked from network); 19 Oct 2005 08:34:11 -0000 Received: from news.dotsrc.org (HELO a.mx.sunsite.dk) (130.225.247.88) by sunsite.dk with SMTP; 19 Oct 2005 08:34:11 -0000 Received: (qmail 84764 invoked from network); 19 Oct 2005 08:34:11 -0000 Received: from cluster-c.mailcontrol.com (HELO rly26c.srv.mailcontrol.com) (168.143.177.190) by a.mx.sunsite.dk with SMTP; 19 Oct 2005 08:34:08 -0000 Received: from exchange03.csr.com (mailhost1.csr.com [81.105.217.43]) by rly26c.srv.mailcontrol.com (MailControl) with ESMTP id j9J8XxCG019191 for ; Wed, 19 Oct 2005 09:33:59 +0100 Received: from news01.csr.com ([10.103.143.38]) by exchange03.csr.com with Microsoft SMTPSVC(5.0.2195.6713); Wed, 19 Oct 2005 09:36:08 +0100 Received: from news01.csr.com (localhost.localdomain [127.0.0.1]) by news01.csr.com (8.13.1/8.12.11) with ESMTP id j9J8Xusw015443 for ; Wed, 19 Oct 2005 09:33:56 +0100 Received: from csr.com (pws@localhost) by news01.csr.com (8.13.1/8.13.1/Submit) with ESMTP id j9J8XuPX015440 for ; Wed, 19 Oct 2005 09:33:56 +0100 Message-Id: <200510190833.j9J8XuPX015440@news01.csr.com> X-Authentication-Warning: news01.csr.com: pws owned process doing -bs To: zsh-workers@sunsite.dk (Zsh hackers list) Subject: PATCH: prompt truncation with multibyte characters Date: Wed, 19 Oct 2005 09:33:55 +0100 From: Peter Stephenson X-OriginalArrivalTime: 19 Oct 2005 08:36:09.0868 (UTC) FILETIME=[27D294C0:01C5D488] Content-Type: text/plain MIME-Version: 1.0 X-Scanned-By: MailControl A-05-40-01 (www.mailcontrol.com) on 10.67.0.136 X-Spam-Checker-Version: SpamAssassin 3.0.4 (2005-06-05) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-2.5 required=5.0 tests=AWL,BAYES_00 autolearn=ham version=3.0.4 Here's a go at extending prompt truncation to work with multibyte characters in the prompt string. It doesn't look very much like the existing code in the other branch because I didn't really understand that (possibly I wrote bits of it). It seems to do roughly the right thing; if anyone finds any more obscure uses, they can be added to the tests. It's hard to write tests that depend on a given locale, however. If anyone wants to try doing this in a fail-safe manner, maybe printing a message if the locale isn't available, please do. I didn't alter the documentation since it already seems intuitively obvious that truncation should work on displayed characters, not bytes. Index: Src/prompt.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/prompt.c,v retrieving revision 1.26 diff -u -r1.26 prompt.c --- Src/prompt.c 13 Oct 2005 13:19:30 -0000 1.26 +++ Src/prompt.c 19 Oct 2005 08:24:43 -0000 @@ -87,7 +87,7 @@ /* Non-zero if truncating the current segment of the buffer. */ -static int trunclen; +static int truncwidth; /* Current level of nesting of %{ / %} sequences. */ @@ -179,7 +179,7 @@ fm = s; bp = bufline = buf = zshcalloc(bufspc = 256); bp1 = NULL; - trunclen = 0; + truncwidth = 0; putpromptchar(1, '\0'); addbufspc(1); if(dontcount) @@ -229,7 +229,7 @@ } else if (minus) arg = -1; if (*fm == '(') { - int tc, otrunclen; + int tc, otruncwidth; if (idigit(*++fm)) { arg = zstrtol(fm, &fm, 10); @@ -334,14 +334,14 @@ return 0; fm++; /* Don't do the current truncation until we get back */ - otrunclen = trunclen; - trunclen = 0; + otruncwidth = truncwidth; + truncwidth = 0; if (!putpromptchar(test == 1 && doprint, sep) || !*++fm || !putpromptchar(test == 0 && doprint, ')')) { - trunclen = otrunclen; + truncwidth = otruncwidth; return 0; } - trunclen = otrunclen; + truncwidth = otruncwidth; continue; } if (!doprint) @@ -973,14 +973,14 @@ * can be finished, backing up so that the new truncation * can be started afterwards. */ - if (trunclen) { + if (truncwidth) { while (*--fm != '%') ; fm--; return 0; } - trunclen = arg; + truncwidth = arg; if (*fm != ']') fm++; while (*fm && *fm != truncchar) { @@ -996,6 +996,12 @@ *bp++ = '<'; } ptr = buf + w; /* addbufspc() may have realloc()'d buf */ + /* + * Now: + * buf is the start of the output prompt buffer + * ptr is the start of the truncation string + * bp is the end of the truncation string + */ truncstr = ztrduppfx(ptr, bp - ptr); bp = ptr; @@ -1006,24 +1012,237 @@ trunccount = 0; ptr = buf + w; /* putpromptchar() may have realloc()'d */ *bp = '\0'; + /* + * Now: + * ptr is the start of the truncation string and also + * where we need to start putting any truncated output + * bp is the end of the string we have just added, which + * may need truncating. + */ + /* + * w below is screen width if multibyte support is enabled + * (note that above it was a raw string pointer difference). + * It's the full width of the string we may need to truncate. + * + * truncwidth has come from the user, so we interpret this + * as a screen width, too. + */ countprompt(ptr, &w, 0, -1); - if (w > trunclen) { + if (w > truncwidth) { /* - * We need to truncate. t points to the truncation string -- - * which is inserted literally, without nice representation. - * tlen is its length, and maxlen is the amount of the main - * string that we want to keep. Note that if the truncation - * string is longer than the truncation length (tlen > - * trunclen), the truncation string is used in full. + * We need to truncate. t points to the truncation string + * -- which is inserted literally, without nice + * representation. twidth is its printing width, and maxwidth + * is the amount of the main string that we want to keep. + * Note that if the truncation string is longer than the + * truncation length (twidth > truncwidth), the truncation + * string is used in full. * * TODO: we don't take account of multibyte characters * in the string we're truncating. */ char *t = truncstr; int fullen = bp - ptr; - int tlen = ztrlen(t), maxlen; - maxlen = tlen < trunclen ? trunclen - tlen : 0; + int twidth, maxwidth; +#ifdef ZLE_UNICODE_SUPPORT + int ntrunc = strlen(t); + + /* Use screen width of string */ + twidth = mb_width(t); + if (twidth < truncwidth) { + maxwidth = truncwidth - twidth; + /* + * It's not safe to assume there are no invisible substrings + * just because the width is less than the full string + * length since there may be multibyte characters. + */ + addbufspc(ntrunc+1); + /* may have realloc'd */ + ptr = bp - fullen; + + if (truncatleft) { + /* + * To truncate at the left, selectively copy + * maxwidth bytes from the main prompt, preceeded + * by the truncation string in full. + * + * We're overwriting the string containing the + * text to be truncated, so copy it. We've + * just ensured there's sufficient space at the + * end of the prompt string. + * + * Pointer into text to be truncated. + */ + char *fulltextptr, *fulltext; + int remw; + mbstate_t mbs; + + fulltextptr = fulltext = bp; + memmove(fulltext, ptr, fullen); + fulltext[fullen] = '\0'; + + /* Copy the truncstr into place. */ + while (*t) + *ptr++ = *t++; + + memset(&mbs, 0, sizeof(mbstate_t)); + + /* + * Find the point in the text at which we should + * start copying, i.e. when the remaining width + * is less than or equal to the maximum width. + */ + remw = w; + while (remw > maxwidth && *fulltextptr) { + if (*fulltextptr == Inpar) { + /* + * Text marked as invisible: copy + * regardless, since we don't know what + * this does but it shouldn't affect + * the width. + */ + for (;;) { + *ptr++ = *fulltextptr; + if (*fulltextptr == Outpar || + *fulltextptr == '\0') + break; + fulltextptr++; + } + } else { + /* + * Normal text: build up a multibyte character. + */ + char inchar; + wchar_t cc; + int ret; + + /* + * careful: string is still metafied (we + * need that because we don't know a + * priori when to stop and the resulting + * string must be metafied). + */ + if (*fulltextptr == Meta) + inchar = *++fulltextptr ^ 32; + else + inchar = *fulltextptr; + fulltextptr++; + ret = mbrtowc(&cc, &inchar, 1, &mbs); + + if (ret != -2) { + /* complete */ + if (ret <= 0) { + /* assume a single-byte character */ + remw--; + if (ret < 0) { + /* need to reset invalid state */ + memset(&mbs, 0, sizeof(mbstate_t)); + } + } else { + remw -= wcwidth(cc); + } + } + } + } + + /* + * Now simply copy the rest of the text. Still + * metafied, so this is easy. + */ + while (*fulltextptr) + *ptr++ = *fulltextptr++; + /* Mark the end of copying */ + bp = ptr; + } else { + /* + * Truncating at the right is easier: just leave + * enough characters until we have reached the + * maximum width. + */ + char *skiptext = ptr; + mbstate_t mbs; + + memset(&mbs, 0, sizeof(mbstate_t)); + + while (maxwidth > 0 && *skiptext) { + if (*skiptext == Inpar) { + for (; *skiptext != Outpar && *skiptext; + skiptext++); + } else { + char inchar; + wchar_t cc; + int ret; + + if (*skiptext == Meta) + inchar = *++skiptext ^ 32; + else + inchar = *skiptext; + skiptext++; + ret = mbrtowc(&cc, &inchar, 1, &mbs); + + if (ret != -2) { + /* complete or invalid character */ + if (ret <= 0) { + /* assume single byte */ + maxwidth--; + if (ret < 0) { + /* need to reset invalid state */ + memset(&mbs, 0, sizeof(mbstate_t)); + } + } else { + maxwidth -= wcwidth(cc); + } + } + } + } + /* + * We don't need the visible text from now on, + * but we'd better copy any invisible bits. + * History dictates that these go after the + * truncation string. This is sensible since + * they may, for example, turn off an effect which + * should apply to all text at this point. + * + * Copy the truncstr. + */ + ptr = skiptext; + while (*t) + *ptr++ = *t++; + bp = ptr; + if (*skiptext) { + /* Move remaining text so we don't overwrite it */ + memmove(bp, skiptext, strlen(skiptext)+1); + skiptext = bp; + + /* + * Copy anything we want, updating bp + */ + while (*skiptext) { + if (*skiptext == Inpar) { + for (;;) { + *bp++ = *skiptext; + if (*skiptext == Outpar || + *skiptext == '\0') + break; + skiptext++; + } + } + else + skiptext++; + } + } + } + } else { + /* Just copy truncstr; no other text appears. */ + while (*t) + *ptr++ = *t++; + bp = ptr; + } + *bp = '\0'; +#else + twidth = ztrlen(t); + maxwidth = twidth < truncwidth ? truncwidth - twidth : 0; if (w < fullen) { /* Invisible substrings, lots of shuffling. */ int n = strlen(t); @@ -1035,6 +1254,13 @@ p = ptr + n; q = p; + /* + * I don't think we need n and the test below since + * we must have enough space (we are using a subset + * of the existing text with no repetition) and the + * string is null-terminated, so I haven't copied it + * to the ZLE_UNICODE_SUPPORT section. + */ n = fullen - w; /* Shift the whole string right, then * @@ -1047,7 +1273,7 @@ --n; } while (*p++ != Outpar && *p && n); else if (w) { - if (--w < maxlen) + if (--w < maxwidth) *q++ = *p; ++p; } @@ -1058,11 +1284,11 @@ q = ptr + fullen; /* First skip over as much as will "fit". */ - while (w > 0 && maxlen > 0) { + while (w > 0 && maxwidth > 0) { if (*ptr == Inpar) while (*ptr++ != Outpar && *ptr) {;} else - ++ptr, --w, --maxlen; + ++ptr, --w, --maxwidth; } if (ptr < q) { /* We didn't reach the end of the string. * @@ -1087,25 +1313,26 @@ } } else { /* No invisible substrings. */ - if (tlen > fullen) { - addbufspc(tlen - fullen); + if (twidth > fullen) { + addbufspc(twidth - fullen); ptr = bp; /* addbufspc() may have realloc()'d buf */ - bp += tlen - fullen; + bp += twidth - fullen; } else - bp -= fullen - trunclen; + bp -= fullen - truncwidth; if (truncatleft) { - if (maxlen) - memmove(ptr + strlen(t), ptr + fullen - maxlen, - maxlen); + if (maxwidth) + memmove(ptr + strlen(t), ptr + fullen - maxwidth, + maxwidth); } else - ptr += maxlen; + ptr += maxwidth; } /* Finally, copy the truncstr into place. */ while (*t) *ptr++ = *t++; +#endif } zsfree(truncstr); - trunclen = 0; + truncwidth = 0; /* * We may have returned early from the previous putpromptchar * * because we found another truncation following this one. * @@ -1116,7 +1343,7 @@ if (*fm != endchar) { fm++; /* - * With trunclen set to zero, we always reach endchar * + * With truncwidth set to zero, we always reach endchar * * (or the terminating NULL) this time round. * */ if (!putpromptchar(doprint, endchar)) @@ -1132,7 +1359,7 @@ fm++; fm++; } - if (trunclen || !*fm) + if (truncwidth || !*fm) return 0; } return 1; Index: Src/utils.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/utils.c,v retrieving revision 1.99 diff -u -r1.99 utils.c --- Src/utils.c 13 Oct 2005 16:30:14 -0000 1.99 +++ Src/utils.c 19 Oct 2005 08:24:43 -0000 @@ -3520,6 +3520,49 @@ return retstr; } +/* + * Return the screen width of a multibyte string. The input + * string is metafied. + */ +/**/ +mod_export int +mb_width(const char *s) +{ + char *ums = ztrdup(s), *umptr; + int umlen; + int width = 0; + mbstate_t mbs; + + memset(&mbs, 0, sizeof(mbs)); + umptr = unmetafy(ums, ¨en); + /* + * Convert one wide character at a time. We could convet + * the entire string using mbsrtowcs(), but that terminates on + * a NUL and we might have embedded NULs. + */ + while (umlen > 0) { + wchar_t cc; + int ret = mbrtowc(&cc, umptr, umlen, &mbs); + + if (ret <= 0) { + /* Assume a single-width character. */ + width++; + ret = 1; + } else { + int wret = wcwidth(cc); + if (wret > 0) + width += wret; + } + + umlen -= ret; + umptr += ret; + } + + free(ums); + + return width; +} + /**/ #endif /* ZLE_UNICODE_SUPPORT */ Index: Test/D01prompt.ztst =================================================================== RCS file: /cvsroot/zsh/zsh/Test/D01prompt.ztst,v retrieving revision 1.2 diff -u -r1.2 D01prompt.ztst --- Test/D01prompt.ztst 9 Jul 2001 18:31:25 -0000 1.2 +++ Test/D01prompt.ztst 19 Oct 2005 08:24:43 -0000 @@ -68,11 +68,11 @@ >true >false - print -P '%10<......>truncated at 10%>> Not truncated' + print -P 'start %10<......>truncated at 10%>> Not truncated%3> ...>Not shown' 0:prompt truncation ->...d at 10 Not truncated ->truncat... Not truncated +>start ...d at 10 Not truncated ... +>start truncat... Not truncated ... # It's hard to check the time and date as they are moving targets. # We therefore just check that various forms of the date are consistent. -- Peter Stephenson Software Engineer CSR PLC, Churchill House, Cambridge Business Park, Cowley Road Cambridge, CB4 0WZ, UK Tel: +44 (0)1223 692070 This message has been scanned for viruses by BlackSpider MailControl - www.blackspider.com