From mboxrd@z Thu Jan 1 00:00:00 1970 From: quanstro@speakeasy.net To: 9fans@cse.psu.edu Message-ID: Date: Wed, 31 Aug 2005 10:51:32 +0000 Subject: [9fans] tcs bug. MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Topicbox-Message-UUID: 81dce4c0-ead0-11e9-9d60-3106f5b1d025 i just had a similar problem a day or two ago. i needed to change some capitalization and the tr 'A-Z' 'a-z' idiom doesn't work on random utf. i solved it a bit differently -- lifting the fullrune() check into the main loop. so i don't have a readu() function. also (unlike tcs) at the cost of 1 extra check at the end-of-input, the output buffer is dumped only when full. on japanese, greek or other text with >1 byte/char, this will save calls to OUT() -- or in my case print(). okay, total overkill. i know. but it was more interesting to do that way. here's upper.c. convert to upper/lower/title case: #include #include enum { BLOCK = 1024*4 }; typedef Rune (*Rconv)(Rune); void output(Rune* r, int nrunes, Rconv R){ int i; for(i=0; i BLOCK){ output(r, j, R); j=0; } } } if (rem_len){ // non unicode garbage. fprint(2, "non-utf8 garbage %.*s at eof\n", rem_len, in); } if (j){ output(r, j, R); } if (blen>0){ return 0; } return "read"; } void main(int argc, /* pfft const */ char** argv){ Rconv R; const char* v; const char* status; const char* s; int fd; v = strrchr(argv[0], '/'); if (v){ v++; } else { v = argv[0]; } if (0 == strcmp(v, "tolower")){ R = tolowerrune; } else if (0 == strcmp(v, "totitle")){ R = totitlerune; } else { R = toupperrune; } ARGBEGIN{ case 'u': R = toupperrune; break; case 'l': R = tolowerrune; break; case 't': R = totitlerune; break; default: fprint(2, "%s: bad option %c\n", argv0, ARGC()); fprint(2, "usage: %s -[ult]\n", argv0); exits("usage"); } ARGEND if (!*argv){ s = casify(0, R); } else { for(status = 0; *argv; argv++){ fd = open(*argv, OREAD); if (-1 == fd){ if (s && !status){ status = "open"; } continue; } s = casify(fd, R); if (s && !status){ status = s; } close(fd); } } exits(status ? status : ""); }