From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Russ Cox" To: 9fans@cse.psu.edu, help@vitanuova.com MIME-Version: 1.0 Content-Type: text/plain; charset="US-ASCII" Content-Transfer-Encoding: 7bit Message-Id: <20010202210516.04B41199D7@mail.cse.psu.edu> Subject: [9fans] charon fixes for utf8 Date: Fri, 2 Feb 2001 16:04:51 -0500 Topicbox-Message-UUID: 578cbe68-eac9-11e9-9e20-41e7f4b1d025 The attached diffs (to the 3rd edition free Inferno release) fix charon to correctly handle UTF8 documents when the only indication of being UTF8 is in the HTML header (rather than the HTTP header), as is the case for most UTF8 documents on the web (including the wiki ones, now). It looks like there is still a bug in the lexer, as viewing http://plan9.bell-labs.com/wiki/plan9/22 misparses one of the UTF8 sequences, but I don't think I did that. http://www.columbia.edu/kermit/utf8.html displays nicely too. Russ diff -n build.b old.build.b build.b:148,189 d old.build.b:147 < # must track Charsets in chutils.m < metacharsetnames := array[] of { < "unknown", < "us-ascii", < "iso-8859-1", < "utf-8" < }; < < # Return document's media type and chset (if found). < # If can't find either type, return old ones. < parsecontent(mtype, chset : int, s: string) : (int, int) < { < if(s == "") < return (mtype, chset); < < (ty, parms) := S->splitl(S->tolower(s), ";"); < mediatable := CU->makestrinttab(CU->mnames); < (fnd, val) := T->lookup(mediatable, trim_white(ty)); < if(fnd) { < mtype = val; < (n, l) := sys->tokenize(trim_white(parms[1:]), " \t"); < for(; l != nil; l = tl l) { < t := hd l; < if(len t > 8 && t[0:8] == "charset=") { < cval := -1; < for(i:=0; i= 0) < chset = cval; < else if(warn) < sys->print("warning: unknown character set in %s\n", s); < } < } < } < else { < if(warn) < sys->print("warning: unknown media type in %s\n", s); < } < return (mtype, chset); < } < build.b:204,205 d old.build.b:161 < chset := di.chset; < mtype := is.ts.mtype; build.b:734,736 d old.build.b:689 < # change character set if specified in html header < is.ts.chset = di.chset = chset; < is.ts.mtype = mtype; build.b:974,975 d old.build.b:926 < "content-type" => < (mtype, chset) = parsecontent(mtype, chset, v); diff -n chutils.m old.chutils.m chutils.m:40 c old.chutils.m:40 < # Charsets (must track chsetnames in chutils.b, metacharsetnames in build.b) --- > # Charsets (must track chsetnames in chutils.b) diff -n lex.b old.lex.b lex.b:475,476 d old.lex.b:474 < if (tok.tag == Thead+RBRA) < break; lex.b:1130,1133 c old.lex.b:1128,1131 < if(unicodechar!=-1) { < ts.i=index; < return unicodechar; < } --- > if(unicodechar!=-1) { > ts.i=index; > return unicodechar; > }