From mboxrd@z Thu Jan  1 00:00:00 1970
From: "Russ Cox" <rsc@plan9.bell-labs.com>
To: 9fans@cse.psu.edu, help@vitanuova.com
MIME-Version: 1.0
Content-Type: text/plain; charset="US-ASCII"
Content-Transfer-Encoding: 7bit
Message-Id: <20010202210516.04B41199D7@mail.cse.psu.edu>
Subject: [9fans] charon fixes for utf8
Date: Fri,  2 Feb 2001 16:04:51 -0500
Topicbox-Message-UUID: 578cbe68-eac9-11e9-9e20-41e7f4b1d025

The attached diffs (to the 3rd edition free Inferno
release) fix charon to correctly handle
UTF8 documents when the only indication of being UTF8
is in the HTML header (rather than the HTTP header),
as is the case for most UTF8 documents on the web
(including the wiki ones, now).

It looks like there is still a bug in the lexer, as
viewing http://plan9.bell-labs.com/wiki/plan9/22
misparses one of the UTF8 sequences, but I don't think
I did that.

http://www.columbia.edu/kermit/utf8.html displays nicely too.

Russ

diff -n build.b old.build.b
build.b:148,189 d old.build.b:147
< # must track Charsets in chutils.m
< metacharsetnames := array[] of {
< 	"unknown",
< 	"us-ascii",
< 	"iso-8859-1",
< 	"utf-8"
< };
<
< # Return document's media type and chset (if found).
< # If can't find either type, return old ones.
< parsecontent(mtype, chset : int, s: string) : (int, int)
< {
< 	if(s == "")
< 		return (mtype, chset);
<
< 	(ty, parms) := S->splitl(S->tolower(s), ";");
< 	mediatable := CU->makestrinttab(CU->mnames);
< 	(fnd, val) := T->lookup(mediatable, trim_white(ty));
< 	if(fnd) {
< 		mtype = val;
< 		(n, l) := sys->tokenize(trim_white(parms[1:]), " \t");
< 		for(; l != nil; l = tl l) {
< 			t := hd l;
< 			if(len t > 8 && t[0:8] == "charset=") {
< 				cval := -1;
< 				for(i:=0; i<len metacharsetnames; i++)
< 					if(t[8:] == metacharsetnames[i])
< 						cval = i;
< 				if(cval >= 0)
< 					chset = cval;
< 				else if(warn)
< 					sys->print("warning: unknown character set in %s\n", s);
< 			}
< 		}
< 	}
< 	else {
< 		if(warn)
< 			sys->print("warning: unknown media type in %s\n", s);
< 	}
< 	return (mtype, chset);
< }
<
build.b:204,205 d old.build.b:161
< 	chset := di.chset;
< 	mtype := is.ts.mtype;
build.b:734,736 d old.build.b:689
< 			# change character set if specified in html header
< 			is.ts.chset = di.chset = chset;
< 			is.ts.mtype = mtype;
build.b:974,975 d old.build.b:926
< 				"content-type" =>
< 					(mtype, chset) = parsecontent(mtype, chset, v);
diff -n chutils.m old.chutils.m
chutils.m:40 c old.chutils.m:40
< 	# Charsets  (must track chsetnames in chutils.b, metacharsetnames in build.b)
---
> 	# Charsets  (must track chsetnames in chutils.b)
diff -n lex.b old.lex.b
lex.b:475,476 d old.lex.b:474
< 					if (tok.tag == Thead+RBRA)
< 						break;
lex.b:1130,1133 c old.lex.b:1128,1131
< 	if(unicodechar!=-1) {
< 		ts.i=index;
< 		return unicodechar;
< 	}
---
>         if(unicodechar!=-1) {
>                         ts.i=index;
>                         return unicodechar;
>         }