* [9fans] minimal changes to make byron's rc utf-8 compatable.
@ 2005-10-14 0:15 erik quanstrom
0 siblings, 0 replies; only message in thread
From: erik quanstrom @ 2005-10-14 0:15 UTC (permalink / raw)
To: 9fans
[-- Attachment #1: Type: text/plain, Size: 538 bytes --]
9fans,
i realize that this is not the ideal forum but it seems that
rc-fans@hawkwind.utcs.toronto.ca is defunct and tim goodwin
hasn't (yet) answered my email...
this is all you need to make byron's rc utf-8 compatable.
you should be able to do this
; α = 1; echo $α
and this
; ~ α [αβ] &&
~ α . &&
~ β [α-γ] && echo works
with this patch.
i have not tested this with 4-byte utf-8 sequences, due to the fact
that p9p doesn't support 32bit unicode. but, barring a typo they should work.
erik
[-- Attachment #2: Type: text/plain, Size: 4058 bytes --]
/* match.c: pattern matching routines */
#include "rc.h"
static int rangematch(const char*, const char*);
enum { RANGE_FAIL = -1, RANGE_ERROR = -2 };
/* match() matches a single pattern against a single string. */
/* utf-8 support copyright © 2005 erik quanstrom with the same
licencing terms as the rest of rc.
since rc doesn't really do utf-8, we are going to pretend,
relying on the properties of utf-8 we know that we can
1. get away with byte-wise comparisons as long as we are not
insisting that the next byte is the next character.
ranges and the ? match operator need to be utf-8-aware.
2. we can compare 2 utf-8 characters without converting to
unicode (PITA) by comparing length (longer is greater) and
then bytewise. all we require is utf8len.
*/
static int utf8len(const char* ss){
const unsigned char* s = (unsigned char*)ss;
int c;
c=*s;
if (c<0x80){
return 1;
}
if (0x80 == (c&0xc0) || 0xc0 == (c&0xe0)){
return 2;
}
if ((c & 0xf0) == 0xe0){
return 3;
}
if ((c & 0xf8) == 0xf0){
return 4;
}
return 1; /* bad */
}
static int utf8cmp(const char* s1, int l1, const char* s2, int l2){
int l;
int t1;
int t2;
int i;
l = l2-l1;
if (l){
return l;
}
for(i=0; i<l1; i++){
t1 = *(unsigned char*)s1++;
t2 = *(unsigned char*)s2++;
l = t2-t1;
if (l){
return l;
}
}
return 0;
}
#define nextc() l = utf8len(s); do { if (!*s++) return FALSE; } while (--l)
extern bool match(char *p, char *m, char *s) {
int i, j;
int l;
if (m == NULL)
return streq(p, s);
i = 0;
while (1) {
if (p[i] == '\0')
return *s == '\0';
else if (m[i]) {
switch (p[i++]) {
case '?':
nextc();
break;
case '*':
while (p[i] == '*' && m[i] == 1) /* collapse multiple stars */
i++;
if (p[i] == '\0') /* star at end of pattern? */
return TRUE;
while (*s != '\0') {
if (match(p + i, m + i, s)) {
return TRUE;
}
nextc();
}
return FALSE;
case '[':
if (*s == '\0')
return FALSE;
switch (j = rangematch(p + i, s)) {
default:
i += j;
break;
case RANGE_FAIL:
return FALSE;
case RANGE_ERROR:
if (*s != '[')
return FALSE;
}
s += utf8len(s);
break;
default:
panic("bad metacharacter in match");
/* NOTREACHED */
return FALSE; /* hush up gcc -Wall */
}
} else if (p[i++] != *s++)
return FALSE;
}
}
/*
From the ed(1) man pages (on ranges):
The `-' is treated as an ordinary character if it occurs first
(or first after an initial ^) or last in the string.
The right square bracket does not terminate the enclosed string
if it is the first character (after an initial `^', if any), in
the bracketed string.
rangematch() matches a single character against a class, and returns
an integer offset to the end of the range on success, or -1 on
failure.
*/
/* we might get invalid utf-8 because rc doesn't check; so lets be paranoid */
#define check(s,l) for(k=0; k<l; k++) { if (!s[k]) return RANGE_ERROR; }
static int rangematch(const char* p, const char* c){
int l, cl;
int i;
int m;
int k;
const char* orig = p;
bool neg = (*p == '~');
bool matched = FALSE;
if (neg)
p++;
if (*p == ']') {
p++;
matched = (*c == ']');
}
cl = utf8len(c); check(c,cl);
for(; *p != ']'; p += l){
if (*p == '\0')
return RANGE_ERROR;
l = utf8len(p); check(p,l);
if (p[l] == '-' && p[l+1] != ']') { /* check for [..-..] but ignore [..-] */
m = utf8cmp(p, l, c, cl);
p+= l+1;
l = utf8len(p); check(p,l);
if (0<=m && 0>=utf8cmp(p, l, c, cl)){
matched = 1;
}
} else if (cl == l) {
for(i=0; i != l; i++){
if (p[i] != c[i]){
break;
}
}
matched |= i==l;
}
}
if (matched ^ neg)
return p - orig + 1; /* skip the right-bracket */
return RANGE_FAIL;
}
[-- Attachment #3: Type: text/plain, Size: 11000 bytes --]
/* lex.c: rc's lexical analyzer */
#include "rc.h"
#include "parse.h"
/*
Special characters (i.e., "non-word") in rc:
\t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \
The lexical analyzer is fairly straightforward. The only really
unclean part concerns backslash continuation and "double
backslashes". A backslash followed by a newline is treated as a
space, otherwise backslash is not a special character (i.e.,
it can be part of a word). This introduces a host of unwanted
special cases. In our case, \ cannot be a word character, since
we wish to read in all word characters in a tight loop.
Note: to save the trouble of declaring these arrays with TRUEs
and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is
it declared in rc.h)
*/
#define BUFSIZE ((size_t) 1000) /* malloc hates power of 2 buffers? */
#define BUFMAX (8 * BUFSIZE) /* How big the buffer can get before we re-allocate the
space at BUFSIZE again. Premature optimization? Maybe.
*/
typedef enum wordstates {
NW, RW, KW /* "nonword", "realword", "keyword" */
} wordstates;
static void getpair(int);
int lineno;
/* does not check for valid utf-8; alternative is changing gchar() to return a Rune */
const char nw[] = {
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
const char dnw[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static size_t bufsize = BUFSIZE;
static char *realbuf = NULL;
static bool newline = FALSE;
static bool errset = FALSE;
static bool prerror = FALSE;
static wordstates w = NW;
static int fd_left, fd_right;
#define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }}
enum filedescriptors {
UNSET = -9, CLOSED = -1
};
/* does this string require quoting? */
extern bool quotep(char *s, bool dollar) {
unsigned char c;
const char *meta;
meta = dollar ? dnw : nw;
while ((c = *s++))
if (meta[c])
return TRUE;
return FALSE;
}
extern int yylex() {
static bool dollar = FALSE;
bool saw_meta = FALSE;
int c;
size_t i; /* The purpose of all these local assignments is to */
const char *meta; /* allow optimizing compilers like gcc to load these */
char *buf = realbuf; /* values into registers. On a sparc this is a */
YYSTYPE *y = &yylval; /* win, in code size *and* execution time */
if (errset) {
errset = FALSE;
return '\n';
}
/* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */
meta = (dollar ? dnw : nw);
dollar = FALSE;
if (newline) {
--lineno; /* slight space optimization; print_prompt2() always increments lineno */
print_prompt2();
newline = FALSE;
}
top: while ((c = gchar()) == ' ' || c == '\t')
w = NW;
if (c == EOF)
return END;
if (!meta[(unsigned char) c]) { /* it's a word or keyword. */
checkfreecaret;
w = RW;
i = 0;
read: do {
buf[i++] = c;
if (c == '?' || c == '[' || c == '*')
saw_meta = TRUE;
if (i >= bufsize)
buf = realbuf = erealloc(buf, bufsize *= 2);
} while ((c = gchar()) != EOF && !meta[(unsigned char) c]);
while (c == '\\') {
if ((c = gchar()) == '\n') {
print_prompt2();
c = ' '; /* Pretend a space was read */
break;
} else {
bs: if (meta != dnw) { /* all words but varnames may have a bslash */
buf[i++] = '\\';
if (i >= bufsize)
buf = realbuf = erealloc(buf, bufsize *= 2);
if (!meta[(unsigned char) c])
goto read;
} else {
ugchar(c);
c = '\\';
break;
}
}
}
ugchar(c);
buf[i] = '\0';
w = KW;
if (i == 2) {
if (*buf == 'i' && buf[1] == 'f') return IF;
if (*buf == 'f' && buf[1] == 'n') return FN;
if (*buf == 'i' && buf[1] == 'n') return IN;
}
if (streq(buf, "for")) return FOR;
if (streq(buf, "else")) return ELSE;
if (streq(buf, "switch")) return SWITCH;
if (streq(buf, "while")) return WHILE;
if (streq(buf, "case")) return CASE;
w = RW;
y->word.w = ncpy(buf);
if (saw_meta) {
char *r, *s;
y->word.m = nalloc(strlen(buf) + 1);
for (r = buf, s = y->word.m; *r != '\0'; r++, s++)
*s = (*r == '?' || *r == '[' || *r == '*');
} else {
y->word.m = NULL;
}
y->word.q = FALSE;
return WORD;
}
if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'') {
checkfreecaret;
if (c == '!' || c == '@' || c == '~')
w = KW;
}
switch (c) {
case '!':
return BANG;
case '@':
return SUBSHELL;
case '~':
return TWIDDLE;
case '`':
c = gchar();
if (c == '`')
return BACKBACK;
ugchar(c);
return '`';
case '$':
dollar = TRUE;
c = gchar();
if (c == '#')
return COUNT;
if (c == '^')
return FLAT;
ugchar(c);
return '$';
case '\'':
w = RW;
i = 0;
/* double ' to quote it, like this: 'how''s it going?' */
while ((c = gchar()) != '\'' || (c = gchar()) == '\'') {
buf[i++] = c;
if (c == '\n')
print_prompt2();
if (c == EOF) {
w = NW;
scanerror("eof in quoted string");
return HUH;
}
if (i >= bufsize)
buf = realbuf = erealloc(buf, bufsize *= 2);
}
ugchar(c);
buf[i] = '\0';
y->word.w = ncpy(buf);
y->word.m = NULL;
y->word.q = TRUE;
return WORD;
case '\\':
if ((c = gchar()) == '\n') {
print_prompt2();
goto top; /* Pretend it was just another space. */
}
ugchar(c);
c = '\\';
checkfreecaret;
c = gchar();
i = 0;
goto bs;
case '(':
if (w == RW) /* SUB's happen only after real words, not keyowrds, so if () and while () work */
c = SUB;
w = NW;
return c;
case '#':
while ((c = gchar()) != '\n') /* skip comment until newline */
if (c == EOF)
return END;
/* FALLTHROUGH */
case '\n':
lineno++;
newline = TRUE;
/* FALLTHROUGH */
case ';':
case '^':
case ')':
case '=':
case '{': case '}':
w = NW;
return c;
case '&':
w = NW;
c = gchar();
if (c == '&')
return ANDAND;
ugchar(c);
return '&';
case '|':
w = NW;
c = gchar();
if (c == '|')
return OROR;
getpair(c);
if (errset)
return HUH;
if ((y->pipe.left = fd_left) == UNSET)
y->pipe.left = 1; /* default to fd 1 */
if ((y->pipe.right = fd_right) == UNSET)
y->pipe.right = 0; /* default to fd 0 */
if (y->pipe.right == CLOSED) {
scanerror("expected digit after '='"); /* can't close a pipe */
return HUH;
}
return PIPE;
case '>':
c = gchar();
if (c == '>') {
c = gchar();
y->redir.type = rAppend;
} else
y->redir.type = rCreate;
y->redir.fd = 1;
goto common;
case '<':
c = gchar();
if (c == '<') {
c = gchar();
if (c == '<') {
c = gchar();
y->redir.type = rHerestring;
} else {
y->redir.type = rHeredoc;
}
} else
y->redir.type = rFrom;
y->redir.fd = 0;
common:
w = NW;
getpair(c);
if (errset)
return HUH;
if (fd_right == UNSET) { /* redirection, not dup */
if (fd_left != UNSET) {
y->redir.fd = fd_left;
return SREDIR;
}
return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR;
} else { /* dup; recast yylval */
y->dup.type = y->redir.type;
y->dup.left = fd_left;
y->dup.right = fd_right;
return DUP;
}
default:
w = NW;
return c; /* don't know what it is, let yacc barf on it */
}
}
extern void yyerror(const char *s) {
char *tok;
if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */
prerror = FALSE;
return;
}
if (!interactive) {
if (w != NW)
tok = realbuf;
else if (lastchar == EOF)
tok = "eof";
else if (lastchar == '\n')
tok = "end of line";
else
tok = nprint((lastchar < 32 || lastchar > 126) ? "(decimal %d)" : "'%c'", lastchar);
fprint(2, "line %d: %s near %s\n", lineno - (lastchar == '\n'), s, tok);
} else
fprint(2, "%s\n", s);
}
extern void scanerror(char *s) {
flushu(); /* flush upto newline */
yyerror(s);
errset = prerror = TRUE;
}
extern void inityy() {
newline = FALSE;
w = NW;
hq = NULL;
/* return memory to the system if the buffer got too large */
if (bufsize > BUFMAX && realbuf != NULL) {
efree(realbuf);
bufsize = BUFSIZE;
realbuf = ealloc(bufsize);
} else if (realbuf == NULL)
realbuf = ealloc(bufsize);
}
/*
Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file
descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g.,
>[2] is represented as (2,UNSET).
This function makes use of unsigned compares to make range tests in one compare operation.
*/
static void getpair(int c) {
int n;
fd_left = fd_right = UNSET;
if (c != '[') {
ugchar(c);
return;
}
if ((unsigned int) (n = gchar() - '0') > 9) {
scanerror("expected digit after '['");
return;
}
while ((unsigned int) (c = gchar() - '0') <= 9)
n = n * 10 + c;
fd_left = n;
c += '0';
switch (c) {
default:
scanerror("expected '=' or ']' after digit");
return;
case ']':
return;
case '=':
if ((unsigned int) (n = gchar() - '0') > 9) {
if (n != ']' - '0') {
scanerror("expected digit or ']' after '='");
return;
}
fd_right = CLOSED;
} else {
while ((unsigned int) (c = gchar() - '0') <= 9)
n = n * 10 + c;
if (c != ']' - '0') {
scanerror("expected ']' after digit");
return;
}
fd_right = n;
}
}
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2005-10-14 0:15 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-10-14 0:15 [9fans] minimal changes to make byron's rc utf-8 compatable erik quanstrom
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).