/* match.c: pattern matching routines */

#include "rc.h"

static int rangematch(const char*, const char*);

enum { RANGE_FAIL = -1, RANGE_ERROR = -2 };

/* match() matches a single pattern against a single string. */

/* utf-8 support copyright © 2005 erik quanstrom with the same
    licencing terms as the rest of rc.

    since rc doesn't really do utf-8, we are going to pretend,
    relying on the properties of utf-8 we know that we can

    1.  get away with byte-wise comparisons as long as we are not
        insisting that the next byte is the next character.
        ranges and the ? match operator need to be utf-8-aware.

    2.  we can compare 2 utf-8 characters without converting to
        unicode (PITA) by comparing length (longer is greater) and
        then bytewise. all we require is utf8len.

*/

static int utf8len(const char* ss){
	const unsigned char* s = (unsigned char*)ss;
	int c;

	c=*s;

	if (c<0x80){
		return 1;
	}

	if (0x80 == (c&0xc0) || 0xc0 == (c&0xe0)){
		return 2;
	}

	if ((c & 0xf0) == 0xe0){
		return 3;
	} 
	if ((c & 0xf8) == 0xf0){
		return 4;
	}
	return 1; /* bad */
}

static int utf8cmp(const char* s1, int l1, const char* s2, int l2){
	int l;
	int t1;
	int t2;
	int i;

	l = l2-l1;
	if (l){
		return l;
	}

	for(i=0; i<l1; i++){
		t1 = *(unsigned char*)s1++;
		t2 = *(unsigned char*)s2++;
		l = t2-t1;
		if (l){
			return l;
		}
	}

	return 0;
}

#define nextc()  l = utf8len(s); do { if (!*s++) return FALSE; } while (--l)

extern bool match(char *p, char *m, char *s) {
	int i, j;
	int l;
	if (m == NULL)
		return streq(p, s);
	i = 0;
	while (1) {
		if (p[i] == '\0')
			return *s == '\0';
		else if (m[i]) {
			switch (p[i++]) {
			case '?':
				nextc();
				break;
			case '*':
				while (p[i] == '*' && m[i] == 1)	/* collapse multiple stars */
					i++;
				if (p[i] == '\0') 	/* star at end of pattern? */
					return TRUE;
				while (*s != '\0') {
					if (match(p + i, m + i, s)) {
						return TRUE;
					}
					nextc();
				}
				return FALSE;
			case '[':
				if (*s == '\0')
					return FALSE;
				switch (j = rangematch(p + i, s)) {
				default:
					i += j;
					break;
				case RANGE_FAIL:
					return FALSE;
				case RANGE_ERROR:
					if (*s != '[')
						return FALSE;
				}
				s += utf8len(s);
				break;
			default:
				panic("bad metacharacter in match");
				/* NOTREACHED */
				return FALSE; /* hush up gcc -Wall */
			}
		} else if (p[i++] != *s++)
			return FALSE;
	}
}

/*
   From the ed(1) man pages (on ranges):

	The `-' is treated as an ordinary character if it occurs first
	(or first after an initial ^) or last in the string.

	The right square bracket does not terminate the enclosed string
	if it is the first character (after an initial `^', if any), in
	the bracketed string.

   rangematch() matches a single character against a class, and returns
   an integer offset to the end of the range on success, or -1 on
   failure.
*/

/* we might get invalid utf-8 because rc doesn't check; so lets be paranoid */
#define check(s,l)  for(k=0; k<l; k++) { if (!s[k]) return RANGE_ERROR; }

static int rangematch(const char* p, const char* c){
	int l, cl;
	int i;
	int m;
	int k;
	const char* orig = p;
	bool neg = (*p == '~');
	bool matched = FALSE;

	if (neg)
		p++;
	if (*p == ']') {
		p++;
		matched = (*c == ']');
	}

	cl = utf8len(c); check(c,cl);

	for(; *p != ']'; p += l){
		if (*p == '\0')
			return RANGE_ERROR;

		l = utf8len(p); check(p,l);

		if (p[l] == '-' && p[l+1] != ']') { /* check for [..-..] but ignore [..-] */
			m = utf8cmp(p, l, c, cl);
			p+= l+1;
			l = utf8len(p); check(p,l);
			if (0<=m && 0>=utf8cmp(p, l, c, cl)){
				matched = 1;
			}
		} else if (cl == l) {
			for(i=0; i != l; i++){
				if (p[i] != c[i]){
					break;	
				}
			}
			matched |= i==l;
		}
	}
	if (matched ^ neg)
		return p - orig + 1; /* skip the right-bracket */
	return RANGE_FAIL;
}