From mboxrd@z Thu Jan 1 00:00:00 1970 From: erik quanstrom Date: Fri, 15 Jul 2011 12:28:27 -0400 To: 9fans@9fans.net Message-ID: In-Reply-To: <2142d47b30d21e30e6d05a708a0907a8@quintile.net> References: <2142d47b30d21e30e6d05a708a0907a8@quintile.net> MIME-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 8bit Subject: Re: [9fans] awk, big REs Topicbox-Message-UUID: 01b45880-ead7-11e9-9d60-3106f5b1d025 > it mostly works but part of the configure script fails where it runs > awk over a header file and uses a rather sill RE to find all defines/undefines. > > /^[\t ]*#[\t ]*(define|undef)[\t ]+[_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]*([\t (]|$)// { print; } > > For some reasone they don't like character class ranges, and if I > replaces the lists with ranges the RE works as I expect, > > /^[\t ]*#[\t ]*(define|undef)[\t ]+[_a-zA-Z][_a-zA-Z0-9]*([\t (]|$)/ { print; } > > I looked at the awk source and even upped MAXRE - though its > existing value of 512 looks fine to me. > > anyone any ideas why awk might not match with an RE like this? this is something i fixed when it came up on the list some time ago. i believe the patch was rejected, and done another way.\ so the diff is noisy. sorry. one remaining problem with the code on sources is NSPANS isn't appropriatly sized for an a through z class for a reasonable alphabet. you also might try the awk in my contrib—it has a large number of fixes from bwk and is much more careful about converting between floating point and decimal so as not to cause spurious exceptions. - erik ; cd /sys/include ; 9diff regexp.h /n/sources/plan9//sys/include/regexp.h:1,6 - regexp.h:1,11 #pragma src "/sys/src/libregexp" #pragma lib "libregexp.a" + enum { + NSPANS = 128, /* max rune ranges per character class */ + NCLASS = 16, /* max character classes per program */ + }; + typedef struct Resub Resub; typedef struct Reclass Reclass; typedef struct Reinst Reinst; /n/sources/plan9//sys/include/regexp.h:27,33 - regexp.h:32,38 */ struct Reclass{ Rune *end; - Rune spans[64]; + Rune spans[NSPANS*2]; }; /* /n/sources/plan9//sys/include/regexp.h:52,58 - regexp.h:57,63 */ struct Reprog{ Reinst *startinst; /* start pc */ - Reclass class[16]; /* .data */ + Reclass class[NCLASS]; /* .data */ Reinst firstinst[5]; /* .text */ }; ; cd /sys/src/libregexp ; 9diff * /n/sources/plan9//sys/src/libregexp/regcomp.c:16,27 - regcomp.c:16,21 Reinst* last; }Node; - /* max character classes per program is nelem(reprog->class) */ - static Reprog *reprog; - - /* max rune ranges per character class is nelem(classp->spans)/2 */ - #define NCCRUNE nelem(classp->spans) - #define NSTACK 20 static Node andstack[NSTACK]; static Node *andp; /n/sources/plan9//sys/src/libregexp/regcomp.c:328,335 - regcomp.c:322,329 static Reclass* newclass(void) { - if(nclass >= nelem(reprog->class)) - rcerror("too many character classes; increase Reprog.class size"); + if(nclass >= NCLASS) + regerr2("too many character classes; limit", NCLASS+'0'); return &(classp[nclass++]); } /n/sources/plan9//sys/src/libregexp/regcomp.c:389,399 - regcomp.c:383,408 return RUNE; } + static void + debugspan(void) + { + #ifdef DEBUG + int i, nspan; + Rune r; + + nspan = yyclassp->end - yyclassp->spans >>1; + fprint(2, "nspan = %d\n", nspan); + p = yyclassp->spans; + for(i = 0; i < nspan; i++) + print("%C %C %.4ux %.4ux\n", p[2*i], p[2*i+1], p[2*i], p[2*i+1]); + #endif + } + static int bldcclass(void) { int type; - Rune r[NCCRUNE]; + Rune r[NSPANS*2]; Rune *p, *ep, *np; Rune rune; int quoted; /n/sources/plan9//sys/src/libregexp/regcomp.c:414,420 - regcomp.c:423,433 } /* parse class into a set of spans */ - while(ep < &r[NCCRUNE-1]){ + for(;;){ + if(ep == r + nelem(r)){ + rcerror("class too large"); + return 0; + } if(rune == 0){ rcerror("malformed '[]'"); return 0; /n/sources/plan9//sys/src/libregexp/regcomp.c:438,447 - regcomp.c:451,456 } quoted = nextc(&rune); } - if(ep >= &r[NCCRUNE-1]) { - rcerror("char class too large; increase Reclass.spans size"); - return 0; - } /* sort on span start */ for(p = r; p < ep; p += 2){ /n/sources/plan9//sys/src/libregexp/regcomp.c:465,474 - regcomp.c:474,482 np[0] = *p++; np[1] = *p++; for(; p < ep; p += 2) - /* overlapping or adjacent ranges? */ - if(p[0] <= np[1] + 1){ + if(p[0] <= np[1]+1){ if(p[1] >= np[1]) - np[1] = p[1]; /* coalesce */ + np[1] = p[1]; } else { np += 2; np[0] = p[0]; /n/sources/plan9//sys/src/libregexp/regcomp.c:475,480 - regcomp.c:483,489 np[1] = p[1]; } yyclassp->end = np+2; + debugspan(); } return type; /n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:1,23 - regcomp.h:1,17 /* * substitution list */ + enum { + NSUBEXP = 32, + LISTINCREMENT = 8, + }; + typedef struct Resublist Resublist; struct Resublist { - Resub m[32]; + Resub m[NSUBEXP]; }; - /* max subexpressions per program */ - Resublist ReSuBlIsT; - #define NSUBEXP (sizeof(ReSuBlIsT.m)/sizeof(Resub)) - - /* max character classes per program */ - Reprog RePrOg; - #define NCLASS (sizeof(RePrOg.class)/sizeof(Reclass)) - - /* max rune ranges per character class */ - #define NCCRUNE (sizeof(Reclass)/sizeof(wchar_t)) - /* * Actions and Tokens (Reinst types) * /n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:46,52 - regcomp.h:40,45 /* * regexec execution lists */ - #define LISTINCREMENT 8 typedef struct Relist Relist; struct Relist { ; lc mkfile regcomp.c regerror.c regsub.c rregsub.c regaux.c regcomp.h regexec.c rregexec.c ; find /sys/include|grep reg /sys/include/ape/regexp.h /sys/include/regexp.h ; 9diff /sys/include/ape/regexp.h /n/sources/plan9/sys/include/ape/regexp.h:35,43 - /sys/include/ape/regexp.h:35,50 /* * character class, each pair of rune's defines a range */ + enum{ + NCCRUNE = 256, + NCLASS = 16, + NINST = 5, + + }; + struct Reclass{ wchar_t *end; - wchar_t spans[64]; + wchar_t spans[NCCRUNE]; }; /* /n/sources/plan9/sys/include/ape/regexp.h:62,69 - /sys/include/ape/regexp.h:69,76 */ struct Reprog{ Reinst *startinst; /* start pc */ - Reclass class[16]; /* .data */ - Reinst firstinst[5]; /* .text */ + Reclass class[NCLASS]; /* .data */ + Reinst firstinst[NINST]; /* .text */ }; extern Reprog *regcomp(char*);