From: erik quanstrom <quanstro@labs.coraid.com>
To: 9fans@9fans.net
Subject: Re: [9fans] awk, big REs
Date: Fri, 15 Jul 2011 12:28:27 -0400 [thread overview]
Message-ID: <a696271bce408b638bc9fd29683d5244@coraid.com> (raw)
In-Reply-To: <2142d47b30d21e30e6d05a708a0907a8@quintile.net>
> it mostly works but part of the configure script fails where it runs
> awk over a header file and uses a rather sill RE to find all defines/undefines.
>
> /^[\t ]*#[\t ]*(define|undef)[\t ]+[_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]*([\t (]|$)// { print; }
>
> For some reasone they don't like character class ranges, and if I
> replaces the lists with ranges the RE works as I expect,
>
> /^[\t ]*#[\t ]*(define|undef)[\t ]+[_a-zA-Z][_a-zA-Z0-9]*([\t (]|$)/ { print; }
>
> I looked at the awk source and even upped MAXRE - though its
> existing value of 512 looks fine to me.
>
> anyone any ideas why awk might not match with an RE like this?
this is something i fixed when it came up on the list some time
ago. i believe the patch was rejected, and done another way.\
so the diff is noisy. sorry.
one remaining problem with the code on sources is NSPANS
isn't appropriatly sized for an a through z class for a reasonable
alphabet. you also might try the awk in my contrib—it has a large
number of fixes from bwk and is much more careful about converting
between floating point and decimal so as not to cause spurious
exceptions.
- erik
; cd /sys/include
; 9diff regexp.h
/n/sources/plan9//sys/include/regexp.h:1,6 - regexp.h:1,11
#pragma src "/sys/src/libregexp"
#pragma lib "libregexp.a"
+ enum {
+ NSPANS = 128, /* max rune ranges per character class */
+ NCLASS = 16, /* max character classes per program */
+ };
+
typedef struct Resub Resub;
typedef struct Reclass Reclass;
typedef struct Reinst Reinst;
/n/sources/plan9//sys/include/regexp.h:27,33 - regexp.h:32,38
*/
struct Reclass{
Rune *end;
- Rune spans[64];
+ Rune spans[NSPANS*2];
};
/*
/n/sources/plan9//sys/include/regexp.h:52,58 - regexp.h:57,63
*/
struct Reprog{
Reinst *startinst; /* start pc */
- Reclass class[16]; /* .data */
+ Reclass class[NCLASS]; /* .data */
Reinst firstinst[5]; /* .text */
};
; cd /sys/src/libregexp
; 9diff *
/n/sources/plan9//sys/src/libregexp/regcomp.c:16,27 - regcomp.c:16,21
Reinst* last;
}Node;
- /* max character classes per program is nelem(reprog->class) */
- static Reprog *reprog;
-
- /* max rune ranges per character class is nelem(classp->spans)/2 */
- #define NCCRUNE nelem(classp->spans)
-
#define NSTACK 20
static Node andstack[NSTACK];
static Node *andp;
/n/sources/plan9//sys/src/libregexp/regcomp.c:328,335 - regcomp.c:322,329
static Reclass*
newclass(void)
{
- if(nclass >= nelem(reprog->class))
- rcerror("too many character classes; increase Reprog.class size");
+ if(nclass >= NCLASS)
+ regerr2("too many character classes; limit", NCLASS+'0');
return &(classp[nclass++]);
}
/n/sources/plan9//sys/src/libregexp/regcomp.c:389,399 - regcomp.c:383,408
return RUNE;
}
+ static void
+ debugspan(void)
+ {
+ #ifdef DEBUG
+ int i, nspan;
+ Rune r;
+
+ nspan = yyclassp->end - yyclassp->spans >>1;
+ fprint(2, "nspan = %d\n", nspan);
+ p = yyclassp->spans;
+ for(i = 0; i < nspan; i++)
+ print("%C %C %.4ux %.4ux\n", p[2*i], p[2*i+1], p[2*i], p[2*i+1]);
+ #endif
+ }
+
static int
bldcclass(void)
{
int type;
- Rune r[NCCRUNE];
+ Rune r[NSPANS*2];
Rune *p, *ep, *np;
Rune rune;
int quoted;
/n/sources/plan9//sys/src/libregexp/regcomp.c:414,420 - regcomp.c:423,433
}
/* parse class into a set of spans */
- while(ep < &r[NCCRUNE-1]){
+ for(;;){
+ if(ep == r + nelem(r)){
+ rcerror("class too large");
+ return 0;
+ }
if(rune == 0){
rcerror("malformed '[]'");
return 0;
/n/sources/plan9//sys/src/libregexp/regcomp.c:438,447 - regcomp.c:451,456
}
quoted = nextc(&rune);
}
- if(ep >= &r[NCCRUNE-1]) {
- rcerror("char class too large; increase Reclass.spans size");
- return 0;
- }
/* sort on span start */
for(p = r; p < ep; p += 2){
/n/sources/plan9//sys/src/libregexp/regcomp.c:465,474 - regcomp.c:474,482
np[0] = *p++;
np[1] = *p++;
for(; p < ep; p += 2)
- /* overlapping or adjacent ranges? */
- if(p[0] <= np[1] + 1){
+ if(p[0] <= np[1]+1){
if(p[1] >= np[1])
- np[1] = p[1]; /* coalesce */
+ np[1] = p[1];
} else {
np += 2;
np[0] = p[0];
/n/sources/plan9//sys/src/libregexp/regcomp.c:475,480 - regcomp.c:483,489
np[1] = p[1];
}
yyclassp->end = np+2;
+ debugspan();
}
return type;
/n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:1,23 - regcomp.h:1,17
/*
* substitution list
*/
+ enum {
+ NSUBEXP = 32,
+ LISTINCREMENT = 8,
+ };
+
typedef struct Resublist Resublist;
struct Resublist
{
- Resub m[32];
+ Resub m[NSUBEXP];
};
- /* max subexpressions per program */
- Resublist ReSuBlIsT;
- #define NSUBEXP (sizeof(ReSuBlIsT.m)/sizeof(Resub))
-
- /* max character classes per program */
- Reprog RePrOg;
- #define NCLASS (sizeof(RePrOg.class)/sizeof(Reclass))
-
- /* max rune ranges per character class */
- #define NCCRUNE (sizeof(Reclass)/sizeof(wchar_t))
-
/*
* Actions and Tokens (Reinst types)
*
/n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:46,52 - regcomp.h:40,45
/*
* regexec execution lists
*/
- #define LISTINCREMENT 8
typedef struct Relist Relist;
struct Relist
{
; lc
mkfile regcomp.c regerror.c regsub.c rregsub.c
regaux.c regcomp.h regexec.c rregexec.c
; find /sys/include|grep reg
/sys/include/ape/regexp.h
/sys/include/regexp.h
; 9diff /sys/include/ape/regexp.h
/n/sources/plan9/sys/include/ape/regexp.h:35,43 - /sys/include/ape/regexp.h:35,50
/*
* character class, each pair of rune's defines a range
*/
+ enum{
+ NCCRUNE = 256,
+ NCLASS = 16,
+ NINST = 5,
+
+ };
+
struct Reclass{
wchar_t *end;
- wchar_t spans[64];
+ wchar_t spans[NCCRUNE];
};
/*
/n/sources/plan9/sys/include/ape/regexp.h:62,69 - /sys/include/ape/regexp.h:69,76
*/
struct Reprog{
Reinst *startinst; /* start pc */
- Reclass class[16]; /* .data */
- Reinst firstinst[5]; /* .text */
+ Reclass class[NCLASS]; /* .data */
+ Reinst firstinst[NINST]; /* .text */
};
extern Reprog *regcomp(char*);
prev parent reply other threads:[~2011-07-15 16:28 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-07-15 13:05 Steve Simon
2011-07-15 13:37 ` Charles Forsyth
2011-07-15 16:28 ` erik quanstrom [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a696271bce408b638bc9fd29683d5244@coraid.com \
--to=quanstro@labs.coraid.com \
--cc=9fans@9fans.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).