9fans - fans of the OS Plan 9 from Bell Labs
 help / color / mirror / Atom feed
* [9fans] awk, big REs
@ 2011-07-15 13:05 Steve Simon
  2011-07-15 13:37 ` Charles Forsyth
  2011-07-15 16:28 ` erik quanstrom
  0 siblings, 2 replies; 3+ messages in thread
From: Steve Simon @ 2011-07-15 13:05 UTC (permalink / raw)
  To: 9fans

I am trying to run configure for a GNU project on plan9.

yes I know, but stay with me...

it mostly works but part of the configure script fails where it runs
awk over a header file and uses a rather sill RE to find all defines/undefines.

/^[\t ]*#[\t ]*(define|undef)[\t ]+[_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]*([\t (]|$)// { print; }

For some reasone they don't like character class ranges, and if I
replaces the lists with ranges the RE works as I expect,

/^[\t ]*#[\t ]*(define|undef)[\t ]+[_a-zA-Z][_a-zA-Z0-9]*([\t (]|$)/ { print; }

I looked at the awk source and even upped MAXRE - though its
existing value of 512 looks fine to me.

anyone any ideas why awk might not match with an RE like this?

-Steve



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [9fans] awk, big REs
  2011-07-15 13:05 [9fans] awk, big REs Steve Simon
@ 2011-07-15 13:37 ` Charles Forsyth
  2011-07-15 16:28 ` erik quanstrom
  1 sibling, 0 replies; 3+ messages in thread
From: Charles Forsyth @ 2011-07-15 13:37 UTC (permalink / raw)
  To: 9fans

>For some reasone they don't like character class ranges, and if I

writing them out explicitly might avoid trouble with some locales.



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [9fans] awk, big REs
  2011-07-15 13:05 [9fans] awk, big REs Steve Simon
  2011-07-15 13:37 ` Charles Forsyth
@ 2011-07-15 16:28 ` erik quanstrom
  1 sibling, 0 replies; 3+ messages in thread
From: erik quanstrom @ 2011-07-15 16:28 UTC (permalink / raw)
  To: 9fans

> it mostly works but part of the configure script fails where it runs
> awk over a header file and uses a rather sill RE to find all defines/undefines.
>
> /^[\t ]*#[\t ]*(define|undef)[\t ]+[_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]*([\t (]|$)// { print; }
>
> For some reasone they don't like character class ranges, and if I
> replaces the lists with ranges the RE works as I expect,
>
> /^[\t ]*#[\t ]*(define|undef)[\t ]+[_a-zA-Z][_a-zA-Z0-9]*([\t (]|$)/ { print; }
>
> I looked at the awk source and even upped MAXRE - though its
> existing value of 512 looks fine to me.
>
> anyone any ideas why awk might not match with an RE like this?

this is something i fixed when it came up on the list some time
ago.  i believe the patch was rejected, and done another way.\
so the diff is noisy.  sorry.

one remaining problem with the code on sources is NSPANS
isn't appropriatly sized for an a through z class for a reasonable
alphabet.  you also might try the awk in my contrib—it has a large
number of fixes from bwk and is much more careful about converting
between floating point and decimal so as not to cause spurious
exceptions.

- erik




; cd /sys/include
; 9diff regexp.h
/n/sources/plan9//sys/include/regexp.h:1,6 - regexp.h:1,11
  #pragma	src	"/sys/src/libregexp"
  #pragma	lib	"libregexp.a"

+ enum {
+ 	NSPANS	= 128,		/* max rune ranges per character class */
+ 	NCLASS	= 16,		/* max character classes per program */
+ };
+
  typedef struct Resub		Resub;
  typedef struct Reclass		Reclass;
  typedef struct Reinst		Reinst;
/n/sources/plan9//sys/include/regexp.h:27,33 - regexp.h:32,38
   */
  struct Reclass{
  	Rune	*end;
- 	Rune	spans[64];
+ 	Rune	spans[NSPANS*2];
  };

  /*
/n/sources/plan9//sys/include/regexp.h:52,58 - regexp.h:57,63
   */
  struct Reprog{
  	Reinst	*startinst;	/* start pc */
- 	Reclass	class[16];	/* .data */
+ 	Reclass	class[NCLASS];	/* .data */
  	Reinst	firstinst[5];	/* .text */
  };

; cd /sys/src/libregexp
; 9diff *
/n/sources/plan9//sys/src/libregexp/regcomp.c:16,27 - regcomp.c:16,21
  	Reinst*	last;
  }Node;

- /* max character classes per program is nelem(reprog->class) */
- static Reprog	*reprog;
-
- /* max rune ranges per character class is nelem(classp->spans)/2 */
- #define NCCRUNE	nelem(classp->spans)
-
  #define	NSTACK	20
  static	Node	andstack[NSTACK];
  static	Node	*andp;
/n/sources/plan9//sys/src/libregexp/regcomp.c:328,335 - regcomp.c:322,329
  static	Reclass*
  newclass(void)
  {
- 	if(nclass >= nelem(reprog->class))
- 		rcerror("too many character classes; increase Reprog.class size");
+ 	if(nclass >= NCLASS)
+ 		regerr2("too many character classes; limit", NCLASS+'0');
  	return &(classp[nclass++]);
  }

/n/sources/plan9//sys/src/libregexp/regcomp.c:389,399 - regcomp.c:383,408
  	return RUNE;
  }

+ static void
+ debugspan(void)
+ {
+ #ifdef DEBUG
+ 	int i, nspan;
+ 	Rune r;
+
+ 	nspan = yyclassp->end - yyclassp->spans >>1;
+ 	fprint(2, "nspan = %d\n", nspan);
+ 	p = yyclassp->spans;
+ 	for(i = 0; i < nspan; i++)
+ 		print("%C %C	%.4ux %.4ux\n", p[2*i], p[2*i+1], p[2*i], p[2*i+1]);
+ #endif
+ }
+
  static int
  bldcclass(void)
  {
  	int type;
- 	Rune r[NCCRUNE];
+ 	Rune r[NSPANS*2];
  	Rune *p, *ep, *np;
  	Rune rune;
  	int quoted;
/n/sources/plan9//sys/src/libregexp/regcomp.c:414,420 - regcomp.c:423,433
  	}

  	/* parse class into a set of spans */
- 	while(ep < &r[NCCRUNE-1]){
+ 	for(;;){
+ 		if(ep == r + nelem(r)){
+ 			rcerror("class too large");
+ 			return 0;
+ 		}
  		if(rune == 0){
  			rcerror("malformed '[]'");
  			return 0;
/n/sources/plan9//sys/src/libregexp/regcomp.c:438,447 - regcomp.c:451,456
  		}
  		quoted = nextc(&rune);
  	}
- 	if(ep >= &r[NCCRUNE-1]) {
- 		rcerror("char class too large; increase Reclass.spans size");
- 		return 0;
- 	}

  	/* sort on span start */
  	for(p = r; p < ep; p += 2){
/n/sources/plan9//sys/src/libregexp/regcomp.c:465,474 - regcomp.c:474,482
  		np[0] = *p++;
  		np[1] = *p++;
  		for(; p < ep; p += 2)
- 			/* overlapping or adjacent ranges? */
- 			if(p[0] <= np[1] + 1){
+ 			if(p[0] <= np[1]+1){
  				if(p[1] >= np[1])
- 					np[1] = p[1];	/* coalesce */
+ 					np[1] = p[1];
  			} else {
  				np += 2;
  				np[0] = p[0];
/n/sources/plan9//sys/src/libregexp/regcomp.c:475,480 - regcomp.c:483,489
  				np[1] = p[1];
  			}
  		yyclassp->end = np+2;
+ 		debugspan();
  	}

  	return type;


/n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:1,23 - regcomp.h:1,17
  /*
   *  substitution list
   */
+ enum {
+ 	NSUBEXP	= 32,
+ 	LISTINCREMENT	= 8,
+ };
+
  typedef struct Resublist	Resublist;
  struct	Resublist
  {
- 	Resub	m[32];
+ 	Resub	m[NSUBEXP];
  };

- /* max subexpressions per program */
- Resublist ReSuBlIsT;
- #define NSUBEXP (sizeof(ReSuBlIsT.m)/sizeof(Resub))
-
- /* max character classes per program */
- Reprog	RePrOg;
- #define	NCLASS	(sizeof(RePrOg.class)/sizeof(Reclass))
-
- /* max rune ranges per character class */
- #define NCCRUNE	(sizeof(Reclass)/sizeof(wchar_t))
-
  /*
   * Actions and Tokens (Reinst types)
   *
/n/sources/plan9//sys/src/ape/lib/regexp/regcomp.h:46,52 - regcomp.h:40,45
  /*
   *  regexec execution lists
   */
- #define LISTINCREMENT 8
  typedef struct Relist	Relist;
  struct Relist
  {
; lc
mkfile		regcomp.c	regerror.c	regsub.c		rregsub.c
regaux.c		regcomp.h	regexec.c		rregexec.c
; find /sys/include|grep reg
/sys/include/ape/regexp.h
/sys/include/regexp.h
; 9diff /sys/include/ape/regexp.h
/n/sources/plan9/sys/include/ape/regexp.h:35,43 - /sys/include/ape/regexp.h:35,50
  /*
   *	character class, each pair of rune's defines a range
   */
+ enum{
+ 	NCCRUNE	= 256,
+ 	NCLASS	= 16,
+ 	NINST		= 5,
+
+ };
+
  struct Reclass{
  	wchar_t	*end;
- 	wchar_t	spans[64];
+ 	wchar_t	spans[NCCRUNE];
  };

  /*
/n/sources/plan9/sys/include/ape/regexp.h:62,69 - /sys/include/ape/regexp.h:69,76
   */
  struct Reprog{
  	Reinst	*startinst;	/* start pc */
- 	Reclass	class[16];	/* .data */
- 	Reinst	firstinst[5];	/* .text */
+ 	Reclass	class[NCLASS];	/* .data */
+ 	Reinst	firstinst[NINST];	/* .text */
  };

  extern Reprog	*regcomp(char*);



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2011-07-15 16:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-15 13:05 [9fans] awk, big REs Steve Simon
2011-07-15 13:37 ` Charles Forsyth
2011-07-15 16:28 ` erik quanstrom

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).