mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND
@ 2023-04-20 21:01 наб
  2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: наб @ 2023-04-20 21:01 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 8166 bytes --]

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but glibc is currently afflicted
by a few bugs, notably that its ^ doesn't work).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(and potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1

Under NetBSD, additionally, the following holds:
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
however this is not the case in glibc or musl yet;
although glibc supports
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
so that's probably another glibc bug.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1 (this is the reason . and [^] doesn't work),
and limit how many bytes may be consumed by mbtowc()
when getting to the end.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string.
---
Please keep me in CC, as I'm not subscribed.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..2a2bded5 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if(!max_len) { next_c = '\0'; pos_add_next = 1; }                         \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);;
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-05-14 15:18 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-21 15:48 ` [musl] REG_STARTEND tests наб
2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
2023-05-14 15:17   ` [musl] [PATCH v3 " наб
2023-05-14 15:17   ` [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-28 11:40 ` [musl] [PATCH v2 " наб

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).