mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND
@ 2023-04-20 21:01 наб
  2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: наб @ 2023-04-20 21:01 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 8166 bytes --]

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but glibc is currently afflicted
by a few bugs, notably that its ^ doesn't work).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(and potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1

Under NetBSD, additionally, the following holds:
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
however this is not the case in glibc or musl yet;
although glibc supports
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
so that's probably another glibc bug.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1 (this is the reason . and [^] doesn't work),
and limit how many bytes may be consumed by mbtowc()
when getting to the end.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string.
---
Please keep me in CC, as I'm not subscribed.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..2a2bded5 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if(!max_len) { next_c = '\0'; pos_add_next = 1; }                         \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);;
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND
  2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
@ 2023-04-20 21:04 ` наб
  2023-04-21 15:48 ` [musl] REG_STARTEND tests наб
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: наб @ 2023-04-20 21:04 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 2022 bytes --]

This character cannot be named normally, but can be matched with
catch-alls like . and [^]

This brings us to feature parity with NetBSD:
	$ ./a.out '^a[^w]c$'  # matching "a\0c"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out '.c$'
	0
	2, 4; -1, -1
	$ ./a.out '.*'
	0
	1, 4; -1, -1

        $ sed -i 's/cdef/adef/' a.c
	$ ./a.out '^\(a\).\1$'  # matching "a\0a"
	0
	1, 4; 1, 2
---
Please keep me in CC, as I'm not subscribed.

I haven't encountered an issue with this, and TRE_CHAR_MAX seems to be
"domain of characters from GET_NEXT_WCHAR()", not
"real characters in the current locale's encoding",
so expanding the domain with a special character for NUL seems fine.

 src/regex/regexec.c | 2 +-
 src/regex/tre.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 2a2bded5..f09fdae1 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -60,7 +60,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
     if(!max_len) { next_c = '\0'; pos_add_next = 1; }                         \
     else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else { pos_add_next++; if (startend) next_c = -1; };                  \
+        else { pos_add_next++; if (startend) next_c = TRE_CHAR_MAX; };        \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
diff --git a/src/regex/tre.h b/src/regex/tre.h
index 9aae851f..e913899a 100644
--- a/src/regex/tre.h
+++ b/src/regex/tre.h
@@ -50,7 +50,7 @@ typedef wchar_t tre_char_t;
 
 /* Wide characters. */
 typedef wint_t tre_cint_t;
-#define TRE_CHAR_MAX 0x10ffff
+#define TRE_CHAR_MAX (0x10ffff + 1)
 
 #define tre_isalnum iswalnum
 #define tre_isalpha iswalpha
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] REG_STARTEND tests
  2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
  2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
@ 2023-04-21 15:48 ` наб
  2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
  2023-04-28 11:40 ` [musl] [PATCH v2 " наб
  3 siblings, 0 replies; 7+ messages in thread
From: наб @ 2023-04-21 15:48 UTC (permalink / raw)
  To: musl


[-- Attachment #1.1: Type: text/plain, Size: 783 bytes --]

I didn't formalise the tests last night, but see the attached file,
which passes cleanly on NetBSD and the illumos gate.

On musl with 1/2 I get
$ ~/store/code/musl/prefix/bin/musl-clang tst-reg-startend.c -o \
    /tmp/tst-reg-startend  -DSTANDALONE  -static &&
	/tmp/tst-reg-startend 2>&1 | cat -A
tst-reg-startend.c: ^a.c$: ac: no match
tst-reg-startend.c: ^a.*c$: ac: no match
tst-reg-startend.c: ^a[^c]c$: ac: no match
tst-reg-startend.c: ^a..: ac: no match
tst-reg-startend.c: ..c: ac: no match
tst-reg-startend.c: [^z]c: ac: no match
tst-reg-startend.c: [^z]c: ac: wanted {2, 4}, got {1, 4}

And with 2/2 it passes cleanly.


glibc gives me a host of errors, but I'll post fixes and include this
test there later.

Best,
наб

Please keep me in CC.

[-- Attachment #1.2: Type: text/x-csrc, Size: 4029 bytes --]

/* Permission to use, copy, modify, and/or distribute this software for any
   purpose with or without fee is hereby granted.

   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  */

#include <assert.h>
#include <locale.h>
#include <string.h>
#include <regex.h>
#include <stdio.h>
#include <stdbool.h>


#define M(s, e) (regmatch_t) {.rm_so = s, .rm_eo = e}
#define MEQ(l, r) ((l).rm_so == (r).rm_so && (l).rm_eo == (r).rm_eo)

static const regmatch_t bound = M(1, 4);

static const char *const regex_ac[] =
  {"^a", "c$", "^a.c$", "^a.*c$", "^a[^c]c$", "^a..", "..c", "[^z]c", NULL};
static const char *const regex_aa[] =
  {"^a", "a$", "^\\(a\\).\\1$", "^a[^a]*", NULL};
static const char *const data_ac[] = {"_a\0cdef", "_abcdef"};
static const char *const data_aa[] = {"_a\0adef", "_abadef"};
static const regmatch_t results_ac[] =
  {M(1, 2), M(3, 4), M(1, 4), M(1, 4), M(1, 4), M(1, 4), M(1, 4), M(2, 4)};
static const regmatch_t results_aa[] =
  {M(1, 2), M(3, 4), M(1, 4), M(1, 3)};
static_assert(sizeof(regex_ac) / sizeof(*regex_ac) - 1 ==
              sizeof(results_ac) / sizeof(*results_ac), "");
static_assert(sizeof(regex_aa) / sizeof(*regex_aa) - 1 ==
              sizeof(results_aa) / sizeof(*results_aa), "");


static bool
testbunch (const char *const *regexes, const char *const data[static 2],
           const regmatch_t *results)
{
#define BASEERR(data)                              \
  err = true,                                      \
    fprintf (stderr, __FILE__ ": %s: ", *regexes), \
    fwrite (data[i] + bound.rm_so, 1, bound.rm_eo - bound.rm_so, stderr)

  bool err = false;
  for (; *regexes; ++regexes, ++results)
    {
      regex_t rgx;
      assert (!regcomp (&rgx, *regexes, 0));

      for (size_t i = 0; i < 2; ++i)
        {
          regmatch_t match = bound;
          if (regexec (&rgx, data[i], 1, &match, REG_STARTEND))
            BASEERR(data), fputs (": no match\n", stderr);

          if (!MEQ(match, *results))
            BASEERR(data), fprintf (stderr, ": wanted {%d, %d}, got {%d, %d}\n",
                                    (int)results->rm_so, (int)results->rm_eo,
                                    (int)match.rm_so, (int)match.rm_eo);
        }

      regfree(&rgx);
    }

  return err;
}


static const char *const ać_data[2] = {"_aaćdef", "_aćdef"};
static const bool ać_exp[] = {false, true};

static bool
testać()
{
  bool err = false;
  regex_t rgx;
  const char *const regexes[] = {"ać"};
  assert (!regcomp (&rgx, *regexes, 0));

  for (size_t i = 0; i < 2; ++i)
    {
      regmatch_t match = bound;
      if (regexec (&rgx, ać_data[i], 1, &match, REG_STARTEND) == ać_exp[i])
        BASEERR(ać_data), fprintf (stderr, ": %s match\n",
                                   ać_exp[i] ? "no" : "yes");

      if (!MEQ(match, bound))
        BASEERR(ać_data), fprintf (stderr, ": wanted {%d, %d}, got {%d, %d}\n",
                                   (int)bound.rm_so, (int)bound.rm_eo,
                                   (int)match.rm_so, (int)match.rm_eo);
    }

  regfree(&rgx);
  return err;
}


static int
do_test (int argc, char **argv)
{
  (void) argc, (void) argv;
  assert (setlocale (LC_ALL, "C.UTF-8"));

  return testbunch (regex_ac, data_ac, results_ac) ||
         testbunch (regex_aa, data_aa, results_aa) ||
         testać ();
}


#if !STANDALONE
#include "../test-skeleton.c"
#else
int
main(int argc, char **argv)
{
  return do_test(argc, argv);
}
#endif

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND
  2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
  2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
  2023-04-21 15:48 ` [musl] REG_STARTEND tests наб
@ 2023-04-28 11:39 ` наб
  2023-05-14 15:17   ` [musl] [PATCH v3 " наб
  2023-05-14 15:17   ` [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
  2023-04-28 11:40 ` [musl] [PATCH v2 " наб
  3 siblings, 2 replies; 7+ messages in thread
From: наб @ 2023-04-28 11:39 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 8299 bytes --]

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but is buggy there).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(thus potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
the last two don't hold in musl with just this patch, though.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc()
when getting to the end, and, if 0, return L'\0'.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string; this is heavily baked
into the matchers, and embedded NULs are unnameable within the regex
anyway.
---
v2: fixed style and made the message probably a bit saner; NFC

Series tested with the v2 tst-regex-startend.c available at
  https://sourceware.org/pipermail/libc-alpha/2023-April/147564.html
and which should port to more compilers.

Keep me in CC: please.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..763dde58 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] [PATCH v2 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND
  2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
                   ` (2 preceding siblings ...)
  2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
@ 2023-04-28 11:40 ` наб
  3 siblings, 0 replies; 7+ messages in thread
From: наб @ 2023-04-28 11:40 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 1814 bytes --]

This character cannot be named normally, much like the NUL it's standing
in for, but can be matched with catch-alls like . and [^].

This brings us to feature parity with NetBSD:
	$ ./a.out '^a[^w]c$'  # matching "a\0c"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out '.c$'
	0
	2, 4; -1, -1
	$ ./a.out '.*'
	0
	1, 4; -1, -1

	$ sed -i 's/cdef/adef/' a.c
	$ ./a.out '^\(a\).\1$'  # matching "a\0a"
	0
	1, 4; 1, 2
---
v2: only message changed

Keep me in CC: please.

 src/regex/regexec.c | 2 +-
 src/regex/tre.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 763dde58..f493a703 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -60,7 +60,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
     if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
     else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else { pos_add_next++; if (startend) next_c = -1; };                  \
+        else { pos_add_next++; if (startend) next_c = TRE_CHAR_MAX; };        \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
diff --git a/src/regex/tre.h b/src/regex/tre.h
index 9aae851f..e913899a 100644
--- a/src/regex/tre.h
+++ b/src/regex/tre.h
@@ -50,7 +50,7 @@ typedef wchar_t tre_char_t;
 
 /* Wide characters. */
 typedef wint_t tre_cint_t;
-#define TRE_CHAR_MAX 0x10ffff
+#define TRE_CHAR_MAX (0x10ffff + 1)
 
 #define tre_isalnum iswalnum
 #define tre_isalpha iswalpha
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] [PATCH v3 1/2] regex: add BSD-style REG_STARTEND
  2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
@ 2023-05-14 15:17   ` наб
  2023-05-14 15:17   ` [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
  1 sibling, 0 replies; 7+ messages in thread
From: наб @ 2023-05-14 15:17 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 8294 bytes --]

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but is buggy there).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(thus potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
the last two don't hold in musl with just this patch, though.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc()
when getting to the end, and, if 0, return L'\0'.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string; this is heavily baked
into the matchers, and embedded NULs are unnameable within the regex
anyway.
---
v2: fixed style and made the message probably a bit saner; NFC
v3: no-change clean rebase

Series tested with the v4 (same as v2) tst-reg-startend.c available at
  https://sourceware.org/pipermail/libc-alpha/2023-May/147882.html

Keep me in CC: please.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..763dde58 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND
  2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
  2023-05-14 15:17   ` [musl] [PATCH v3 " наб
@ 2023-05-14 15:17   ` наб
  1 sibling, 0 replies; 7+ messages in thread
From: наб @ 2023-05-14 15:17 UTC (permalink / raw)
  Cc: musl

[-- Attachment #1: Type: text/plain, Size: 1761 bytes --]

This character cannot be named normally, much like the NUL it's standing
in for, but can be matched with catch-alls like . and [^].

This brings us to feature parity with NetBSD:
	$ ./a.out '^a[^w]c$'  # matching "a\0c"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out '.c$'
	0
	2, 4; -1, -1
	$ ./a.out '.*'
	0
	1, 4; -1, -1

	$ sed -i 's/cdef/adef/' a.c
 	$ ./a.out '^\(a\).\1$'  # matching "a\0a"
	0
	1, 4; 1, 2
---
 src/regex/regexec.c | 2 +-
 src/regex/tre.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 763dde58..f493a703 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -60,7 +60,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
     if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
     else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else { pos_add_next++; if (startend) next_c = -1; };                  \
+        else { pos_add_next++; if (startend) next_c = TRE_CHAR_MAX; };        \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
diff --git a/src/regex/tre.h b/src/regex/tre.h
index 9aae851f..e913899a 100644
--- a/src/regex/tre.h
+++ b/src/regex/tre.h
@@ -50,7 +50,7 @@ typedef wchar_t tre_char_t;
 
 /* Wide characters. */
 typedef wint_t tre_cint_t;
-#define TRE_CHAR_MAX 0x10ffff
+#define TRE_CHAR_MAX (0x10ffff + 1)
 
 #define tre_isalnum iswalnum
 #define tre_isalpha iswalpha
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-05-14 15:18 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-20 21:01 [musl] [PATCH 1/2] regex: add BSD-style REG_STARTEND наб
2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-21 15:48 ` [musl] REG_STARTEND tests наб
2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
2023-05-14 15:17   ` [musl] [PATCH v3 " наб
2023-05-14 15:17   ` [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-28 11:40 ` [musl] [PATCH v2 " наб

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).