mailing list of musl libc
 help / color / mirror / code / Atom feed
From: наб <nabijaczleweli@nabijaczleweli.xyz>
Cc: musl@lists.openwall.com
Subject: [musl] [PATCH v3 1/2] regex: add BSD-style REG_STARTEND
Date: Sun, 14 May 2023 17:17:29 +0200	[thread overview]
Message-ID: <c543d09605b7b9f8efda2ab76a12f954b0d55393.1684077278.git.nabijaczleweli@nabijaczleweli.xyz> (raw)
In-Reply-To: <007f7c7408aa227ee7b0fb3d82e44b8142db6ff4.1682681245.git.nabijaczleweli@nabijaczleweli.xyz>

[-- Attachment #1: Type: text/plain, Size: 8294 bytes --]

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but is buggy there).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(thus potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
the last two don't hold in musl with just this patch, though.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc()
when getting to the end, and, if 0, return L'\0'.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string; this is heavily baked
into the matchers, and embedded NULs are unnameable within the regex
anyway.
---
v2: fixed style and made the message probably a bit saner; NFC
v3: no-change clean rebase

Series tested with the v4 (same as v2) tst-reg-startend.c available at
  https://sourceware.org/pipermail/libc-alpha/2023-May/147882.html

Keep me in CC: please.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..763dde58 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2023-05-14 15:17 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-20 21:01 [musl] [PATCH " наб
2023-04-20 21:04 ` [musl] [PATCH 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-21 15:48 ` [musl] REG_STARTEND tests наб
2023-04-28 11:39 ` [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND наб
2023-05-14 15:17   ` наб [this message]
2023-05-14 15:17   ` [musl] [PATCH v3 2/2] regex: increase TRE_CHAR_MAX and use it for NUL with REG_STARTEND наб
2023-04-28 11:40 ` [musl] [PATCH v2 " наб

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c543d09605b7b9f8efda2ab76a12f954b0d55393.1684077278.git.nabijaczleweli@nabijaczleweli.xyz \
    --to=nabijaczleweli@nabijaczleweli.xyz \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).