mailing list of musl libc
 help / color / mirror / code / Atom feed
* [PATCH] regex: REG_STARTEND support
@ 2016-10-05 12:19 Julien Ramseier
  2016-10-05 16:23 ` Rich Felker
  0 siblings, 1 reply; 2+ messages in thread
From: Julien Ramseier @ 2016-10-05 12:19 UTC (permalink / raw)
  To: musl; +Cc: Johannes.Schindelin

[-- Attachment #1: Type: text/plain, Size: 196 bytes --]

Here's my REG_STARTEND patch, mostly copied from the original tre[1] implementation.
It's only lightly tested.



[1] https://github.com/laurikari/tre/ <https://github.com/laurikari/tre/>


[-- Attachment #2.1: Type: text/html, Size: 408 bytes --]

[-- Attachment #2.2: reg_startend.diff --]
[-- Type: application/octet-stream, Size: 6267 bytes --]

diff --git a/include/regex.h b/include/regex.h
index dce2177..449e606 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
@@ -46,6 +47,7 @@ typedef struct {
 #define REG_ERANGE      11
 #define REG_ESPACE      12
 #define REG_BADRPT      13
+#define REG_INVARG      14
 
 #define REG_ENOSYS      -1
 
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 16c5d0a..ae65726 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -29,6 +29,7 @@
 
 */
 
+#include <sys/types.h>
 #include <stdlib.h>
 #include <string.h>
 #include <wchar.h>
@@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 
 #define GET_NEXT_WCHAR() do {                                                 \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
-        if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+    if (len >= 0 && pos >= len)                                               \
+        next_c = L'\0';                                                       \
+    else {                                                                    \
+        if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {    \
+            if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }     \
+            else pos_add_next++;                                              \
+        }                                                                     \
+        str_byte += pos_add_next;                                             \
     }                                                                         \
-    str_byte += pos_add_next;                                                 \
   } while (0)
 
 #define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum(c))
@@ -166,7 +171,7 @@ typedef struct {
 
 
 static reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
 		      int *match_tags, int eflags,
 		      int *match_end_ofs)
 {
@@ -306,7 +311,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 	}
 
       /* Check for end of string. */
-      if (!next_c) break;
+      if (len < 0) {
+        if (!next_c) break;
+      } else if (pos >= len) break;
 
       GET_NEXT_WCHAR();
 
@@ -577,7 +584,7 @@ typedef struct tre_backtrack_struct {
 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
 
 static reg_errcode_t
-tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
 		       int *match_tags, int eflags, int *match_end_ofs)
 {
   /* State variables required by GET_NEXT_WCHAR. */
@@ -767,8 +774,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
 
-	  result = strncmp((const char*)string + so, str_byte - 1,
-				 (size_t)bt_len);
+	  if (so < 0)
+	      result = 1; /* Back reference of nomatch doesn't match */
+	  else if (len < 0)
+	      result = strncmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
+	  else if (len - pos < bt_len)
+	      result = 1;
+	  else
+	      result = memcmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
 
 	  if (result == 0)
 	    {
@@ -796,8 +809,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
       else
 	{
 	  /* Check for end of string. */
-	  if (next_c == L'\0')
-		goto backtrack;
+	  if (len < 0) {
+		if (!next_c) goto backtrack;
+	  } else if (pos >= len) goto backtrack;
 
 	  /* Read the next character. */
 	  GET_NEXT_WCHAR();
@@ -870,10 +884,10 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	    {
 	      /* Try starting from a later position in the input string. */
 	      /* Check for end of string. */
-	      if (next_c == L'\0')
-		    {
-		      break;
-		    }
+		  if (len < 0) {
+		    if (!next_c) break;
+		  } else if (pos >= len) break;
+
 	      next_c = next_c_start;
 #ifdef TRE_MBSTATE
 	      mbstate = mbstate_start;
@@ -984,6 +998,18 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   int *tags = NULL, eo;
+  size_t offset = 0;
+  ssize_t len = -1;
+
+	if ((eflags & REG_STARTEND) && pmatch) {
+		if (pmatch->rm_so < 0 || pmatch->rm_eo < 0 ||
+		    pmatch->rm_so > pmatch->rm_eo)
+			return REG_INVARG;
+
+		offset = pmatch->rm_so;
+		len = pmatch->rm_eo - pmatch->rm_so;
+	}
+
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -996,17 +1022,34 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string+offset, len, tags, eflags, &eo);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string+offset, len, tags, eflags, &eo);
     }
 
-  if (status == REG_OK)
+  if (status == REG_OK) {
     /* A match was found, so fill the submatch registers. */
     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+
+	/*
+	 * If doing REG_STARTEND, adjust the pmatch array (we can't build
+	 * this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
+	 * tre_fill_pmatch itself).
+	 */
+	if (!(tnfa->cflags & REG_NOSUB) && (eflags & REG_STARTEND)
+	    && pmatch && nmatch > 0) {
+		size_t i;
+		regmatch_t *p;
+		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
+			if (p->rm_so >= 0) p->rm_so += offset;
+			if (p->rm_eo >= 0) p->rm_eo += offset;
+		}
+	}
+  }
+
   if (tags)
     xfree(tags);
   return status;

[-- Attachment #2.3: Type: text/html, Size: 438 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] regex: REG_STARTEND support
  2016-10-05 12:19 [PATCH] regex: REG_STARTEND support Julien Ramseier
@ 2016-10-05 16:23 ` Rich Felker
  0 siblings, 0 replies; 2+ messages in thread
From: Rich Felker @ 2016-10-05 16:23 UTC (permalink / raw)
  To: Julien Ramseier; +Cc: musl, Johannes.Schindelin

On Wed, Oct 05, 2016 at 02:19:35PM +0200, Julien Ramseier wrote:
>    Here's my REG_STARTEND patch, mostly copied from the original tre[1]
>    implementation.
>    It's only lightly tested.
> [...]
> diff --git a/src/regex/regexec.c b/src/regex/regexec.c
> index 16c5d0a..ae65726 100644
> --- a/src/regex/regexec.c
> +++ b/src/regex/regexec.c
> @@ -29,6 +29,7 @@
>  
>  */
>  
> +#include <sys/types.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <wchar.h>
> @@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
>  
>  #define GET_NEXT_WCHAR() do {                                                 \
>      prev_c = next_c; pos += pos_add_next;                                     \
> -    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
> -        if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
> -        else pos_add_next++;                                                  \
> +    if (len >= 0 && pos >= len)                                               \
> +        next_c = L'\0';                                                       \

As caught discussing this on #musl yesterday, pos (int) here has the
wrong type, int, which is a big problem. I'm going to work on a test
case to show it and confirm that changing the type fixes it.

> +    else {                                                                    \
> +        if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {    \
> +            if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }     \
> +            else pos_add_next++;                                              \
> +        }                                                                     \
> +        str_byte += pos_add_next;                                             \
>      }                                                                         \
> -    str_byte += pos_add_next;                                                 \

There also seems to be a bug, which was also present in the original
TRE I think, whereby read past len can happen if the buffer up to len
ends with a partial multibyte character. Avoiding this seems rather
costly.

Otherwise this doesn't look too bad. I'll see if we can get some
figures for how it affects performance.

Rich


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2016-10-05 16:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-05 12:19 [PATCH] regex: REG_STARTEND support Julien Ramseier
2016-10-05 16:23 ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).