* [PATCH] regex: REG_STARTEND support
@ 2016-10-05 12:19 Julien Ramseier
2016-10-05 16:23 ` Rich Felker
0 siblings, 1 reply; 2+ messages in thread
From: Julien Ramseier @ 2016-10-05 12:19 UTC (permalink / raw)
To: musl; +Cc: Johannes.Schindelin
[-- Attachment #1: Type: text/plain, Size: 196 bytes --]
Here's my REG_STARTEND patch, mostly copied from the original tre[1] implementation.
It's only lightly tested.
[1] https://github.com/laurikari/tre/ <https://github.com/laurikari/tre/>
[-- Attachment #2.1: Type: text/html, Size: 408 bytes --]
[-- Attachment #2.2: reg_startend.diff --]
[-- Type: application/octet-stream, Size: 6267 bytes --]
diff --git a/include/regex.h b/include/regex.h
index dce2177..449e606 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
#define REG_NOTBOL 1
#define REG_NOTEOL 2
+#define REG_STARTEND 4
#define REG_OK 0
#define REG_NOMATCH 1
@@ -46,6 +47,7 @@ typedef struct {
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
+#define REG_INVARG 14
#define REG_ENOSYS -1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 16c5d0a..ae65726 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -29,6 +29,7 @@
*/
+#include <sys/types.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
@@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
#define GET_NEXT_WCHAR() do { \
prev_c = next_c; pos += pos_add_next; \
- if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
- if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
- else pos_add_next++; \
+ if (len >= 0 && pos >= len) \
+ next_c = L'\0'; \
+ else { \
+ if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
+ if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
+ else pos_add_next++; \
+ } \
+ str_byte += pos_add_next; \
} \
- str_byte += pos_add_next; \
} while (0)
#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
@@ -166,7 +171,7 @@ typedef struct {
static reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
int *match_tags, int eflags,
int *match_end_ofs)
{
@@ -306,7 +311,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
}
/* Check for end of string. */
- if (!next_c) break;
+ if (len < 0) {
+ if (!next_c) break;
+ } else if (pos >= len) break;
GET_NEXT_WCHAR();
@@ -577,7 +584,7 @@ typedef struct tre_backtrack_struct {
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
static reg_errcode_t
-tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
int *match_tags, int eflags, int *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
@@ -767,8 +774,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
eo = pmatch[bt].rm_eo;
bt_len = eo - so;
- result = strncmp((const char*)string + so, str_byte - 1,
- (size_t)bt_len);
+ if (so < 0)
+ result = 1; /* Back reference of nomatch doesn't match */
+ else if (len < 0)
+ result = strncmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
+ else if (len - pos < bt_len)
+ result = 1;
+ else
+ result = memcmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
if (result == 0)
{
@@ -796,8 +809,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
else
{
/* Check for end of string. */
- if (next_c == L'\0')
- goto backtrack;
+ if (len < 0) {
+ if (!next_c) goto backtrack;
+ } else if (pos >= len) goto backtrack;
/* Read the next character. */
GET_NEXT_WCHAR();
@@ -870,10 +884,10 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
{
/* Try starting from a later position in the input string. */
/* Check for end of string. */
- if (next_c == L'\0')
- {
- break;
- }
+ if (len < 0) {
+ if (!next_c) break;
+ } else if (pos >= len) break;
+
next_c = next_c_start;
#ifdef TRE_MBSTATE
mbstate = mbstate_start;
@@ -984,6 +998,18 @@ regexec(const regex_t *restrict preg, const char *restrict string,
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
int *tags = NULL, eo;
+ size_t offset = 0;
+ ssize_t len = -1;
+
+ if ((eflags & REG_STARTEND) && pmatch) {
+ if (pmatch->rm_so < 0 || pmatch->rm_eo < 0 ||
+ pmatch->rm_so > pmatch->rm_eo)
+ return REG_INVARG;
+
+ offset = pmatch->rm_so;
+ len = pmatch->rm_eo - pmatch->rm_so;
+ }
+
if (tnfa->cflags & REG_NOSUB) nmatch = 0;
if (tnfa->num_tags > 0 && nmatch > 0)
{
@@ -996,17 +1022,34 @@ regexec(const regex_t *restrict preg, const char *restrict string,
if (tnfa->have_backrefs)
{
/* The regex has back references, use the backtracking matcher. */
- status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_backtrack(tnfa, string+offset, len, tags, eflags, &eo);
}
else
{
/* Exact matching, no back references, use the parallel matcher. */
- status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_parallel(tnfa, string+offset, len, tags, eflags, &eo);
}
- if (status == REG_OK)
+ if (status == REG_OK) {
/* A match was found, so fill the submatch registers. */
tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+
+ /*
+ * If doing REG_STARTEND, adjust the pmatch array (we can't build
+ * this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
+ * tre_fill_pmatch itself).
+ */
+ if (!(tnfa->cflags & REG_NOSUB) && (eflags & REG_STARTEND)
+ && pmatch && nmatch > 0) {
+ size_t i;
+ regmatch_t *p;
+ for (i = nmatch, p = pmatch; i > 0; p++, i--) {
+ if (p->rm_so >= 0) p->rm_so += offset;
+ if (p->rm_eo >= 0) p->rm_eo += offset;
+ }
+ }
+ }
+
if (tags)
xfree(tags);
return status;
[-- Attachment #2.3: Type: text/html, Size: 438 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] regex: REG_STARTEND support
2016-10-05 12:19 [PATCH] regex: REG_STARTEND support Julien Ramseier
@ 2016-10-05 16:23 ` Rich Felker
0 siblings, 0 replies; 2+ messages in thread
From: Rich Felker @ 2016-10-05 16:23 UTC (permalink / raw)
To: Julien Ramseier; +Cc: musl, Johannes.Schindelin
On Wed, Oct 05, 2016 at 02:19:35PM +0200, Julien Ramseier wrote:
> Here's my REG_STARTEND patch, mostly copied from the original tre[1]
> implementation.
> It's only lightly tested.
> [...]
> diff --git a/src/regex/regexec.c b/src/regex/regexec.c
> index 16c5d0a..ae65726 100644
> --- a/src/regex/regexec.c
> +++ b/src/regex/regexec.c
> @@ -29,6 +29,7 @@
>
> */
>
> +#include <sys/types.h>
> #include <stdlib.h>
> #include <string.h>
> #include <wchar.h>
> @@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
>
> #define GET_NEXT_WCHAR() do { \
> prev_c = next_c; pos += pos_add_next; \
> - if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
> - if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
> - else pos_add_next++; \
> + if (len >= 0 && pos >= len) \
> + next_c = L'\0'; \
As caught discussing this on #musl yesterday, pos (int) here has the
wrong type, int, which is a big problem. I'm going to work on a test
case to show it and confirm that changing the type fixes it.
> + else { \
> + if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
> + if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
> + else pos_add_next++; \
> + } \
> + str_byte += pos_add_next; \
> } \
> - str_byte += pos_add_next; \
There also seems to be a bug, which was also present in the original
TRE I think, whereby read past len can happen if the buffer up to len
ends with a partial multibyte character. Avoiding this seems rather
costly.
Otherwise this doesn't look too bad. I'll see if we can get some
figures for how it affects performance.
Rich
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2016-10-05 16:23 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-05 12:19 [PATCH] regex: REG_STARTEND support Julien Ramseier
2016-10-05 16:23 ` Rich Felker
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).