From: Julien Ramseier <j.ramseier@gmail.com>
To: musl@lists.openwall.com
Cc: Johannes.Schindelin@gmx.de
Subject: [PATCH] regex: REG_STARTEND support
Date: Wed, 5 Oct 2016 14:19:35 +0200 [thread overview]
Message-ID: <B6143A4F-1AB7-46E9-9476-305FF1A2CD49@gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 196 bytes --]
Here's my REG_STARTEND patch, mostly copied from the original tre[1] implementation.
It's only lightly tested.
[1] https://github.com/laurikari/tre/ <https://github.com/laurikari/tre/>
[-- Attachment #2.1: Type: text/html, Size: 408 bytes --]
[-- Attachment #2.2: reg_startend.diff --]
[-- Type: application/octet-stream, Size: 6267 bytes --]
diff --git a/include/regex.h b/include/regex.h
index dce2177..449e606 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
#define REG_NOTBOL 1
#define REG_NOTEOL 2
+#define REG_STARTEND 4
#define REG_OK 0
#define REG_NOMATCH 1
@@ -46,6 +47,7 @@ typedef struct {
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
+#define REG_INVARG 14
#define REG_ENOSYS -1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 16c5d0a..ae65726 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -29,6 +29,7 @@
*/
+#include <sys/types.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
@@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
#define GET_NEXT_WCHAR() do { \
prev_c = next_c; pos += pos_add_next; \
- if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
- if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
- else pos_add_next++; \
+ if (len >= 0 && pos >= len) \
+ next_c = L'\0'; \
+ else { \
+ if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
+ if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
+ else pos_add_next++; \
+ } \
+ str_byte += pos_add_next; \
} \
- str_byte += pos_add_next; \
} while (0)
#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
@@ -166,7 +171,7 @@ typedef struct {
static reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
int *match_tags, int eflags,
int *match_end_ofs)
{
@@ -306,7 +311,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
}
/* Check for end of string. */
- if (!next_c) break;
+ if (len < 0) {
+ if (!next_c) break;
+ } else if (pos >= len) break;
GET_NEXT_WCHAR();
@@ -577,7 +584,7 @@ typedef struct tre_backtrack_struct {
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
static reg_errcode_t
-tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
+tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, ssize_t len,
int *match_tags, int eflags, int *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
@@ -767,8 +774,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
eo = pmatch[bt].rm_eo;
bt_len = eo - so;
- result = strncmp((const char*)string + so, str_byte - 1,
- (size_t)bt_len);
+ if (so < 0)
+ result = 1; /* Back reference of nomatch doesn't match */
+ else if (len < 0)
+ result = strncmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
+ else if (len - pos < bt_len)
+ result = 1;
+ else
+ result = memcmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
if (result == 0)
{
@@ -796,8 +809,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
else
{
/* Check for end of string. */
- if (next_c == L'\0')
- goto backtrack;
+ if (len < 0) {
+ if (!next_c) goto backtrack;
+ } else if (pos >= len) goto backtrack;
/* Read the next character. */
GET_NEXT_WCHAR();
@@ -870,10 +884,10 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
{
/* Try starting from a later position in the input string. */
/* Check for end of string. */
- if (next_c == L'\0')
- {
- break;
- }
+ if (len < 0) {
+ if (!next_c) break;
+ } else if (pos >= len) break;
+
next_c = next_c_start;
#ifdef TRE_MBSTATE
mbstate = mbstate_start;
@@ -984,6 +998,18 @@ regexec(const regex_t *restrict preg, const char *restrict string,
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
int *tags = NULL, eo;
+ size_t offset = 0;
+ ssize_t len = -1;
+
+ if ((eflags & REG_STARTEND) && pmatch) {
+ if (pmatch->rm_so < 0 || pmatch->rm_eo < 0 ||
+ pmatch->rm_so > pmatch->rm_eo)
+ return REG_INVARG;
+
+ offset = pmatch->rm_so;
+ len = pmatch->rm_eo - pmatch->rm_so;
+ }
+
if (tnfa->cflags & REG_NOSUB) nmatch = 0;
if (tnfa->num_tags > 0 && nmatch > 0)
{
@@ -996,17 +1022,34 @@ regexec(const regex_t *restrict preg, const char *restrict string,
if (tnfa->have_backrefs)
{
/* The regex has back references, use the backtracking matcher. */
- status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_backtrack(tnfa, string+offset, len, tags, eflags, &eo);
}
else
{
/* Exact matching, no back references, use the parallel matcher. */
- status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_parallel(tnfa, string+offset, len, tags, eflags, &eo);
}
- if (status == REG_OK)
+ if (status == REG_OK) {
/* A match was found, so fill the submatch registers. */
tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+
+ /*
+ * If doing REG_STARTEND, adjust the pmatch array (we can't build
+ * this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
+ * tre_fill_pmatch itself).
+ */
+ if (!(tnfa->cflags & REG_NOSUB) && (eflags & REG_STARTEND)
+ && pmatch && nmatch > 0) {
+ size_t i;
+ regmatch_t *p;
+ for (i = nmatch, p = pmatch; i > 0; p++, i--) {
+ if (p->rm_so >= 0) p->rm_so += offset;
+ if (p->rm_eo >= 0) p->rm_eo += offset;
+ }
+ }
+ }
+
if (tags)
xfree(tags);
return status;
[-- Attachment #2.3: Type: text/html, Size: 438 bytes --]
next reply other threads:[~2016-10-05 12:19 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-10-05 12:19 Julien Ramseier [this message]
2016-10-05 16:23 ` Rich Felker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=B6143A4F-1AB7-46E9-9476-305FF1A2CD49@gmail.com \
--to=j.ramseier@gmail.com \
--cc=Johannes.Schindelin@gmx.de \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).