From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/10574 Path: news.gmane.org!.POSTED!not-for-mail From: Julien Ramseier Newsgroups: gmane.linux.lib.musl.general Subject: [PATCH] regex: REG_STARTEND support Date: Wed, 5 Oct 2016 14:19:35 +0200 Message-ID: Reply-To: musl@lists.openwall.com NNTP-Posting-Host: blaine.gmane.org Mime-Version: 1.0 (Mac OS X Mail 9.3 \(3124\)) Content-Type: multipart/alternative; boundary="Apple-Mail=_788B596F-01AD-4717-B4BF-618F67F4C17A" X-Trace: blaine.gmane.org 1475670017 32182 195.159.176.226 (5 Oct 2016 12:20:17 GMT) X-Complaints-To: usenet@blaine.gmane.org NNTP-Posting-Date: Wed, 5 Oct 2016 12:20:17 +0000 (UTC) Cc: Johannes.Schindelin@gmx.de To: musl@lists.openwall.com Original-X-From: musl-return-10587-gllmg-musl=m.gmane.org@lists.openwall.com Wed Oct 05 14:20:13 2016 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.84_2) (envelope-from ) id 1brlAu-0005iY-Ec for gllmg-musl@m.gmane.org; Wed, 05 Oct 2016 14:19:52 +0200 Original-Received: (qmail 30371 invoked by uid 550); 5 Oct 2016 12:19:52 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 30338 invoked from network); 5 Oct 2016 12:19:48 -0000 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=from:subject:date:message-id:cc:to:mime-version; bh=21+DxIiwegMDpdwkOiar6wL/ImXV9E0hfS/XGNanVWA=; b=TS7Dq/47y6RboWoKFKnm9MJ4X00dwH7DAPT6kEQ7cCBDk8Tzp1MyfRRS/THxeyoMzQ g7/MTgAwHprLEYnKexycRBySUBEzJLIz9mAo5RWZFBrXREoNMhF9UqEO7IiwBmiFmCJ3 etzLlPRL4ZGCbOGyVZhhi9Lv3FH2TMkij4YpyJWRPnrp7ag/ruvMTnjLJ9B2DhrMemfj WyDhyN0mwHvnhLkNmQcZaTRi4koLk7UmKWBY4sE0O0ST9jLaMNNx+zNMvUN1ukMuy+fq 4KYmqRXbaIljKwDS+DwNnrPjB6DS2kR9wypBYN10R/k0udNt8pjKSzjS1RTogy623P6d tu+A== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:subject:date:message-id:cc:to:mime-version; bh=21+DxIiwegMDpdwkOiar6wL/ImXV9E0hfS/XGNanVWA=; b=a0bmcYsZL48DL1S1W86dAQ4GWN3e1PEcYvgIxpWXCxL20L4h+D2dH9vy0+RZFrH3zB eQZn9GMVsfDyw81UObPg492qL789mUQSGFrhBXUOzd2ivO4osrDvebhcnIEpb0At1hqy ztNoyBC41Fdo5rpJxJa6ucf36Z/8Ipkbj5Ff4WUfYS/T/YQTGT8CWY86jJdY8sVnRWyx Mlr87MjYDFSO4FC4EcEFYbgrQKeMXN/7L8uxvGFr5+swCY+5pgZD6DqCfwJPKlKo3bcC Pubeo4d+j7xcJMR/1ZNqKar2zWquc9MCQc+0INpGv7TbhC09Osv7RZK4o2b6d5pqdLT+ gFIw== X-Gm-Message-State: AA6/9RnNmPjJrp250sYaR8+FMAJuBnoi4V4e1C5jOT+0ddlBOIMp+iWUy5RtJf/jxX1VrA== X-Received: by 10.194.231.99 with SMTP id tf3mr7290721wjc.61.1475669976918; Wed, 05 Oct 2016 05:19:36 -0700 (PDT) X-Mailer: Apple Mail (2.3124) Xref: news.gmane.org gmane.linux.lib.musl.general:10574 Archived-At: --Apple-Mail=_788B596F-01AD-4717-B4BF-618F67F4C17A Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=us-ascii Here's my REG_STARTEND patch, mostly copied from the original tre[1] = implementation. It's only lightly tested. [1] https://github.com/laurikari/tre/ = --Apple-Mail=_788B596F-01AD-4717-B4BF-618F67F4C17A Content-Type: multipart/mixed; boundary="Apple-Mail=_84D71815-5CEF-4942-B4BD-6EED665BB3C6" --Apple-Mail=_84D71815-5CEF-4942-B4BD-6EED665BB3C6 Content-Transfer-Encoding: 7bit Content-Type: text/html; charset=us-ascii Here's my REG_STARTEND patch, mostly copied from the original tre[1] implementation.
It's only lightly tested.

--Apple-Mail=_84D71815-5CEF-4942-B4BD-6EED665BB3C6 Content-Disposition: attachment; filename=reg_startend.diff Content-Type: application/octet-stream; name="reg_startend.diff" Content-Transfer-Encoding: 7bit diff --git a/include/regex.h b/include/regex.h index dce2177..449e606 100644 --- a/include/regex.h +++ b/include/regex.h @@ -31,6 +31,7 @@ typedef struct { #define REG_NOTBOL 1 #define REG_NOTEOL 2 +#define REG_STARTEND 4 #define REG_OK 0 #define REG_NOMATCH 1 @@ -46,6 +47,7 @@ typedef struct { #define REG_ERANGE 11 #define REG_ESPACE 12 #define REG_BADRPT 13 +#define REG_INVARG 14 #define REG_ENOSYS -1 diff --git a/src/regex/regexec.c b/src/regex/regexec.c index 16c5d0a..ae65726 100644 --- a/src/regex/regexec.c +++ b/src/regex/regexec.c @@ -29,6 +29,7 @@ */ +#include #include #include #include @@ -51,11 +52,15 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, #define GET_NEXT_WCHAR() do { \ prev_c = next_c; pos += pos_add_next; \ - if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \ - if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \ - else pos_add_next++; \ + if (len >= 0 && pos >= len) \ + next_c = L'\0'; \ + else { \ + if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \ + if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \ + else pos_add_next++; \ + } \ + str_byte += pos_add_next; \ } \ - str_byte += pos_add_next; \ } while (0) #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) @@ -166,7 +171,7 @@ typedef struct { static reg_errcode_t -tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len, int *match_tags, int eflags, int *match_end_ofs) { @@ -306,7 +311,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, } /* Check for end of string. */ - if (!next_c) break; + if (len < 0) { + if (!next_c) break; + } else if (pos >= len) break; GET_NEXT_WCHAR(); @@ -577,7 +584,7 @@ typedef struct tre_backtrack_struct { #define MIN(a, b) ((a) <= (b) ? (a) : (b)) static reg_errcode_t -tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, ssize_t len, int *match_tags, int eflags, int *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ @@ -767,8 +774,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, eo = pmatch[bt].rm_eo; bt_len = eo - so; - result = strncmp((const char*)string + so, str_byte - 1, - (size_t)bt_len); + if (so < 0) + result = 1; /* Back reference of nomatch doesn't match */ + else if (len < 0) + result = strncmp((const char *)string + so, str_byte - 1, (size_t)bt_len); + else if (len - pos < bt_len) + result = 1; + else + result = memcmp((const char *)string + so, str_byte - 1, (size_t)bt_len); if (result == 0) { @@ -796,8 +809,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, else { /* Check for end of string. */ - if (next_c == L'\0') - goto backtrack; + if (len < 0) { + if (!next_c) goto backtrack; + } else if (pos >= len) goto backtrack; /* Read the next character. */ GET_NEXT_WCHAR(); @@ -870,10 +884,10 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, { /* Try starting from a later position in the input string. */ /* Check for end of string. */ - if (next_c == L'\0') - { - break; - } + if (len < 0) { + if (!next_c) break; + } else if (pos >= len) break; + next_c = next_c_start; #ifdef TRE_MBSTATE mbstate = mbstate_start; @@ -984,6 +998,18 @@ regexec(const regex_t *restrict preg, const char *restrict string, tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; reg_errcode_t status; int *tags = NULL, eo; + size_t offset = 0; + ssize_t len = -1; + + if ((eflags & REG_STARTEND) && pmatch) { + if (pmatch->rm_so < 0 || pmatch->rm_eo < 0 || + pmatch->rm_so > pmatch->rm_eo) + return REG_INVARG; + + offset = pmatch->rm_so; + len = pmatch->rm_eo - pmatch->rm_so; + } + if (tnfa->cflags & REG_NOSUB) nmatch = 0; if (tnfa->num_tags > 0 && nmatch > 0) { @@ -996,17 +1022,34 @@ regexec(const regex_t *restrict preg, const char *restrict string, if (tnfa->have_backrefs) { /* The regex has back references, use the backtracking matcher. */ - status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); + status = tre_tnfa_run_backtrack(tnfa, string+offset, len, tags, eflags, &eo); } else { /* Exact matching, no back references, use the parallel matcher. */ - status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); + status = tre_tnfa_run_parallel(tnfa, string+offset, len, tags, eflags, &eo); } - if (status == REG_OK) + if (status == REG_OK) { /* A match was found, so fill the submatch registers. */ tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); + + /* + * If doing REG_STARTEND, adjust the pmatch array (we can't build + * this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls + * tre_fill_pmatch itself). + */ + if (!(tnfa->cflags & REG_NOSUB) && (eflags & REG_STARTEND) + && pmatch && nmatch > 0) { + size_t i; + regmatch_t *p; + for (i = nmatch, p = pmatch; i > 0; p++, i--) { + if (p->rm_so >= 0) p->rm_so += offset; + if (p->rm_eo >= 0) p->rm_eo += offset; + } + } + } + if (tags) xfree(tags); return status; --Apple-Mail=_84D71815-5CEF-4942-B4BD-6EED665BB3C6 Content-Transfer-Encoding: 7bit Content-Type: text/html; charset=us-ascii --Apple-Mail=_84D71815-5CEF-4942-B4BD-6EED665BB3C6-- --Apple-Mail=_788B596F-01AD-4717-B4BF-618F67F4C17A--