This extension originates from the BSD, and is available under the illumos gate as well as glibc (but glibc is currently afflicted by a few bugs, notably that its ^ doesn't work). REG_STARTEND affects regexec() in the following way: the string to be matched is [string + pmatch->rm_so, string + pmatch->rm_eo) rather than [string, string + strlen(string)) This allows matching data with embedded NULs (and on other implementations avoids a strlen() over the input string), and limiting the far side of the matched string (and potentially matching unterminated strings). The matches written to pmatch are still referenced to string (not string + pmatch->rm_so). As an example, the following program: #include #include #include int main(int c, char ** v) { setlocale(LC_ALL, ""); regex_t r; regcomp(&r, v[1], 0); regmatch_t dt[2] = {{1, 4}}; printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND)); printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo); } produces $ ./a.out '^a' # matching in "a\0c" 0 1, 2; -1, -1 $ ./a.out 'c$' 0 3, 4; -1, -1 $ ./a.out 'c$' '_ac' # matching in "ac\0" 1 1, 4; 0, 0 $ ./a.out '^\(a\).\1$' _abad # matching in "aba" 0 1, 4; 1, 2 $ ./a.out 'ać' '_aaćdef' # ć is two bytes in UTF-8 1 # matching in "aa\xC4" 1, 4; 0, 0 $ ./a.out 'ać' '_aćdef' # matching in "ać" 0 1, 4; -1, -1 Under NetBSD, additionally, the following holds: $ ./a.out '^a.c$' 0 1, 4; -1, -1 however this is not the case in glibc or musl yet; although glibc supports $ ./a.out 'a[^-]c$' 0 1, 4; -1, -1 so that's probably another glibc bug. The bulk of the implementation is concentrated in GET_NEXT_WCHAR(): if REG_STARTEND was requested, we smooth over NULs by replacing them with (wchar_t)-1 (this is the reason . and [^] doesn't work), and limit how many bytes may be consumed by mbtowc() when getting to the end. To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(), in that yielding an L'\0' means end-of-string. --- Please keep me in CC, as I'm not subscribed. include/regex.h | 1 + src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/include/regex.h b/include/regex.h index dce21771..01ab326e 100644 --- a/include/regex.h +++ b/include/regex.h @@ -31,6 +31,7 @@ typedef struct { #define REG_NOTBOL 1 #define REG_NOTEOL 2 +#define REG_STARTEND 4 #define REG_OK 0 #define REG_NOMATCH 1 diff --git a/src/regex/regexec.c b/src/regex/regexec.c index 253b0e14..2a2bded5 100644 --- a/src/regex/regexec.c +++ b/src/regex/regexec.c @@ -44,17 +44,23 @@ static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo); + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo, + const regmatch_t *startend); /*********************************************************************** from tre-match-utils.h ***********************************************************************/ + #define GET_NEXT_WCHAR() do { \ + size_t max_len = startend ? \ + MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) : \ + MB_LEN_MAX; \ prev_c = next_c; pos += pos_add_next; \ - if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \ + if(!max_len) { next_c = '\0'; pos_add_next = 1; } \ + else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) { \ if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \ - else pos_add_next++; \ + else { pos_add_next++; if (startend) next_c = -1; }; \ } \ str_byte += pos_add_next; \ } while (0) @@ -169,11 +175,11 @@ typedef struct { static reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, regoff_t *match_tags, int eflags, - regoff_t *match_end_ofs) + regoff_t *match_end_ofs, const regmatch_t *startend) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; - const char *str_byte = string; + const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0); regoff_t pos = -1; regoff_t pos_add_next = 1; #ifdef TRE_MBSTATE @@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct { static reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - regoff_t *match_tags, int eflags, regoff_t *match_end_ofs) + regoff_t *match_tags, int eflags, + regoff_t *match_end_ofs, const regmatch_t *startend) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; - const char *str_byte = string; + const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);; regoff_t pos = 0; regoff_t pos_add_next = 1; #ifdef TRE_MBSTATE @@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* Get the substring we need to match against. Remember to turn off REG_NOSUB temporarily. */ tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, - tnfa, tags, pos); + tnfa, tags, pos, startend); so = pmatch[bt].rm_so; eo = pmatch[bt].rm_eo; bt_len = eo - so; @@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, endpoint values. */ static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo) + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo, + const regmatch_t *startend) { tre_submatch_data_t *submatch_data; + regoff_t offset = startend ? startend->rm_so : 0; unsigned int i, j; int *parents; @@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, was not part of the match. */ if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) pmatch[i].rm_so = pmatch[i].rm_eo = -1; + else + { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; } i++; } @@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string, tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; reg_errcode_t status; regoff_t *tags = NULL, eo; + const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL; if (tnfa->cflags & REG_NOSUB) nmatch = 0; if (tnfa->num_tags > 0 && nmatch > 0) { @@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string, if (tnfa->have_backrefs) { /* The regex has back references, use the backtracking matcher. */ - status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); + status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo, + startend); } else { /* Exact matching, no back references, use the parallel matcher. */ - status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); + status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo, + startend); } if (status == REG_OK) /* A match was found, so fill the submatch registers. */ - tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); + tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend); if (tags) xfree(tags); return status; -- 2.30.2