From mboxrd@z Thu Jan 1 00:00:00 1970 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=0.4 required=5.0 tests=DKIM_INVALID,DKIM_SIGNED, MAILING_LIST_MULTI,MISSING_HEADERS,RCVD_IN_MSPIKE_H2, T_SCC_BODY_TEXT_LINE autolearn=no autolearn_force=no version=3.4.4 Received: (qmail 24327 invoked from network); 28 Apr 2023 11:41:17 -0000 Received: from second.openwall.net (193.110.157.125) by inbox.vuxu.org with ESMTPUTF8; 28 Apr 2023 11:41:17 -0000 Received: (qmail 9644 invoked by uid 550); 28 Apr 2023 11:41:12 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Reply-To: musl@lists.openwall.com Received: (qmail 7643 invoked from network); 28 Apr 2023 11:39:23 -0000 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=nabijaczleweli.xyz; s=202211; t=1682681948; bh=G0RIXTZ4g3/nPQp7WOTrJKSB5ZCfygbM6+VcwW4Qe4Y=; h=Date:From:Cc:Subject:References:In-Reply-To:From; b=S6Rn00DEi8eZCw5AySUSguvr1eIBxfOhtvO94qCpdMLu2gZseswyYgsDlIV1SttGZ HxYPIHkhS7c8K6bMsnP2ZLNvt0FJuV/vJyTyx7KfTgeIsgfmlUrv48yskC/T+NDNBX W5ShGakCzTfKXFOUd21zdROO5oN+7t45MGdzNxP1lysFe4tbneoVrsgjS9UmyosSHi 8fw4yCqF9K5qO+yd6kALxeeU4GMSPQe81qy0/Ww3uP+j8Ye4jZypD8g268OBbBvkVj O0vsaOblErlO3n13KjjpCp++/Bi6vWD9xsezWOqoEo92ve6BHTQYPcDUl1a++4rxSR fxIcrqPDOaXAw== Date: Fri, 28 Apr 2023 13:39:06 +0200 From: =?utf-8?B?0L3QsNCx?= Cc: musl@lists.openwall.com Message-ID: <007f7c7408aa227ee7b0fb3d82e44b8142db6ff4.1682681245.git.nabijaczleweli@nabijaczleweli.xyz> References: <73caac41e70db544c53b1aa947627206d3eb625b.1682024413.git.nabijaczleweli@nabijaczleweli.xyz> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha512; protocol="application/pgp-signature"; boundary="nbqs6rzczht2q3m2" Content-Disposition: inline In-Reply-To: <73caac41e70db544c53b1aa947627206d3eb625b.1682024413.git.nabijaczleweli@nabijaczleweli.xyz> User-Agent: NeoMutt/20230407 Subject: [musl] [PATCH v2 1/2] regex: add BSD-style REG_STARTEND --nbqs6rzczht2q3m2 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: quoted-printable This extension originates from the BSD, and is available under the illumos gate as well as glibc (but is buggy there). REG_STARTEND affects regexec() in the following way: the string to be matched is [string + pmatch->rm_so, string + pmatch->rm_eo) rather than [string, string + strlen(string)) This allows matching data with embedded NULs (and on other implementations avoids a strlen() over the input string), and limiting the far side of the matched string (thus potentially matching unterminated strings). The matches written to pmatch are still referenced to string (not string + pmatch->rm_so). As an example, the following program: #include #include #include int main(int c, char ** v) { setlocale(LC_ALL, ""); regex_t r; regcomp(&r, v[1], 0); regmatch_t dt[2] =3D {{1, 4}}; printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt= , REG_STARTEND)); printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1]= =2Erm_so, (int)dt[1].rm_eo); } produces $ ./a.out '^a' # matching in "a\0c" 0 1, 2; -1, -1 $ ./a.out 'c$' 0 3, 4; -1, -1 $ ./a.out 'c$' '_ac' # matching in "ac\0" 1 1, 4; 0, 0 $ ./a.out '^\(a\).\1$' _abad # matching in "aba" 0 1, 4; 1, 2 $ ./a.out 'a=C4=87' '_aa=C4=87def' # =C4=87 is two bytes in UTF-8 1 # matching in "aa\xC4" 1, 4; 0, 0 $ ./a.out 'a=C4=87' '_a=C4=87def' # matching in "a=C4=87" 0 1, 4; -1, -1 $ ./a.out '^a.c$' 0 1, 4; -1, -1 $ ./a.out 'a[^-]c$' 0 1, 4; -1, -1 the last two don't hold in musl with just this patch, though. The bulk of the implementation is concentrated in GET_NEXT_WCHAR(): if REG_STARTEND was requested, we smooth over NULs by replacing them with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc() when getting to the end, and, if 0, return L'\0'. To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(), in that yielding an L'\0' means end-of-string; this is heavily baked into the matchers, and embedded NULs are unnameable within the regex anyway. --- v2: fixed style and made the message probably a bit saner; NFC Series tested with the v2 tst-regex-startend.c available at https://sourceware.org/pipermail/libc-alpha/2023-April/147564.html and which should port to more compilers. Keep me in CC: please. include/regex.h | 1 + src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/include/regex.h b/include/regex.h index dce21771..01ab326e 100644 --- a/include/regex.h +++ b/include/regex.h @@ -31,6 +31,7 @@ typedef struct { =20 #define REG_NOTBOL 1 #define REG_NOTEOL 2 +#define REG_STARTEND 4 =20 #define REG_OK 0 #define REG_NOMATCH 1 diff --git a/src/regex/regexec.c b/src/regex/regexec.c index 253b0e14..763dde58 100644 --- a/src/regex/regexec.c +++ b/src/regex/regexec.c @@ -44,17 +44,23 @@ =20 static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo); + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo, + const regmatch_t *startend); =20 /*********************************************************************** from tre-match-utils.h ***********************************************************************/ =20 + #define GET_NEXT_WCHAR() do { = \ + size_t max_len =3D startend ? = \ + MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX)= : \ + MB_LEN_MAX; = \ prev_c =3D next_c; pos +=3D pos_add_next; = \ - if ((pos_add_next =3D mbtowc(&next_c, str_byte, MB_LEN_MAX)) <=3D 0) {= \ + if (!max_len) { next_c =3D L'\0'; pos_add_next =3D 1; } = \ + else if ((pos_add_next =3D mbtowc(&next_c, str_byte, max_len)) <=3D 0)= { \ if (pos_add_next < 0) { ret =3D REG_NOMATCH; goto error_exit; } = \ - else pos_add_next++; = \ + else { pos_add_next++; if (startend) next_c =3D -1; }; = \ } = \ str_byte +=3D pos_add_next; = \ } while (0) @@ -169,11 +175,11 @@ typedef struct { static reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, regoff_t *match_tags, int eflags, - regoff_t *match_end_ofs) + regoff_t *match_end_ofs, const regmatch_t *startend) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c =3D 0, next_c =3D 0; - const char *str_byte =3D string; + const char *str_byte =3D (const char *)string + (startend ? startend->rm= _so : 0); regoff_t pos =3D -1; regoff_t pos_add_next =3D 1; #ifdef TRE_MBSTATE @@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct { =20 static reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - regoff_t *match_tags, int eflags, regoff_t *match_end_ofs) + regoff_t *match_tags, int eflags, + regoff_t *match_end_ofs, const regmatch_t *startend) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c =3D 0, next_c =3D 0; - const char *str_byte =3D string; + const char *str_byte =3D (const char *)string + (startend ? startend->rm= _so : 0); regoff_t pos =3D 0; regoff_t pos_add_next =3D 1; #ifdef TRE_MBSTATE @@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const vo= id *string, /* Get the substring we need to match against. Remember to turn off REG_NOSUB temporarily. */ tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, - tnfa, tags, pos); + tnfa, tags, pos, startend); so =3D pmatch[bt].rm_so; eo =3D pmatch[bt].rm_eo; bt_len =3D eo - so; @@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const v= oid *string, endpoint values. */ static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo) + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo, + const regmatch_t *startend) { tre_submatch_data_t *submatch_data; + regoff_t offset =3D startend ? startend->rm_so : 0; unsigned int i, j; int *parents; =20 @@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int= cflags, was not part of the match. */ if (pmatch[i].rm_so =3D=3D -1 || pmatch[i].rm_eo =3D=3D -1) pmatch[i].rm_so =3D pmatch[i].rm_eo =3D -1; + else + { pmatch[i].rm_so +=3D offset; pmatch[i].rm_eo +=3D offset; } =20 i++; } @@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *rest= rict string, tre_tnfa_t *tnfa =3D (void *)preg->TRE_REGEX_T_FIELD; reg_errcode_t status; regoff_t *tags =3D NULL, eo; + const regmatch_t *startend =3D (eflags & REG_STARTEND) ? pmatch : NULL; if (tnfa->cflags & REG_NOSUB) nmatch =3D 0; if (tnfa->num_tags > 0 && nmatch > 0) { @@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *r= estrict string, if (tnfa->have_backrefs) { /* The regex has back references, use the backtracking matcher. */ - status =3D tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); + status =3D tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo, + startend); } else { /* Exact matching, no back references, use the parallel matcher. */ - status =3D tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); + status =3D tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo, + startend); } =20 if (status =3D=3D REG_OK) /* A match was found, so fill the submatch registers. */ - tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); + tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend= ); if (tags) xfree(tags); return status; --=20 2.30.2 --nbqs6rzczht2q3m2 Content-Type: application/pgp-signature; name="signature.asc" -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEfWlHToQCjFzAxEFjvP0LAY0mWPEFAmRLsFQACgkQvP0LAY0m WPFW4w/8D5GP9L1Zv1V5V9uIUy4WZ+x1WbhevVKNMzRP8BnyrHlqOKb/yNZPBhLd HxtXCM2mcpt913v2dn9EpJIoCc9xmucbkN2LXmRlZzzUIZS8ZvIBWxD1jjvfhpQQ FUv53rj257FquDTBtlIzaNUp4ep9nuu8KUpylZWiRPkmSDU/jcyvg5Nz2CqpE0JX rwl3EBNJFeY3qdRxJGFrpVgvEDuZNF6tjp9HrkkgIu/iaOEXdjf3nE7LD3GTiDwI V5aYSEDTW9OwAF2SZrOpZBl7aa2VZeOi/2kvWtjXx9KnaUHTVRPfYcZpV3UHFrfX BKt13MO1zCxjoUIVDe8hvBvybgt/60zJBITsNumqxt+bON6QXriD0VmBG6x+3IcB U50CGMAl5ja6Rgnf3RTvXMphkiFZVUaGQkd79nQ7F0/MEY5vfloMCZPjHPTAAUIf LNpZk9ecR2Z3aEPpCMjX7+WdXUDqxQ8N+gBcG+XaqrZv4CFsQzMOVoNExYUjlWlS 4LQMBWX9kGraI36puHJzlTnafCnPeCwsM0J3QrAK9ghMNGL1dt1ivNFh3KHi6zMe AcLvY4Z5LDI3PiYpn8eLrAUR+iUh540ZJtbx1Me1oVEIPtLMmOYR8ryLkZFYCy6r QXBlvOv6B9D7ftK2Pql2Tmj8gdnRq5NdQF9y9L+upLI1YrvUnvY= =JZwU -----END PGP SIGNATURE----- --nbqs6rzczht2q3m2--