* [PATCH] fix regexec with haystack strings longer than INT_MAX
@ 2016-10-06 6:02 Rich Felker
0 siblings, 0 replies; only message in thread
From: Rich Felker @ 2016-10-06 6:02 UTC (permalink / raw)
To: musl
[-- Attachment #1: Type: text/plain, Size: 648 bytes --]
We inherited from TRE regexec code that's utterly wrong with respect
to the integer types it's using; while it doesn't appear to be unsafe,
it fails to find matches past offset INT_MAX. This patch fixes the
type of all variables/fields used to store offsets in the string from
int to regoff_t, and seems to fix the problem, though it has not been
heavily tested yet.
I've also attached a test program suitable for demonstrating the bug
and at least one case where the fix works. It uses my (also attached)
alloc_huge function which allows testing >4GB inputs to string
functions without the need for huge amounts of physical memory or
swap.
Rich
[-- Attachment #2: regpos2.diff --]
[-- Type: text/plain, Size: 5841 bytes --]
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 16c5d0a..28b8a3d 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -43,7 +43,7 @@
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, int *tags, int match_eo);
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
/***********************************************************************
from tre-match-utils.h
@@ -96,7 +96,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
/* Returns 1 if `t1' wins `t2', 0 otherwise. */
static int
tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
- int *t1, int *t2)
+ regoff_t *t1, regoff_t *t2)
{
int i;
for (i = 0; i < num_tags; i++)
@@ -156,25 +156,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
typedef struct {
tre_tnfa_transition_t *state;
- int *tags;
+ regoff_t *tags;
} tre_tnfa_reach_t;
typedef struct {
- int pos;
- int **tags;
+ regoff_t pos;
+ regoff_t **tags;
} tre_reach_pos_t;
static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
- int *match_tags, int eflags,
- int *match_end_ofs)
+ regoff_t *match_tags, int eflags,
+ regoff_t *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
- int pos = -1;
- int pos_add_next = 1;
+ regoff_t pos = -1;
+ regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -190,10 +190,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
int *tag_i;
int num_tags, i;
- int match_eo = -1; /* end offset of match (-1 if no match found yet) */
+ regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
int new_match = 0;
- int *tmp_tags = NULL;
- int *tmp_iptr;
+ regoff_t *tmp_tags = NULL;
+ regoff_t *tmp_iptr;
#ifdef TRE_MBSTATE
memset(&mbstate, '\0', sizeof(mbstate));
@@ -209,22 +209,22 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
everything in a single large block from the stack frame using alloca()
or with malloc() if alloca is unavailable. */
{
- int tbytes, rbytes, pbytes, xbytes, total_bytes;
+ size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
char *tmp_buf;
/* Compute the length of the block we need. */
tbytes = sizeof(*tmp_tags) * num_tags;
rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
pbytes = sizeof(*reach_pos) * tnfa->num_states;
- xbytes = sizeof(int) * num_tags;
+ xbytes = sizeof(regoff_t) * num_tags;
total_bytes =
(sizeof(long) - 1) * 4 /* for alignment paddings */
+ (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
/* Allocate the memory. */
- buf = xmalloc((unsigned)total_bytes);
+ buf = xmalloc(total_bytes);
if (buf == NULL)
return REG_ESPACE;
- memset(buf, 0, (size_t)total_bytes);
+ memset(buf, 0, total_bytes);
/* Get the various pointers within tmp_buf (properly aligned). */
tmp_tags = (void *)buf;
@@ -477,12 +477,12 @@ error_exit:
*/
typedef struct {
- int pos;
+ regoff_t pos;
const char *str_byte;
tre_tnfa_transition_t *state;
int state_id;
int next_c;
- int *tags;
+ regoff_t *tags;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -578,13 +578,13 @@ typedef struct tre_backtrack_struct {
static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- int *match_tags, int eflags, int *match_end_ofs)
+ regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
- int pos = 0;
- int pos_add_next = 1;
+ regoff_t pos = 0;
+ regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -597,15 +597,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
started from. */
int next_c_start;
const char *str_byte_start;
- int pos_start = -1;
+ regoff_t pos_start = -1;
#ifdef TRE_MBSTATE
mbstate_t mbstate_start;
#endif /* TRE_MBSTATE */
/* End offset of best match so far, or -1 if no match found yet. */
- int match_eo = -1;
+ regoff_t match_eo = -1;
/* Tag arrays. */
- int *next_tags, *tags = NULL;
+ int *next_tags;
+ regoff_t *tags = NULL;
/* Current TNFA state. */
tre_tnfa_transition_t *state;
int *states_seen = NULL;
@@ -755,8 +756,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
/* This is a back reference state. All transitions leaving from
this state have the same back reference "assertion". Instead
of reading the next character, we match the back reference. */
- int so, eo, bt = trans_i->u.backref;
- int bt_len;
+ regoff_t so, eo;
+ int bt = trans_i->u.backref;
+ regoff_t bt_len;
int result;
/* Get the substring we need to match against. Remember to
@@ -913,7 +915,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
endpoint values. */
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, int *tags, int match_eo)
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
{
tre_submatch_data_t *submatch_data;
unsigned int i, j;
@@ -983,7 +985,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
{
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
- int *tags = NULL, eo;
+ regoff_t *tags = NULL, eo;
if (tnfa->cflags & REG_NOSUB) nmatch = 0;
if (tnfa->num_tags > 0 && nmatch > 0)
{
[-- Attachment #3: regexec_huge_haystack.c --]
[-- Type: text/plain, Size: 344 bytes --]
#include <regex.h>
#include <stdio.h>
void *alloc_huge(size_t n, int c);
int main()
{
size_t len = (1ULL<<32) + 65536;
char *h = alloc_huge(len, 'a');
h[len-100] = 'b';
h[len-1] = 0;
regex_t re;
regmatch_t rm;
regcomp(&re, "b", REG_EXTENDED);
int ret = regexec(&re, h, 1, &rm, 0);
printf("%d %zd %zd\n", ret, rm.rm_so, rm.rm_eo);
}
[-- Attachment #4: huge.c --]
[-- Type: text/plain, Size: 1108 bytes --]
#define _POSIX_C_SOURCE 200809L
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#ifndef PAGE_SIZE
#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
#endif
#define BLOCK (128*PAGE_SIZE)
void *alloc_huge(size_t n, int c)
{
char *vm;
size_t i;
int fd;
FILE *f;
f = tmpfile();
if (!f) return 0;
fd = fileno(f);
if (n & (BLOCK-1)) n = (n & -BLOCK) + BLOCK;
vm = mmap(0, n, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (vm == MAP_FAILED) goto fail1;
if (ftruncate(fd, 128*PAGE_SIZE) < 0)
goto fail2;
for (i=0; i<n; i+=BLOCK)
if (mmap(vm+i, BLOCK, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED)
goto fail2;
if (mmap(vm, BLOCK, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
goto fail2;
if (n>BLOCK && mmap(vm+n-2*BLOCK, 2*BLOCK, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
goto fail2;
memset(vm, c, BLOCK);
if (n > BLOCK) {
memset(vm+BLOCK, c, BLOCK);
memset(vm+n-2*BLOCK, c, 2*BLOCK);
}
return vm;
fail2:
munmap(vm, n);
fail1:
fclose(f);
return 0;
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2016-10-06 6:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-06 6:02 [PATCH] fix regexec with haystack strings longer than INT_MAX Rich Felker
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).