From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/10589 Path: news.gmane.org!.POSTED!not-for-mail From: Rich Felker Newsgroups: gmane.linux.lib.musl.general Subject: [PATCH] fix regexec with haystack strings longer than INT_MAX Date: Thu, 6 Oct 2016 02:02:07 -0400 Message-ID: <20161006060207.GA7634@brightrain.aerifal.cx> Reply-To: musl@lists.openwall.com NNTP-Posting-Host: blaine.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="8t9RHnE3ZwKMSgU+" X-Trace: blaine.gmane.org 1475733757 17657 195.159.176.226 (6 Oct 2016 06:02:37 GMT) X-Complaints-To: usenet@blaine.gmane.org NNTP-Posting-Date: Thu, 6 Oct 2016 06:02:37 +0000 (UTC) User-Agent: Mutt/1.5.21 (2010-09-15) To: musl@lists.openwall.com Original-X-From: musl-return-10602-gllmg-musl=m.gmane.org@lists.openwall.com Thu Oct 06 08:02:33 2016 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.84_2) (envelope-from ) id 1bs1l8-00036G-Rb for gllmg-musl@m.gmane.org; Thu, 06 Oct 2016 08:02:22 +0200 Original-Received: (qmail 14145 invoked by uid 550); 6 Oct 2016 06:02:22 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 14097 invoked from network); 6 Oct 2016 06:02:20 -0000 Content-Disposition: inline Original-Sender: Rich Felker Xref: news.gmane.org gmane.linux.lib.musl.general:10589 Archived-At: --8t9RHnE3ZwKMSgU+ Content-Type: text/plain; charset=us-ascii Content-Disposition: inline We inherited from TRE regexec code that's utterly wrong with respect to the integer types it's using; while it doesn't appear to be unsafe, it fails to find matches past offset INT_MAX. This patch fixes the type of all variables/fields used to store offsets in the string from int to regoff_t, and seems to fix the problem, though it has not been heavily tested yet. I've also attached a test program suitable for demonstrating the bug and at least one case where the fix works. It uses my (also attached) alloc_huge function which allows testing >4GB inputs to string functions without the need for huge amounts of physical memory or swap. Rich --8t9RHnE3ZwKMSgU+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="regpos2.diff" diff --git a/src/regex/regexec.c b/src/regex/regexec.c index 16c5d0a..28b8a3d 100644 --- a/src/regex/regexec.c +++ b/src/regex/regexec.c @@ -43,7 +43,7 @@ static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, int *tags, int match_eo); + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo); /*********************************************************************** from tre-match-utils.h @@ -96,7 +96,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, /* Returns 1 if `t1' wins `t2', 0 otherwise. */ static int tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, - int *t1, int *t2) + regoff_t *t1, regoff_t *t2) { int i; for (i = 0; i < num_tags; i++) @@ -156,25 +156,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) typedef struct { tre_tnfa_transition_t *state; - int *tags; + regoff_t *tags; } tre_tnfa_reach_t; typedef struct { - int pos; - int **tags; + regoff_t pos; + regoff_t **tags; } tre_reach_pos_t; static reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, - int *match_tags, int eflags, - int *match_end_ofs) + regoff_t *match_tags, int eflags, + regoff_t *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; const char *str_byte = string; - int pos = -1; - int pos_add_next = 1; + regoff_t pos = -1; + regoff_t pos_add_next = 1; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -190,10 +190,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int *tag_i; int num_tags, i; - int match_eo = -1; /* end offset of match (-1 if no match found yet) */ + regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */ int new_match = 0; - int *tmp_tags = NULL; - int *tmp_iptr; + regoff_t *tmp_tags = NULL; + regoff_t *tmp_iptr; #ifdef TRE_MBSTATE memset(&mbstate, '\0', sizeof(mbstate)); @@ -209,22 +209,22 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, everything in a single large block from the stack frame using alloca() or with malloc() if alloca is unavailable. */ { - int tbytes, rbytes, pbytes, xbytes, total_bytes; + size_t tbytes, rbytes, pbytes, xbytes, total_bytes; char *tmp_buf; /* Compute the length of the block we need. */ tbytes = sizeof(*tmp_tags) * num_tags; rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); pbytes = sizeof(*reach_pos) * tnfa->num_states; - xbytes = sizeof(int) * num_tags; + xbytes = sizeof(regoff_t) * num_tags; total_bytes = (sizeof(long) - 1) * 4 /* for alignment paddings */ + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; /* Allocate the memory. */ - buf = xmalloc((unsigned)total_bytes); + buf = xmalloc(total_bytes); if (buf == NULL) return REG_ESPACE; - memset(buf, 0, (size_t)total_bytes); + memset(buf, 0, total_bytes); /* Get the various pointers within tmp_buf (properly aligned). */ tmp_tags = (void *)buf; @@ -477,12 +477,12 @@ error_exit: */ typedef struct { - int pos; + regoff_t pos; const char *str_byte; tre_tnfa_transition_t *state; int state_id; int next_c; - int *tags; + regoff_t *tags; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -578,13 +578,13 @@ typedef struct tre_backtrack_struct { static reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - int *match_tags, int eflags, int *match_end_ofs) + regoff_t *match_tags, int eflags, regoff_t *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; const char *str_byte = string; - int pos = 0; - int pos_add_next = 1; + regoff_t pos = 0; + regoff_t pos_add_next = 1; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -597,15 +597,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, started from. */ int next_c_start; const char *str_byte_start; - int pos_start = -1; + regoff_t pos_start = -1; #ifdef TRE_MBSTATE mbstate_t mbstate_start; #endif /* TRE_MBSTATE */ /* End offset of best match so far, or -1 if no match found yet. */ - int match_eo = -1; + regoff_t match_eo = -1; /* Tag arrays. */ - int *next_tags, *tags = NULL; + int *next_tags; + regoff_t *tags = NULL; /* Current TNFA state. */ tre_tnfa_transition_t *state; int *states_seen = NULL; @@ -755,8 +756,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* This is a back reference state. All transitions leaving from this state have the same back reference "assertion". Instead of reading the next character, we match the back reference. */ - int so, eo, bt = trans_i->u.backref; - int bt_len; + regoff_t so, eo; + int bt = trans_i->u.backref; + regoff_t bt_len; int result; /* Get the substring we need to match against. Remember to @@ -913,7 +915,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, endpoint values. */ static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, int *tags, int match_eo) + const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo) { tre_submatch_data_t *submatch_data; unsigned int i, j; @@ -983,7 +985,7 @@ regexec(const regex_t *restrict preg, const char *restrict string, { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; reg_errcode_t status; - int *tags = NULL, eo; + regoff_t *tags = NULL, eo; if (tnfa->cflags & REG_NOSUB) nmatch = 0; if (tnfa->num_tags > 0 && nmatch > 0) { --8t9RHnE3ZwKMSgU+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="regexec_huge_haystack.c" #include #include void *alloc_huge(size_t n, int c); int main() { size_t len = (1ULL<<32) + 65536; char *h = alloc_huge(len, 'a'); h[len-100] = 'b'; h[len-1] = 0; regex_t re; regmatch_t rm; regcomp(&re, "b", REG_EXTENDED); int ret = regexec(&re, h, 1, &rm, 0); printf("%d %zd %zd\n", ret, rm.rm_so, rm.rm_eo); } --8t9RHnE3ZwKMSgU+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="huge.c" #define _POSIX_C_SOURCE 200809L #include #include #include #include #ifndef PAGE_SIZE #define PAGE_SIZE sysconf(_SC_PAGE_SIZE) #endif #define BLOCK (128*PAGE_SIZE) void *alloc_huge(size_t n, int c) { char *vm; size_t i; int fd; FILE *f; f = tmpfile(); if (!f) return 0; fd = fileno(f); if (n & (BLOCK-1)) n = (n & -BLOCK) + BLOCK; vm = mmap(0, n, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (vm == MAP_FAILED) goto fail1; if (ftruncate(fd, 128*PAGE_SIZE) < 0) goto fail2; for (i=0; iBLOCK && mmap(vm+n-2*BLOCK, 2*BLOCK, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED) goto fail2; memset(vm, c, BLOCK); if (n > BLOCK) { memset(vm+BLOCK, c, BLOCK); memset(vm+n-2*BLOCK, c, 2*BLOCK); } return vm; fail2: munmap(vm, n); fail1: fclose(f); return 0; } --8t9RHnE3ZwKMSgU+--