mailing list of musl libc
 help / color / mirror / code / Atom feed
* [PATCH] fix regexec with haystack strings longer than INT_MAX
@ 2016-10-06  6:02 Rich Felker
  0 siblings, 0 replies; only message in thread
From: Rich Felker @ 2016-10-06  6:02 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 648 bytes --]

We inherited from TRE regexec code that's utterly wrong with respect
to the integer types it's using; while it doesn't appear to be unsafe,
it fails to find matches past offset INT_MAX. This patch fixes the
type of all variables/fields used to store offsets in the string from
int to regoff_t, and seems to fix the problem, though it has not been
heavily tested yet.

I've also attached a test program suitable for demonstrating the bug
and at least one case where the fix works. It uses my (also attached)
alloc_huge function which allows testing >4GB inputs to string
functions without the need for huge amounts of physical memory or
swap.

Rich

[-- Attachment #2: regpos2.diff --]
[-- Type: text/plain, Size: 5841 bytes --]

diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 16c5d0a..28b8a3d 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -43,7 +43,7 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, int *tags, int match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
 
 /***********************************************************************
  from tre-match-utils.h
@@ -96,7 +96,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
 static int
 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
-	      int *t1, int *t2)
+	      regoff_t *t1, regoff_t *t2)
 {
   int i;
   for (i = 0; i < num_tags; i++)
@@ -156,25 +156,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
 
 typedef struct {
   tre_tnfa_transition_t *state;
-  int *tags;
+  regoff_t *tags;
 } tre_tnfa_reach_t;
 
 typedef struct {
-  int pos;
-  int **tags;
+  regoff_t pos;
+  regoff_t **tags;
 } tre_reach_pos_t;
 
 
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
-		      int *match_tags, int eflags,
-		      int *match_end_ofs)
+		      regoff_t *match_tags, int eflags,
+		      regoff_t *match_end_ofs)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
   const char *str_byte = string;
-  int pos = -1;
-  int pos_add_next = 1;
+  regoff_t pos = -1;
+  regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
   mbstate_t mbstate;
 #endif /* TRE_MBSTATE */
@@ -190,10 +190,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
   int *tag_i;
   int num_tags, i;
 
-  int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
+  regoff_t match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
   int new_match = 0;
-  int *tmp_tags = NULL;
-  int *tmp_iptr;
+  regoff_t *tmp_tags = NULL;
+  regoff_t *tmp_iptr;
 
 #ifdef TRE_MBSTATE
   memset(&mbstate, '\0', sizeof(mbstate));
@@ -209,22 +209,22 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
      everything in a single large block from the stack frame using alloca()
      or with malloc() if alloca is unavailable. */
   {
-    int tbytes, rbytes, pbytes, xbytes, total_bytes;
+    size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
     char *tmp_buf;
     /* Compute the length of the block we need. */
     tbytes = sizeof(*tmp_tags) * num_tags;
     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
     pbytes = sizeof(*reach_pos) * tnfa->num_states;
-    xbytes = sizeof(int) * num_tags;
+    xbytes = sizeof(regoff_t) * num_tags;
     total_bytes =
       (sizeof(long) - 1) * 4 /* for alignment paddings */
       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
 
     /* Allocate the memory. */
-    buf = xmalloc((unsigned)total_bytes);
+    buf = xmalloc(total_bytes);
     if (buf == NULL)
       return REG_ESPACE;
-    memset(buf, 0, (size_t)total_bytes);
+    memset(buf, 0, total_bytes);
 
     /* Get the various pointers within tmp_buf (properly aligned). */
     tmp_tags = (void *)buf;
@@ -477,12 +477,12 @@ error_exit:
 */
 
 typedef struct {
-  int pos;
+  regoff_t pos;
   const char *str_byte;
   tre_tnfa_transition_t *state;
   int state_id;
   int next_c;
-  int *tags;
+  regoff_t *tags;
 #ifdef TRE_MBSTATE
   mbstate_t mbstate;
 #endif /* TRE_MBSTATE */
@@ -578,13 +578,13 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       int *match_tags, int eflags, int *match_end_ofs)
+		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
   const char *str_byte = string;
-  int pos = 0;
-  int pos_add_next = 1;
+  regoff_t pos = 0;
+  regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
   mbstate_t mbstate;
 #endif /* TRE_MBSTATE */
@@ -597,15 +597,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
      started from. */
   int next_c_start;
   const char *str_byte_start;
-  int pos_start = -1;
+  regoff_t pos_start = -1;
 #ifdef TRE_MBSTATE
   mbstate_t mbstate_start;
 #endif /* TRE_MBSTATE */
 
   /* End offset of best match so far, or -1 if no match found yet. */
-  int match_eo = -1;
+  regoff_t match_eo = -1;
   /* Tag arrays. */
-  int *next_tags, *tags = NULL;
+  int *next_tags;
+  regoff_t *tags = NULL;
   /* Current TNFA state. */
   tre_tnfa_transition_t *state;
   int *states_seen = NULL;
@@ -755,8 +756,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* This is a back reference state.  All transitions leaving from
 	     this state have the same back reference "assertion".  Instead
 	     of reading the next character, we match the back reference. */
-	  int so, eo, bt = trans_i->u.backref;
-	  int bt_len;
+	  regoff_t so, eo;
+	  int bt = trans_i->u.backref;
+	  regoff_t bt_len;
 	  int result;
 
 	  /* Get the substring we need to match against.  Remember to
@@ -913,7 +915,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, int *tags, int match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
 {
   tre_submatch_data_t *submatch_data;
   unsigned int i, j;
@@ -983,7 +985,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
-  int *tags = NULL, eo;
+  regoff_t *tags = NULL, eo;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {

[-- Attachment #3: regexec_huge_haystack.c --]
[-- Type: text/plain, Size: 344 bytes --]

#include <regex.h>
#include <stdio.h>

void *alloc_huge(size_t n, int c);

int main()
{
	size_t len = (1ULL<<32) + 65536;
	char *h = alloc_huge(len, 'a');
	h[len-100] = 'b';
	h[len-1] = 0;

	regex_t re;
	regmatch_t rm;
	regcomp(&re, "b", REG_EXTENDED);
	int ret = regexec(&re, h, 1, &rm, 0);
	printf("%d %zd %zd\n", ret, rm.rm_so, rm.rm_eo);
}

[-- Attachment #4: huge.c --]
[-- Type: text/plain, Size: 1108 bytes --]

#define _POSIX_C_SOURCE 200809L
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>

#ifndef PAGE_SIZE
#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
#endif

#define BLOCK (128*PAGE_SIZE)

void *alloc_huge(size_t n, int c)
{
	char *vm;
	size_t i;
	int fd;
	FILE *f;

	f = tmpfile();
	if (!f) return 0;
	fd = fileno(f);

	if (n & (BLOCK-1)) n = (n & -BLOCK) + BLOCK;

	vm = mmap(0, n, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
	if (vm == MAP_FAILED) goto fail1;

	if (ftruncate(fd, 128*PAGE_SIZE) < 0)
		goto fail2;
	for (i=0; i<n; i+=BLOCK)
		if (mmap(vm+i, BLOCK, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED)
			goto fail2;
	if (mmap(vm, BLOCK, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
		goto fail2;
	if (n>BLOCK && mmap(vm+n-2*BLOCK, 2*BLOCK, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
		goto fail2;

	memset(vm, c, BLOCK);
	if (n > BLOCK) {
		memset(vm+BLOCK, c, BLOCK);
		memset(vm+n-2*BLOCK, c, 2*BLOCK);
	}

	return vm;

fail2:
	munmap(vm, n);
fail1:
	fclose(f);
	return 0;
}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2016-10-06  6:02 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-06  6:02 [PATCH] fix regexec with haystack strings longer than INT_MAX Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).