Re: Issues in mbsnrtowcs and wcsnrtombs

mailing list of musl libc
 help / color / mirror / code / Atom feed

From: Mikhail Kremnyov <mkremnyov@gmail.com>
To: musl@lists.openwall.com
Subject: Re: Issues in mbsnrtowcs and wcsnrtombs
Date: Wed, 9 Aug 2017 20:57:27 +0300	[thread overview]
Message-ID: <dfc33583-c665-05cd-9847-9c06c4708c8c@gmail.com> (raw)
In-Reply-To: <c23a73f5-34b4-99e8-786f-622ae42d41e8@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2380 bytes --]

Hi,

Here are two patches, one (for libc_test) adds a couple of tests to
reproduce the issues, the other one fixes them.

Mikhail.

On 07/18/2017 11:05 PM, Mikhail Kremnyov wrote:
> Hi,
>
> It looks like there are some bugs in the implementations of mbsnrtowcs
> and wcsnrtombs.
> E.g. inside mbsnrtowcs there is this code:
>
>     while ( s && wn && ( (n2=n/4)>=wn || n2>32 ) ) {
>         if (n2>=wn) n2=wn;
>         n -= n2;
>         l = mbsrtowcs(ws, &s, n2, st);
>
> Here "n" is the number of source bytes to convert and "n2" is the number
> of wide chars that may be put to the destination, so it's incorrect to
> subtract one from another. And indeed a simple test shows that the
> function doesn't work correctly if long enough non-ascii string is
> passed to it. E.g.:
>
>     const std::string origStr =
> u8"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ";
>     const std::string srcStr = origStr + u8"їґіє";
>
>     std::mbstate_t st = {};
>     const char* srcPtr = &srcStr[0];
>     std::wstring dest(srcStr.length() + 1, wchar_t(0));
>
>     auto res = mbsnrtowcs(&dest[0], &srcPtr, origStr.length(),
> dest.length(), &st);
>
>     std::cout << "res = " << res << ", srcPtr = " << (void*)srcPtr <<
> std::endl;
>
> And the output is:
>     res = 70, srcPtr = 0
>
> Here mbsnrtowcs was told to convert only "origStr.length()" number of
> bytes, which contain 66 2-byte characters, but it converted 70, stopping
> only after the zero char was met.
>
> A similar problem happens with wcsnrtombs using a slightly longer string:
>
>     std::wstring srcStr =
> L"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя";
>
>     const wchar_t* srcPtr = &srcStr[0];
>     std::mbstate_t st = {};
>     std::string dest(srcStr.length() * 4 + 1, char(0));
>
>     auto res = wcsnrtombs(&dest[0], &srcPtr, srcStr.length(),
> dest.length(), &st);
>
>     std::cout << "res = " << res << ", dest = " << dest << std::endl;
>
> The output:
>     res = 98, dest = абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНО
>    
> I.e. it only converted 49 characters instead of 99.
>
>
> Mikhail.
>
>


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: libc_test.diff --]
[-- Type: text/x-patch; name="libc_test.diff", Size: 2822 bytes --]

--- ./src/regression/mbsnrtowcs-overread.c	1970-01-01 03:00:00.000000000 +0300
+++ ./src/regression/mbsnrtowcs-overread.c	2017-08-09 20:20:29.472003066 +0300
@@ -0,0 +1,45 @@
+// mbsnrtowcs issue, reported in www.openwall.com/lists/musl/2017/07/18/3
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+#include "test.h"
+
+int main(void)
+{
+	const char *const chr = "\u044B";
+	const int chr_size = strlen(chr);
+	// The passed length of the source string in bytes should be bigger than
+	// 32*4 to force mbsnrtowcs to use the optimization based on mbsrtowcs.
+	const int chr_count_to_convert = 1000;
+	// Make sure that the source string has more characters after the passed
+	// length.
+	const int chr_count = chr_count_to_convert + 10;
+	
+	char src[chr_count * chr_size + 1];
+	// dest should also have extra space
+	wchar_t dest[chr_count + 1];
+	size_t r;
+	const char *str_ptr = src;
+	mbstate_t mbs;
+
+	for (int i = 0; i < chr_count; ++i)
+	{
+		memcpy(src + i * chr_size, chr, chr_size);
+	}
+	src[chr_count * chr_size] = 0;
+ 
+	setlocale(LC_CTYPE, "en_US.UTF-8");
+
+	memset(&mbs, 0, sizeof(mbs));
+	r = mbsnrtowcs(dest, &str_ptr, chr_count_to_convert * chr_size,
+	               sizeof(dest)/sizeof(dest[0]), &mbs);
+
+	if (r != chr_count_to_convert)
+	{
+		t_error("Expected to convert %d characters, but converted %d\n",
+			chr_count_to_convert, r);
+	}
+
+	return t_status;
+}
--- ./src/regression/wcsnrtombs_underread.c	1970-01-01 03:00:00.000000000 +0300
+++ ./src/regression/wcsnrtombs_underread.c	2017-08-09 20:24:57.575995227 +0300
@@ -0,0 +1,46 @@
+// wcsnrtombs issue, reported in www.openwall.com/lists/musl/2017/07/18/3
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+#include "test.h"
+
+#define TEST_CHR "\u044B"
+
+#define CAT_IMPL(x, y) x##y
+#define CAT(x, y) CAT_IMPL(x, y)
+
+int main(void)
+{
+	const wchar_t *const chr = CAT(L, TEST_CHR);
+	const int chr_len_in_utf_8 = strlen(TEST_CHR);
+	const int chr_size = wcslen(chr);
+	// The number of characters should be greater than 32 to force wcsnrtombs
+	// to use the optimization based on wcsrtombs.
+	const int chr_count = 1000;
+	wchar_t src[chr_count];
+	char dest[chr_count * 4];
+	size_t r;
+	const wchar_t *str_ptr = src;
+	mbstate_t mbs;
+
+	for (int i = 0; i < chr_count; ++i)
+	{
+		memcpy(src + i, chr, sizeof(*chr));
+	}
+	src[chr_count] = 0;
+
+	setlocale(LC_CTYPE, "en_US.UTF-8");
+
+	memset(&mbs, 0, sizeof(mbs));
+	r = wcsnrtombs(dest, &str_ptr, chr_count,
+				   sizeof(dest), &mbs);
+
+	if (r != chr_count * chr_len_in_utf_8)
+	{
+		t_error("Expected to fill %d bytes, but filled %d\n",
+			chr_count * chr_len_in_utf_8, r);
+	}
+
+	return t_status;
+}

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: musl.diff --]
[-- Type: text/x-patch; name="musl.diff", Size: 1284 bytes --]

--- ./src/multibyte/mbsnrtowcs.c	2017-08-08 16:19:29.311584832 +0300
+++ ./src/multibyte/mbsnrtowcs.c	2017-08-09 20:33:27.515980317 +0300
@@ -5,6 +5,7 @@
 	size_t l, cnt=0, n2;
 	wchar_t *ws, wbuf[256];
 	const char *s = *src;
+	const char *tmp_s;
 
 	if (!wcs) ws = wbuf, wn = sizeof wbuf / sizeof *wbuf;
 	else ws = wcs;
@@ -15,7 +16,7 @@
 
 	while ( s && wn && ( (n2=n/4)>=wn || n2>32 ) ) {
 		if (n2>=wn) n2=wn;
-		n -= n2;
+		tmp_s = s;
 		l = mbsrtowcs(ws, &s, n2, st);
 		if (!(l+1)) {
 			cnt = l;
@@ -26,6 +27,7 @@
 			ws += l;
 			wn -= l;
 		}
+		n = s ? n - (s - tmp_s) : 0;
 		cnt += l;
 	}
 	if (s) while (wn && n) {
--- ./src/multibyte/wcsnrtombs.c	2017-08-08 16:19:29.311584832 +0300
+++ ./src/multibyte/wcsnrtombs.c	2017-08-09 20:34:17.479978856 +0300
@@ -5,13 +5,14 @@
 	size_t l, cnt=0, n2;
 	char *s, buf[256];
 	const wchar_t *ws = *wcs;
+	const wchar_t *tmp_ws;
 
 	if (!dst) s = buf, n = sizeof buf;
 	else s = dst;
 
 	while ( ws && n && ( (n2=wn)>=n || n2>32 ) ) {
 		if (n2>=n) n2=n;
-		wn -= n2;
+		tmp_ws = ws;
 		l = wcsrtombs(s, &ws, n2, 0);
 		if (!(l+1)) {
 			cnt = l;
@@ -22,6 +23,7 @@
 			s += l;
 			n -= l;
 		}
+		wn = ws ? wn - (ws - tmp_ws) : 0;
 		cnt += l;
 	}
 	if (ws) while (n && wn) {

next prev parent reply	other threads:[~2017-08-09 17:57 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-18 20:05 Mikhail Kremnyov
2017-08-09 17:57 ` Mikhail Kremnyov [this message]
2017-08-12  0:31   ` Rich Felker
2017-08-31 18:28 ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=dfc33583-c665-05cd-9847-9c06c4708c8c@gmail.com \
    --to=mkremnyov@gmail.com \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).