mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] UTF-7 decoder in iconv
@ 2024-08-31  8:52 Joakim Sindholt
  0 siblings, 0 replies; only message in thread
From: Joakim Sindholt @ 2024-08-31  8:52 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1093 bytes --]

I'm not sure how common this is but I have a correspondent who sends
mail in UTF-7, a truly magnificent piece of horror, and my claws-mail
can't render it on musl. I've written this patch based on RFC2152,
though I'm not entirely certain I've covered all bases. I also find the
code to be extremely kludgy as I was trying to not add any more
variables to the top of iconv().
The basic premise is that scd->state contains a single bit to determine
whether it's currently in a base64-encoded escape sequence, as well as
how many (decoded) bits it has consumed from the byte it's currently
looking at.
If I understand correctly, encodings with an ID above 0330 are only
supported for decoding. I have no need for encoding UTF-7 but I can
write an encoder if that's desired, however I don't even think this
decoder is in good enough condition to be committed as-is so I'm
submitting it here for comments. I'm guessing that Rich might like it if
the base64 table had all the -1 values at both ends chopped off.
It does work with claws-mail and some more contrived and evil tests I've
thrown at it.

[-- Attachment #2: iconv.diff --]
[-- Type: text/x-patch, Size: 3807 bytes --]

diff --git a/src/locale/base64.h b/src/locale/base64.h
new file mode 100644
index 00000000..866df450
--- /dev/null
+++ b/src/locale/base64.h
@@ -0,0 +1,8 @@
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
+52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
+-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
+-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 7fb2e1ef..50ec69a1 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -27,6 +27,7 @@
 #define GB2312      0332
 #define BIG5        0340
 #define EUC_KR      0350
+#define UTF_7       0360
 
 /* Definitions of charmaps. Each charmap consists of:
  * 1. Empty-string-terminated list of null-terminated aliases.
@@ -56,6 +57,7 @@ static const unsigned char charmaps[] =
 "gb2312\0\0\332"
 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
+"utf7\0\0\360"
 #include "codepages.h"
 ;
 
@@ -90,6 +92,14 @@ static const unsigned short rev_jis[] = {
 #include "revjis.h"
 };
 
+static const unsigned int utf7[] = {
+#include "utf7.h"
+};
+
+static const signed char base64[] = {
+#include "base64.h"
+};
+
 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
 {
 	for (; *a && *b; a++, b++) {
@@ -151,6 +161,7 @@ iconv_t iconv_open(const char *to, const char *from)
 	iconv_t cd = combine_to_from(t, f);
 
 	switch (charmaps[f]) {
+	case UTF_7:
 	case UTF_16:
 	case UTF_32:
 	case UCS2:
@@ -224,6 +235,17 @@ static unsigned uni_to_jis(unsigned c)
 	}
 }
 
+static unsigned base64d(const char *s, unsigned n)
+{
+	unsigned c, r = 0;
+	while (n--) {
+		c = *(unsigned char *)s++;
+		if (c > 128 || base64[c] < 0) return -1U;
+		r = r<<6|base64[c];
+	}
+	return r;
+}
+
 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
 {
 	size_t x=0;
@@ -319,6 +341,56 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			}
 			type = scd->state;
 			continue;
+		case UTF_7:
+			l = 1;
+			if (c >= 128) goto ilseq;
+			if (!scd->state) {
+				if (!(utf7[c/32]&(1<<c%32))) goto ilseq;
+				if (c != '+') break;
+				if (*inb < 2) goto starved;
+				if ((*in)[1] == '-') {
+					l = 2;
+					break;
+				}
+			}
+			if (base64[c] < 0) {
+				scd->state = 0;
+				if (c != '-') l = 0;
+				continue;
+			}
+			if (*inb < 2) goto starved;
+			d = *((unsigned char *)*in+1);
+			if (d >= 128) goto ilseq;
+			if (base64[d] < 0) {
+				k = scd->state>>1;
+				if (k == 0 || (base64[c]&(1<<6-k)-1)) goto ilseq;
+				scd->state = 0;
+				if (d == '-') l = 2;
+				continue;
+			}
+			l = 3+(6*3-(scd->state>>1)<16);
+			if (*inb < l+!scd->state) goto starved;
+			c = base64d(*in+!scd->state, l);
+			if (c == -1U) goto ilseq;
+			k = 6*l-(scd->state>>1)-16;
+			c = c>>k&0xffffu;
+			if (k) l--; else k = 6;
+			if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
+			if ((unsigned)(c-0xd800) < 0x400) {
+				k = 3+(6*3-(6-k)<16);
+				if (*inb < !scd->state+l+k) goto starved;
+				d = base64d(*in+!scd->state+l, k);
+				if (d == -1U) goto ilseq;
+				l += k;
+				k = 6*l-(scd->state>>1)-32;
+				d = d>>k&0xffffu;
+				if (k) l--; else k = 6;
+				if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
+				c = ((c-0xd7c0)<<10) + (d-0xdc00);
+			}
+			l += !scd->state;
+			scd->state = (6-k)<<1|1;
+			break;
 		case SHIFT_JIS:
 			if (c < 128) break;
 			if (c-0xa1 <= 0xdf-0xa1) {
diff --git a/src/locale/utf7.h b/src/locale/utf7.h
new file mode 100644
index 00000000..b04a7490
--- /dev/null
+++ b/src/locale/utf7.h
@@ -0,0 +1 @@
+9728,4294967295,4026531839,1073741823

[-- Attachment #3: utf7.c --]
[-- Type: text/x-c++src, Size: 839 bytes --]

#include <stdio.h>

static const char add[] = {
	/* Rule 3 */
	32, // SPACE
	 9, // TAB
	13, // CR
	10, // LF
	/* Set D */
	39, // '
	40, // (
	41, // )
	44, // ,
	45, // -
	46, // .
	47, // /
	58, // :
	63, // ?
	/* Set O */
	33, // !
	34, // "
	35, // #
	36, // $
	37, // %
	38, // &
	42, // *
	59, // ;
	60, // <
	61, // =
	62, // >
	64, // @
	91, // [
	93, // ]
	94, // ^
	95, // _
	96, // '
	123,// {
	124,// |
	125,// }
};

int main(int argc, char *argv[])
{
	unsigned int map[128/32] = {0};
	int i;

	for (i = 'A'; i <= 'Z'; i++)
		map[i/32] |= 1U<<i%32;
	for (i = 'a'; i <= 'z'; i++)
		map[i/32] |= 1U<<i%32;
	for (i = '0'; i <= '9'; i++)
		map[i/32] |= 1U<<i%32;
	for (i = 0; i < sizeof(add); i++)
		map[add[i]/32] |= 1U<<add[i]%32;
	map['+'/32] |= 1U<<'+'%32;

	for (i = 0; i < 128/32; i++)
		printf("%u,", map[i]);
	return 0;
}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-08-31  8:53 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-31  8:52 [musl] UTF-7 decoder in iconv Joakim Sindholt

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).