From: Joakim Sindholt <opensource@zhasha.com>
To: musl@lists.openwall.com
Subject: [musl] UTF-7 decoder in iconv
Date: Sat, 31 Aug 2024 10:52:43 +0200 [thread overview]
Message-ID: <20240831105243.2a396d84@eclair> (raw)
[-- Attachment #1: Type: text/plain, Size: 1093 bytes --]
I'm not sure how common this is but I have a correspondent who sends
mail in UTF-7, a truly magnificent piece of horror, and my claws-mail
can't render it on musl. I've written this patch based on RFC2152,
though I'm not entirely certain I've covered all bases. I also find the
code to be extremely kludgy as I was trying to not add any more
variables to the top of iconv().
The basic premise is that scd->state contains a single bit to determine
whether it's currently in a base64-encoded escape sequence, as well as
how many (decoded) bits it has consumed from the byte it's currently
looking at.
If I understand correctly, encodings with an ID above 0330 are only
supported for decoding. I have no need for encoding UTF-7 but I can
write an encoder if that's desired, however I don't even think this
decoder is in good enough condition to be committed as-is so I'm
submitting it here for comments. I'm guessing that Rich might like it if
the base64 table had all the -1 values at both ends chopped off.
It does work with claws-mail and some more contrived and evil tests I've
thrown at it.
[-- Attachment #2: iconv.diff --]
[-- Type: text/x-patch, Size: 3807 bytes --]
diff --git a/src/locale/base64.h b/src/locale/base64.h
new file mode 100644
index 00000000..866df450
--- /dev/null
+++ b/src/locale/base64.h
@@ -0,0 +1,8 @@
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
+52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
+-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
+-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 7fb2e1ef..50ec69a1 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -27,6 +27,7 @@
#define GB2312 0332
#define BIG5 0340
#define EUC_KR 0350
+#define UTF_7 0360
/* Definitions of charmaps. Each charmap consists of:
* 1. Empty-string-terminated list of null-terminated aliases.
@@ -56,6 +57,7 @@ static const unsigned char charmaps[] =
"gb2312\0\0\332"
"big5\0bigfive\0cp950\0big5hkscs\0\0\340"
"euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
+"utf7\0\0\360"
#include "codepages.h"
;
@@ -90,6 +92,14 @@ static const unsigned short rev_jis[] = {
#include "revjis.h"
};
+static const unsigned int utf7[] = {
+#include "utf7.h"
+};
+
+static const signed char base64[] = {
+#include "base64.h"
+};
+
static int fuzzycmp(const unsigned char *a, const unsigned char *b)
{
for (; *a && *b; a++, b++) {
@@ -151,6 +161,7 @@ iconv_t iconv_open(const char *to, const char *from)
iconv_t cd = combine_to_from(t, f);
switch (charmaps[f]) {
+ case UTF_7:
case UTF_16:
case UTF_32:
case UCS2:
@@ -224,6 +235,17 @@ static unsigned uni_to_jis(unsigned c)
}
}
+static unsigned base64d(const char *s, unsigned n)
+{
+ unsigned c, r = 0;
+ while (n--) {
+ c = *(unsigned char *)s++;
+ if (c > 128 || base64[c] < 0) return -1U;
+ r = r<<6|base64[c];
+ }
+ return r;
+}
+
size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
{
size_t x=0;
@@ -319,6 +341,56 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
}
type = scd->state;
continue;
+ case UTF_7:
+ l = 1;
+ if (c >= 128) goto ilseq;
+ if (!scd->state) {
+ if (!(utf7[c/32]&(1<<c%32))) goto ilseq;
+ if (c != '+') break;
+ if (*inb < 2) goto starved;
+ if ((*in)[1] == '-') {
+ l = 2;
+ break;
+ }
+ }
+ if (base64[c] < 0) {
+ scd->state = 0;
+ if (c != '-') l = 0;
+ continue;
+ }
+ if (*inb < 2) goto starved;
+ d = *((unsigned char *)*in+1);
+ if (d >= 128) goto ilseq;
+ if (base64[d] < 0) {
+ k = scd->state>>1;
+ if (k == 0 || (base64[c]&(1<<6-k)-1)) goto ilseq;
+ scd->state = 0;
+ if (d == '-') l = 2;
+ continue;
+ }
+ l = 3+(6*3-(scd->state>>1)<16);
+ if (*inb < l+!scd->state) goto starved;
+ c = base64d(*in+!scd->state, l);
+ if (c == -1U) goto ilseq;
+ k = 6*l-(scd->state>>1)-16;
+ c = c>>k&0xffffu;
+ if (k) l--; else k = 6;
+ if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
+ if ((unsigned)(c-0xd800) < 0x400) {
+ k = 3+(6*3-(6-k)<16);
+ if (*inb < !scd->state+l+k) goto starved;
+ d = base64d(*in+!scd->state+l, k);
+ if (d == -1U) goto ilseq;
+ l += k;
+ k = 6*l-(scd->state>>1)-32;
+ d = d>>k&0xffffu;
+ if (k) l--; else k = 6;
+ if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
+ c = ((c-0xd7c0)<<10) + (d-0xdc00);
+ }
+ l += !scd->state;
+ scd->state = (6-k)<<1|1;
+ break;
case SHIFT_JIS:
if (c < 128) break;
if (c-0xa1 <= 0xdf-0xa1) {
diff --git a/src/locale/utf7.h b/src/locale/utf7.h
new file mode 100644
index 00000000..b04a7490
--- /dev/null
+++ b/src/locale/utf7.h
@@ -0,0 +1 @@
+9728,4294967295,4026531839,1073741823
[-- Attachment #3: utf7.c --]
[-- Type: text/x-c++src, Size: 839 bytes --]
#include <stdio.h>
static const char add[] = {
/* Rule 3 */
32, // SPACE
9, // TAB
13, // CR
10, // LF
/* Set D */
39, // '
40, // (
41, // )
44, // ,
45, // -
46, // .
47, // /
58, // :
63, // ?
/* Set O */
33, // !
34, // "
35, // #
36, // $
37, // %
38, // &
42, // *
59, // ;
60, // <
61, // =
62, // >
64, // @
91, // [
93, // ]
94, // ^
95, // _
96, // '
123,// {
124,// |
125,// }
};
int main(int argc, char *argv[])
{
unsigned int map[128/32] = {0};
int i;
for (i = 'A'; i <= 'Z'; i++)
map[i/32] |= 1U<<i%32;
for (i = 'a'; i <= 'z'; i++)
map[i/32] |= 1U<<i%32;
for (i = '0'; i <= '9'; i++)
map[i/32] |= 1U<<i%32;
for (i = 0; i < sizeof(add); i++)
map[add[i]/32] |= 1U<<add[i]%32;
map['+'/32] |= 1U<<'+'%32;
for (i = 0; i < 128/32; i++)
printf("%u,", map[i]);
return 0;
}
reply other threads:[~2024-08-31 8:53 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240831105243.2a396d84@eclair \
--to=opensource@zhasha.com \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).