/* ** Copyright 2000-2018 Double Precision, Inc. ** See COPYING for distribution information. ** */ #include "unicode_config.h" #include "courier-unicode.h" #include #include #include #include #include #include #if USE_LIBCHARSET #if HAVE_LOCALCHARSET_H #include #elif HAVE_LIBCHARSET_H #include #endif /* HAVE_LOCALCHARSET_H */ #else #include #endif /* USE_LIBCHARSET */ static char default_chset_buf[32]; static const char *fix_charset(const char *); const char *unicode_locale_chset() { const char *c; #if USE_LIBCHARSET c=locale_charset(); #else c=nl_langinfo(CODESET); #endif return fix_charset(c); } #if HAVE_LANGINFO_L const char *unicode_locale_chset_l(locale_t l) { return fix_charset(nl_langinfo_l(CODESET, l)); } #endif static const char *fix_charset(const char *c) { if (!c) c="US-ASCII"; if (c && /* Map GNU libc iconv oddity to us-ascii */ (strcmp(c, "ANSI_X3.4") == 0 || strncmp(c, "ANSI_X3.4-", 10) == 0)) c="US-ASCII"; return c; } static void init_default_chset() { const char *old_locale=NULL; const char *chset=NULL; char *locale_cpy=NULL; char buf[sizeof(default_chset_buf)]; chset=getenv("MM_CHARSET"); if (chset == NULL) chset=getenv("CHARSET"); if (chset == NULL) { old_locale=setlocale(LC_ALL, ""); locale_cpy=old_locale ? strdup(old_locale):NULL; chset=unicode_locale_chset(); } memset(buf, 0, sizeof(buf)); strncat(buf, chset, sizeof(buf)-1); memcpy(default_chset_buf, buf, sizeof(buf)); if (locale_cpy) { setlocale(LC_ALL, locale_cpy); free(locale_cpy); } } const char *unicode_default_chset() { if (default_chset_buf[0] == 0) init_default_chset(); return default_chset_buf; } /*****************************************************************************/ const char unicode_u_ucs4_native[]= #if WORDS_BIGENDIAN "UCS-4BE" #else "UCS-4LE" #endif ; const char unicode_u_ucs2_native[]= #if WORDS_BIGENDIAN "UCS-2BE" #else "UCS-2LE" #endif ; /* A stack of conversion modules */ struct unicode_convert_hdr { int (*convert_handler)(void *ptr, const char *text, size_t cnt); int (*deinit_handler)(void *ptr, int *errptr); void *ptr; struct unicode_convert_hdr *next; }; /* Decoding table for modified UTF7-encoding as used in imap */ static const char mbase64_lookup[]={ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,63,-1,-1,-1, 52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1, -1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1, -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, 41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}; static const char mbase64[]= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /* ** Conversion wrapper for converting to modified-utf7 IMAP encoding. ** ** This is done by converting to UCS2, then stacking on a module that ** takes that and converts UCS2 to modified-UTF7. ** ** init_nottoimaputf7() returns an opaque stack for converting to ucs2. */ static unicode_convert_handle_t init_nottoimaputf7(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg); /* ** The to modified UTF7 module */ struct unicode_convert_toimaputf7 { struct unicode_convert_hdr hdr; /* Accumulated output buffer */ char utf7encodebuf[1024]; size_t utf7encodebuf_cnt; /* Accumulated bits for base64 encoding */ uint32_t utf7bits; /* How many bits in utf7bits */ uint16_t utf7bitcount; /* Flag: in base64mode */ uint16_t utfmode; int errflag; /* Any extra characters that should be munged */ char smapmunge[16]; /* Remembered output function */ int (*output_func)(const char *, size_t, void *); /* Remembered arg to the output function */ void *convert_arg; }; /* Macro - flush the output buffer */ #define toimaputf7_encode_flush(p) do { \ int rc; \ \ rc=(*(p)->output_func)((p)->utf7encodebuf, \ (p)->utf7encodebuf_cnt, \ (p)->convert_arg); \ if (rc) \ return ((p)->errflag=(rc)); \ \ (p)->utf7encodebuf_cnt=0; \ } while (0) static int toimaputf7_encode_flushfinal(struct unicode_convert_toimaputf7 *p) { if (p->utf7encodebuf_cnt > 0) toimaputf7_encode_flush(p); return 0; } /* Macro - add one char to the output buffer */ #define toimaputf7_encode_add(p,c) do { \ if ((p)->utf7encodebuf_cnt >= sizeof((p)->utf7encodebuf)) \ toimaputf7_encode_flush((p)); \ \ (p)->utf7encodebuf[(p)->utf7encodebuf_cnt++]=(c); \ } while (0); static int deinit_toimaputf7(void *ptr, int *errptr); static int do_convert_toutf7(const char *text, size_t cnt, void *arg); static int convert_utf7_handler(void *ptr, const char *text, size_t cnt); /* ** Conversion wrapper for converting to modified-utf8 SMAP encoding. ** ** This is done by converting to UTF-8, then stacking on a module that ** takes that and converts UTF-8 to modified-UTF8. ** ** init_nottosmaputf8() returns an opaque stack for converting to modified ** UTF-8. */ static unicode_convert_handle_t init_nottosmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg); /* ** The to modified UTF8 module */ struct unicode_convert_tosmaputf8 { struct unicode_convert_hdr hdr; int errflag; /* Remembered output function */ int (*output_func)(const char *, size_t, void *); /* Remembered arg to the output function */ void *convert_arg; }; /* ** Create a conversion module stack */ unicode_convert_handle_t unicode_convert_init(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { struct unicode_convert_toimaputf7 *toutf7; unicode_convert_handle_t h; const char *smapmunge; size_t l=strlen(unicode_x_imap_modutf7); if (strncmp(dst_chset, unicode_x_imap_modutf7, l) == 0 && (dst_chset[l] == 0 || dst_chset[l] == ' ')) { smapmunge=dst_chset + l; if (*smapmunge) ++smapmunge; } else return init_nottoimaputf7(src_chset, dst_chset, output_func, convert_arg); toutf7=malloc(sizeof(struct unicode_convert_toimaputf7)); if (!toutf7) return NULL; memset(toutf7, 0, sizeof(*toutf7)); h=unicode_convert_init(src_chset, unicode_u_ucs2_native, do_convert_toutf7, toutf7); if (!h) { free(toutf7); return (NULL); } toutf7->output_func=output_func; toutf7->convert_arg=convert_arg; strncat(toutf7->smapmunge, smapmunge, sizeof(toutf7->smapmunge)-1); toutf7->hdr.convert_handler=convert_utf7_handler; toutf7->hdr.deinit_handler=deinit_toimaputf7; toutf7->hdr.ptr=toutf7; toutf7->hdr.next=h; return &toutf7->hdr; } /* Passthrough to the wrapped stack */ static int convert_utf7_handler(void *ptr, const char *text, size_t cnt) { struct unicode_convert_toimaputf7 *toutf7= (struct unicode_convert_toimaputf7 *)ptr; return (*toutf7->hdr.next->convert_handler)(toutf7->hdr.next->ptr, text, cnt); } static int utf7off(struct unicode_convert_toimaputf7 *toutf7) { if (!toutf7->utfmode) return 0; toutf7->utfmode=0; if (toutf7->utf7bitcount > 0) toimaputf7_encode_add(toutf7, mbase64[(toutf7->utf7bits << (6-toutf7->utf7bitcount)) & 63]); toimaputf7_encode_add(toutf7, '-'); return 0; } static int do_convert_toutf7(const char *text, size_t cnt, void *arg) { struct unicode_convert_toimaputf7 *toutf7= (struct unicode_convert_toimaputf7 *)arg; /* We better be getting UCS-2 here! */ const uint16_t *utext=(const uint16_t *)text; cnt /= 2; while (cnt) { if (toutf7->errflag) return toutf7->errflag; if (*utext >= 0x20 && *utext <= 0x7F && strchr( toutf7->smapmunge, (char)*utext) == NULL) /* && (!toutf7->smapmunge || (*utext != '.' && *utext != '/' && *utext != '~' && *utext != ':'))) */ { if (utf7off(toutf7)) return toutf7->errflag; toimaputf7_encode_add(toutf7, *utext); if (*utext == '&') toimaputf7_encode_add(toutf7, '-'); ++utext; --cnt; continue; } if (!toutf7->utfmode) { toutf7->utfmode=1; toutf7->utf7bitcount=0; toimaputf7_encode_add(toutf7, '&'); continue; } toutf7->utf7bits = (toutf7->utf7bits << 16) | (((uint32_t)*utext) & 0xFFFF); toutf7->utf7bitcount += 16; ++utext; --cnt; /* If there's at least 6 bits, output base64-encoded char */ while (toutf7->utf7bitcount >= 6) { uint32_t v; int n; if (toutf7->errflag) return toutf7->errflag; v=toutf7->utf7bits; n=toutf7->utf7bitcount-6; toutf7->utf7bitcount -= 6; if (n > 0) v >>= n; toimaputf7_encode_add(toutf7, mbase64[v & 63]); } } return 0; } static int deinit_toimaputf7(void *ptr, int *errptr) { int rc; struct unicode_convert_toimaputf7 *toutf7= (struct unicode_convert_toimaputf7 *)ptr; /* Flush out the downstream stack */ rc=(*toutf7->hdr.next->deinit_handler)(toutf7->hdr.next->ptr, errptr); /* Make sure we're out of modified base64 */ if (rc == 0) rc=utf7off(toutf7); if (rc == 0 && toutf7->utf7encodebuf_cnt > 0) rc=toimaputf7_encode_flushfinal(toutf7); free(toutf7); return rc; } /* ** Convert to unicode_x_smap_modutf8. */ static int deinit_tosmaputf8(void *ptr, int *errptr); static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg); static int convert_utf8_handler(void *ptr, const char *text, size_t cnt); static unicode_convert_handle_t init_nottoimaputf7(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { struct unicode_convert_tosmaputf8 *toutf8; unicode_convert_handle_t h; if (strcmp(dst_chset, unicode_x_smap_modutf8)) return init_nottosmaputf8(src_chset, dst_chset, output_func, convert_arg); toutf8=malloc(sizeof(struct unicode_convert_tosmaputf8)); if (!toutf8) return NULL; memset(toutf8, 0, sizeof(*toutf8)); h=unicode_convert_init(src_chset, "utf-8", do_convert_tosmaputf8, toutf8); if (!h) { free(toutf8); return (NULL); } toutf8->output_func=output_func; toutf8->convert_arg=convert_arg; toutf8->hdr.convert_handler=convert_utf8_handler; toutf8->hdr.deinit_handler=deinit_tosmaputf8; toutf8->hdr.ptr=toutf8; toutf8->hdr.next=h; return &toutf8->hdr; } static int deinit_tosmaputf8(void *ptr, int *errptr) { int rc; struct unicode_convert_tosmaputf8 *toutf8= (struct unicode_convert_tosmaputf8 *)ptr; /* Flush out the downstream stack */ rc=(*toutf8->hdr.next->deinit_handler)(toutf8->hdr.next->ptr, errptr); if (toutf8->errflag && rc == 0) rc=toutf8->errflag; free(toutf8); return rc; } static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg) { struct unicode_convert_tosmaputf8 *toutf8= (struct unicode_convert_tosmaputf8 *)arg; int rc; size_t i; char octal[4]; while (cnt) { if (toutf8->errflag) return toutf8->errflag; for (i=0; ioutput_func)(text, i, toutf8->convert_arg); if (rc) { toutf8->errflag=rc; return rc; } text += i; cnt -= i; } if (cnt) { char c= *text; octal[0]='\\'; octal[3]= (c & 7)+'0'; c /= 8; octal[2]= (c & 7)+'0'; c /= 8; octal[1]= (c & 7)+'0'; rc= (*toutf8->output_func)(octal, 4, toutf8->convert_arg); if (rc) { toutf8->errflag=rc; return rc; } ++text; --cnt; } } return 0; } static int convert_utf8_handler(void *ptr, const char *text, size_t cnt) { struct unicode_convert_tosmaputf8 *toutf8= (struct unicode_convert_tosmaputf8 *)ptr; return (*toutf8->hdr.next->convert_handler)(toutf8->hdr.next->ptr, text, cnt); } /************/ /* ** Convert from modified-utf7 IMAP encoding. ** ** This module converts it to UCS-2, then this is attached to a stack that ** converts UCS-2 to the requested charset. */ static unicode_convert_handle_t init_notfromimaputf7(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg); struct unicode_convert_fromimaputf7 { struct unicode_convert_hdr hdr; /* Accumulated UCS-2 stream */ uint16_t convbuf[512]; size_t convbuf_cnt; /* Accumulated base64 bits */ uint32_t modbits; /* How many bits extracted from a base64 stream */ short modcnt; /* Flag: seen the & */ char seenamp; /* Flag: seen the &, and the next char wasn't - */ char inmod; int errflag; int converr; }; /* Flush the accumulated UCS-2 stream */ #define convert_fromutf7_flush(p) do { \ int rc=(*(p)->hdr.next->convert_handler) \ ((p)->hdr.next->ptr, \ (const char *)(p)->convbuf, \ (p)->convbuf_cnt * \ sizeof((p)->convbuf[0])); \ \ if (rc) \ (p)->errflag=rc; \ \ (p)->convbuf_cnt=0; \ } while (0) /* Accumulated a UCS-2 char */ #define convert_fromutf7_add(p,c) do { \ if ((p)->convbuf_cnt >= \ sizeof((p)->convbuf)/sizeof((p)->convbuf[0])) \ convert_fromutf7_flush((p)); \ (p)->convbuf[(p)->convbuf_cnt++]=(c); \ } while (0) static int convert_fromutf7(void *ptr, const char *text, size_t cnt); static int deinit_fromutf7(void *ptr, int *errptr); static unicode_convert_handle_t init_nottosmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { struct unicode_convert_fromimaputf7 *fromutf7; unicode_convert_handle_t h; size_t l=strlen(unicode_x_imap_modutf7); if (strncmp(src_chset, unicode_x_imap_modutf7, l) == 0 && (src_chset[l] == 0 || src_chset[l] == ' ')) ; else return init_notfromimaputf7(src_chset, dst_chset, output_func, convert_arg); fromutf7=(struct unicode_convert_fromimaputf7 *) malloc(sizeof(struct unicode_convert_fromimaputf7)); if (!fromutf7) return NULL; memset(fromutf7, 0, sizeof(*fromutf7)); /* Create a stack for converting UCS-2 to the dest charset */ h=unicode_convert_init(unicode_u_ucs2_native, dst_chset, output_func, convert_arg); if (!h) { free(fromutf7); return (NULL); } fromutf7->hdr.next=h; fromutf7->hdr.convert_handler=convert_fromutf7; fromutf7->hdr.deinit_handler=deinit_fromutf7; fromutf7->hdr.ptr=fromutf7; return &fromutf7->hdr; } static int convert_fromutf7(void *ptr, const char *text, size_t cnt) { struct unicode_convert_fromimaputf7 *fromutf7= (struct unicode_convert_fromimaputf7 *)ptr; int bits; while (cnt) { if (fromutf7->errflag) return fromutf7->errflag; if (!fromutf7->seenamp && *text == '&') { fromutf7->seenamp=1; fromutf7->inmod=0; fromutf7->modcnt=0; ++text; --cnt; continue; } if (fromutf7->seenamp) { if (*text == '-') { convert_fromutf7_add(fromutf7, '&'); ++text; --cnt; fromutf7->seenamp=0; continue; } fromutf7->seenamp=0; fromutf7->inmod=1; } if (!fromutf7->inmod) { /* Not in the base64 encoded stream */ convert_fromutf7_add(fromutf7, ((uint16_t)*text) & 0xFFFF); ++text; --cnt; continue; } if (*text == '-') { /* End of the base64 encoded stream */ fromutf7->inmod=0; ++text; --cnt; continue; } /* Got 6 more bits */ bits=mbase64_lookup[(unsigned char)*text]; ++text; --cnt; if (bits < 0) { errno=EILSEQ; return fromutf7->errflag=-1; } fromutf7->modbits = (fromutf7->modbits << 6) | bits; fromutf7->modcnt += 6; if (fromutf7->modcnt >= 16) { /* Got a UCS-2 char */ int shiftcnt=fromutf7->modcnt - 16; uint32_t v=fromutf7->modbits; if (shiftcnt) v >>= shiftcnt; fromutf7->modcnt -= 16; convert_fromutf7_add(fromutf7, v); } } return 0; } static int deinit_fromutf7(void *ptr, int *errptr) { struct unicode_convert_fromimaputf7 *fromutf7= (struct unicode_convert_fromimaputf7 *)ptr; int rc; if (fromutf7->seenamp || fromutf7->inmod) { if (fromutf7->errflag == 0) { fromutf7->errflag= -1; errno=EILSEQ; } } if (fromutf7->convbuf_cnt) convert_fromutf7_flush(fromutf7); rc=fromutf7->hdr.next->deinit_handler(fromutf7->hdr.next->ptr, errptr); if (fromutf7->errflag && rc == 0) rc=fromutf7->errflag; if (errptr && fromutf7->converr) *errptr=1; free(fromutf7); return rc; } /************/ /* ** Convert from modified-utf8 SMAP encoding. ** ** This module converts it to UTF-8, then this is attached to a stack that ** converts UTF-8 to the requested charset. */ static unicode_convert_handle_t init_notfromsmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg); struct unicode_convert_fromsmaputf8 { struct unicode_convert_hdr hdr; /* Convert a backslash escape */ int in_escape; /* The escaped character */ unsigned char escape_char; int errflag; int converr; }; static int convert_fromutf8(void *ptr, const char *text, size_t cnt); static int deinit_fromutf8(void *ptr, int *errptr); static unicode_convert_handle_t init_notfromimaputf7(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { struct unicode_convert_fromsmaputf8 *fromutf8; unicode_convert_handle_t h; if (strcmp(src_chset, unicode_x_smap_modutf8)) return init_notfromsmaputf8(src_chset, dst_chset, output_func, convert_arg); fromutf8=(struct unicode_convert_fromsmaputf8 *) malloc(sizeof(struct unicode_convert_fromsmaputf8)); if (!fromutf8) return NULL; memset(fromutf8, 0, sizeof(*fromutf8)); /* Create a stack for converting UTF-8 to the dest charset */ h=unicode_convert_init("utf-8", dst_chset, output_func, convert_arg); if (!h) { free(fromutf8); return (NULL); } fromutf8->hdr.next=h; fromutf8->hdr.convert_handler=convert_fromutf8; fromutf8->hdr.deinit_handler=deinit_fromutf8; fromutf8->hdr.ptr=fromutf8; return &fromutf8->hdr; } static int convert_fromutf8(void *ptr, const char *text, size_t cnt) { struct unicode_convert_fromsmaputf8 *fromutf8= (struct unicode_convert_fromsmaputf8 *)ptr; size_t i; while (cnt) { if (fromutf8->errflag) return fromutf8->errflag; if (fromutf8->in_escape) { if (*text < '0' || *text > '7') { errno=EILSEQ; return fromutf8->errflag=-1; } fromutf8->escape_char <<= 3; fromutf8->escape_char |= *text - '0'; if (--fromutf8->in_escape == 0) { fromutf8->errflag=(*fromutf8->hdr.next ->convert_handler) (fromutf8->hdr.next->ptr, (const char *)&fromutf8->escape_char, 1); } ++text; --cnt; continue; } for (i=0; ierrflag=(*fromutf8->hdr.next ->convert_handler) (fromutf8->hdr.next->ptr, text, i); text += i; cnt -= i; } if (cnt) { fromutf8->escape_char=0; fromutf8->in_escape=3; ++text; --cnt; } } return 0; } static int deinit_fromutf8(void *ptr, int *errptr) { struct unicode_convert_fromsmaputf8 *fromutf8= (struct unicode_convert_fromsmaputf8 *)ptr; int rc; if (fromutf8->in_escape) { fromutf8->errflag= -1; errno=EILSEQ; } rc=fromutf8->hdr.next->deinit_handler(fromutf8->hdr.next->ptr, errptr); if (fromutf8->errflag && rc == 0) rc=fromutf8->errflag; if (errptr && fromutf8->converr) *errptr=1; free(fromutf8); return rc; } /************/ /* A real conversion module, via iconv */ struct unicode_convert_iconv { struct unicode_convert_hdr hdr; iconv_t h; int errflag; /* Accumulated errors */ int (*output_func)(const char *, size_t, void *); void *convert_arg; char buffer[1024]; /* Input buffer */ size_t bufcnt; /* Accumulated input in buffer */ char skipcnt; /* Skip this many bytes upon encountering EILSEQ */ char skipleft; /* How many bytes are currently left to skip */ char converr; /* Flag - an EILSEQ was encountered */ } ; static int init_iconv(struct unicode_convert_iconv *h, const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg); static unicode_convert_handle_t init_notfromsmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { struct unicode_convert_iconv *h= malloc(sizeof(struct unicode_convert_iconv)); if (!h) return NULL; memset(h, 0, sizeof(*h)); if (init_iconv(h, src_chset, dst_chset, output_func, convert_arg)) { free(h); return NULL; } return &h->hdr; } /* Run the stack */ int unicode_convert(unicode_convert_handle_t h, const char *text, size_t cnt) { return (*h->convert_handler)(h->ptr, text, cnt); } /* Destroy the stack */ int unicode_convert_deinit(unicode_convert_handle_t h, int *errptr) { return (*h->deinit_handler)(h, errptr); } static int deinit_iconv(void *ptr, int *errptr); static int convert_iconv(void *ptr, const char *text, size_t cnt); /* Initialize a single conversion module, in the stack */ static int init_iconv(struct unicode_convert_iconv *h, const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) { if ((h->h=iconv_open(dst_chset, src_chset)) == (iconv_t)-1) return -1; h->hdr.convert_handler=convert_iconv; h->hdr.deinit_handler=deinit_iconv; h->hdr.ptr=h; h->output_func=output_func; h->convert_arg=convert_arg; /* Heuristically determine how many octets to skip upon an EILSEQ */ h->skipcnt=1; switch (src_chset[0]) { case 'u': case 'U': switch (src_chset[1]) { case 'c': case 'C': switch (src_chset[2]) { case 's': case 'S': if (src_chset[3] == '-') switch (src_chset[4]) { case '4': /* UCS-4 */ h->skipcnt=4; break; case '2': /* UCS-2 */ h->skipcnt=2; break; } } break; case 't': case 'T': switch (src_chset[2]) { case 'f': case 'F': if (src_chset[3] == '-') switch (src_chset[4]) { case '3': /* UTF-32 */ h->skipcnt=4; break; case '1': /* UTF-16 */ h->skipcnt=2; break; } } } } return 0; } static void convert_flush(struct unicode_convert_iconv *); static void convert_flush_iconv(struct unicode_convert_iconv *, const char **, size_t *); /* ** iconv conversion module. Accumulate input in an input buffer. When the ** input buffer is full, invoke convert_flush(). */ static int convert_iconv(void *ptr, const char *text, size_t cnt) { struct unicode_convert_iconv *h=(struct unicode_convert_iconv *)ptr; while (cnt && h->errflag == 0) { if (h->bufcnt >= sizeof(h->buffer)-1) { convert_flush(h); if (h->errflag) break; } h->buffer[h->bufcnt++]= *text++; --cnt; } return h->errflag; } /* ** Finish an iconv conversion module. Invoke convert_flush() to flush any ** buffered input. Invoke convert_flush_iconv() to return state to the initial ** conversion state. */ static int deinit_iconv(void *ptr, int *errptr) { int rc; int converr; struct unicode_convert_iconv *h=(struct unicode_convert_iconv *)ptr; unicode_convert_handle_t next; if (h->errflag == 0) convert_flush(h); if (h->bufcnt && h->errflag == 0) h->converr=1; if (h->errflag == 0) convert_flush_iconv(h, NULL, NULL); rc=h->errflag; converr=h->converr != 0; iconv_close(h->h); next=h->hdr.next; free(h); if (errptr) *errptr=converr; /* If there's another module in the stack, clean that up */ if (next) { int converrnext; int rcnext=unicode_convert_deinit(next, &converrnext); if (converrnext && errptr && *errptr == 0) *errptr=converr; if (rcnext && rc == 0) rc=rcnext; } return rc; } /* ** Invoke convert_flush_iconv() to flush the input buffer. If there's ** unconverted text remaining, reposition it at the beginning of the input ** buffer. */ static void convert_flush(struct unicode_convert_iconv *h) { const char *p; size_t n; if (h->bufcnt == 0 || h->errflag) return; p=h->buffer; n=h->bufcnt; convert_flush_iconv(h, &p, &n); if (h->errflag) return; if (h->bufcnt == n) n=0; /* Unexpected error, dunno what to do, punt */ h->bufcnt=0; while (n) { h->buffer[h->bufcnt]= *p; ++h->bufcnt; ++p; --n; } } /* ** Convert text via iconv. */ static void convert_flush_iconv(struct unicode_convert_iconv *h, const char **inbuf, size_t *inbytesleft) { int save_errno; while (1) { char outbuf[1024]; char *outp; size_t outleft; size_t n; size_t origin=0; if (inbytesleft) { if ((origin=*inbytesleft) == 0) return; if (inbuf && h->skipleft && origin) { /* Skipping after an EILSEQ */ --h->skipleft; --*inbytesleft; ++*inbuf; continue; } } if (h->errflag) { /* Quietly eat everything after a previous error */ if (inbytesleft) *inbytesleft=0; return; } outp=outbuf; outleft=sizeof(outbuf); n=iconv(h->h, (char **)inbuf, inbytesleft, &outp, &outleft); save_errno=errno; /* Anything produced by iconv() gets pushed down the stack */ if (outp > outbuf) { int rc=(*h->output_func)(outbuf, outp-outbuf, h->convert_arg); if (rc) { h->errflag=rc; return; } } if (n != (size_t)-1) { /* iconv(3) reason #2 */ break; } if (inbytesleft == 0) { /* ** An error when generating the shift sequence to ** return to the initial state. We don't know what to ** do, now. */ errno=EINVAL; h->errflag= -1; return; } /* ** convert_flush() gets invoked when the 1024 char input buffer ** fills or to convert input that has been buffered when ** convert_chset_end() gets invoked. ** ** A return code of EINVAL from iconv() is iconv() encountering ** an incomplete multibyte sequence. ** ** If iconv() failed without consuming any input: ** ** - iconv(3) reason #1, EILSEQ, invalid multibyte sequence ** that starts at the beginning of the string we wish to ** convert. Discard one character, and try again. ** ** - iconv(3) reason #3, EINVAL, incomplete multibyte sequence. ** If it's possible to have an incomplete 1024 character long ** multibyte sequence, we're in trouble. Or we've encountered ** an EINVAL when flushing out the remaining buffered input, ** in convert_chset_end(). In either case, it's ok to sicard ** one character at a time, until we either reach the end, ** or get some other result. ** ** - iconv(3) reason #4, E2BIG. If the 1024 character output ** buffer, above, is insufficient to produce the output from a ** single converted character, we're in trouble. */ if (*inbytesleft == origin) { h->skipleft=h->skipcnt; h->converr=1; } /* ** Stopped at an incomplete multibyte sequence, try again on ** the next round. */ else if (save_errno == EINVAL) break; if (save_errno == EILSEQ) h->converr=1; /* Another possibility this can happen */ /* ** If we get here because of iconv(3) reason #4, filled out ** the output buffer, we should continue with the conversion. ** Otherwise, upon encountering any other error condition, ** reset the conversion state. */ if (save_errno != E2BIG) iconv(h->h, NULL, NULL, NULL, NULL); } } /*****************************************************************************/ /* ** A wrapper for unicode_convert() that collects the converted character ** text into a buffer. This is done by passing an output function to ** unicode_convert() that saves converted text in a linked-list ** of buffers. ** ** Then, in the deinitialization function, the buffers get concatenated into ** the final character buffer. */ struct unicode_convert_cbuf { struct unicode_convert_cbuf *next; char *fragment; size_t fragment_size; }; struct unicode_convert_tocbuf { struct unicode_convert_hdr hdr; char **cbufptr_ret; size_t *cbufsize_ret; int errflag; size_t tot_size; int nullterminate; struct unicode_convert_cbuf *first, **last; }; static int save_tocbuf(const char *, size_t, void *); static int convert_tocbuf(void *ptr, const char *text, size_t cnt); static int deinit_tocbuf(void *ptr, int *errptr); unicode_convert_handle_t unicode_convert_tocbuf_init(const char *src_chset, const char *dst_chset, char **cbufptr_ret, size_t *cbufsize_ret, int nullterminate ) { struct unicode_convert_tocbuf *p= malloc(sizeof(struct unicode_convert_tocbuf)); unicode_convert_handle_t h; if (!p) return NULL; memset(p, 0, sizeof(*p)); h=unicode_convert_init(src_chset, dst_chset, save_tocbuf, p); if (!h) { free(p); return NULL; } p->cbufptr_ret=cbufptr_ret; p->cbufsize_ret=cbufsize_ret; p->last= &p->first; p->nullterminate=nullterminate; p->hdr.next=h; p->hdr.convert_handler=convert_tocbuf; p->hdr.deinit_handler=deinit_tocbuf; p->hdr.ptr=p; return &p->hdr; } /* Capture the output of the conversion stack */ static int save_tocbuf(const char *text, size_t cnt, void *ptr) { struct unicode_convert_tocbuf *p= (struct unicode_convert_tocbuf *)ptr; struct unicode_convert_cbuf *fragment= malloc(sizeof(struct unicode_convert_cbuf)+cnt); size_t tot_size; if (!fragment) { p->errflag=1; return 1; } fragment->next=NULL; fragment->fragment=(char *)(fragment+1); if ((fragment->fragment_size=cnt) > 0) memcpy(fragment->fragment, text, cnt); *(p->last)=fragment; p->last=&fragment->next; tot_size=p->tot_size + cnt; /* Keep track of the total size saved */ if (tot_size < p->tot_size) /* Overflow? */ { errno=E2BIG; return 1; } p->tot_size=tot_size; return 0; } /* Punt converted text down the stack */ static int convert_tocbuf(void *ptr, const char *text, size_t cnt) { struct unicode_convert_tocbuf *p= (struct unicode_convert_tocbuf *)ptr; return unicode_convert(p->hdr.next, text, cnt); } /* ** Destroy the conversion stack. Destroy the downstream, then assemble the ** final array. */ static int deinit_tocbuf(void *ptr, int *errptr) { struct unicode_convert_tocbuf *p= (struct unicode_convert_tocbuf *)ptr; int rc=unicode_convert_deinit(p->hdr.next, errptr); struct unicode_convert_cbuf *bufptr; if (rc == 0 && p->nullterminate) { char zero=0; rc=save_tocbuf( &zero, sizeof(zero), p->hdr.ptr); } if (rc == 0) { if (((*p->cbufptr_ret)=malloc(p->tot_size ? p->tot_size:1)) != NULL) { size_t i=0; for (bufptr=p->first; bufptr; bufptr=bufptr->next) { if (bufptr->fragment_size) memcpy(&(*p->cbufptr_ret)[i], bufptr->fragment, bufptr->fragment_size); i += bufptr->fragment_size; } (*p->cbufsize_ret)=i; } else { rc= -1; } } for (bufptr=p->first; bufptr; ) { struct unicode_convert_cbuf *b=bufptr; bufptr=bufptr->next; free(b); } free(p); return rc; } unicode_convert_handle_t unicode_convert_tocbuf_toutf8_init(const char *src_chset, char **cbufptr_ret, size_t *cbufsize_ret, int nullterminate ) { return unicode_convert_tocbuf_init(src_chset, "utf-8", cbufptr_ret, cbufsize_ret, nullterminate); } unicode_convert_handle_t unicode_convert_tocbuf_fromutf8_init(const char *dst_chset, char **cbufptr_ret, size_t *cbufsize_ret, int nullterminate ) { return unicode_convert_tocbuf_init("utf-8", dst_chset, cbufptr_ret, cbufsize_ret, nullterminate); } char *unicode_convert_toutf8(const char *text, const char *charset, int *error) { char *cbufptr; size_t cbufsize; unicode_convert_handle_t h= unicode_convert_tocbuf_toutf8_init(charset, &cbufptr, &cbufsize, 1); if (!h) return NULL; unicode_convert(h, text, strlen(text)); if (unicode_convert_deinit(h, error) == 0) return cbufptr; return NULL; } char *unicode_convert_fromutf8(const char *text, const char *charset, int *error) { char *cbufptr; size_t cbufsize; unicode_convert_handle_t h= unicode_convert_tocbuf_fromutf8_init(charset, &cbufptr, &cbufsize, 1); if (!h) return NULL; unicode_convert(h, text, strlen(text)); if (unicode_convert_deinit(h, error) == 0) return cbufptr; return NULL; } char *unicode_convert_tobuf(const char *text, const char *charset, const char *dstcharset, int *error) { char *cbufptr; size_t cbufsize; unicode_convert_handle_t h= unicode_convert_tocbuf_init(charset, dstcharset, &cbufptr, &cbufsize, 1); if (!h) return NULL; unicode_convert(h, text, strlen(text)); if (unicode_convert_deinit(h, error) == 0) return cbufptr; return NULL; } /*****************************************************************************/ /* ** Convert text to char32_ts. Same basic approach as ** unicode_convert_tocbuf_init(). The output character set gets specified ** as UCS-4, the final output size is divided by 4, and the output buffer gets ** typed as a char32_t array. */ struct unicode_convert_buf { struct unicode_convert_buf *next; char32_t *fragment; size_t fragment_size; size_t max_fragment_size; }; struct unicode_convert_tou { struct unicode_convert_hdr hdr; char32_t **ucptr_ret; size_t *ucsize_ret; int errflag; size_t tot_size; int nullterminate; struct unicode_convert_buf *first, *tail, **last; }; static int save_unicode(const char *, size_t, void *); static int convert_tounicode(void *ptr, const char *text, size_t cnt); static int deinit_tounicode(void *ptr, int *errptr); unicode_convert_handle_t unicode_convert_tou_init(const char *src_chset, char32_t **ucptr_ret, size_t *ucsize_ret, int nullterminate ) { struct unicode_convert_tou *p= malloc(sizeof(struct unicode_convert_tou)); unicode_convert_handle_t h; if (!p) return NULL; memset(p, 0, sizeof(*p)); h=unicode_convert_init(src_chset, unicode_u_ucs4_native, save_unicode, p); if (!h) { free(p); return NULL; } p->ucptr_ret=ucptr_ret; p->ucsize_ret=ucsize_ret; p->last= &p->first; p->nullterminate=nullterminate; p->hdr.next=h; p->hdr.convert_handler=convert_tounicode; p->hdr.deinit_handler=deinit_tounicode; p->hdr.ptr=p; return &p->hdr; } unicode_convert_handle_t unicode_convert_fromu_init(const char *dst_chset, char **cbufptr_ret, size_t *csize_ret, int nullterminate ) { return unicode_convert_tocbuf_init(unicode_u_ucs4_native, dst_chset, cbufptr_ret, csize_ret, nullterminate); } int unicode_convert_uc(unicode_convert_handle_t handle, const char32_t *text, size_t cnt) { return unicode_convert(handle, (const char *)text, cnt * sizeof(*text)); } /* Capture the output of the conversion stack */ static int save_unicode(const char *text, size_t cnt, void *ptr) { struct unicode_convert_tou *p= (struct unicode_convert_tou *)ptr; struct unicode_convert_buf *fragment; size_t tot_size; cnt /= sizeof(char32_t); tot_size=p->tot_size + cnt*sizeof(char32_t); /* Keep track of the total size saved */ if (p->tail) { size_t n=p->tail->max_fragment_size-p->tail->fragment_size; if (n > cnt) n=cnt; if (n) { memcpy(p->tail->fragment+p->tail->fragment_size, text, n*sizeof(char32_t)); cnt -= n; text += n*sizeof(char32_t); p->tail->fragment_size += n; } } if (cnt > 0) { size_t cnt_alloc=cnt; if (cnt_alloc < 16) cnt_alloc=16; if ((fragment=malloc(sizeof(struct unicode_convert_buf) +cnt_alloc*sizeof(char32_t))) == NULL) { p->errflag=1; return 1; } fragment->next=NULL; fragment->fragment=(char32_t *)(fragment+1); fragment->max_fragment_size=cnt_alloc; fragment->fragment_size=cnt; memcpy(fragment->fragment, text, cnt*sizeof(char32_t)); *(p->last)=fragment; p->last=&fragment->next; p->tail=fragment; } if (tot_size < p->tot_size) /* Overflow? */ { errno=E2BIG; return 1; } p->tot_size=tot_size; return 0; } /* Punt converted text down the stack */ static int convert_tounicode(void *ptr, const char *text, size_t cnt) { struct unicode_convert_tou *p= (struct unicode_convert_tou *)ptr; return unicode_convert(p->hdr.next, text, cnt); } /* ** Destroy the conversion stack. Destroy the downstream, then assemble the ** final array. */ static int deinit_tounicode(void *ptr, int *errptr) { struct unicode_convert_tou *p= (struct unicode_convert_tou *)ptr; int rc=unicode_convert_deinit(p->hdr.next, errptr); struct unicode_convert_buf *bufptr; if (rc == 0 && p->nullterminate) { char32_t zero=0; rc=save_unicode( (const char *)&zero, sizeof(zero), p->hdr.ptr); } if (rc == 0) { if (((*p->ucptr_ret)=malloc(p->tot_size ? p->tot_size:1)) != NULL) { size_t i=0; for (bufptr=p->first; bufptr; bufptr=bufptr->next) { if (bufptr->fragment_size) memcpy(&(*p->ucptr_ret)[i], bufptr->fragment, bufptr->fragment_size *sizeof(*bufptr->fragment)); i += bufptr->fragment_size; } (*p->ucsize_ret)=i; } else { rc= -1; } } for (bufptr=p->first; bufptr; ) { struct unicode_convert_buf *b=bufptr; bufptr=bufptr->next; free(b); } free(p); return rc; } int unicode_convert_tou_tobuf(const char *text, size_t text_l, const char *charset, char32_t **uc, size_t *ucsize, int *err) { unicode_convert_handle_t h; if ((h=unicode_convert_tou_init(charset, uc, ucsize, 0)) == NULL) return -1; if (unicode_convert(h, text, text_l) < 0) { unicode_convert_deinit(h, NULL); return -1; } if (unicode_convert_deinit(h, err)) return -1; return 0; } int unicode_convert_fromu_tobuf(const char32_t *utext, size_t utext_l, const char *charset, char **c, size_t *csize, int *err) { unicode_convert_handle_t h; if (utext_l == (size_t)-1) { for (utext_l=0; utext[utext_l]; ++utext_l) ; } if ((h=unicode_convert_fromu_init(charset, c, csize, 1)) == NULL) return -1; if (unicode_convert_uc(h, utext, utext_l) < 0) { unicode_convert_deinit(h, NULL); return -1; } if (unicode_convert_deinit(h, err)) return -1; return 0; } char *unicode_convert_tocase(const char *str, const char *charset, char32_t (*first_char_func)(char32_t), char32_t (*char_func)(char32_t)) { char32_t *uc; size_t ucsize; size_t i; int err; char *c; size_t csize; if (unicode_convert_tou_tobuf(str, strlen(str), charset, &uc, &ucsize, &err)) return NULL; if (err) { free(uc); return NULL; } for (i=0; i