From 4fc91797ada09e9f8e3bd0a3cdbe0c78edf2530d Mon Sep 17 00:00:00 2001 From: Sam Varshavchik Date: Wed, 11 Jul 2018 22:16:22 -0400 Subject: courier-unicode: implement unicode_x_smap_modutf8 pseudo-encoding --- unicode/unicode.c | 321 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 318 insertions(+), 3 deletions(-) (limited to 'unicode/unicode.c') diff --git a/unicode/unicode.c b/unicode/unicode.c index 71e6439..2cf5856 100644 --- a/unicode/unicode.c +++ b/unicode/unicode.c @@ -1,5 +1,5 @@ /* -** Copyright 2000-2011 Double Precision, Inc. +** Copyright 2000-2018 Double Precision, Inc. ** See COPYING for distribution information. ** */ @@ -238,6 +238,42 @@ static int deinit_toimaputf7(void *ptr, int *errptr); static int do_convert_toutf7(const char *text, size_t cnt, void *arg); static int convert_utf7_handler(void *ptr, const char *text, size_t cnt); +/* +** Conversion wrapper for converting to modified-utf8 SMAP encoding. +** +** This is done by converting to UTF-8, then stacking on a module that +** takes that and converts UTF-8 to modified-UTF8. +** +** init_nottosmaputf8() returns an opaque stack for converting to modified +** UTF-8. +*/ + +static unicode_convert_handle_t +init_nottosmaputf8(const char *src_chset, + const char *dst_chset, + int (*output_func)(const char *, size_t, void *), + void *convert_arg); + +/* +** The to modified UTF8 module +*/ + +struct unicode_convert_tosmaputf8 { + + struct unicode_convert_hdr hdr; + + int errflag; + + /* Remembered output function */ + + int (*output_func)(const char *, size_t, void *); + + /* Remembered arg to the output function */ + void *convert_arg; +}; + + + /* ** Create a conversion module stack */ @@ -417,6 +453,129 @@ static int deinit_toimaputf7(void *ptr, int *errptr) return rc; } +/* +** Convert to unicode_x_smap_modutf8. +*/ + +static int deinit_tosmaputf8(void *ptr, int *errptr); +static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg); +static int convert_utf8_handler(void *ptr, const char *text, size_t cnt); + +static unicode_convert_handle_t +init_nottoimaputf7(const char *src_chset, + const char *dst_chset, + int (*output_func)(const char *, size_t, void *), + void *convert_arg) +{ + struct unicode_convert_tosmaputf8 *toutf8; + unicode_convert_handle_t h; + + if (strcmp(dst_chset, unicode_x_smap_modutf8)) + return init_nottosmaputf8(src_chset, dst_chset, + output_func, + convert_arg); + + toutf8=malloc(sizeof(struct unicode_convert_tosmaputf8)); + + if (!toutf8) + return NULL; + + memset(toutf8, 0, sizeof(*toutf8)); + + h=init_nottosmaputf8(src_chset, "utf-8", + do_convert_tosmaputf8, + toutf8); + if (!h) + { + free(toutf8); + return (NULL); + } + + toutf8->output_func=output_func; + toutf8->convert_arg=convert_arg; + + toutf8->hdr.convert_handler=convert_utf8_handler; + toutf8->hdr.deinit_handler=deinit_tosmaputf8; + toutf8->hdr.ptr=toutf8; + toutf8->hdr.next=h; + return &toutf8->hdr; +} + +static int deinit_tosmaputf8(void *ptr, int *errptr) +{ + int rc; + + struct unicode_convert_tosmaputf8 *toutf8= + (struct unicode_convert_tosmaputf8 *)ptr; + + /* Flush out the downstream stack */ + rc=(*toutf8->hdr.next->deinit_handler)(toutf8->hdr.next->ptr, errptr); + + free(toutf8); + return rc; +} + +static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg) +{ + struct unicode_convert_tosmaputf8 *toutf8= + (struct unicode_convert_tosmaputf8 *)arg; + int rc; + size_t i; + char octal[4]; + + while (cnt) + { + if (toutf8->errflag) + return toutf8->errflag; + + for (i=0; ioutput_func)(text, i, + toutf8->convert_arg); + + if (rc) + { + toutf8->errflag=rc; + return rc; + } + text += i; + cnt -= i; + } + + if (cnt) + { + char c= *text; + + octal[0]='\\'; + octal[3]= (c & 7)+'0'; c /= 8; + octal[2]= (c & 7)+'0'; c /= 8; + octal[1]= (c & 7)+'0'; + rc= (*toutf8->output_func)(octal, 4, + toutf8->convert_arg); + if (rc) + { + toutf8->errflag=rc; + return rc; + } + ++text; + --cnt; + } + } + return 0; +} + +static int convert_utf8_handler(void *ptr, const char *text, size_t cnt) +{ + struct unicode_convert_tosmaputf8 *toutf8= + (struct unicode_convert_tosmaputf8 *)ptr; + + return (*toutf8->hdr.next->convert_handler)(toutf8->hdr.next->ptr, + text, cnt); +} + /************/ /* @@ -483,7 +642,7 @@ static int convert_fromutf7(void *ptr, static int deinit_fromutf7(void *ptr, int *errptr); static unicode_convert_handle_t -init_nottoimaputf7(const char *src_chset, +init_nottosmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) @@ -648,6 +807,162 @@ static int deinit_fromutf7(void *ptr, int *errptr) /************/ +/* +** Convert from modified-utf8 SMAP encoding. +** +** This module converts it to UTF-8, then this is attached to a stack that +** converts UTF-8 to the requested charset. +*/ + +static unicode_convert_handle_t +init_notfromsmaputf8(const char *src_chset, + const char *dst_chset, + int (*output_func)(const char *, size_t, void *), + void *convert_arg); + +struct unicode_convert_fromsmaputf8 { + + struct unicode_convert_hdr hdr; + + /* Convert a backslash escape */ + + int in_escape; + + /* The escaped character */ + + unsigned char escape_char; + + int errflag; + int converr; +}; + +static int convert_fromutf8(void *ptr, + const char *text, size_t cnt); +static int deinit_fromutf8(void *ptr, int *errptr); + +static unicode_convert_handle_t +init_notfromimaputf7(const char *src_chset, + const char *dst_chset, + int (*output_func)(const char *, size_t, void *), + void *convert_arg) +{ + struct unicode_convert_fromsmaputf8 *fromutf8; + unicode_convert_handle_t h; + + if (strcmp(src_chset, unicode_x_smap_modutf8)) + return init_notfromsmaputf8(src_chset, dst_chset, + output_func, convert_arg); + + fromutf8=(struct unicode_convert_fromsmaputf8 *) + malloc(sizeof(struct unicode_convert_fromsmaputf8)); + + if (!fromutf8) + return NULL; + + memset(fromutf8, 0, sizeof(*fromutf8)); + + /* Create a stack for converting UTF-8 to the dest charset */ + + h=init_notfromimaputf7("utf-8", dst_chset, + output_func, convert_arg); + + if (!h) + { + free(fromutf8); + return (NULL); + } + + fromutf8->hdr.next=h; + fromutf8->hdr.convert_handler=convert_fromutf8; + fromutf8->hdr.deinit_handler=deinit_fromutf8; + fromutf8->hdr.ptr=fromutf8; + return &fromutf8->hdr; +} + +static int convert_fromutf8(void *ptr, + const char *text, size_t cnt) +{ + struct unicode_convert_fromsmaputf8 *fromutf8= + (struct unicode_convert_fromsmaputf8 *)ptr; + size_t i; + + while (cnt) + { + if (fromutf8->errflag) + return fromutf8->errflag; + + if (fromutf8->in_escape) + { + if (*text < '0' || *text > '7') + { + errno=EILSEQ; + return fromutf8->errflag=-1; + } + fromutf8->escape_char <<= 3; + fromutf8->escape_char |= *text - '0'; + if (--fromutf8->in_escape == 0) + { + fromutf8->errflag=(*fromutf8->hdr.next + ->convert_handler) + (fromutf8->hdr.next->ptr, + (const char *)&fromutf8->escape_char, + 1); + } + ++text; + --cnt; + continue; + } + + for (i=0; ierrflag=(*fromutf8->hdr.next + ->convert_handler) + (fromutf8->hdr.next->ptr, text, i); + text += i; + cnt -= i; + } + + if (cnt) + { + fromutf8->escape_char=0; + fromutf8->in_escape=3; + ++text; + --cnt; + } + } + return 0; +} + +static int deinit_fromutf8(void *ptr, int *errptr) +{ + struct unicode_convert_fromsmaputf8 *fromutf8= + (struct unicode_convert_fromsmaputf8 *)ptr; + int rc; + + if (fromutf8->in_escape) + { + fromutf8->errflag= -1; + errno=EILSEQ; + } + + rc=fromutf8->hdr.next->deinit_handler(fromutf8->hdr.next->ptr, errptr); + + if (fromutf8->errflag && rc == 0) + rc=fromutf8->errflag; + + if (errptr && fromutf8->converr) + *errptr=1; + + free(fromutf8); + return rc; +} + +/************/ + /* A real conversion module, via iconv */ struct unicode_convert_iconv { @@ -674,7 +989,7 @@ static int init_iconv(struct unicode_convert_iconv *h, void *convert_arg); static unicode_convert_handle_t -init_notfromimaputf7(const char *src_chset, +init_notfromsmaputf8(const char *src_chset, const char *dst_chset, int (*output_func)(const char *, size_t, void *), void *convert_arg) -- cgit v1.2.3