courier-unicode: implement unicode_x_smap_modutf8 pseudo-encoding

author: Sam Varshavchik 2018-07-11 22:16:22 -0400
committer: Sam Varshavchik 2018-07-11 22:16:22 -0400
commit: 4fc91797ada09e9f8e3bd0a3cdbe0c78edf2530d (patch)
tree: 0f4417dcffecd8b8e8f060a8a79ca42bc236484a
parent: 03050820bf8e22adb2ec88d079cdde01d7fa6a29 (diff)
download: courier-libs-4fc91797ada09e9f8e3bd0a3cdbe0c78edf2530d.tar.bz2
5 files changed, 345 insertions, 4 deletions
diff --git a/unicode/ChangeLog b/unicode/ChangeLog
index 42278a8..f29176e 100644
--- a/unicode/ChangeLog
+++ b/unicode/ChangeLog
@@ -1,3 +1,7 @@
+2018-07-11  Sam Varshavchik  <mrsam@courier-mta.com>
+
+	* unicode.c: Implement unicode_x_smap_modutf8 pseudo-encoding.
+
 2018-04-27  Sam Varshavchik  <mrsam@courier-mta.com>
 
 	* gcc 8 update, fix assertions. libtool and toolchain updates.
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index a7a0783..bb46ae6 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -167,6 +167,9 @@ check-am: unicodetest
 	test "`./unicodetest 'foobааааааr'`" = "foob&BDAEMAQwBDAEMAQw-r"
 	test "`./unicodetest 'foobаaаr'`" = "foob&BDA-a&BDA-r"
 	test "`./unicodetest 'foobааaааr'`" = "foob&BDAEMA-a&BDAEMA-r"
+	test "`./unicodetest --smaputf8 'hello world'`" = 'hello\040world'
+	test "`./unicodetest --smaputf8 'hello\\\\world'`" = 'hello\134\134world'
+	test "`./unicodetest --smaputf8 ':hello:world:'`" = '\072hello\072world\072'
 	n="aaaaaaaa"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n"; n="`echo $$n | cut -c1-1023`"; test "`./unicodetest $$n`" = "$$n"
 	n="aaaaaaaa"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n"; test "`./unicodetest $$n`" = "$$n"
 	n="aaaaaaaa"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n$$n$$n"; n="$$n$$n"; test "`./unicodetest a$$n`" = "a$$n"
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index 6bf9c17..7c6a00d 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -2,7 +2,7 @@
 #define	courier_unicode_h
 
 /*
-** Copyright 2000-2017 Double Precision, Inc.
+** Copyright 2000-2018 Double Precision, Inc.
 ** See COPYING for distribution information.
 **
 */
@@ -987,6 +987,19 @@ extern const char unicode_u_ucs2_native[];
 
 #define unicode_x_imap_modutf7 "x-imap-modutf7"
 
+/*
+** EAI-capable Courier-IMAP does not use modified-UTF7, and uses UTF-8.
+**
+** However, to support SMAP we will still need to encode/decode some
+** special characters.
+**
+** The characters U+0000-U+0020 (including space), and ./~:\
+**
+** They are encoded as a backslash followed by three octal digits.
+*/
+
+#define unicode_x_smap_modutf8 "x-smap-modutf8"
+
 #if 0
 {
 #endif
diff --git a/unicode/unicode.c b/unicode/unicode.c
index 71e6439..2cf5856 100644
--- a/unicode/unicode.c
+++ b/unicode/unicode.c
@@ -1,5 +1,5 @@
 /*
-** Copyright 2000-2011 Double Precision, Inc.
+** Copyright 2000-2018 Double Precision, Inc.
 ** See COPYING for distribution information.
 **
 */
@@ -239,6 +239,42 @@ static int do_convert_toutf7(const char *text, size_t cnt, void *arg);
 static int convert_utf7_handler(void *ptr, const char *text, size_t cnt);
 
 /*
+** Conversion wrapper for converting to modified-utf8 SMAP encoding.
+**
+** This is done by converting to UTF-8, then stacking on a module that
+** takes that and converts UTF-8 to modified-UTF8.
+**
+** init_nottosmaputf8() returns an opaque stack for converting to modified
+** UTF-8.
+*/
+
+static unicode_convert_handle_t
+init_nottosmaputf8(const char *src_chset,
+		   const char *dst_chset,
+		   int (*output_func)(const char *, size_t, void *),
+		   void *convert_arg);
+
+/*
+** The to modified UTF8 module
+*/
+
+struct unicode_convert_tosmaputf8 {
+
+	struct unicode_convert_hdr hdr;
+
+	int errflag;
+
+	/* Remembered output function */
+
+	int (*output_func)(const char *, size_t, void *);
+
+	/* Remembered arg to the output function */
+	void *convert_arg;
+};
+
+
+
+/*
 ** Create a conversion module stack
 */
 
@@ -417,6 +453,129 @@ static int deinit_toimaputf7(void *ptr, int *errptr)
 	return rc;
 }
 
+/*
+** Convert to unicode_x_smap_modutf8.
+*/
+
+static int deinit_tosmaputf8(void *ptr, int *errptr);
+static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg);
+static int convert_utf8_handler(void *ptr, const char *text, size_t cnt);
+
+static unicode_convert_handle_t
+init_nottoimaputf7(const char *src_chset,
+		   const char *dst_chset,
+		   int (*output_func)(const char *, size_t, void *),
+		   void *convert_arg)
+{
+	struct unicode_convert_tosmaputf8 *toutf8;
+	unicode_convert_handle_t h;
+
+	if (strcmp(dst_chset, unicode_x_smap_modutf8))
+		return init_nottosmaputf8(src_chset, dst_chset,
+					  output_func,
+					  convert_arg);
+
+	toutf8=malloc(sizeof(struct unicode_convert_tosmaputf8));
+
+	if (!toutf8)
+		return NULL;
+
+	memset(toutf8, 0, sizeof(*toutf8));
+
+	h=init_nottosmaputf8(src_chset, "utf-8",
+			     do_convert_tosmaputf8,
+			     toutf8);
+	if (!h)
+	{
+		free(toutf8);
+		return (NULL);
+	}
+
+	toutf8->output_func=output_func;
+	toutf8->convert_arg=convert_arg;
+
+	toutf8->hdr.convert_handler=convert_utf8_handler;
+	toutf8->hdr.deinit_handler=deinit_tosmaputf8;
+	toutf8->hdr.ptr=toutf8;
+	toutf8->hdr.next=h;
+	return &toutf8->hdr;
+}
+
+static int deinit_tosmaputf8(void *ptr, int *errptr)
+{
+	int rc;
+
+	struct unicode_convert_tosmaputf8 *toutf8=
+		(struct unicode_convert_tosmaputf8 *)ptr;
+
+	/* Flush out the downstream stack */
+	rc=(*toutf8->hdr.next->deinit_handler)(toutf8->hdr.next->ptr, errptr);
+
+	free(toutf8);
+	return rc;
+}
+
+static int do_convert_tosmaputf8(const char *text, size_t cnt, void *arg)
+{
+	struct unicode_convert_tosmaputf8 *toutf8=
+		(struct unicode_convert_tosmaputf8 *)arg;
+	int rc;
+	size_t i;
+	char octal[4];
+
+	while (cnt)
+	{
+		if (toutf8->errflag)
+			return toutf8->errflag;
+
+		for (i=0; i<cnt; ++i)
+			if (strchr(" ./~:\\", text[i]))
+				break;
+		if (i)
+		{
+			rc= (*toutf8->output_func)(text, i,
+						   toutf8->convert_arg);
+
+			if (rc)
+			{
+				toutf8->errflag=rc;
+				return rc;
+			}
+			text += i;
+			cnt -= i;
+		}
+
+		if (cnt)
+		{
+			char c= *text;
+
+			octal[0]='\\';
+			octal[3]= (c & 7)+'0'; c /= 8;
+			octal[2]= (c & 7)+'0'; c /= 8;
+			octal[1]= (c & 7)+'0';
+			rc= (*toutf8->output_func)(octal, 4,
+						   toutf8->convert_arg);
+			if (rc)
+			{
+				toutf8->errflag=rc;
+				return rc;
+			}
+			++text;
+			--cnt;
+		}
+	}
+	return 0;
+}
+
+static int convert_utf8_handler(void *ptr, const char *text, size_t cnt)
+{
+	struct unicode_convert_tosmaputf8 *toutf8=
+		(struct unicode_convert_tosmaputf8 *)ptr;
+
+	return (*toutf8->hdr.next->convert_handler)(toutf8->hdr.next->ptr,
+						    text, cnt);
+}
+
 /************/
 
 /*
@@ -483,7 +642,7 @@ static int convert_fromutf7(void *ptr,
 static int deinit_fromutf7(void *ptr, int *errptr);
 
 static unicode_convert_handle_t
-init_nottoimaputf7(const char *src_chset,
+init_nottosmaputf8(const char *src_chset,
 		   const char *dst_chset,
 		   int (*output_func)(const char *, size_t, void *),
 		   void *convert_arg)
@@ -648,6 +807,162 @@ static int deinit_fromutf7(void *ptr, int *errptr)
 
 /************/
 
+/*
+** Convert from modified-utf8 SMAP encoding.
+**
+** This module converts it to UTF-8, then this is attached to a stack that
+** converts UTF-8 to the requested charset.
+*/
+
+static unicode_convert_handle_t
+init_notfromsmaputf8(const char *src_chset,
+		     const char *dst_chset,
+		     int (*output_func)(const char *, size_t, void *),
+		     void *convert_arg);
+
+struct unicode_convert_fromsmaputf8 {
+
+	struct unicode_convert_hdr hdr;
+
+	/* Convert a backslash escape */
+
+	int in_escape;
+
+	/* The escaped character */
+
+	unsigned char escape_char;
+
+	int errflag;
+	int converr;
+};
+
+static int convert_fromutf8(void *ptr,
+			    const char *text, size_t cnt);
+static int deinit_fromutf8(void *ptr, int *errptr);
+
+static unicode_convert_handle_t
+init_notfromimaputf7(const char *src_chset,
+		     const char *dst_chset,
+		     int (*output_func)(const char *, size_t, void *),
+		     void *convert_arg)
+{
+	struct unicode_convert_fromsmaputf8 *fromutf8;
+	unicode_convert_handle_t h;
+
+	if (strcmp(src_chset, unicode_x_smap_modutf8))
+		return init_notfromsmaputf8(src_chset, dst_chset,
+					    output_func, convert_arg);
+
+	fromutf8=(struct unicode_convert_fromsmaputf8 *)
+		malloc(sizeof(struct unicode_convert_fromsmaputf8));
+
+	if (!fromutf8)
+		return NULL;
+
+	memset(fromutf8, 0, sizeof(*fromutf8));
+
+	/* Create a stack for converting UTF-8 to the dest charset */
+
+	h=init_notfromimaputf7("utf-8", dst_chset,
+			       output_func, convert_arg);
+
+	if (!h)
+	{
+		free(fromutf8);
+		return (NULL);
+	}
+
+	fromutf8->hdr.next=h;
+	fromutf8->hdr.convert_handler=convert_fromutf8;
+	fromutf8->hdr.deinit_handler=deinit_fromutf8;
+	fromutf8->hdr.ptr=fromutf8;
+	return &fromutf8->hdr;
+}
+
+static int convert_fromutf8(void *ptr,
+			    const char *text, size_t cnt)
+{
+	struct unicode_convert_fromsmaputf8 *fromutf8=
+		(struct unicode_convert_fromsmaputf8 *)ptr;
+	size_t i;
+
+	while (cnt)
+	{
+		if (fromutf8->errflag)
+			return fromutf8->errflag;
+
+		if (fromutf8->in_escape)
+		{
+			if (*text < '0' || *text > '7')
+			{
+				errno=EILSEQ;
+				return fromutf8->errflag=-1;
+			}
+			fromutf8->escape_char <<= 3;
+			fromutf8->escape_char |= *text - '0';
+			if (--fromutf8->in_escape == 0)
+			{
+				fromutf8->errflag=(*fromutf8->hdr.next
+						   ->convert_handler)
+					(fromutf8->hdr.next->ptr,
+					 (const char *)&fromutf8->escape_char,
+					 1);
+			}
+			++text;
+			--cnt;
+			continue;
+		}
+
+		for (i=0; i<cnt; ++i)
+			if (text[i] == '\\')
+				break;
+
+		if (i)
+		{
+			fromutf8->errflag=(*fromutf8->hdr.next
+					   ->convert_handler)
+				(fromutf8->hdr.next->ptr, text, i);
+			text += i;
+			cnt -= i;
+		}
+
+		if (cnt)
+		{
+			fromutf8->escape_char=0;
+			fromutf8->in_escape=3;
+			++text;
+			--cnt;
+		}
+	}
+	return 0;
+}
+
+static int deinit_fromutf8(void *ptr, int *errptr)
+{
+	struct unicode_convert_fromsmaputf8 *fromutf8=
+		(struct unicode_convert_fromsmaputf8 *)ptr;
+	int rc;
+
+	if (fromutf8->in_escape)
+	{
+		fromutf8->errflag= -1;
+		errno=EILSEQ;
+	}
+
+	rc=fromutf8->hdr.next->deinit_handler(fromutf8->hdr.next->ptr, errptr);
+
+	if (fromutf8->errflag && rc == 0)
+		rc=fromutf8->errflag;
+
+	if (errptr && fromutf8->converr)
+		*errptr=1;
+
+	free(fromutf8);
+	return rc;
+}
+
+/************/
+
 /* A real conversion module, via iconv */
 
 struct unicode_convert_iconv {
@@ -674,7 +989,7 @@ static int init_iconv(struct unicode_convert_iconv *h,
 		      void *convert_arg);
 
 static unicode_convert_handle_t
-init_notfromimaputf7(const char *src_chset,
+init_notfromsmaputf8(const char *src_chset,
 		     const char *dst_chset,
 		     int (*output_func)(const char *, size_t, void *),
 		     void *convert_arg)
diff --git a/unicode/unicodetest.c b/unicode/unicodetest.c
index b59fab3..b309ad6 100644
--- a/unicode/unicodetest.c
+++ b/unicode/unicodetest.c
@@ -134,6 +134,12 @@ int main(int argc, char **argv)
 		++argn;
 	}
 
+	if (argn < argc && strcmp(argv[argn], "--smaputf8") == 0)
+	{
+		chset=unicode_x_smap_modutf8;
+		++argn;
+	}
+
 	if (argn < argc && strcmp(argv[argn], "--totitle") == 0)
 	{
 		++argn;
author	Sam Varshavchik	2018-07-11 22:16:22 -0400
committer	Sam Varshavchik	2018-07-11 22:16:22 -0400
commit	4fc91797ada09e9f8e3bd0a3cdbe0c78edf2530d (patch)
tree	0f4417dcffecd8b8e8f060a8a79ca42bc236484a
parent	03050820bf8e22adb2ec88d079cdde01d7fa6a29 (diff)
download	courier-libs-4fc91797ada09e9f8e3bd0a3cdbe0c78edf2530d.tar.bz2