diff options
Diffstat (limited to 'unicode/unicode.h')
| -rw-r--r-- | unicode/unicode.h | 1692 | 
1 files changed, 1692 insertions, 0 deletions
| diff --git a/unicode/unicode.h b/unicode/unicode.h new file mode 100644 index 0000000..4e018b7 --- /dev/null +++ b/unicode/unicode.h @@ -0,0 +1,1692 @@ +#ifndef	unicode_h +#define	unicode_h + +/* +** Copyright 2000-2011 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#ifdef	__cplusplus + +#include <string> +#include <vector> +#include <list> + +extern "C" { +#endif + +#if 0 +} +#endif + +#include	"unicode/unicode_config.h" + +#include	<stdlib.h> + +#include	<stdio.h> +#if HAVE_WCHAR_H +#include	<wchar.h> +#endif + +#if HAVE_STDDEF_H +#include	<stddef.h> +#endif +#include	<stdint.h> + +#include	<sys/types.h> + +typedef uint32_t unicode_char; + +/* +** The system default character set, from the locale. +*/ + +extern const char *unicode_default_chset(); + +/* Unicode upper/lower/title case conversion functions */ + +extern unicode_char unicode_uc(unicode_char); +extern unicode_char unicode_lc(unicode_char); +extern unicode_char unicode_tc(unicode_char); + +/* +** Look up HTML 4.0/XHTML entity. +** +** n="amp", etc... +** +** Returns the unicode entity value, or 0 if no such entity is defined. +*/ + +unicode_char unicode_html40ent_lookup(const char *n); + +/* +** +** Return "width" of unicode character. +** +** This is defined as follows: for characters having the F or W property in +** tr11 (EastAsianWidth), unicode_wcwidth() returns 2. +** +** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line +** breaking property as per tr14, unicode_wcwdith() returns 0. For all other +** cases, 1. +** +** This provides a rough estimate of the "width" of the character if its +** shown on a text console. +*/ + +extern int unicode_wcwidth(unicode_char c); +extern size_t unicode_wcwidth_str(const unicode_char *c); + +/* +** The unicode-ish isspace() +*/ +extern int unicode_isspace(unicode_char ch); + +/* Internal unicode table lookup function */ + +extern uint8_t unicode_tab_lookup(unicode_char ch, +				  const size_t *unicode_indextab, +				  size_t unicode_indextab_sizeof, +				  const uint8_t (*unicode_rangetab)[2], +				  const uint8_t *unicode_classtab, +				  uint8_t uclass); + +/* +** Implementation of grapheme cluster boundary rules, as per tr29, +** including  GB9a and GB9b. +** +** Returns non-zero if there's a grapheme break between the two referenced +** characters. +*/ + +int unicode_grapheme_break(unicode_char a, unicode_char b); + +/* +** Implementation of line break rules, as per tr14. +** +** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The +** first parameter is a callback function that gets invoked with two +** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument. +** The second parameter to unicode_lb_init() is the opaque passthrough +** pointer, that is passed as the second argument to the callback function +** with no further interpretation. +** +** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(), +** passing the handle and one unicode character. Repeatedly invoke +** unicode_lb_next() to specify the input string for the linebreaking +** algorithm, then invoke unicode_lb_end() to finish calculating the +** linebreaking algorithm, and deallocate the opaque linebreaking handle. +** +** The callback function gets invoked once for each invocation of +** unicode_lb_next(). The contract is that before unicode_lb_end() returns, +** the callback function will get invoked the exact number of times that +** unicode_lb_next(), as long as each invocation of the callback function +** returned 0; nothing more, nothing less. The first parameter to the callback +** function will be one of the following values: +** +** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding +** character. +** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding +** character. +** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding +** character (the preceding character is a space, or an equivalent). +** +** The callback function should return 0. A non-zero value indicates an +** error, which gets propagated up to the caller. The contract that the +** callback function gets invoked the same number of times that +** unicode_lb_next() gets invoked is now broken. +*/ + +#define UNICODE_LB_MANDATORY	-1 +#define UNICODE_LB_NONE		0 +#define UNICODE_LB_ALLOWED	1 + +struct unicode_lb_info; + +typedef struct unicode_lb_info *unicode_lb_info_t; + +/* +** Allocate a linebreaking handle. +*/ +extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *), +					 void *cb_arg); + +/* +** Feed the next character through the linebreaking algorithm. +** A non-zero return code indicates that the callback function was invoked +** and it returned a non-zero return code (which is propagated as a return +** value). unicode_lb_end() must still be invoked, in this case. +** +** A zero return code indicates that if the callback function was invoked, +** it returned 0. +*/ + +extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch); + +/* +** Convenience function that invokes unicode_lb_next() with a list of +** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned +** 0, or the first non-zero return value from unicode_lb_next(). +*/ + +extern int unicode_lb_next_cnt(unicode_lb_info_t i, +			       const unicode_char *chars, +			       size_t cnt); + +/* +** Finish the linebreaking algorithm. +** +** A non-zero return code indicates that the callback function was invoked +** and it returned a non-zero return code (which is propagated as a return +** value). +** +** A zero return code indicates that if the callback function was invoked, +** it returned 0, and that the callback function was invoked exactly the same +** number of times that unicode_lb_next() was invoked. +** +** In all case, the linebreak handle will no longer be valid when this +** function returns. +*/ + +extern int unicode_lb_end(unicode_lb_info_t i); + +/* +** An alternative linebreak API where the callback function receives the +** original unicode character in addition to its linebreak value. +** +** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose +** semantics are the same as their _lb_ counterparts. +*/ + +struct unicode_lbc_info; + +typedef struct unicode_lbc_info *unicode_lbc_info_t; + +extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, +							  void *), +					   void *cb_arg); +extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch); +extern int unicode_lbc_end(unicode_lbc_info_t i); + +/* +** Set linebreaking options. +** +** OPTIONS SUBJECT TO CHANGE. +*/ + +extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts); + +extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts); + +/* +** Tailorization of LB24: Prevent pluses, as in "C++", from breaking. +** +** Adds the following to LB24: +** +**            PR x PR +** +**            AL x PR +** +**            ID x PR +**/ +#define UNICODE_LB_OPT_PRBREAK 0x0001 + + +/* +** Tailored / breaking rules. +** +** Adds the following rule to LB13: +** +**            SY x EX +** +**            SY x AL +** +**            SY x ID +** +**            SP รท SY, which takes precedence over "x SY". +*/ +#define UNICODE_LB_OPT_SYBREAK 0x0002 + +/* +** Tailored / breaking rules. +** +** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before +** and after mdash and ndash. +*/ +#define UNICODE_LB_OPT_DASHWJ 0x0004 + +/* +** Implemention of word break rules, as per tr29. +** +** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The +** first parameter is a callback function that gets invoked with two +** arguments: an int flag, and a passthrough argument. The second parameter to +** unicode_wb_init() is the opaque passthrough pointer, that is passed as the +** second argument to the callback function with no further interpretation. +** +** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(), +** passing the handle and one unicode character. Repeatedly invoke +** unicode_wb_next() to specify the input string for the wordbreaking +** algorithm, then invoke unicode_wb_end() to finish calculating the +** wordbreaking algorithm, and deallocate the opaque wordbreaking handle. +** +** The callback function gets invoked once for each invocation of +** unicode_wb_next(). The contract is that before unicode_wb_end() returns, +** the callback function will get invoked the exact number of times that +** unicode_wb_next(), as long as each invocation of the callback function +** returned 0; nothing more, nothing less. The first parameter to the callback +** function will be an int. A non-zero value indicates that there is a word +** break between this character and the preceding one. +** +** The callback function should return 0. A non-zero value indicates an +** error, which gets propagated up to the caller. The contract that the +** callback function gets invoked the same number of times that +** unicode_lb_next() gets invoked is now broken. +*/ + +struct unicode_wb_info; + +typedef struct unicode_wb_info *unicode_wb_info_t; + +/* +** Allocate a wordbreaking handle. +*/ +extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), +					 void *cb_arg); + +/* +** Feed the next character through the wordbreaking algorithm. +** A non-zero return code indicates that the callback function was invoked +** and it returned a non-zero return code (which is propagated as a return +** value). unicode_wb_end() must still be invoked, in this case. +** +** A zero return code indicates that if the callback function was invoked, +** it returned 0. +*/ + +extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch); + +/* +** Convenience function that invokes unicode_wb_next() with a list of +** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned +** 0, or the first non-zero return value from unicode_wb_next(). +*/ + +extern int unicode_wb_next_cnt(unicode_wb_info_t i, +			       const unicode_char *chars, +			       size_t cnt); + +/* +** Finish the wordbreaking algorithm. +** +** A non-zero return code indicates that the callback function was invoked +** and it returned a non-zero return code (which is propagated as a return +** value). +** +** A zero return code indicates that if the callback function was invoked, +** it returned 0, and that the callback function was invoked exactly the same +** number of times that unicode_wb_next() was invoked. +** +** In all case, the wordbreak handle will no longer be valid when this +** function returns. +*/ + +extern int unicode_wb_end(unicode_wb_info_t i); + +/* +** Search for a word boundary. +** +** Obtain a handle by calling unicode_wbscan_init(), then invoke +** unicode_wbscan_next() to provide a unicode stream, then invoke +** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode +** characters from the beginning of the stream until the first word boundary. +** +** You may prematurely stop calling unicode_wbscan_next() once it returns a +** non-0 value, which means that there is sufficient context to compute the +** first word boundary, and all further calls to unicode_wbscan_next() will +** be internal no-ops. +*/ + +struct unicode_wbscan_info; + +typedef struct unicode_wbscan_info *unicode_wbscan_info_t; + +unicode_wbscan_info_t unicode_wbscan_init(); + +int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch); + +size_t unicode_wbscan_end(unicode_wbscan_info_t i); + +/* +** A buffer that holds unicode characters, and dynamically grows as needed. +*/ + +struct unicode_buf { +	unicode_char *ptr;	/* The unicode characters */ +	size_t size,		/* Buffer size */ +		len,		/* How many characters in ptr are initialized */ +		max;		/* Maximum size the buffer can grow to */ +}; + +/* +** Initialize a buffer. Constructor. +*/ + +void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */ +		      struct unicode_buf *p, + +		      /* +		      ** Maximum size the buffer can grow to. (size_t)-1 +		      ** means unlimited. +		      */ +		      size_t max); +/* +** Like unicode_buf_init, and initialize the new buffer with the contents of +** another buffer. The maximum size of the initialized buffer is exactly the +** number of characters in the existing buffer. This copies a buffer using +** the minimum amount of heap space. +*/ + +#define unicode_buf_init_copy(a,b)				\ +	do {							\ +		unicode_buf_init((a), unicode_buf_len(b));	\ +		unicode_buf_append_buf((a),(b));		\ +	} while (0) + +/* +** Deinitialize the buffer. Destructor. Frees memory. +*/ + +void unicode_buf_deinit(struct unicode_buf *p); + +/* +** Official way to access the characters in the unicode buffer. +*/ +#define unicode_buf_ptr(p) ((p)->ptr) + +/* +** Official way of obtaining the number of characters in the unicode buffer. +*/ +#define unicode_buf_len(p) ((p)->len) + +/* +** Remove all existing characters from an initialized buffer. Sets len to 0. +*/ + +#define unicode_buf_clear(p) ((p)->len=0) + +/* +** Append characters to the existing characters in the unicode buffer. +** The buffer grows, if needed. If the buffer would exceed its maximum size, +** the extra characters get truncated. +** +** Returns 0 if the characters were appended. -1 for a malloc failure. +*/ + +int unicode_buf_append(struct unicode_buf *p,	/* The buffer */ +		       const unicode_char *uc,	/* Characters to append */ +		       size_t l);		/* How many of them */ + +/* +** Convert an iso-8859-1 char string and invoke unicode_buf_append(). +*/ + +void unicode_buf_append_char(struct unicode_buf *dst, +			     const char *str, +			     size_t cnt); + +/* +** Remove some portion of the unicode buffer +*/ + +void unicode_buf_remove(struct unicode_buf *p, /* The buffer */ +			size_t pos, /* Offset in buffer */ +			size_t cnt); /* How many to remove */ + +/* +** Append the contents of an existing buffer to another one. +*/ + +#define unicode_buf_append_buf(a,b)					\ +	unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b)) + + +/* +** The equivalent of strcmp() for unicode buffers. +*/ + +int unicode_buf_cmp(const struct unicode_buf *a, +		    const struct unicode_buf *b); + +/* +** The equivalent of unicode_buf_cmp, except that the second buffer is an +** iso-8859-1 string. +*/ + +int unicode_buf_cmp_str(const struct unicode_buf *p, +			const char *c,	/* iso-8859-1 string */ +			size_t cl);	/* Number of chars in c */ + +/* +** A wrapper for iconv(3). This wrapper provides a different API for iconv(3). +** A handle gets created by libmail_u_convert_init(). +** libmail_u_convert_init() receives a pointer to the output function +** which receives converted character text. +** +** The output function receives a pointer to the converted character text, and +** the number of characters in the converted text. +** +** The character text to convert gets passed, repeatedly, to +** libmail_u_convert(). Each call to libmail_u_convert() results in +** the output function being invoked, zero or more times, with the converted +** text. Finally, libmail_u_convert_deinit() stops the conversion and +** deallocates the conversion handle. +** +** Internal buffering takes place. libmail_u_convert_deinit() may result +** in the output function being called one or more times, to receive the final +** part of the converted character stream. +** +** The output function should return 0. A non-0 value causes +** libmail_u_convert() and/or libmail_u_convert_deinit() returning +** non-0. +*/ + +struct libmail_u_convert_hdr; + +typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t; + +/* +** libmail_u_convert_init() returns a non-NULL handle for the requested +** conversion, or NULL if the requested conversion is not available. +*/ + +libmail_u_convert_handle_t +libmail_u_convert_init(/* Convert from this chset */ +		       const char *src_chset, + +		       /* Convert to this chset */ +		       const char *dst_chset, + +		       /* The output function */ + +		       int (*output_func)(const char *, size_t, void *), + +		       /* Passthrough arg */ +		       void *convert_arg); + +/* +** Repeatedly pass the character text to convert to libmail_u_convert(). +** +** Returns non-0 if the output function returned non-0, or 0 if all invocations +** of the output function returned 0. +*/ + +int libmail_u_convert(/* The conversion handle */ +		      libmail_u_convert_handle_t handle, + +		      /* Text to convert */ +		      const char *text, + +		      /* Number of bytes to convert */ +		      size_t cnt); + +/* +** Finish character set conversion. The handle gets deallocated. +** +** May still result in one or more invocations of the output function. +** Returns non-zero if any previous invocation of the output function returned +** non-zero (this includes any invocations of the output function resulting +** from this call, or prior libmail_u_convert() calls), or 0 if all +** invocations of the output function returned 0. +** +** If the errptr is not NULL, *errptr is set to non-zero if there were any +** conversion errors -- if there was any text that could not be converted to +** the destination character text. +*/ + +int libmail_u_convert_deinit(libmail_u_convert_handle_t handle, +			     int *errptr); + + +/* +** Specialization: save converted character text in a buffer. +** +** Implementation: call libmail_u_convert_tocbuf_init() instead of +** libmail_u_convert_init(), then call libmail_u_convert() and +** libmail_u_convert_deinit(), as usual. +** +** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a +** malloc()ed buffer, and the number of converted characters, the size of the +** malloc()ed buffer, are placed into *csize_ret arguments, that were passed +** to libmail_u_convert_tou_init(). +** +** Note: if the converted string is an empty string, *cbufsize_ret is set to 0, +** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer). +** +** The optional nullterminate places a trailing \0 character after the +** converted string (this is included in *cbufsize_ret). +*/ + +libmail_u_convert_handle_t +libmail_u_convert_tocbuf_init(/* Convert from this chset */ +			      const char *src_chset, + +			      /* Convert to this chset */ +			      const char *dst_chset, + +			      /* malloced buffer */ +			      char **cbufptr_ret, + +			      /* size of the malloced buffer */ +			      size_t *cbufsize_ret, + +			      /* null terminate the resulting string */ +			      int nullterminate +			      ); + + +/* +** Specialization: convert some character text to a unicode_char array. +** +** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char +** array. +** +** The returned *ucsize_ret is initialized with the number of unicode_chars, +** rather than the byte count. +** +** In all other ways, this function behaves identically to +** libmail_u_convert_tocbuf_init(). +*/ + +libmail_u_convert_handle_t +libmail_u_convert_tou_init(/* Convert from this chset */ +			   const char *src_chset, + +			   /* malloc()ed buffer pointer, on exit. */ +			   unicode_char **ucptr_ret, + +			   /* size of the malloc()ed buffer, upon exit */ +			   size_t *ucsize_ret, + +			   /* If true, terminate with U+0x0000, for convenience */ +			   int nullterminate +			   ); + +/* +** Specialization: convert a unicode_char array to some character text. +** +** This is the opposite of libmail_u_convert_tou_init(). Call this to +** initialize the conversion handle, then use libmail_u_convert_uc() +** instead of libmail_u_convert. +*/ + +libmail_u_convert_handle_t +libmail_u_convert_fromu_init(/* Convert to this chset */ +			     const char *dst_chset, + +			     /* malloc()ed buffer pointer, on exit. */ +			     char **cbufptr_ret, + +			     /* size of the malloc()ed buffer, upon exit */ +			     size_t *cbufsize_ret, + +			     /* If true, terminate with U+0x0000, for convenience */ +			     int nullterminate +			     ); + +int libmail_u_convert_uc(/* The conversion handle */ +			 libmail_u_convert_handle_t handle, + +			 /* Text to convert */ +			 const unicode_char *text, + +			 /* Number of bytes to convert */ +			 size_t cnt); + +/* +** Initialize conversion to UTF-8. +** +** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the +** destination charset as UTF-8. +*/ + +libmail_u_convert_handle_t +libmail_u_convert_tocbuf_toutf8_init(const char *src_chset, +				     char **cbufptr_ret, +				     size_t *cbufsize_ret, +				     int nullterminate); + +/* +** Initialize conversion from UTF-8. +** +** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the +** source charset as UTF-8. +*/ + +libmail_u_convert_handle_t +libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset, +				       char **cbufptr_ret, +				       size_t *cbufsize_ret, +				       int nullterminate); + +/* +** Convert a character string to UTF-8. +** +** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an +** error occured. +*/ +char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */ +			       const char *text, + +			       /* Character set to convert to UTF-8 */ +			       const char *charset, + +			       /* +			       ** If non-NULL, and a non-NULL pointer is +			       ** returned, *error is set to non-zero if +			       ** a character conversion error has occured. +			       */ +			       int *error); + +/* +** Convert UTF-8 text to another character set. +** +** Returns a malloc-ed buffer holding the string converted to the specified +** character set, or NULL if an error occured. +*/ + +char *libmail_u_convert_fromutf8(/* A UTF-8 string */ +				 const char *text, + +				 /* +				 ** Convert the UTF-8 string to this character +				 ** set. +				 */ + +				 const char *charset, + +				 /* +				 ** If non-NULL, and a non-NULL pointer is +				 ** returned, *error is set to non-zero if +				 ** a character conversion error has occured. +				 */ +				 int *error); + +/* +** Convert one charset to another charset, placing the result in a malloc-ed +** buffer. +** +** Returns a malloc-ed buffer holding the string converted to the specified +** character set, or NULL if an error occured. +*/ + +char *libmail_u_convert_tobuf(/* A string to convert */ +			      const char *text, + +			      /* +			      ** String's charset. +			      */ + +			      const char *charset, + +			      /* +			      ** Destination charset +			      */ +			      const char *dstcharset, + +			      /* +			      ** If non-NULL, and a non-NULL pointer is +			      ** returned, *error is set to non-zero if +			      ** a character conversion error has occured. +			      */ +			      int *error); + +/* +** Convenience function: call libmail_u_convert_tou_init(), feed the +** character string through libmail_u_convert(), then call +** libmail_u_convert_deinit(). +** +** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size +** holding the unicode char array. +*/ + +int libmail_u_convert_tou_tobuf(/* Character text to convert */ +				const char *text, + +				/* Number of characters */ +				size_t text_l, + +				/* text's charset */ +				const char *charset, + +				/* +				** If this function returns 0, this gets +				** initialized +				*/ +				unicode_char **uc, + +				/* +				** Size of the allocated buffer +				*/ +				size_t *ucsize, + +				/* +				** If not null and this function returns 0, +				** this is set to non-0 if there +				** was a conversion error (but the output +				** buffer gets still allocated and +				** initialized) +				*/ +				int *err); + +/* +** Convenience function: call libmail_u_convert_fromu_init(), feed the +** unicode_array through libmail_u_convert_uc(), then call +** libmail_u_convert_deinit(). +** +** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size +** holding the converted character string +*/ + +int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */ +				  const unicode_char *utext, + +				  /* +				  ** Size of the unicode array. +				  ** If this is (size_t)-1, utext is a +				  ** 0-terminated array. +				  */ +				  size_t utext_l, + +				  /* +				  ** Convert the unicode array to this charset. +				  */ +				  const char *charset, + +				  /* +				  ** If libmail_u_convert_fromu_tobuf() +				  ** returns 0, this is initialized to a +				  ** malloced buffer with a 0-terminated +				  ** string is kept. +				  */ +				  char **c, + +				  /* +				  ** Size of the initialized array, including +				  ** the 0-terminator. +				  */ +				  size_t *csize, + +				  /* +				  ** If libmail_u_convert_fromu_tobuf() +				  ** returns 0 and this is not NULL, +				  ** *err is set to non-0 if there was a +				  ** conversion error to the requested +				  ** character set. +				  */ +				  int *err); + +/* +** Convenience function: convert a string in a given character set +** to/from uppercase, lowercase, or something else. +** +** This is done by calling libmail_u_convert_tou_tobuf() first, +** applying the title_func and char_func, then using +** libmail_u_convert_fromu_tobuf(). +** +** A NULL return indicates that the requested conversion cannot be performed. +*/ + +char *libmail_u_convert_tocase( /* String to convert */ +			       const char *str, + +			       /* String's character set */ + +			       const char *charset, + +			       /* +			       ** Conversion of the first character in +			       ** str: unicode_uc, unicode_lc, or unicode_tc: +			       */ + +			       unicode_char (*first_char_func)(unicode_char), + +			       /* +			       ** Conversion of the second and the remaining +			       ** character in str. If NULL, same as +			       ** first_char_func. +			       */ +			       unicode_char (*char_func)(unicode_char)); + + + +/* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */ + +extern const char libmail_u_ucs4_native[]; + +/* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */ + +extern const char libmail_u_ucs2_native[]; + +/* +** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset +** parameter. +** +** This can be followed by a " " and up to 15 characters to be escaped in +** addition to unicode chars. +*/ + +#define unicode_x_imap_modutf7 "x-imap-modutf7" + +#if 0 +{ +#endif + +#ifdef	__cplusplus +} + +extern size_t unicode_wcwidth(const std::vector<unicode_char> &uc); + +namespace mail { + +	/* +	** Interface to iconv. +	** +	** Subclass converted(). Invoke begin(), then operator(), repeatedly, +	** then end(). +	** +	** converted() receives the converted text. +	*/ + +	class iconvert { + +		libmail_u_convert_handle_t handle; + +	public: +		iconvert(); +		virtual ~iconvert(); + +		/* Start conversion. +		** Returns false if the requested conversion cannot be done. +		**/ + +		bool begin(/* Convert from */ +			   const std::string &src_chset, + +			   /* Convert to */ +			   const std::string &dst_chset); + +		/* Feed iconv(3). Returns false if the conversion was aborted. +		 */ + +		bool operator()(const char *, size_t); + +		bool operator()(const unicode_char *, size_t); + +		/* +		** Get the results here. If the subclass returns a non-0 +		** value, the conversion is aborted. +		*/ + +		virtual int converted(const char *, size_t); + +		/* +		** End of conversion. +		** +		** Returns true if all calls to converted() returned 0, +		** false if the conversion was aborted. +		** +		** errflag is set to true if there was a character that could +		** not be converted, and passed to converted(). +		*/ + +		bool end(bool &errflag) +		{ +			return end(&errflag); +		} + +		bool end() +		{ +			return end(NULL); +		} + +		/* Convert between two different charsets */ + +		static std::string convert(const std::string &text, +					   const std::string &charset, +					   const std::string &dstcharset, +					   bool &errflag); + +		/* Convert between two different charsets */ + +		static std::string convert(const std::string &text, +					   const std::string &charset, +					   const std::string &dstcharset) +		{ +			bool dummy; + +			return convert(text, charset, dstcharset, dummy); +		} + +		/* Convert from unicode to a charset */ + +		static std::string convert(const std::vector<unicode_char> &uc, +					   const std::string &dstcharset, +					   bool &errflag); + +		/* Convert from unicode to a charset */ + +		static std::string convert(const std::vector<unicode_char> &uc, +					   const std::string &dstcharset) +		{ +			bool dummy; + +			return convert(uc, dstcharset, dummy); +		} + +		/* Convert charset to unicode */ + +		static bool convert(const std::string &text, +				    const std::string &charset, +				    std::vector<unicode_char> &uc); + + +		/* Convert to upper/lower/title case */ + +		static std::string +			convert_tocase(/* Text string */ +				       const std::string &text, + +				       /* Its charset */ +				       const std::string &charset, + +				       /* First character: unicode_uc, unicode_lc, or unicode_tc */ +				       unicode_char (*first_char_func)(unicode_char), + +				       /* If not NULL, second and subsequent chars */ +				       unicode_char (*char_func)(unicode_char) +				       =NULL) +		{ +			bool dummy; + +			return convert_tocase(text, charset, dummy, +					      first_char_func, +					      char_func); +		} + +		/* Convert to upper/lower/title case */ + +		static std::string +			convert_tocase(/* Text string */ +				       const std::string &text, + +				       /* Its charset */ +				       const std::string &charset, + +				       /* Set if there's a conversion error */ +				       bool &err, + +				       /* First character: unicode_uc, unicode_lc, or unicode_tc */ +				       unicode_char (*first_char_func)(unicode_char), + +				       /* If not NULL, second and subsequent chars */ +				       unicode_char (*char_func)(unicode_char) +				       =NULL); +	private: +		bool end(bool *); + +	public: +		class tou; +		class fromu; +	}; + +	/* Convert output of iconvert to unicode_chars. */ + +	class iconvert::tou : public iconvert { + +	public: +		bool begin(const std::string &chset); + +		virtual int converted(const unicode_char *, size_t); + +		using iconvert::operator(); +	private: +		int converted(const char *ptr, size_t cnt); + +	public: +		template<typename iter_t> class to_iter_class; + +		template<typename input_iter_t, +			typename output_iter_t> +			static output_iter_t convert(input_iter_t from_iter, +						     input_iter_t to_iter, +						     const std::string &chset, +						     output_iter_t out_iter); + +		template<typename input_iter_t> +			static void convert(input_iter_t from_iter, +					    input_iter_t to_iter, +					    const std::string &chset, +					    std::vector<unicode_char> &out_buf) +		{ +			out_buf.clear(); +			std::back_insert_iterator<std::vector<unicode_char> > +				insert_iter(out_buf); + +			convert(from_iter, to_iter, chset, insert_iter); +		} + +		static void convert(const std::string &str, +				    const std::string &chset, +				    std::vector<unicode_char> &out_buf); +	}; + +	/* Helper class that saves unicode output into an output iterator */ + +	template<typename iter_t> +		class iconvert::tou::to_iter_class : public iconvert::tou { + +		iter_t iter; +	public: + +	to_iter_class(iter_t iterValue) +		: iter(iterValue) {} + +		using tou::operator(); + +		operator iter_t() const { return iter; } + +	private: +		int converted(const unicode_char *ptr, size_t cnt) +		{ +			while (cnt) +			{ +				*iter=*ptr; + +				++iter; +				++ptr; +				--cnt; +			} +			return 0; +		} +	}; +		 +	template<typename input_iter_t, +		typename output_iter_t> +		output_iter_t iconvert::tou::convert(input_iter_t from_iter, +						     input_iter_t to_iter, +						     const std::string &chset, +						     output_iter_t out_iter) +		{ +			class to_iter_class<output_iter_t> out(out_iter); + +			if (!out.begin(chset)) +				return out; + +			std::vector<char> string; + +			while (from_iter != to_iter) +			{ +				string.push_back(*from_iter++); + +				if (string.size() > 31) +				{ +					out(&string[0], string.size()); +					string.clear(); +				} +			} + +			if (string.size() > 0) +				out(&string[0], string.size()); + +			out.end(); +			return out; +		} +		 +	/* Convert output of iconvert from unicode_chars. */ + +	class iconvert::fromu : public iconvert { + +	public: +		bool begin(const std::string &chset); + +		using iconvert::operator(); + +		template<typename iter_t> class to_iter_class; + +		template<typename input_iter_t, +			typename output_iter_t> +			static output_iter_t convert(input_iter_t from_iter, +						     input_iter_t to_iter, +						     const std::string &chset, +						     output_iter_t out_iter); + +		template<typename input_iter_t> +			static void convert(input_iter_t from_iter, +					    input_iter_t to_iter, +					    const std::string &chset, +					    std::string &out_buf) +		{ +			out_buf=""; +			std::back_insert_iterator<std::string> +				insert_iter(out_buf); + +			convert(from_iter, to_iter, chset, insert_iter); +		} + +		static void convert(const std::vector<unicode_char> &ubuf, +				    const std::string &chset, +				    std::string &out_buf); + +		static std::string convert(const std::vector<unicode_char> +					   &ubuf, +					   const std::string &chset); +	}; + +	/* Helper class that saves unicode output into an output iterator */ + +	template<typename iter_t> +		class iconvert::fromu::to_iter_class : public iconvert::fromu { + +		iter_t iter; +	public: + +	to_iter_class(iter_t iterValue) +		: iter(iterValue) {} + +		using fromu::operator(); + +		operator iter_t() const { return iter; } + +	private: +		int converted(const char *ptr, size_t cnt) +		{ +			while (cnt) +			{ +				*iter=*ptr; + +				++iter; +				++ptr; +				--cnt; +			} +			return 0; +		} +	}; +		 +	template<typename input_iter_t, +		typename output_iter_t> +		output_iter_t iconvert::fromu::convert(input_iter_t from_iter, +						       input_iter_t to_iter, +						       const std::string &chset, +						       output_iter_t out_iter) +		{ +			class to_iter_class<output_iter_t> out(out_iter); + +			if (!out.begin(chset)) +				return out; + +			std::vector<unicode_char> string; + +			while (from_iter != to_iter) +			{ +				string.push_back(*from_iter++); + +				if (string.size() > 31) +				{ +					out(&string[0], string.size()); +					string.clear(); +				} +			} + +			if (string.size() > 0) +				out(&string[0], string.size()); + +			out.end(); +			return out; +		} + +	/* +	** Unicode linebreaking algorithm, tr14. +	*/ + +	extern "C" int linebreak_trampoline(int value, void *ptr); +	extern "C" int linebreakc_trampoline(int value, unicode_char ch, +					     void *ptr); + +	/* +	** Subclass linebreak_callback_base, implement operator()(int). +	** +	** Use operator<< or operator()(iterator, iterator) to feed +	** unicode_chars into the linebreaking algorithm. The subclass receives +	** UNICODE_LB values, as they become available. +	*/ + +	class linebreak_callback_base { + +		unicode_lb_info_t handle; + +		int opts; + +		linebreak_callback_base(const linebreak_callback_base &); +		/* NOT IMPLEMENTED */ + +		linebreak_callback_base &operator==(const +						    linebreak_callback_base &); +		/* NOT IMPLEMENTED */ + +	public: +		linebreak_callback_base(); +		virtual ~linebreak_callback_base(); + +		void finish(); + +		void set_opts(int opts); + +		friend int linebreak_trampoline(int, void *); + +		linebreak_callback_base &operator<<(unicode_char uc); + +		template<typename iter_type> +			linebreak_callback_base &operator()(iter_type beg_iter, +							    iter_type end_iter) +		{ +			while (beg_iter != end_iter) +				operator<<(*beg_iter++); +			return *this; +		} + +		linebreak_callback_base &operator<<(const +						    std::vector<unicode_char> +						    &vec) +		{ +			return operator()(vec.begin(), vec.end()); +		} +	private: +		virtual int operator()(int); +	}; + +	class linebreak_callback_save_buf : public linebreak_callback_base { + +	public: +		std::list<int> lb_buf; + +		linebreak_callback_save_buf(); +		~linebreak_callback_save_buf(); + +	private: +		int operator()(int value); +	}; + +	/* +	** Convert an input iterator sequence over unicode_chars into +	** an input iterator sequence over linebreak values. +	*/ + +	template<typename input_t> class linebreak_iter +		: public std::iterator<std::input_iterator_tag, int, void> +	{ +		mutable input_t iter_value, end_iter_value; + +		mutable linebreak_callback_save_buf *buf; + +		void fill() const +		{ +			if (buf == NULL) +				return; + +			while (buf->lb_buf.empty()) +			{ +				if (iter_value == end_iter_value) +				{ +					buf->finish(); +					if (buf->lb_buf.empty()) +					{ +						delete buf; +						buf=NULL; +					} +					break; +				} + +				buf->operator<<(*iter_value++); +			} +		} + +		mutable value_type bufvalue; + +	public: +		linebreak_iter(const input_t &iter_valueArg, +			       const input_t &iter_endvalueArg) +			: iter_value(iter_valueArg), +			end_iter_value(iter_endvalueArg), +			buf(new linebreak_callback_save_buf) +			{ +			} + +		linebreak_iter() : buf(NULL) +		{ +		} + +		void set_opts(int opts) +		{ +			if (buf) +				buf->set_opts(opts); +		} + +		~linebreak_iter() +		{ +			if (buf) +				delete buf; +		} + +		linebreak_iter(const linebreak_iter<input_t> &v) +			: buf(NULL) +		{ +			operator=(v); +		} + +		linebreak_iter<input_t> &operator=(const +						   linebreak_iter<input_t> &v) +		{ +			if (buf) +				delete buf; +			buf=v.buf; +			iter_value=v.iter_value; +			end_iter_value=v.end_iter_value; +			v.buf=NULL; +			return *this; +		} + +		bool operator==(const linebreak_iter<input_t> &v) const +		{ +			fill(); +			v.fill(); + +			return buf == NULL && v.buf == NULL; +		} + +		bool operator!=(const linebreak_iter<input_t> &v) const +		{ +			return !operator==(v); +		} + +		value_type operator*() const +		{ +			fill(); +			return buf == NULL ? UNICODE_LB_MANDATORY: +				buf->lb_buf.front(); +		} + +		linebreak_iter<input_t> &operator++() +		{ +			bufvalue=operator*(); + +			if (buf) +				buf->lb_buf.pop_front(); +			return *this; +		} + +		const value_type *operator++(int) +		{ +			operator++(); +			return &bufvalue; +		} +	}; + +	/* +	** Like linebreak_callback_base, except the subclass receives both +	** the linebreaking value, and the unicode character. +	*/ + +	class linebreakc_callback_base { + +		unicode_lbc_info_t handle; + +		int opts; + +		linebreakc_callback_base(const linebreakc_callback_base &); +		/* NOT IMPLEMENTED */ + +		linebreakc_callback_base &operator==(const +						     linebreakc_callback_base +						     &); +		/* NOT IMPLEMENTED */ + + +	public: +		linebreakc_callback_base(); +		virtual ~linebreakc_callback_base(); + +		void finish(); + +		void set_opts(int opts); + +		friend int linebreakc_trampoline(int, unicode_char, void *); + +		linebreakc_callback_base &operator<<(unicode_char uc); + +		template<typename iter_type> +			linebreakc_callback_base &operator()(iter_type beg_iter, +							    iter_type end_iter) +		{ +			while (beg_iter != end_iter) +				operator<<(*beg_iter++); +			return *this; +		} + +		linebreakc_callback_base &operator<<(const +						    std::vector<unicode_char> +						    &vec) +		{ +			return operator()(vec.begin(), vec.end()); +		} +	private: +		virtual int operator()(int, unicode_char); +	}; + +	class linebreakc_callback_save_buf : public linebreakc_callback_base { + +	public: +		std::list<std::pair<int, unicode_char> > lb_buf; + +		linebreakc_callback_save_buf(); +		~linebreakc_callback_save_buf(); + +	private: +		int operator()(int, unicode_char); +	}; + + +	/* +	** Convert an input iterator sequence over unicode_chars into +	** an input iterator sequence over std::pair<int, unicode_char>, +	** the original unicode character, and the linebreaking value before +	** the character. +	*/ + +	template<typename input_t> class linebreakc_iter +		: public std::iterator<std::input_iterator_tag, +		std::pair<int, unicode_char>, void> +	{ +		mutable input_t iter_value, end_iter_value; + +		mutable linebreakc_callback_save_buf *buf; + +		void fill() const +		{ +			if (buf == NULL) +				return; + +			while (buf->lb_buf.empty()) +			{ +				if (iter_value == end_iter_value) +				{ +					buf->finish(); +					if (buf->lb_buf.empty()) +					{ +						delete buf; +						buf=NULL; +					} +					break; +				} + +				buf->operator<<(*iter_value); +				++iter_value; +			} +		} + +		mutable value_type bufvalue; + +	public: +		linebreakc_iter(const input_t &iter_valueArg, +				const input_t &iter_endvalueArg) +			: iter_value(iter_valueArg), +			end_iter_value(iter_endvalueArg), +			buf(new linebreakc_callback_save_buf) +			{ +			} + +		linebreakc_iter() : buf(NULL) +		{ +		} + +		~linebreakc_iter() +		{ +			if (buf) +				delete buf; +		} + +		linebreakc_iter(const linebreakc_iter<input_t> &v) +			: buf(NULL) +		{ +			operator=(v); +		} + +		linebreakc_iter<input_t> &operator=(const +						   linebreakc_iter<input_t> &v) +		{ +			if (buf) +				delete buf; +			buf=v.buf; +			iter_value=v.iter_value; +			end_iter_value=v.end_iter_value; +			v.buf=NULL; +			return *this; +		} + +		bool operator==(const linebreakc_iter<input_t> &v) const +		{ +			fill(); +			v.fill(); + +			return buf == NULL && v.buf == NULL; +		} + +		bool operator!=(const linebreakc_iter<input_t> &v) const +		{ +			return !operator==(v); +		} + +		value_type operator*() const +		{ +			fill(); +			return buf == NULL ? +				std::make_pair(UNICODE_LB_MANDATORY, +					       (unicode_char)0): +				buf->lb_buf.front(); +		} + +		linebreakc_iter<input_t> &operator++() +		{ +			bufvalue=operator*(); + +			if (buf) +				buf->lb_buf.pop_front(); +			return *this; +		} + +		const value_type *operator++(int) +		{ +			operator++(); +			return &bufvalue; +		} +	}; + + +	/* +	** Subclass wordbreak_callback_base, implement operator()(int). +	** +	** Use operator<< or operator()(iterator, iterator) to feed +	** unicode_chars into the wordbreaking algorithm. The subclass receives +	** word flags, as they become available. +	*/ + +	extern "C" int wordbreak_trampoline(int value, void *ptr); + +	class wordbreak_callback_base { + +		unicode_wb_info_t handle; + +		wordbreak_callback_base(const wordbreak_callback_base &); +		/* NOT IMPLEMENTED */ + +		wordbreak_callback_base &operator==(const +						    wordbreak_callback_base &); +		/* NOT IMPLEMENTED */ + +	public: +		wordbreak_callback_base(); +		virtual ~wordbreak_callback_base(); + +		void finish(); + +		friend int wordbreak_trampoline(int, void *); + +		wordbreak_callback_base &operator<<(unicode_char uc); + +		template<typename iter_type> +			wordbreak_callback_base &operator()(iter_type beg_iter, +							    iter_type end_iter) +		{ +			while (beg_iter != end_iter) +				operator<<(*beg_iter++); +			return *this; +		} + +		wordbreak_callback_base &operator<<(const +						    std::vector<unicode_char> +						    &vec) +		{ +			return operator()(vec.begin(), vec.end()); +		} +	private: +		virtual int operator()(bool); +	}; + +	/* +	** A C++ wrapper for unicode_wbscan. +	*/ + +	class wordbreakscan { + +		unicode_wbscan_info_t handle; + +		wordbreakscan(const wordbreakscan &); +		/* NOT IMPLEMENTED */ + +		wordbreakscan &operator==(const wordbreakscan &); +		/* NOT IMPLEMENTED */ +	public: + +		wordbreakscan(); +		~wordbreakscan(); + +		bool operator<<(unicode_char uc); + +		size_t finish(); +	}; +		 +} +#endif + +#endif | 
