diff options
| -rw-r--r-- | imap/ChangeLog | 4 | ||||
| -rw-r--r-- | imap/configure.ac | 2 | ||||
| -rw-r--r-- | imap/fetch.c | 50 | ||||
| -rw-r--r-- | imap/imapd.c | 2 | ||||
| -rw-r--r-- | imap/imapscanclient.c | 1 | ||||
| -rw-r--r-- | imap/imapscanclient.h | 1 | ||||
| -rw-r--r-- | rfc2045/testrfc3676parsersuite.txt | 2 | ||||
| -rw-r--r-- | unicode/Makefile.am | 5 | ||||
| -rw-r--r-- | unicode/biditest2.C | 13 | ||||
| -rw-r--r-- | unicode/book.xml | 179 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 93 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 38 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 47 | 
13 files changed, 239 insertions, 198 deletions
| diff --git a/imap/ChangeLog b/imap/ChangeLog index d011c22..abb298a 100644 --- a/imap/ChangeLog +++ b/imap/ChangeLog @@ -1,3 +1,7 @@ +2020-11-30  Sam Varshavchik  <mrsam@courier-mta.com> + +	* imap: send corrupted Unicode alerts only for new messages. +  2020-11-04  Sam Varshavchik  <mrsam@courier-mta.com>  	* spec file: add BuildRequires: %{__make} (will be required in F34). diff --git a/imap/configure.ac b/imap/configure.ac index e538983..a85bb04 100644 --- a/imap/configure.ac +++ b/imap/configure.ac @@ -4,7 +4,7 @@ dnl  dnl Copyright 1998 - 2019 Double Precision, Inc.  See COPYING for  dnl distribution information. -AC_INIT(courier-imap, 5.0.11, [courier-users@lists.sourceforge.net]) +AC_INIT(courier-imap, 5.0.11.20201130, [courier-users@lists.sourceforge.net])  >confdefs.h  # Kill PACKAGE_ macros diff --git a/imap/fetch.c b/imap/fetch.c index 5daf150..257b295 100644 --- a/imap/fetch.c +++ b/imap/fetch.c @@ -61,8 +61,8 @@ extern void get_message_flags(struct imapscanmessageinfo *,  extern void append_flags(char *, struct imapflags *);  static int fetchitem(FILE **, int *, struct fetchinfo *, -	struct imapscaninfo *,  unsigned long, -	struct rfc2045 **); +		     struct imapscaninfo *,  unsigned long, +		     struct rfc2045 **, int *);  static void bodystructure(FILE *, struct fetchinfo *,  	struct imapscaninfo *,  unsigned long, @@ -250,6 +250,7 @@ int do_fetch(unsigned long n, int byuid, void *p)  	int	seen;  	int	open_err;  	int	unicode_err=0; +	int	report_unicode_err=0;  	fp=NULL;  	open_err=0; @@ -278,15 +279,11 @@ int do_fetch(unsigned long n, int byuid, void *p)  	while (fi)  	{  		int rc=fetchitem(&fp, &open_err, fi, ¤t_maildir_info, n-1, -				 &rfc2045p); +				 &rfc2045p, &unicode_err);  		if (rc > 0)  			seen=1; -		if (rc < 0) -		{ -			rc=0; -			unicode_err=1; -		} +  		if ((fi=fi->next) != 0)	writes(" ");  	}  	writes(")\r\n"); @@ -299,22 +296,6 @@ int do_fetch(unsigned long n, int byuid, void *p)  		return (0);  	} -	if (current_maildir_info.msgs[n-1].err8bitflag) -		unicode_err=0; - -	if (unicode_err) -	{ -		current_maildir_info.msgs[n-1].err8bitflag=1; - -		writes("* OK [ALERT] Message "); -		writen(n); -		writes(" appears to be a Unicode message and your" -		       " E-mail reader did not enable Unicode support." -		       " Please use an E-mail reader that supports" -		       " IMAP with UTF-8 (see" -		       " https://tools.ietf.org/html/rfc6855.html)\r\n"); -	} -  #if SMAP  	if (!smapflag)  #endif @@ -334,17 +315,31 @@ int do_fetch(unsigned long n, int byuid, void *p)  			reflag_filename(¤t_maildir_info.msgs[n-1],&flags,  				fileno(fp));  			current_maildir_info.msgs[n-1].changedflags=1; + +			report_unicode_err=unicode_err;  		}  	} +	if (report_unicode_err) +	{ +		writes("* OK [ALERT] Message "); +		writen(n); +		writes(" appears to be a Unicode message and your" +		       " E-mail reader did not enable Unicode support." +		       " Please use an E-mail reader that supports" +		       " IMAP with UTF-8 (see" +		       " https://tools.ietf.org/html/rfc6855.html)\r\n"); +	} +  	if (current_maildir_info.msgs[n-1].changedflags)  		fetchflags(n-1);  	return (0);  }  static int fetchitem(FILE **fp, int *open_err, struct fetchinfo *fi, -	struct imapscaninfo *i, unsigned long msgnum, -	struct rfc2045 **mimep) +		     struct imapscaninfo *i, unsigned long msgnum, +		     struct rfc2045 **mimep, +		     int *unicode_err)  {  	void (*fetchfunc)(FILE *, struct fetchinfo *,  			  struct imapscaninfo *, unsigned long, @@ -460,8 +455,7 @@ static int fetchitem(FILE **fp, int *open_err, struct fetchinfo *fi,  	if (mimecorrectness && !enabled_utf8 &&  	    ((*mimep)->rfcviolation & RFC2045_ERR8BITHEADER))  	{ -		/* Still return -1, in order to [ALERT] the client */ -		rc= -1; +		*unicode_err=1;  	}  	(*fetchfunc)(*fp, fi, i, msgnum, *mimep); diff --git a/imap/imapd.c b/imap/imapd.c index 095defb..c0b5908 100644 --- a/imap/imapd.c +++ b/imap/imapd.c @@ -1334,8 +1334,6 @@ void doNoop(int real_noop)  #endif  		new_maildir_info.msgs[j].copiedflag=  			current_maildir_info.msgs[i].copiedflag; -		new_maildir_info.msgs[j].err8bitflag= -			current_maildir_info.msgs[i].err8bitflag;  		++j;  	} diff --git a/imap/imapscanclient.c b/imap/imapscanclient.c index 515abc2..5ab7148 100644 --- a/imap/imapscanclient.c +++ b/imap/imapscanclient.c @@ -868,7 +868,6 @@ int	dowritecache=0;  		scaninfo->msgs[i].filename=tempinfo_array[i]->filename;  		scaninfo->msgs[i].keywordMsg=NULL;  		scaninfo->msgs[i].copiedflag=0; -		scaninfo->msgs[i].err8bitflag=0;  #if SMAP  		if (smapflag)  			scaninfo->msgs[i].recentflag=0; diff --git a/imap/imapscanclient.h b/imap/imapscanclient.h index 373bc83..2e516cb 100644 --- a/imap/imapscanclient.h +++ b/imap/imapscanclient.h @@ -23,7 +23,6 @@ struct imapscanmessageinfo {  	char storeflag;  /* Used by imap_addRemoveKeywords() */ -	char err8bitflag;       /* Invalid 8 bit header error was reported */  	/* When reading keywords, hash messages by filename */  	struct imapscanmessageinfo *firstBucket, *nextBucket; diff --git a/rfc2045/testrfc3676parsersuite.txt b/rfc2045/testrfc3676parsersuite.txt index f523981..93c87d5 100644 --- a/rfc2045/testrfc3676parsersuite.txt +++ b/rfc2045/testrfc3676parsersuite.txt @@ -109,5 +109,5 @@  [0: 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 ...1234567890 1234567890 1234567890 1234567890 1234567890 1234567890] -[0: 12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234...1678901234567890123456789012345678901234567890123456789012345678901234567...190123456789012345678901234567890123456789012345678901234567890] +[0: 12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234...5678901234567890123456789012345678901234567890123456789012345678901234567...890123456789012345678901234567890123456789012345678901234567890] diff --git a/unicode/Makefile.am b/unicode/Makefile.am index f864e2d..dbc71aa 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -89,11 +89,11 @@ include_HEADERS=courier-unicode.h \  man_MANS= \          $(srcdir)/man/courier-unicode.7 \ +        $(srcdir)/man/unicode\:\:bidi.3 \          $(srcdir)/man/unicode\:\:bidi_calc.3 \          $(srcdir)/man/unicode\:\:bidi_cleanup.3 \          $(srcdir)/man/unicode\:\:bidi_embed.3 \          $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ -        $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \          $(srcdir)/man/unicode\:\:bidi_logical_order.3 \          $(srcdir)/man/unicode\:\:bidi_reorder.3 \          $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \ @@ -118,7 +118,6 @@ man_MANS= \          $(srcdir)/man/unicode_bidi_cleanup.3 \          $(srcdir)/man/unicode_bidi_embed.3 \          $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ -        $(srcdir)/man/unicode_bidi_extra_cleanup.3 \          $(srcdir)/man/unicode_bidi_logical_order.3 \          $(srcdir)/man/unicode_bidi_mirror.3 \          $(srcdir)/man/unicode_bidi_reorder.3 \ @@ -515,4 +514,4 @@ distrelease:  	$(MAKE) dist  www: -	rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode +	rsync -a --delete-after html/. $$HOME/www/hostrocket/courier-mta.org/unicode diff --git a/unicode/biditest2.C b/unicode/biditest2.C index a9ab87d..ded76be 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -307,7 +307,9 @@ void character_test()  			exit(1);  		} -		unicode::bidi_extra_cleanup(s, levels); +		unicode::bidi_cleanup(s, levels, +				      [](size_t) {}, +				      UNICODE_BIDI_CLEANUP_CANONICAL);  		auto dump_ls=  			[&] @@ -371,8 +373,13 @@ void character_test()  			}  			unicode::bidi_reorder(new_string, std::get<0>(ret)); -			unicode::bidi_extra_cleanup(new_string, -						    std::get<0>(ret)); +			unicode::bidi_cleanup(new_string, +					      std::get<0>(ret), +					      [] +					      (size_t) +					      { +					      }, +					      UNICODE_BIDI_CLEANUP_CANONICAL);  			/* New string is now back in logical order */ diff --git a/unicode/book.xml b/unicode/book.xml index c8948ba..b0342ea 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -304,7 +304,6 @@ See COPYING for distribution information.  	  <refname>unicode_bidi_calc</refname>  	  <refname>unicode_bidi_reorder</refname>  	  <refname>unicode_bidi_cleanup</refname> -	  <refname>unicode_bidi_extra_cleanup</refname>  	  <refname>unicode_bidi_logical_order</refname>  	  <refname>unicode_bidi_embed</refname>  	  <refname>unicode_bidi_embed_paragraph_level</refname> @@ -341,15 +340,7 @@ See COPYING for distribution information.                <paramdef>char32_t *<parameter>string</parameter></paramdef>                <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>                <paramdef>size_t <parameter>n</parameter></paramdef> -              <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> -	      <paramdef>void *<parameter>arg</parameter></paramdef> -	    </funcprototype> - -	    <funcprototype> -	      <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef> -              <paramdef>char32_t *<parameter>string</parameter></paramdef> -              <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> -              <paramdef>size_t <parameter>n</parameter></paramdef> +	      <paramdef>int <parameter>options</parameter></paramdef>                <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef>  	      <paramdef>void *<parameter>arg</parameter></paramdef>  	    </funcprototype> @@ -450,8 +441,7 @@ See COPYING for distribution information.  	      </listitem>  	      <listitem>  		<para> -		  Use <function>unicode_bidi_cleanup</function>() or -		  <function>unicode_bidi_extra_cleanup</function>(), +		  Use <function>unicode_bidi_cleanup</function>()  		  to remove the characters from the string which are used  		  by the bi-directional algorithm, and are not needed for  		  rendering the text. @@ -585,28 +575,12 @@ See COPYING for distribution information.  	      <quote>rendering order</quote>, but still contain bi-directional  	      embedding, override, boundary-neutral, isolate, and marker  	      characters. -	      <function>unicode_bidi_cleanup</function>() and -	      <function>unicode_bidi_extra_cleanup</function>() remove these -	      characters and directional markers from the unicode string. -	      <function>unicode_bidi_cleanup</function> removes only the -	      embedding, override, and  boundry-neutral characters (as -	      specified by step X9 of the bi-directional algorithm). -	      <function>unicode_bidi_extra_cleanup</function>() -	      additionally removes the isolation markers, implicit markers; -	      and all characters -	      classified as paragraph separators get replaced by a newline. -            </para> -	    <para> -	      A non-null pointer to the directional embedding level buffer, -	      of the same size as the string, also removes the corresponding -	      values from the buffer, and the remaining values in the -	      embedding level buffer get reset to -	      levels <literal>UNICODE_BIDI_LR</literal> and -	      <literal> UNICODE_BIDI_RL</literal>, only. -            </para> +	      <function>unicode_bidi_cleanup</function> +	      removes these characters and directional markers. +	    </para>  	    <para> -	      The parameters to <function>unicode_bidi_cleanup</function>() and -	      <function>unicode_bidi_extra_cleanup</function>() are: +	      The parameters to <function>unicode_bidi_cleanup</function>() +	      are:              </para>  	    <itemizedlist> @@ -617,15 +591,66 @@ See COPYING for distribution information.                </listitem>  	      <listitem>  		<para> -		  The pointer to the directional embedding buffer. -                </para> +		  A non-null pointer to the directional embedding level buffer, +		  of the same size as the string, also removes the corresponding +		  values from the buffer, and the remaining values in the +		  embedding level buffer get reset to +		  levels <literal>UNICODE_BIDI_LR</literal> and +		  <literal> UNICODE_BIDI_RL</literal>, only. +		</para>                </listitem> +  	      <listitem>  		<para>  		  The size of the unicode string and the directional embedding -		  buffer. +		  buffer (if not NULL).                  </para>                </listitem> + +	      <listitem> +		<para> +		  A a bitmask that selects the following options +		  (or 0 if no options): +		</para> + +		<variablelist> +		  <varlistentry> +		    <term><literal>UNICODE_BIDI_CLEANUP_EXTRA</literal></term> +		    <listitem> +		      <para> +			In addition to removing all embedding, override, and +			boundry-neutral characters as +			specified by step X9 of the bi-directional algorithm +			(the default behavior without this flag), also +			remove all isolation markers and implicit markers. +		      </para> +		    </listitem> +		  </varlistentry> + +		  <varlistentry> +		    <term><literal>UNICODE_BIDI_CLEANUP_BNL</literal></term> +		    <listitem> +		      <para> +			Replace all characters classified as paragraph +			separators with a newline character. +		      </para> +		    </listitem> +		  </varlistentry> + +		  <varlistentry> +		    <term><literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal></term> +		    <listitem> +		      <para> +			A combined set of +			<literal>UNICODE_BIDI_CLEANUP_EXTRA</literal> +			and +			<literal>UNICODE_BIDI_CLEANUP_BNL</literal>, +		      </para> +		    </listitem> +		  </varlistentry> +		</variablelist> +	      </listitem> +  	      <listitem>  		<para>  		  A pointer to a function that gets repeatedly invoked with the @@ -647,17 +672,17 @@ See COPYING for distribution information.  	      from the first to  	      the last removed character (if any).              </para> -	    <para> -	      Multiple calls to <function>unicode_bidi_cleanup</function>() or -	      <function>unicode_bidi_extra_cleanup</function>() do no harm; -	      except that <function>unicode_bidi_extra_cleanup</function>() -	      always removes all the additional characters that -	      <function>unicode_bidi_cleanup</function>() does not remove. -            </para> +  	    <para>  	      The character string and the embedding level values resulting -	      from <function>unicode_bidi_extra_cleanup</function>() are in +	      from <function>unicode_bidi_cleanup</function>() +	      with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal> +	      are in  	      <quote>canonical rendering order</quote>. +	      <function>unicode_bidi_logical_order</function>() and +	      <function>unicode_bidi_embed</function>() require the +	      canonical rendering order for their string and embedding level +	      values.              </para>  	  </refsect2> @@ -675,7 +700,8 @@ See COPYING for distribution information.  	      canonical rendering order after applying  	      <function>unicode_bidi_calc()</function>,  	      <function>unicode_reorder()</function> and -	      <function>unicode_bidi_extra_cleanup()</function>, +	      <function>unicode_bidi_cleanup()</function> +	      (with the canonical option),  	      with the same paragraph_embedding level.              </para> @@ -2628,15 +2654,15 @@ See COPYING for distribution information.  	<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>  	<refmeta> -	  <refentrytitle>unicode::bidi::calc</refentrytitle> +	  <refentrytitle>unicode::bidi</refentrytitle>  	  <manvolnum>3</manvolnum>  	</refmeta>  	<refnamediv> +	  <refname>unicode::bidi</refname>  	  <refname>unicode::bidi_calc</refname>  	  <refname>unicode::bidi_reorder</refname>  	  <refname>unicode::bidi_cleanup</refname> -	  <refname>unicode::bidi_extra_cleanup</refname>  	  <refname>unicode::bidi_logical_order</refname>  	  <refname>unicode::bidi_embed</refname>  	  <refname>unicode::bidi_embed_paragraph_level</refname> @@ -2674,6 +2700,7 @@ See COPYING for distribution information.                <funcdef>void <function>unicode::bidi_cleanup</function></funcdef>  	      <paramdef>std::u32string &<parameter>string</parameter></paramdef>  	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +	      <paramdef>int <parameter>cleanup_options</parameter></paramdef>              </funcprototype>  	    <funcprototype> @@ -2681,19 +2708,7 @@ See COPYING for distribution information.  	      <paramdef>std::u32string &<parameter>string</parameter></paramdef>  	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef>  	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> -            </funcprototype> - -	    <funcprototype> -              <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef> -	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> -	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> -            </funcprototype> - -	    <funcprototype> -              <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef> -	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> -	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> -	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +	      <paramdef>int <parameter>cleanup_options</parameter></paramdef>              </funcprototype>  	    <funcprototype> @@ -2789,7 +2804,51 @@ See COPYING for distribution information.                </para>              </listitem>            </itemizedlist> + +	  <refsect2 id="unicode_cpp_bidi_literals"> +	    <title><literal>unicode::literals</literal> namespace</title> + +	    <blockquote> +	      <informalexample> +		<programlisting><![CDATA[ +using namespace unicode::literals; + +std::u32string foo(std::u32string bar) +{ +	return bar + LRO; +} +]]></programlisting> +	      </informalexample> +	    </blockquote> + +	    <para> +	      This namespace contains the following <literal>constexpr</literal> +	      definitions: +	    </para> + +	    <itemizedlist> +	      <listitem> +		<para> +		  <classname>char32_t</classname> arrays with literal +		  Unicode character strings containing Unicode directional, +		  isolate, and override markers, like +		  <literal>LRO</literal>, +		  <literal>RLO</literal> and others. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  <literal>CLEANUP_EXTRA</literal>, +		  <literal>CLEANUP_BNL</literal>, and +		  <literal>CLEANUP_CANONICAL</literal> options for +		  <function>unicode::bidi_cleanup</function>(). +		</para> +	      </listitem> +	    </itemizedlist> + +	  </refsect2>  	</refsect1> +  	<refsect1 id="unicode_cpp_bidi_seealso">  	  <title>SEE ALSO</title>  	  <para> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index cc9dbbb..3de76d3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -548,6 +548,24 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i);  #define UNICODE_LRO	0x202d /* Left-to-right override */  #define UNICODE_PDF	0x202c /* Pop directional override */ +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { +	namespace literals { + +		constexpr char32_t LRM[]={UNICODE_LRM, 0}; +		constexpr char32_t RLM[]={UNICODE_RLM, 0}; +		constexpr char32_t ALM[]={UNICODE_ALM, 0}; +		constexpr char32_t LRI[]={UNICODE_LRI, 0}; +		constexpr char32_t RLI[]={UNICODE_RLI, 0}; +		constexpr char32_t PDI[]={UNICODE_PDI, 0}; +		constexpr char32_t RLO[]={UNICODE_RLO, 0}; +		constexpr char32_t LRO[]={UNICODE_LRO, 0}; +		constexpr char32_t PDF[]={UNICODE_PDF, 0}; +	} +} +#endif +#endif  typedef char unicode_bidi_bracket_type_t; @@ -608,19 +626,50 @@ typedef enum {  extern enum_bidi_type_t unicode_bidi_type(char32_t c); +/* Bitmask options to unicode_bidi_cleanup */ + +/* + In addition to removing embedding, override, and boundary-neutral + characters also remove isolation markers and implicit markers. +*/ + +#define UNICODE_BIDI_CLEANUP_EXTRA	1 + +/* +  Replace all characters classified as paragraph separators by a newline +  character. +*/ + +#define UNICODE_BIDI_CLEANUP_BNL	2 + +/* +  Options for canonical rendering order. +*/ + +#define UNICODE_BIDI_CLEANUP_CANONICAL				\ +	(UNICODE_BIDI_CLEANUP_EXTRA | UNICODE_BIDI_CLEANUP_BNL) + +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { +	namespace literals { +		constexpr int CLEANUP_EXTRA=UNICODE_BIDI_CLEANUP_EXTRA; + +		constexpr int CLEANUP_BNL=UNICODE_BIDI_CLEANUP_BNL; + +		constexpr int CLEANUP_CANONICAL=UNICODE_BIDI_CLEANUP_CANONICAL; +	} +} +#endif +#endif +  extern size_t unicode_bidi_cleanup(char32_t *string,  				   unicode_bidi_level_t *levels,  				   size_t n, +				   int options,  				   void (*removed_callback)(size_t, void *),  				   void *); -extern size_t unicode_bidi_extra_cleanup(char32_t *string, -					 unicode_bidi_level_t *levels, -					 size_t n, -					 void (*removed_callback)(size_t, -								  void *), -					 void *); -  extern void unicode_bidi_logical_order(char32_t *string,  				       unicode_bidi_level_t *levels,  				       size_t n, @@ -2147,7 +2196,8 @@ void bidi_reorder(std::vector<unicode_bidi_level_t> &levels,  void bidi_cleanup(std::u32string &string,  		  const std::function<void (size_t)> &removed_callback= -		  [](size_t) {}); +		  [](size_t) {}, +		  int cleanup_options=0);  //! Also remove them from the embedding direction level buffer. @@ -2156,28 +2206,8 @@ void bidi_cleanup(std::u32string &string,  int bidi_cleanup(std::u32string &string,  		 std::vector<unicode_bidi_level_t> &levels,  		 const std::function<void (size_t)> &removed_callback= -		  [](size_t) {}); - - -//! Remove directional markers and isolation markers. - -//! Removes them from the string, in place. Optional lambda gets notified -//! of the index (in the original string, of each removed marker. - -void bidi_extra_cleanup(std::u32string &string, -			const std::function<void (size_t)> -			&removed_callback= -			[](size_t) {}); - -//! Also remove them from the embedding direction level buffer. - -//! Returns non-0 in case of non-matching level buffer size. - -int bidi_extra_cleanup(std::u32string &string, -		       std::vector<unicode_bidi_level_t> &levels, -		       const std::function<void (size_t)> -		       &removed_callback= -		       [](size_t) {}); +		 [](size_t) {}, +		 int cleanup_options=0);  //! Convert Unicode string from canonical rendering order to logical order.  int bidi_logical_order(std::u32string &string, @@ -2189,8 +2219,7 @@ int bidi_logical_order(std::u32string &string,  //! Convert Unicode string from canonical rendering order to logical order.  void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,  			unicode_bidi_level_t paragraph_embedding, -			const std::function<void (size_t, size_t)> -			&lambda); +			const std::function<void (size_t, size_t)> &lambda);  //! Embed directional and isolation markers diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 79c4db5..cfae12f 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2032,6 +2032,7 @@ void unicode_bidi_reorder(char32_t *p,  size_t unicode_bidi_cleanup(char32_t *string,  			    unicode_bidi_level_t *levels,  			    size_t n, +			    int cleanup_options,  			    void (*removed_callback)(size_t, void *),  			    void *arg)  { @@ -2040,7 +2041,13 @@ size_t unicode_bidi_cleanup(char32_t *string,  	{  		enum_bidi_type_t cl=unicode_bidi_type(string[j]); -		if (IS_X9(cl)) +		if (cleanup_options & UNICODE_BIDI_CLEANUP_EXTRA +		    ? ( +		       is_explicit_indicator_except_b(cl) || +		       (string[j] == UNICODE_LRM || +			string[j] == UNICODE_RLM || +			string[j] == UNICODE_ALM)) +		    : IS_X9(cl))  		{  			if (removed_callback)  				(*removed_callback)(j, arg); @@ -2048,34 +2055,9 @@ size_t unicode_bidi_cleanup(char32_t *string,  		}  		if (levels)  			levels[i]=levels[j] & 1; -		++i; -	} -	return i; -} - -size_t unicode_bidi_extra_cleanup(char32_t *string, -				  unicode_bidi_level_t *levels, -				  size_t n, -				  void (*removed_callback)(size_t, void *), -				  void *arg) -{ -	size_t i=0; -	for (size_t j=0; j<n; ++j) -	{ -		enum_bidi_type_t cl=unicode_bidi_type(string[j]); -		if (is_explicit_indicator_except_b(cl) || -		    (string[j] == UNICODE_LRM || -		     string[j] == UNICODE_RLM || -		     string[j] == UNICODE_ALM)) -		{ -			if (removed_callback) -				(*removed_callback)(j, arg); -			continue; -		} -		string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j]; -		if (levels) -			levels[i]=levels[j] & 1; +		string[i]=(cleanup_options & UNICODE_BIDI_CLEANUP_BNL) +			&& cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j];  		++i;  	}  	return i; diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 4217630..a0d5ac4 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -691,7 +691,8 @@ extern "C" {  }  void unicode::bidi_cleanup(std::u32string &string, -			   const std::function<void (size_t)> &lambda) +			   const std::function<void (size_t)> &lambda, +			   int cleanup_options)  {  	if (string.empty())  		return; @@ -701,6 +702,7 @@ void unicode::bidi_cleanup(std::u32string &string,  	size_t n=unicode_bidi_cleanup(&string[0],  				      0,  				      string.size(), +				      cleanup_options,  				      removed_callback,  				      reinterpret_cast<void *>(&cb));  	cb.rethrow(); @@ -709,15 +711,20 @@ void unicode::bidi_cleanup(std::u32string &string,  int unicode::bidi_cleanup(std::u32string &string,  			  std::vector<unicode_bidi_level_t> &levels, -			  const std::function<void (size_t)> &lambda) +			  const std::function<void (size_t)> &lambda, +			  int cleanup_options)  {  	if (levels.size() != string.size())  		return -1; +	if (levels.size() == 0) +		return 0; +  	cb_wrapper<void (size_t)> cb{lambda};  	size_t n=unicode_bidi_cleanup(&string[0],  				      &levels[0],  				      string.size(), +				      cleanup_options,  				      removed_callback,  				      reinterpret_cast<void *>(&cb));  	cb.rethrow(); @@ -727,42 +734,6 @@ int unicode::bidi_cleanup(std::u32string &string,  	return 0;  } - -void unicode::bidi_extra_cleanup(std::u32string &string, -				 const std::function<void (size_t)> &lambda) -{ -	if (string.empty()) -		return; - -	cb_wrapper<void (size_t)> cb{lambda}; -	size_t n=unicode_bidi_extra_cleanup(&string[0], -					    0, -					    string.size(), -					    removed_callback, -					    reinterpret_cast<void *>(&cb)); -	cb.rethrow(); -	string.resize(n); -} - -int unicode::bidi_extra_cleanup(std::u32string &string, -				std::vector<unicode_bidi_level_t> &levels, -				const std::function<void (size_t)> &lambda) -{ -	if (levels.size() != string.size()) -		return -1; - -	cb_wrapper<void (size_t)> cb{lambda}; -	size_t n=unicode_bidi_extra_cleanup(&string[0], -					    &levels[0], -					    string.size(), -					    removed_callback, -					    reinterpret_cast<void *>(&cb)); -	cb.rethrow(); -	string.resize(n); -	levels.resize(n); -	return 0; -} -  int unicode::bidi_logical_order(std::u32string &string,  				std::vector<unicode_bidi_level_t> &levels,  				unicode_bidi_level_t paragraph_embedding, | 
