diff options
| author | Sam Varshavchik | 2021-02-24 07:48:49 -0500 | 
|---|---|---|
| committer | Sam Varshavchik | 2021-02-24 07:48:49 -0500 | 
| commit | 7f29205e16403e5c86613ca6a80366969b6c6e7f (patch) | |
| tree | 3ba27e549001a3c10a4f9b0a70ad15fd21747623 | |
| parent | 6e8ce4696bf8c05272a01dc55081fcc186e9e6ac (diff) | |
| download | courier-libs-7f29205e16403e5c86613ca6a80366969b6c6e7f.tar.bz2 | |
More unicode functions.
| -rw-r--r-- | unicode/ChangeLog | 5 | ||||
| -rw-r--r-- | unicode/Makefile.am | 2 | ||||
| -rw-r--r-- | unicode/biditest2.C | 32 | ||||
| -rw-r--r-- | unicode/book.xml | 91 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 14 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 43 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 23 | 
7 files changed, 201 insertions, 9 deletions
| diff --git a/unicode/ChangeLog b/unicode/ChangeLog index fcb1c10..35cffe6 100644 --- a/unicode/ChangeLog +++ b/unicode/ChangeLog @@ -1,3 +1,8 @@ +2021-02-24  Sam Varshavchik  <mrsam@courier-mta.com> + +	* Implement unicode_bidi_needs_embed(), unicode_bidi_cleaned_size(), +	unicode::bidi_override, +  2.2.1  2021-02-14  Sam Varshavchik  <mrsam@courier-mta.com> diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 5877d22..dc502b3 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -97,6 +97,7 @@ man_MANS= \          $(srcdir)/man/unicode[\:][\:]bidi_embed_paragraph_level.3 \          $(srcdir)/man/unicode[\:][\:]bidi_get_direction.3 \          $(srcdir)/man/unicode[\:][\:]bidi_logical_order.3 \ +        $(srcdir)/man/unicode[\:][\:]bidi_needs_embed.3 \          $(srcdir)/man/unicode[\:][\:]bidi_override.3 \          $(srcdir)/man/unicode[\:][\:]bidi_reorder.3 \          $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 \ @@ -127,6 +128,7 @@ man_MANS= \          $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \          $(srcdir)/man/unicode_bidi_logical_order.3 \          $(srcdir)/man/unicode_bidi_mirror.3 \ +        $(srcdir)/man/unicode_bidi_needs_embed.3 \          $(srcdir)/man/unicode_bidi_reorder.3 \          $(srcdir)/man/unicode_bidi_setbnl.3 \          $(srcdir)/man/unicode_bidi_type.3 \ diff --git a/unicode/biditest2.C b/unicode/biditest2.C index 6ab347b..a14b3ea 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -597,29 +597,47 @@ void null_character_test()  void direction_test()  {  	static const struct { -		const char32_t *str; +		std::u32string str;  		unicode_bidi_level_t direction;  		int is_explicit; +		bool needs_embed;  	} tests[]={  		{  			U"Hello",  			UNICODE_BIDI_LR,  			1, +			true,  		},  		{  			U" ",  			UNICODE_BIDI_LR,  			0, +			true,  		},  		{  			U"",  			UNICODE_BIDI_LR,  			0, +			true,  		},  		{  			U"שלום",  			UNICODE_BIDI_RL,  			1, +			true, +		}, +		{ +			U"Helloש", +			UNICODE_BIDI_LR, +			1, +			true, +		}, +		{ +			U"Hello" + std::u32string{unicode::literals::LRO} +			+ U"ש", +			UNICODE_BIDI_LR, +			1, +			false,  		},  	}; @@ -633,6 +651,18 @@ void direction_test()  			std::cerr << "direction_test failed\n";  			exit(1);  		} + +		std::u32string s=t.str; +		auto levels=std::get<0>(unicode::bidi_calc(s, t.direction)); +		unicode::bidi_reorder(s, levels); +		unicode::bidi_cleanup(s, levels); + +		if (unicode::bidi_needs_embed(s, levels, &t.direction) +		    != t.needs_embed) +		{ +			std::cerr << "needs embed failed\n"; +			exit(1); +		}  	}  } diff --git a/unicode/book.xml b/unicode/book.xml index 0b45433..4f0fd71 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -336,6 +336,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	  <refname>unicode_bidi_cleanup</refname>  	  <refname>unicode_bidi_cleaned_size</refname>  	  <refname>unicode_bidi_logical_order</refname> +	  <refname>unicode_bidi_needs_embed</refname>  	  <refname>unicode_bidi_embed</refname>  	  <refname>unicode_bidi_embed_paragraph_level</refname> @@ -403,7 +404,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	    </funcprototype>  	    <funcprototype> -	      <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef> +	      <funcdef>void <function>unicode_bidi_logical_order</function></funcdef>                <paramdef>char32_t *<parameter>string</parameter></paramdef>                <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>                <paramdef>size_t <parameter>n</parameter></paramdef> @@ -413,6 +414,14 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	    </funcprototype>  	    <funcprototype> +	      <funcdef>int <function>unicode_bidi_needs_embed</function></funcdef> +              <paramdef>const char32_t *<parameter>string</parameter></paramdef> +              <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>const unicode_bidi_level_t <parameter>*paragraph_embedding</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype>  	      <funcdef>size_t <function>unicode_bidi_embed</function></funcdef>                <paramdef>const char32_t *<parameter>string</parameter></paramdef>                <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> @@ -871,7 +880,8 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	      with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal>  	      are in  	      <quote>canonical rendering order</quote>. -	      <function>unicode_bidi_logical_order</function>() and +	      <function>unicode_bidi_logical_order</function>(), +	      <function>unicode_bidi_needs_embed</function>() and  	      <function>unicode_bidi_embed</function>() require the  	      canonical rendering order for their string and embedding level  	      values. @@ -886,8 +896,9 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	  <refsect2 id="unicode_bidi_embed">  	    <title>Embedding bi-directional markers in Unicode text strings</title>              <para> -	      <function>unicode_bidi_logical_order</function>() and -	      <function>unicode_bidi_embed</function>() add various +	      <function>unicode_bidi_logical_order</function>() rearranges +	      the string from rendering to its logical order. +	      <function>unicode_bidi_embed</function>() adds various  	      bi-directional markers to a Unicode string in canonical rendering  	      order. The resulting string is not guaranteed to be  	      identical to the @@ -901,12 +912,18 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	      <function>unicode_bidi_cleanup()</function>  	      (with the canonical option),  	      with the same paragraph_embedding level. +	      <function>unicode_bidi_needs_embed</function>() attempts to +	      heuristically determine whether +	      <function>unicode_bidi_embed</function>() is required.              </para>  	    <para>  	      <function>unicode_bidi_logical_order</function>() gets called  	      first, followed by -	      <function>unicode_bidi_embed</function>(). +	      <function>unicode_bidi_embed</function>() +	      (or +	      <function>unicode_bidi_needs_embed</function>() in order to +	      determine whether bi-directional markers are required).  	      Finally, <function>unicode_bidi_embed_paragraph_level</function>()  	      optionally determines whether the resulting string's default  	      paragraph embedding level matches the one used for the actual @@ -963,12 +980,12 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti              <itemizedlist>  	      <listitem>  		<para> -		  The Unicode string, and … +		  The Unicode string.                  </para>                </listitem>  	      <listitem>  		<para> -		  … the directional embedding buffer, in canonical +		  The directional embedding buffer, in canonical  		  rendering order.                  </para>                </listitem> @@ -1080,6 +1097,53 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti                  </para>                </listitem>  	    </itemizedlist> + +	    <para> +	      <function>unicode_bidi_needs_embed</function>() attempts to +	      heuristically determine whether the Unicode string, in logical +	      order, requires bi-directional markers. +	      The parameters to +	      <function>unicode_bidi_embed_paragraph_level</function>() are: +	    </para> +	    <itemizedlist> +	      <listitem> +		<para> +		  The Unicode string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The directional embedding buffer, in logical +		  rendering order. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The size of the string and the embedding level buffer. +                </para> +              </listitem> +	      <listitem> +		<para> +		  A pointer to an explicit paragraph embedding level, either +		  <literal>UNICODE_BIDI_LR</literal> or +		  <literal>UNICODE_BIDI_RL</literal>; or a +		  <literal>NULL</literal> pointer (see +		  <function>unicode_bidi_calc_types</function>()'s +		  explanation for this parameter). +                </para> +              </listitem> +	    </itemizedlist> + +	    <para> +	      <function>unicode_bidi_needs_embed</function>() returns 0 +	      if the Unicode string does not need explicit directional +	      markers, or 1 if it does. This is done by using +	      <function>unicode_bidi_calc()</function>, +	      <function>unicode_bidi_reorder()</function>, +	      <function>unicode_bidi_logical_order</function> and then +	      checking if the end result is different from what was passed +	      in. +	    </para>            </refsect2>  	  <refsect2 id="unicode_bidi_misc">  	    <title>Miscellaneous utility functions</title> @@ -2919,6 +2983,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti  	  <refname>unicode::bidi_reorder</refname>  	  <refname>unicode::bidi_cleanup</refname>  	  <refname>unicode::bidi_logical_order</refname> +	  <refname>unicode::bidi_needs_embed</refname>  	  <refname>unicode::bidi_embed</refname>  	  <refname>unicode::bidi_embed_paragraph_level</refname>  	  <refname>unicode::bidi_get_direction</refname> @@ -3026,6 +3091,15 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti              </funcprototype>  	    <funcprototype> +              <funcdef>bool <function>unicode::bidi_needs_embed</function></funcdef> +	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>const unicode_bidi_level_t (<parameter>paragraph_embedding</parameter>=NULL</paramdef> +	      <paramdef>size_t <parameter>starting_pos</parameter>=0</paramdef> +	      <paramdef>size_t <parameter>n</parameter>=(size_t)-1</paramdef> +            </funcprototype> + +	    <funcprototype>                <funcdef>int <function>unicode::bidi_embed</function></funcdef>  	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef>  	      <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> @@ -3196,7 +3270,8 @@ auto [levels, direction]=unicode::bidi_calc(types);  	      <para>  		<function>unicode::bidi_reorder</function>,  		<function>unicode::bidi_cleanup</function>, -		<function>unicode::bidi_logical_order</function> and +		<function>unicode::bidi_logical_order</function>, +		<function>unicode::bidi_needs_embed</function> and  		<function>unicode::bidi_get_direction</function>  		take two optional  		parameters (defaulted values or overloaded) specifying diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index a1a502c..2999ee3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -717,6 +717,12 @@ extern void unicode_bidi_logical_order(char32_t *string,  								void *),  				       void *arg); +extern int unicode_bidi_needs_embed(const char32_t *string, +				    const unicode_bidi_level_t *levels, +				    size_t n, +				    const unicode_bidi_level_t * +				    paragraph_embedding); +  extern void unicode_bidi_embed(const char32_t *string,  			       const unicode_bidi_level_t *levels,  			       size_t n, @@ -2328,6 +2334,14 @@ void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,  			size_t starting_pos=0,  			size_t n=(size_t)-1); +//! Whether directional and isolation markers are needed. + +bool bidi_needs_embed(const std::u32string &string, +		      const std::vector<unicode_bidi_level_t> &levels, +		      const unicode_bidi_level_t *paragraph_embedding=0, +		      size_t starting_pos=0, +		      size_t n=(size_t)-1); +  //! Embed directional and isolation markers  //! Non-0 return value indicates the string and levels' sizes do not match. diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 1aa4a88..772f9fe 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2310,6 +2310,49 @@ static void emit_marker(struct bidi_embed_levelrun *p,  	}  } +int unicode_bidi_needs_embed(const char32_t *string, +			     const unicode_bidi_level_t *levels, +			     size_t n, +			     const unicode_bidi_level_t *paragraph_level) +{ +	char32_t *string_cpy=(char32_t *)malloc(n * sizeof(char32_t)); +	unicode_bidi_level_t *levels_cpy=(unicode_bidi_level_t *) +		malloc(n * sizeof(unicode_bidi_level_t)); +	size_t nn; +	int ret; + +	if (!string_cpy || !levels_cpy) +		abort(); + +	memcpy(string_cpy, string, n * sizeof(char32_t)); + +	struct unicode_bidi_direction direction= +		unicode_bidi_calc(string_cpy, n, +				  levels_cpy, paragraph_level); + +	unicode_bidi_reorder(string_cpy, levels_cpy, n, NULL, NULL); +	nn=unicode_bidi_cleanup(string_cpy, levels_cpy, n, 0, +				NULL, NULL); + +	ret=0; +	if (n == nn && (paragraph_level == NULL || +			direction.direction == *paragraph_level)) +	{ +		unicode_bidi_logical_order(string_cpy, levels_cpy, nn, +					   direction.direction, +					   NULL, NULL); +		if (memcmp(string_cpy, string, n * sizeof(char32_t)) == 0 && +		    memcmp(levels_cpy, levels, n * sizeof(unicode_bidi_level_t)) +		    == 0) +		{ +			ret=1; +		} +	} +	free(string_cpy); +	free(levels_cpy); +	return ret; +} +  void unicode_bidi_embed(const char32_t *string,  			const unicode_bidi_level_t *levels,  			size_t n, diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 04d9879..7bb6edc 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -950,6 +950,29 @@ unicode_bidi_direction unicode::bidi_get_direction(const std::u32string &string,  	return unicode_bidi_get_direction(string.c_str()+starting_pos, n);  } +bool unicode::bidi_needs_embed(const std::u32string &string, +			       const std::vector<unicode_bidi_level_t> &levels, +			       const unicode_bidi_level_t *paragraph_embedding, +			       size_t starting_pos, +			       size_t n) +{ +	if (string.size() != levels.size()) +		return false; + +	auto s=levels.size(); + +	if (starting_pos >= s) +		return false; + +	if (n > s-starting_pos) +		n=s-starting_pos; + +	return unicode_bidi_needs_embed(string.c_str(), +					n == 0 ? NULL : &levels[starting_pos], +					n, +					paragraph_embedding) != 0; +} +  std::u32string unicode::bidi_override(const std::u32string &s,  				      unicode_bidi_level_t direction,  				      int cleanup_options) | 
