diff options
| author | Sam Varshavchik | 2021-02-24 07:48:49 -0500 |
|---|---|---|
| committer | Sam Varshavchik | 2021-02-24 07:48:49 -0500 |
| commit | 7f29205e16403e5c86613ca6a80366969b6c6e7f (patch) | |
| tree | 3ba27e549001a3c10a4f9b0a70ad15fd21747623 /unicode | |
| parent | 6e8ce4696bf8c05272a01dc55081fcc186e9e6ac (diff) | |
| download | courier-libs-7f29205e16403e5c86613ca6a80366969b6c6e7f.tar.bz2 | |
More unicode functions.
Diffstat (limited to 'unicode')
| -rw-r--r-- | unicode/ChangeLog | 5 | ||||
| -rw-r--r-- | unicode/Makefile.am | 2 | ||||
| -rw-r--r-- | unicode/biditest2.C | 32 | ||||
| -rw-r--r-- | unicode/book.xml | 91 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 14 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 43 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 23 |
7 files changed, 201 insertions, 9 deletions
diff --git a/unicode/ChangeLog b/unicode/ChangeLog index fcb1c10..35cffe6 100644 --- a/unicode/ChangeLog +++ b/unicode/ChangeLog @@ -1,3 +1,8 @@ +2021-02-24 Sam Varshavchik <mrsam@courier-mta.com> + + * Implement unicode_bidi_needs_embed(), unicode_bidi_cleaned_size(), + unicode::bidi_override, + 2.2.1 2021-02-14 Sam Varshavchik <mrsam@courier-mta.com> diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 5877d22..dc502b3 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -97,6 +97,7 @@ man_MANS= \ $(srcdir)/man/unicode[\:][\:]bidi_embed_paragraph_level.3 \ $(srcdir)/man/unicode[\:][\:]bidi_get_direction.3 \ $(srcdir)/man/unicode[\:][\:]bidi_logical_order.3 \ + $(srcdir)/man/unicode[\:][\:]bidi_needs_embed.3 \ $(srcdir)/man/unicode[\:][\:]bidi_override.3 \ $(srcdir)/man/unicode[\:][\:]bidi_reorder.3 \ $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 \ @@ -127,6 +128,7 @@ man_MANS= \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ + $(srcdir)/man/unicode_bidi_needs_embed.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ $(srcdir)/man/unicode_bidi_setbnl.3 \ $(srcdir)/man/unicode_bidi_type.3 \ diff --git a/unicode/biditest2.C b/unicode/biditest2.C index 6ab347b..a14b3ea 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -597,29 +597,47 @@ void null_character_test() void direction_test() { static const struct { - const char32_t *str; + std::u32string str; unicode_bidi_level_t direction; int is_explicit; + bool needs_embed; } tests[]={ { U"Hello", UNICODE_BIDI_LR, 1, + true, }, { U" ", UNICODE_BIDI_LR, 0, + true, }, { U"", UNICODE_BIDI_LR, 0, + true, }, { U"שלום", UNICODE_BIDI_RL, 1, + true, + }, + { + U"Helloש", + UNICODE_BIDI_LR, + 1, + true, + }, + { + U"Hello" + std::u32string{unicode::literals::LRO} + + U"ש", + UNICODE_BIDI_LR, + 1, + false, }, }; @@ -633,6 +651,18 @@ void direction_test() std::cerr << "direction_test failed\n"; exit(1); } + + std::u32string s=t.str; + auto levels=std::get<0>(unicode::bidi_calc(s, t.direction)); + unicode::bidi_reorder(s, levels); + unicode::bidi_cleanup(s, levels); + + if (unicode::bidi_needs_embed(s, levels, &t.direction) + != t.needs_embed) + { + std::cerr << "needs embed failed\n"; + exit(1); + } } } diff --git a/unicode/book.xml b/unicode/book.xml index 0b45433..4f0fd71 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -336,6 +336,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti <refname>unicode_bidi_cleanup</refname> <refname>unicode_bidi_cleaned_size</refname> <refname>unicode_bidi_logical_order</refname> + <refname>unicode_bidi_needs_embed</refname> <refname>unicode_bidi_embed</refname> <refname>unicode_bidi_embed_paragraph_level</refname> @@ -403,7 +404,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti </funcprototype> <funcprototype> - <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef> + <funcdef>void <function>unicode_bidi_logical_order</function></funcdef> <paramdef>char32_t *<parameter>string</parameter></paramdef> <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> <paramdef>size_t <parameter>n</parameter></paramdef> @@ -413,6 +414,14 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti </funcprototype> <funcprototype> + <funcdef>int <function>unicode_bidi_needs_embed</function></funcdef> + <paramdef>const char32_t *<parameter>string</parameter></paramdef> + <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>const unicode_bidi_level_t <parameter>*paragraph_embedding</parameter></paramdef> + </funcprototype> + + <funcprototype> <funcdef>size_t <function>unicode_bidi_embed</function></funcdef> <paramdef>const char32_t *<parameter>string</parameter></paramdef> <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> @@ -871,7 +880,8 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal> are in <quote>canonical rendering order</quote>. - <function>unicode_bidi_logical_order</function>() and + <function>unicode_bidi_logical_order</function>(), + <function>unicode_bidi_needs_embed</function>() and <function>unicode_bidi_embed</function>() require the canonical rendering order for their string and embedding level values. @@ -886,8 +896,9 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti <refsect2 id="unicode_bidi_embed"> <title>Embedding bi-directional markers in Unicode text strings</title> <para> - <function>unicode_bidi_logical_order</function>() and - <function>unicode_bidi_embed</function>() add various + <function>unicode_bidi_logical_order</function>() rearranges + the string from rendering to its logical order. + <function>unicode_bidi_embed</function>() adds various bi-directional markers to a Unicode string in canonical rendering order. The resulting string is not guaranteed to be identical to the @@ -901,12 +912,18 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti <function>unicode_bidi_cleanup()</function> (with the canonical option), with the same paragraph_embedding level. + <function>unicode_bidi_needs_embed</function>() attempts to + heuristically determine whether + <function>unicode_bidi_embed</function>() is required. </para> <para> <function>unicode_bidi_logical_order</function>() gets called first, followed by - <function>unicode_bidi_embed</function>(). + <function>unicode_bidi_embed</function>() + (or + <function>unicode_bidi_needs_embed</function>() in order to + determine whether bi-directional markers are required). Finally, <function>unicode_bidi_embed_paragraph_level</function>() optionally determines whether the resulting string's default paragraph embedding level matches the one used for the actual @@ -963,12 +980,12 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti <itemizedlist> <listitem> <para> - The Unicode string, and … + The Unicode string. </para> </listitem> <listitem> <para> - … the directional embedding buffer, in canonical + The directional embedding buffer, in canonical rendering order. </para> </listitem> @@ -1080,6 +1097,53 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti </para> </listitem> </itemizedlist> + + <para> + <function>unicode_bidi_needs_embed</function>() attempts to + heuristically determine whether the Unicode string, in logical + order, requires bi-directional markers. + The parameters to + <function>unicode_bidi_embed_paragraph_level</function>() are: + </para> + <itemizedlist> + <listitem> + <para> + The Unicode string. + </para> + </listitem> + <listitem> + <para> + The directional embedding buffer, in logical + rendering order. + </para> + </listitem> + <listitem> + <para> + The size of the string and the embedding level buffer. + </para> + </listitem> + <listitem> + <para> + A pointer to an explicit paragraph embedding level, either + <literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal>; or a + <literal>NULL</literal> pointer (see + <function>unicode_bidi_calc_types</function>()'s + explanation for this parameter). + </para> + </listitem> + </itemizedlist> + + <para> + <function>unicode_bidi_needs_embed</function>() returns 0 + if the Unicode string does not need explicit directional + markers, or 1 if it does. This is done by using + <function>unicode_bidi_calc()</function>, + <function>unicode_bidi_reorder()</function>, + <function>unicode_bidi_logical_order</function> and then + checking if the end result is different from what was passed + in. + </para> </refsect2> <refsect2 id="unicode_bidi_misc"> <title>Miscellaneous utility functions</title> @@ -2919,6 +2983,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti <refname>unicode::bidi_reorder</refname> <refname>unicode::bidi_cleanup</refname> <refname>unicode::bidi_logical_order</refname> + <refname>unicode::bidi_needs_embed</refname> <refname>unicode::bidi_embed</refname> <refname>unicode::bidi_embed_paragraph_level</refname> <refname>unicode::bidi_get_direction</refname> @@ -3026,6 +3091,15 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti </funcprototype> <funcprototype> + <funcdef>bool <function>unicode::bidi_needs_embed</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>const unicode_bidi_level_t (<parameter>paragraph_embedding</parameter>=NULL</paramdef> + <paramdef>size_t <parameter>starting_pos</parameter>=0</paramdef> + <paramdef>size_t <parameter>n</parameter>=(size_t)-1</paramdef> + </funcprototype> + + <funcprototype> <funcdef>int <function>unicode::bidi_embed</function></funcdef> <paramdef>const std::u32string &<parameter>string</parameter></paramdef> <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> @@ -3196,7 +3270,8 @@ auto [levels, direction]=unicode::bidi_calc(types); <para> <function>unicode::bidi_reorder</function>, <function>unicode::bidi_cleanup</function>, - <function>unicode::bidi_logical_order</function> and + <function>unicode::bidi_logical_order</function>, + <function>unicode::bidi_needs_embed</function> and <function>unicode::bidi_get_direction</function> take two optional parameters (defaulted values or overloaded) specifying diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index a1a502c..2999ee3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -717,6 +717,12 @@ extern void unicode_bidi_logical_order(char32_t *string, void *), void *arg); +extern int unicode_bidi_needs_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + const unicode_bidi_level_t * + paragraph_embedding); + extern void unicode_bidi_embed(const char32_t *string, const unicode_bidi_level_t *levels, size_t n, @@ -2328,6 +2334,14 @@ void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, size_t starting_pos=0, size_t n=(size_t)-1); +//! Whether directional and isolation markers are needed. + +bool bidi_needs_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t> &levels, + const unicode_bidi_level_t *paragraph_embedding=0, + size_t starting_pos=0, + size_t n=(size_t)-1); + //! Embed directional and isolation markers //! Non-0 return value indicates the string and levels' sizes do not match. diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 1aa4a88..772f9fe 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2310,6 +2310,49 @@ static void emit_marker(struct bidi_embed_levelrun *p, } } +int unicode_bidi_needs_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + const unicode_bidi_level_t *paragraph_level) +{ + char32_t *string_cpy=(char32_t *)malloc(n * sizeof(char32_t)); + unicode_bidi_level_t *levels_cpy=(unicode_bidi_level_t *) + malloc(n * sizeof(unicode_bidi_level_t)); + size_t nn; + int ret; + + if (!string_cpy || !levels_cpy) + abort(); + + memcpy(string_cpy, string, n * sizeof(char32_t)); + + struct unicode_bidi_direction direction= + unicode_bidi_calc(string_cpy, n, + levels_cpy, paragraph_level); + + unicode_bidi_reorder(string_cpy, levels_cpy, n, NULL, NULL); + nn=unicode_bidi_cleanup(string_cpy, levels_cpy, n, 0, + NULL, NULL); + + ret=0; + if (n == nn && (paragraph_level == NULL || + direction.direction == *paragraph_level)) + { + unicode_bidi_logical_order(string_cpy, levels_cpy, nn, + direction.direction, + NULL, NULL); + if (memcmp(string_cpy, string, n * sizeof(char32_t)) == 0 && + memcmp(levels_cpy, levels, n * sizeof(unicode_bidi_level_t)) + == 0) + { + ret=1; + } + } + free(string_cpy); + free(levels_cpy); + return ret; +} + void unicode_bidi_embed(const char32_t *string, const unicode_bidi_level_t *levels, size_t n, diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 04d9879..7bb6edc 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -950,6 +950,29 @@ unicode_bidi_direction unicode::bidi_get_direction(const std::u32string &string, return unicode_bidi_get_direction(string.c_str()+starting_pos, n); } +bool unicode::bidi_needs_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t> &levels, + const unicode_bidi_level_t *paragraph_embedding, + size_t starting_pos, + size_t n) +{ + if (string.size() != levels.size()) + return false; + + auto s=levels.size(); + + if (starting_pos >= s) + return false; + + if (n > s-starting_pos) + n=s-starting_pos; + + return unicode_bidi_needs_embed(string.c_str(), + n == 0 ? NULL : &levels[starting_pos], + n, + paragraph_embedding) != 0; +} + std::u32string unicode::bidi_override(const std::u32string &s, unicode_bidi_level_t direction, int cleanup_options) |
