From 7f29205e16403e5c86613ca6a80366969b6c6e7f Mon Sep 17 00:00:00 2001 From: Sam Varshavchik Date: Wed, 24 Feb 2021 07:48:49 -0500 Subject: More unicode functions. --- unicode/ChangeLog | 5 +++ unicode/Makefile.am | 2 + unicode/biditest2.C | 32 +++++++++++++++- unicode/book.xml | 91 ++++++++++++++++++++++++++++++++++++++++---- unicode/courier-unicode.h.in | 14 +++++++ unicode/unicode_bidi.c | 43 +++++++++++++++++++++ unicode/unicodecpp.C | 23 +++++++++++ 7 files changed, 201 insertions(+), 9 deletions(-) diff --git a/unicode/ChangeLog b/unicode/ChangeLog index fcb1c10..35cffe6 100644 --- a/unicode/ChangeLog +++ b/unicode/ChangeLog @@ -1,3 +1,8 @@ +2021-02-24 Sam Varshavchik + + * Implement unicode_bidi_needs_embed(), unicode_bidi_cleaned_size(), + unicode::bidi_override, + 2.2.1 2021-02-14 Sam Varshavchik diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 5877d22..dc502b3 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -97,6 +97,7 @@ man_MANS= \ $(srcdir)/man/unicode[\:][\:]bidi_embed_paragraph_level.3 \ $(srcdir)/man/unicode[\:][\:]bidi_get_direction.3 \ $(srcdir)/man/unicode[\:][\:]bidi_logical_order.3 \ + $(srcdir)/man/unicode[\:][\:]bidi_needs_embed.3 \ $(srcdir)/man/unicode[\:][\:]bidi_override.3 \ $(srcdir)/man/unicode[\:][\:]bidi_reorder.3 \ $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 \ @@ -127,6 +128,7 @@ man_MANS= \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ + $(srcdir)/man/unicode_bidi_needs_embed.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ $(srcdir)/man/unicode_bidi_setbnl.3 \ $(srcdir)/man/unicode_bidi_type.3 \ diff --git a/unicode/biditest2.C b/unicode/biditest2.C index 6ab347b..a14b3ea 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -597,29 +597,47 @@ void null_character_test() void direction_test() { static const struct { - const char32_t *str; + std::u32string str; unicode_bidi_level_t direction; int is_explicit; + bool needs_embed; } tests[]={ { U"Hello", UNICODE_BIDI_LR, 1, + true, }, { U" ", UNICODE_BIDI_LR, 0, + true, }, { U"", UNICODE_BIDI_LR, 0, + true, }, { U"שלום", UNICODE_BIDI_RL, 1, + true, + }, + { + U"Helloש", + UNICODE_BIDI_LR, + 1, + true, + }, + { + U"Hello" + std::u32string{unicode::literals::LRO} + + U"ש", + UNICODE_BIDI_LR, + 1, + false, }, }; @@ -633,6 +651,18 @@ void direction_test() std::cerr << "direction_test failed\n"; exit(1); } + + std::u32string s=t.str; + auto levels=std::get<0>(unicode::bidi_calc(s, t.direction)); + unicode::bidi_reorder(s, levels); + unicode::bidi_cleanup(s, levels); + + if (unicode::bidi_needs_embed(s, levels, &t.direction) + != t.needs_embed) + { + std::cerr << "needs embed failed\n"; + exit(1); + } } } diff --git a/unicode/book.xml b/unicode/book.xml index 0b45433..4f0fd71 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -336,6 +336,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.unicode_bidi_cleanup unicode_bidi_cleaned_size unicode_bidi_logical_order + unicode_bidi_needs_embed unicode_bidi_embed unicode_bidi_embed_paragraph_level @@ -403,7 +404,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example. - size_t unicode_bidi_logical_order + void unicode_bidi_logical_order char32_t *string unicode_bidi_level_t *levels size_t n @@ -412,6 +413,14 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.void *arg + + int unicode_bidi_needs_embed + const char32_t *string + const unicode_bidi_level_t *levels + size_t n + const unicode_bidi_level_t *paragraph_embedding + + size_t unicode_bidi_embed const char32_t *string @@ -871,7 +880,8 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.UNICODE_BIDI_CLEANUP_CANONICAL are in canonical rendering order. - unicode_bidi_logical_order() and + unicode_bidi_logical_order(), + unicode_bidi_needs_embed() and unicode_bidi_embed() require the canonical rendering order for their string and embedding level values. @@ -886,8 +896,9 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example. Embedding bi-directional markers in Unicode text strings - unicode_bidi_logical_order() and - unicode_bidi_embed() add various + unicode_bidi_logical_order() rearranges + the string from rendering to its logical order. + unicode_bidi_embed() adds various bi-directional markers to a Unicode string in canonical rendering order. The resulting string is not guaranteed to be identical to the @@ -901,12 +912,18 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.unicode_bidi_cleanup() (with the canonical option), with the same paragraph_embedding level. + unicode_bidi_needs_embed() attempts to + heuristically determine whether + unicode_bidi_embed() is required. unicode_bidi_logical_order() gets called first, followed by - unicode_bidi_embed(). + unicode_bidi_embed() + (or + unicode_bidi_needs_embed() in order to + determine whether bi-directional markers are required). Finally, unicode_bidi_embed_paragraph_level() optionally determines whether the resulting string's default paragraph embedding level matches the one used for the actual @@ -963,12 +980,12 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example. - The Unicode string, and … + The Unicode string. - … the directional embedding buffer, in canonical + The directional embedding buffer, in canonical rendering order. @@ -1080,6 +1097,53 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example. + + + unicode_bidi_needs_embed() attempts to + heuristically determine whether the Unicode string, in logical + order, requires bi-directional markers. + The parameters to + unicode_bidi_embed_paragraph_level() are: + + + + + The Unicode string. + + + + + The directional embedding buffer, in logical + rendering order. + + + + + The size of the string and the embedding level buffer. + + + + + A pointer to an explicit paragraph embedding level, either + UNICODE_BIDI_LR or + UNICODE_BIDI_RL; or a + NULL pointer (see + unicode_bidi_calc_types()'s + explanation for this parameter). + + + + + + unicode_bidi_needs_embed() returns 0 + if the Unicode string does not need explicit directional + markers, or 1 if it does. This is done by using + unicode_bidi_calc(), + unicode_bidi_reorder(), + unicode_bidi_logical_order and then + checking if the end result is different from what was passed + in. + Miscellaneous utility functions @@ -2919,6 +2983,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.unicode::bidi_reorder unicode::bidi_cleanup unicode::bidi_logical_order + unicode::bidi_needs_embed unicode::bidi_embed unicode::bidi_embed_paragraph_level unicode::bidi_get_direction @@ -3025,6 +3090,15 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.size_t n=(size_t)-1 + + bool unicode::bidi_needs_embed + const std::u32string &string + const std::vector <unicode_bidi_level_t> &levels + const unicode_bidi_level_t (paragraph_embedding=NULL + size_t starting_pos=0 + size_t n=(size_t)-1 + + int unicode::bidi_embed const std::u32string &string @@ -3196,7 +3270,8 @@ auto [levels, direction]=unicode::bidi_calc(types); unicode::bidi_reorder, unicode::bidi_cleanup, - unicode::bidi_logical_order and + unicode::bidi_logical_order, + unicode::bidi_needs_embed and unicode::bidi_get_direction take two optional parameters (defaulted values or overloaded) specifying diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index a1a502c..2999ee3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -717,6 +717,12 @@ extern void unicode_bidi_logical_order(char32_t *string, void *), void *arg); +extern int unicode_bidi_needs_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + const unicode_bidi_level_t * + paragraph_embedding); + extern void unicode_bidi_embed(const char32_t *string, const unicode_bidi_level_t *levels, size_t n, @@ -2328,6 +2334,14 @@ void bidi_logical_order(std::vector &levels, size_t starting_pos=0, size_t n=(size_t)-1); +//! Whether directional and isolation markers are needed. + +bool bidi_needs_embed(const std::u32string &string, + const std::vector &levels, + const unicode_bidi_level_t *paragraph_embedding=0, + size_t starting_pos=0, + size_t n=(size_t)-1); + //! Embed directional and isolation markers //! Non-0 return value indicates the string and levels' sizes do not match. diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 1aa4a88..772f9fe 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2310,6 +2310,49 @@ static void emit_marker(struct bidi_embed_levelrun *p, } } +int unicode_bidi_needs_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + const unicode_bidi_level_t *paragraph_level) +{ + char32_t *string_cpy=(char32_t *)malloc(n * sizeof(char32_t)); + unicode_bidi_level_t *levels_cpy=(unicode_bidi_level_t *) + malloc(n * sizeof(unicode_bidi_level_t)); + size_t nn; + int ret; + + if (!string_cpy || !levels_cpy) + abort(); + + memcpy(string_cpy, string, n * sizeof(char32_t)); + + struct unicode_bidi_direction direction= + unicode_bidi_calc(string_cpy, n, + levels_cpy, paragraph_level); + + unicode_bidi_reorder(string_cpy, levels_cpy, n, NULL, NULL); + nn=unicode_bidi_cleanup(string_cpy, levels_cpy, n, 0, + NULL, NULL); + + ret=0; + if (n == nn && (paragraph_level == NULL || + direction.direction == *paragraph_level)) + { + unicode_bidi_logical_order(string_cpy, levels_cpy, nn, + direction.direction, + NULL, NULL); + if (memcmp(string_cpy, string, n * sizeof(char32_t)) == 0 && + memcmp(levels_cpy, levels, n * sizeof(unicode_bidi_level_t)) + == 0) + { + ret=1; + } + } + free(string_cpy); + free(levels_cpy); + return ret; +} + void unicode_bidi_embed(const char32_t *string, const unicode_bidi_level_t *levels, size_t n, diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 04d9879..7bb6edc 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -950,6 +950,29 @@ unicode_bidi_direction unicode::bidi_get_direction(const std::u32string &string, return unicode_bidi_get_direction(string.c_str()+starting_pos, n); } +bool unicode::bidi_needs_embed(const std::u32string &string, + const std::vector &levels, + const unicode_bidi_level_t *paragraph_embedding, + size_t starting_pos, + size_t n) +{ + if (string.size() != levels.size()) + return false; + + auto s=levels.size(); + + if (starting_pos >= s) + return false; + + if (n > s-starting_pos) + n=s-starting_pos; + + return unicode_bidi_needs_embed(string.c_str(), + n == 0 ? NULL : &levels[starting_pos], + n, + paragraph_embedding) != 0; +} + std::u32string unicode::bidi_override(const std::u32string &s, unicode_bidi_level_t direction, int cleanup_options) -- cgit v1.2.3