From 844f6a9ef755c1c5826c9583b364af08b54a4dcc Mon Sep 17 00:00:00 2001 From: Sam Varshavchik Date: Sun, 29 Nov 2020 08:41:57 -0500 Subject: Combine cleanup functions, add unicode::literals namespace. --- unicode/Makefile.am | 5 +- unicode/biditest2.C | 13 +++- unicode/book.xml | 179 ++++++++++++++++++++++++++++--------------- unicode/courier-unicode.h.in | 93 ++++++++++++++-------- unicode/unicode_bidi.c | 38 +++------ unicode/unicodecpp.C | 47 +++--------- 6 files changed, 211 insertions(+), 164 deletions(-) diff --git a/unicode/Makefile.am b/unicode/Makefile.am index f864e2d..dbc71aa 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -89,11 +89,11 @@ include_HEADERS=courier-unicode.h \ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ + $(srcdir)/man/unicode\:\:bidi.3 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_embed.3 \ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ - $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_logical_order.3 \ $(srcdir)/man/unicode\:\:bidi_reorder.3 \ $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \ @@ -118,7 +118,6 @@ man_MANS= \ $(srcdir)/man/unicode_bidi_cleanup.3 \ $(srcdir)/man/unicode_bidi_embed.3 \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ - $(srcdir)/man/unicode_bidi_extra_cleanup.3 \ $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ @@ -515,4 +514,4 @@ distrelease: $(MAKE) dist www: - rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode + rsync -a --delete-after html/. $$HOME/www/hostrocket/courier-mta.org/unicode diff --git a/unicode/biditest2.C b/unicode/biditest2.C index a9ab87d..ded76be 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -307,7 +307,9 @@ void character_test() exit(1); } - unicode::bidi_extra_cleanup(s, levels); + unicode::bidi_cleanup(s, levels, + [](size_t) {}, + UNICODE_BIDI_CLEANUP_CANONICAL); auto dump_ls= [&] @@ -371,8 +373,13 @@ void character_test() } unicode::bidi_reorder(new_string, std::get<0>(ret)); - unicode::bidi_extra_cleanup(new_string, - std::get<0>(ret)); + unicode::bidi_cleanup(new_string, + std::get<0>(ret), + [] + (size_t) + { + }, + UNICODE_BIDI_CLEANUP_CANONICAL); /* New string is now back in logical order */ diff --git a/unicode/book.xml b/unicode/book.xml index c8948ba..b0342ea 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -304,7 +304,6 @@ See COPYING for distribution information. unicode_bidi_calc unicode_bidi_reorder unicode_bidi_cleanup - unicode_bidi_extra_cleanup unicode_bidi_logical_order unicode_bidi_embed unicode_bidi_embed_paragraph_level @@ -341,15 +340,7 @@ See COPYING for distribution information. char32_t *string unicode_bidi_level_t *levels size_t n - void (*removed_callback)(size_t, size_t, void *) - void *arg - - - - size_t unicode_bidi_extra_cleanup - char32_t *string - unicode_bidi_level_t *levels - size_t n + int options void (*removed_callback)(size_t, size_t, void *) void *arg @@ -450,8 +441,7 @@ See COPYING for distribution information. - Use unicode_bidi_cleanup() or - unicode_bidi_extra_cleanup(), + Use unicode_bidi_cleanup() to remove the characters from the string which are used by the bi-directional algorithm, and are not needed for rendering the text. @@ -585,28 +575,12 @@ See COPYING for distribution information. rendering order, but still contain bi-directional embedding, override, boundary-neutral, isolate, and marker characters. - unicode_bidi_cleanup() and - unicode_bidi_extra_cleanup() remove these - characters and directional markers from the unicode string. - unicode_bidi_cleanup removes only the - embedding, override, and boundry-neutral characters (as - specified by step X9 of the bi-directional algorithm). - unicode_bidi_extra_cleanup() - additionally removes the isolation markers, implicit markers; - and all characters - classified as paragraph separators get replaced by a newline. - - - A non-null pointer to the directional embedding level buffer, - of the same size as the string, also removes the corresponding - values from the buffer, and the remaining values in the - embedding level buffer get reset to - levels UNICODE_BIDI_LR and - UNICODE_BIDI_RL, only. - + unicode_bidi_cleanup + removes these characters and directional markers. + - The parameters to unicode_bidi_cleanup() and - unicode_bidi_extra_cleanup() are: + The parameters to unicode_bidi_cleanup() + are: @@ -617,15 +591,66 @@ See COPYING for distribution information. - The pointer to the directional embedding buffer. - + A non-null pointer to the directional embedding level buffer, + of the same size as the string, also removes the corresponding + values from the buffer, and the remaining values in the + embedding level buffer get reset to + levels UNICODE_BIDI_LR and + UNICODE_BIDI_RL, only. + + The size of the unicode string and the directional embedding - buffer. + buffer (if not NULL). + + + + A a bitmask that selects the following options + (or 0 if no options): + + + + + UNICODE_BIDI_CLEANUP_EXTRA + + + In addition to removing all embedding, override, and + boundry-neutral characters as + specified by step X9 of the bi-directional algorithm + (the default behavior without this flag), also + remove all isolation markers and implicit markers. + + + + + + UNICODE_BIDI_CLEANUP_BNL + + + Replace all characters classified as paragraph + separators with a newline character. + + + + + + UNICODE_BIDI_CLEANUP_CANONICAL + + + A combined set of + UNICODE_BIDI_CLEANUP_EXTRA + and + UNICODE_BIDI_CLEANUP_BNL, + + + + + + A pointer to a function that gets repeatedly invoked with the @@ -647,17 +672,17 @@ See COPYING for distribution information. from the first to the last removed character (if any). - - Multiple calls to unicode_bidi_cleanup() or - unicode_bidi_extra_cleanup() do no harm; - except that unicode_bidi_extra_cleanup() - always removes all the additional characters that - unicode_bidi_cleanup() does not remove. - + The character string and the embedding level values resulting - from unicode_bidi_extra_cleanup() are in + from unicode_bidi_cleanup() + with the UNICODE_BIDI_CLEANUP_CANONICAL + are in canonical rendering order. + unicode_bidi_logical_order() and + unicode_bidi_embed() require the + canonical rendering order for their string and embedding level + values. @@ -675,7 +700,8 @@ See COPYING for distribution information. canonical rendering order after applying unicode_bidi_calc(), unicode_reorder() and - unicode_bidi_extra_cleanup(), + unicode_bidi_cleanup() + (with the canonical option), with the same paragraph_embedding level. @@ -2628,15 +2654,15 @@ See COPYING for distribution information. SamVarshavchikAuthorCourier Unicode Library - unicode::bidi::calc + unicode::bidi 3 + unicode::bidi unicode::bidi_calc unicode::bidi_reorder unicode::bidi_cleanup - unicode::bidi_extra_cleanup unicode::bidi_logical_order unicode::bidi_embed unicode::bidi_embed_paragraph_level @@ -2674,6 +2700,7 @@ See COPYING for distribution information. void unicode::bidi_cleanup std::u32string &string const std::function<void (size_t) noexcept> &removed_callback + int cleanup_options @@ -2681,19 +2708,7 @@ See COPYING for distribution information. std::u32string &string std::vector <unicode_bidi_level_t> &levels const std::function<void (size_t) noexcept> &removed_callback - - - - void unicode::bidi_extra_cleanup - std::u32string &string - const std::function<void (size_t) noexcept> &removed_callback - - - - int unicode::bidi_extra_cleanup - std::u32string &string - std::vector <unicode_bidi_level_t> &levels - const std::function<void (size_t) noexcept> &removed_callback + int cleanup_options @@ -2789,7 +2804,51 @@ See COPYING for distribution information. + + + <literal>unicode::literals</literal> namespace + +
+ + + +
+ + + This namespace contains the following constexpr + definitions: + + + + + + char32_t arrays with literal + Unicode character strings containing Unicode directional, + isolate, and override markers, like + LRO, + RLO and others. + + + + + CLEANUP_EXTRA, + CLEANUP_BNL, and + CLEANUP_CANONICAL options for + unicode::bidi_cleanup(). + + + + +
+ SEE ALSO diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index cc9dbbb..3de76d3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -548,6 +548,24 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i); #define UNICODE_LRO 0x202d /* Left-to-right override */ #define UNICODE_PDF 0x202c /* Pop directional override */ +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { + namespace literals { + + constexpr char32_t LRM[]={UNICODE_LRM, 0}; + constexpr char32_t RLM[]={UNICODE_RLM, 0}; + constexpr char32_t ALM[]={UNICODE_ALM, 0}; + constexpr char32_t LRI[]={UNICODE_LRI, 0}; + constexpr char32_t RLI[]={UNICODE_RLI, 0}; + constexpr char32_t PDI[]={UNICODE_PDI, 0}; + constexpr char32_t RLO[]={UNICODE_RLO, 0}; + constexpr char32_t LRO[]={UNICODE_LRO, 0}; + constexpr char32_t PDF[]={UNICODE_PDF, 0}; + } +} +#endif +#endif typedef char unicode_bidi_bracket_type_t; @@ -608,19 +626,50 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +/* Bitmask options to unicode_bidi_cleanup */ + +/* + In addition to removing embedding, override, and boundary-neutral + characters also remove isolation markers and implicit markers. +*/ + +#define UNICODE_BIDI_CLEANUP_EXTRA 1 + +/* + Replace all characters classified as paragraph separators by a newline + character. +*/ + +#define UNICODE_BIDI_CLEANUP_BNL 2 + +/* + Options for canonical rendering order. +*/ + +#define UNICODE_BIDI_CLEANUP_CANONICAL \ + (UNICODE_BIDI_CLEANUP_EXTRA | UNICODE_BIDI_CLEANUP_BNL) + +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { + namespace literals { + constexpr int CLEANUP_EXTRA=UNICODE_BIDI_CLEANUP_EXTRA; + + constexpr int CLEANUP_BNL=UNICODE_BIDI_CLEANUP_BNL; + + constexpr int CLEANUP_CANONICAL=UNICODE_BIDI_CLEANUP_CANONICAL; + } +} +#endif +#endif + extern size_t unicode_bidi_cleanup(char32_t *string, unicode_bidi_level_t *levels, size_t n, + int options, void (*removed_callback)(size_t, void *), void *); -extern size_t unicode_bidi_extra_cleanup(char32_t *string, - unicode_bidi_level_t *levels, - size_t n, - void (*removed_callback)(size_t, - void *), - void *); - extern void unicode_bidi_logical_order(char32_t *string, unicode_bidi_level_t *levels, size_t n, @@ -2147,7 +2196,8 @@ void bidi_reorder(std::vector &levels, void bidi_cleanup(std::u32string &string, const std::function &removed_callback= - [](size_t) {}); + [](size_t) {}, + int cleanup_options=0); //! Also remove them from the embedding direction level buffer. @@ -2156,28 +2206,8 @@ void bidi_cleanup(std::u32string &string, int bidi_cleanup(std::u32string &string, std::vector &levels, const std::function &removed_callback= - [](size_t) {}); - - -//! Remove directional markers and isolation markers. - -//! Removes them from the string, in place. Optional lambda gets notified -//! of the index (in the original string, of each removed marker. - -void bidi_extra_cleanup(std::u32string &string, - const std::function - &removed_callback= - [](size_t) {}); - -//! Also remove them from the embedding direction level buffer. - -//! Returns non-0 in case of non-matching level buffer size. - -int bidi_extra_cleanup(std::u32string &string, - std::vector &levels, - const std::function - &removed_callback= - [](size_t) {}); + [](size_t) {}, + int cleanup_options=0); //! Convert Unicode string from canonical rendering order to logical order. int bidi_logical_order(std::u32string &string, @@ -2189,8 +2219,7 @@ int bidi_logical_order(std::u32string &string, //! Convert Unicode string from canonical rendering order to logical order. void bidi_logical_order(std::vector &levels, unicode_bidi_level_t paragraph_embedding, - const std::function - &lambda); + const std::function &lambda); //! Embed directional and isolation markers diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 79c4db5..cfae12f 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2032,6 +2032,7 @@ void unicode_bidi_reorder(char32_t *p, size_t unicode_bidi_cleanup(char32_t *string, unicode_bidi_level_t *levels, size_t n, + int cleanup_options, void (*removed_callback)(size_t, void *), void *arg) { @@ -2040,7 +2041,13 @@ size_t unicode_bidi_cleanup(char32_t *string, { enum_bidi_type_t cl=unicode_bidi_type(string[j]); - if (IS_X9(cl)) + if (cleanup_options & UNICODE_BIDI_CLEANUP_EXTRA + ? ( + is_explicit_indicator_except_b(cl) || + (string[j] == UNICODE_LRM || + string[j] == UNICODE_RLM || + string[j] == UNICODE_ALM)) + : IS_X9(cl)) { if (removed_callback) (*removed_callback)(j, arg); @@ -2048,34 +2055,9 @@ size_t unicode_bidi_cleanup(char32_t *string, } if (levels) levels[i]=levels[j] & 1; - ++i; - } - return i; -} - -size_t unicode_bidi_extra_cleanup(char32_t *string, - unicode_bidi_level_t *levels, - size_t n, - void (*removed_callback)(size_t, void *), - void *arg) -{ - size_t i=0; - for (size_t j=0; j &lambda) + const std::function &lambda, + int cleanup_options) { if (string.empty()) return; @@ -701,6 +702,7 @@ void unicode::bidi_cleanup(std::u32string &string, size_t n=unicode_bidi_cleanup(&string[0], 0, string.size(), + cleanup_options, removed_callback, reinterpret_cast(&cb)); cb.rethrow(); @@ -709,15 +711,20 @@ void unicode::bidi_cleanup(std::u32string &string, int unicode::bidi_cleanup(std::u32string &string, std::vector &levels, - const std::function &lambda) + const std::function &lambda, + int cleanup_options) { if (levels.size() != string.size()) return -1; + if (levels.size() == 0) + return 0; + cb_wrapper cb{lambda}; size_t n=unicode_bidi_cleanup(&string[0], &levels[0], string.size(), + cleanup_options, removed_callback, reinterpret_cast(&cb)); cb.rethrow(); @@ -727,42 +734,6 @@ int unicode::bidi_cleanup(std::u32string &string, return 0; } - -void unicode::bidi_extra_cleanup(std::u32string &string, - const std::function &lambda) -{ - if (string.empty()) - return; - - cb_wrapper cb{lambda}; - size_t n=unicode_bidi_extra_cleanup(&string[0], - 0, - string.size(), - removed_callback, - reinterpret_cast(&cb)); - cb.rethrow(); - string.resize(n); -} - -int unicode::bidi_extra_cleanup(std::u32string &string, - std::vector &levels, - const std::function &lambda) -{ - if (levels.size() != string.size()) - return -1; - - cb_wrapper cb{lambda}; - size_t n=unicode_bidi_extra_cleanup(&string[0], - &levels[0], - string.size(), - removed_callback, - reinterpret_cast(&cb)); - cb.rethrow(); - string.resize(n); - levels.resize(n); - return 0; -} - int unicode::bidi_logical_order(std::u32string &string, std::vector &levels, unicode_bidi_level_t paragraph_embedding, -- cgit v1.2.3