diff options
| author | Sam Varshavchik | 2020-11-29 08:41:57 -0500 |
|---|---|---|
| committer | Sam Varshavchik | 2020-11-30 19:31:57 -0500 |
| commit | 844f6a9ef755c1c5826c9583b364af08b54a4dcc (patch) | |
| tree | 10f0af36c609cad9953f7a736e11a2f2e8d8b897 /unicode | |
| parent | f2db409949ad94d4fc175d04ebd72bda3bd1df4e (diff) | |
| download | courier-libs-844f6a9ef755c1c5826c9583b364af08b54a4dcc.tar.bz2 | |
Combine cleanup functions, add unicode::literals namespace.
Diffstat (limited to 'unicode')
| -rw-r--r-- | unicode/Makefile.am | 5 | ||||
| -rw-r--r-- | unicode/biditest2.C | 13 | ||||
| -rw-r--r-- | unicode/book.xml | 179 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 93 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 38 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 47 |
6 files changed, 211 insertions, 164 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am index f864e2d..dbc71aa 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -89,11 +89,11 @@ include_HEADERS=courier-unicode.h \ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ + $(srcdir)/man/unicode\:\:bidi.3 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_embed.3 \ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ - $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_logical_order.3 \ $(srcdir)/man/unicode\:\:bidi_reorder.3 \ $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \ @@ -118,7 +118,6 @@ man_MANS= \ $(srcdir)/man/unicode_bidi_cleanup.3 \ $(srcdir)/man/unicode_bidi_embed.3 \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ - $(srcdir)/man/unicode_bidi_extra_cleanup.3 \ $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ @@ -515,4 +514,4 @@ distrelease: $(MAKE) dist www: - rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode + rsync -a --delete-after html/. $$HOME/www/hostrocket/courier-mta.org/unicode diff --git a/unicode/biditest2.C b/unicode/biditest2.C index a9ab87d..ded76be 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -307,7 +307,9 @@ void character_test() exit(1); } - unicode::bidi_extra_cleanup(s, levels); + unicode::bidi_cleanup(s, levels, + [](size_t) {}, + UNICODE_BIDI_CLEANUP_CANONICAL); auto dump_ls= [&] @@ -371,8 +373,13 @@ void character_test() } unicode::bidi_reorder(new_string, std::get<0>(ret)); - unicode::bidi_extra_cleanup(new_string, - std::get<0>(ret)); + unicode::bidi_cleanup(new_string, + std::get<0>(ret), + [] + (size_t) + { + }, + UNICODE_BIDI_CLEANUP_CANONICAL); /* New string is now back in logical order */ diff --git a/unicode/book.xml b/unicode/book.xml index c8948ba..b0342ea 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -304,7 +304,6 @@ See COPYING for distribution information. <refname>unicode_bidi_calc</refname> <refname>unicode_bidi_reorder</refname> <refname>unicode_bidi_cleanup</refname> - <refname>unicode_bidi_extra_cleanup</refname> <refname>unicode_bidi_logical_order</refname> <refname>unicode_bidi_embed</refname> <refname>unicode_bidi_embed_paragraph_level</refname> @@ -341,15 +340,7 @@ See COPYING for distribution information. <paramdef>char32_t *<parameter>string</parameter></paramdef> <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> <paramdef>size_t <parameter>n</parameter></paramdef> - <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> - <paramdef>void *<parameter>arg</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef> - <paramdef>char32_t *<parameter>string</parameter></paramdef> - <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> - <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>int <parameter>options</parameter></paramdef> <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> <paramdef>void *<parameter>arg</parameter></paramdef> </funcprototype> @@ -450,8 +441,7 @@ See COPYING for distribution information. </listitem> <listitem> <para> - Use <function>unicode_bidi_cleanup</function>() or - <function>unicode_bidi_extra_cleanup</function>(), + Use <function>unicode_bidi_cleanup</function>() to remove the characters from the string which are used by the bi-directional algorithm, and are not needed for rendering the text. @@ -585,28 +575,12 @@ See COPYING for distribution information. <quote>rendering order</quote>, but still contain bi-directional embedding, override, boundary-neutral, isolate, and marker characters. - <function>unicode_bidi_cleanup</function>() and - <function>unicode_bidi_extra_cleanup</function>() remove these - characters and directional markers from the unicode string. - <function>unicode_bidi_cleanup</function> removes only the - embedding, override, and boundry-neutral characters (as - specified by step X9 of the bi-directional algorithm). - <function>unicode_bidi_extra_cleanup</function>() - additionally removes the isolation markers, implicit markers; - and all characters - classified as paragraph separators get replaced by a newline. - </para> - <para> - A non-null pointer to the directional embedding level buffer, - of the same size as the string, also removes the corresponding - values from the buffer, and the remaining values in the - embedding level buffer get reset to - levels <literal>UNICODE_BIDI_LR</literal> and - <literal> UNICODE_BIDI_RL</literal>, only. - </para> + <function>unicode_bidi_cleanup</function> + removes these characters and directional markers. + </para> <para> - The parameters to <function>unicode_bidi_cleanup</function>() and - <function>unicode_bidi_extra_cleanup</function>() are: + The parameters to <function>unicode_bidi_cleanup</function>() + are: </para> <itemizedlist> @@ -617,15 +591,66 @@ See COPYING for distribution information. </listitem> <listitem> <para> - The pointer to the directional embedding buffer. - </para> + A non-null pointer to the directional embedding level buffer, + of the same size as the string, also removes the corresponding + values from the buffer, and the remaining values in the + embedding level buffer get reset to + levels <literal>UNICODE_BIDI_LR</literal> and + <literal> UNICODE_BIDI_RL</literal>, only. + </para> </listitem> + <listitem> <para> The size of the unicode string and the directional embedding - buffer. + buffer (if not NULL). </para> </listitem> + + <listitem> + <para> + A a bitmask that selects the following options + (or 0 if no options): + </para> + + <variablelist> + <varlistentry> + <term><literal>UNICODE_BIDI_CLEANUP_EXTRA</literal></term> + <listitem> + <para> + In addition to removing all embedding, override, and + boundry-neutral characters as + specified by step X9 of the bi-directional algorithm + (the default behavior without this flag), also + remove all isolation markers and implicit markers. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>UNICODE_BIDI_CLEANUP_BNL</literal></term> + <listitem> + <para> + Replace all characters classified as paragraph + separators with a newline character. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal></term> + <listitem> + <para> + A combined set of + <literal>UNICODE_BIDI_CLEANUP_EXTRA</literal> + and + <literal>UNICODE_BIDI_CLEANUP_BNL</literal>, + </para> + </listitem> + </varlistentry> + </variablelist> + </listitem> + <listitem> <para> A pointer to a function that gets repeatedly invoked with the @@ -647,17 +672,17 @@ See COPYING for distribution information. from the first to the last removed character (if any). </para> - <para> - Multiple calls to <function>unicode_bidi_cleanup</function>() or - <function>unicode_bidi_extra_cleanup</function>() do no harm; - except that <function>unicode_bidi_extra_cleanup</function>() - always removes all the additional characters that - <function>unicode_bidi_cleanup</function>() does not remove. - </para> + <para> The character string and the embedding level values resulting - from <function>unicode_bidi_extra_cleanup</function>() are in + from <function>unicode_bidi_cleanup</function>() + with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal> + are in <quote>canonical rendering order</quote>. + <function>unicode_bidi_logical_order</function>() and + <function>unicode_bidi_embed</function>() require the + canonical rendering order for their string and embedding level + values. </para> </refsect2> @@ -675,7 +700,8 @@ See COPYING for distribution information. canonical rendering order after applying <function>unicode_bidi_calc()</function>, <function>unicode_reorder()</function> and - <function>unicode_bidi_extra_cleanup()</function>, + <function>unicode_bidi_cleanup()</function> + (with the canonical option), with the same paragraph_embedding level. </para> @@ -2628,15 +2654,15 @@ See COPYING for distribution information. <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> - <refentrytitle>unicode::bidi::calc</refentrytitle> + <refentrytitle>unicode::bidi</refentrytitle> <manvolnum>3</manvolnum> </refmeta> <refnamediv> + <refname>unicode::bidi</refname> <refname>unicode::bidi_calc</refname> <refname>unicode::bidi_reorder</refname> <refname>unicode::bidi_cleanup</refname> - <refname>unicode::bidi_extra_cleanup</refname> <refname>unicode::bidi_logical_order</refname> <refname>unicode::bidi_embed</refname> <refname>unicode::bidi_embed_paragraph_level</refname> @@ -2674,6 +2700,7 @@ See COPYING for distribution information. <funcdef>void <function>unicode::bidi_cleanup</function></funcdef> <paramdef>std::u32string &<parameter>string</parameter></paramdef> <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + <paramdef>int <parameter>cleanup_options</parameter></paramdef> </funcprototype> <funcprototype> @@ -2681,19 +2708,7 @@ See COPYING for distribution information. <paramdef>std::u32string &<parameter>string</parameter></paramdef> <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef> - <paramdef>std::u32string &<parameter>string</parameter></paramdef> - <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef> - <paramdef>std::u32string &<parameter>string</parameter></paramdef> - <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> - <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + <paramdef>int <parameter>cleanup_options</parameter></paramdef> </funcprototype> <funcprototype> @@ -2789,7 +2804,51 @@ See COPYING for distribution information. </para> </listitem> </itemizedlist> + + <refsect2 id="unicode_cpp_bidi_literals"> + <title><literal>unicode::literals</literal> namespace</title> + + <blockquote> + <informalexample> + <programlisting><![CDATA[ +using namespace unicode::literals; + +std::u32string foo(std::u32string bar) +{ + return bar + LRO; +} +]]></programlisting> + </informalexample> + </blockquote> + + <para> + This namespace contains the following <literal>constexpr</literal> + definitions: + </para> + + <itemizedlist> + <listitem> + <para> + <classname>char32_t</classname> arrays with literal + Unicode character strings containing Unicode directional, + isolate, and override markers, like + <literal>LRO</literal>, + <literal>RLO</literal> and others. + </para> + </listitem> + <listitem> + <para> + <literal>CLEANUP_EXTRA</literal>, + <literal>CLEANUP_BNL</literal>, and + <literal>CLEANUP_CANONICAL</literal> options for + <function>unicode::bidi_cleanup</function>(). + </para> + </listitem> + </itemizedlist> + + </refsect2> </refsect1> + <refsect1 id="unicode_cpp_bidi_seealso"> <title>SEE ALSO</title> <para> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index cc9dbbb..3de76d3 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -548,6 +548,24 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i); #define UNICODE_LRO 0x202d /* Left-to-right override */ #define UNICODE_PDF 0x202c /* Pop directional override */ +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { + namespace literals { + + constexpr char32_t LRM[]={UNICODE_LRM, 0}; + constexpr char32_t RLM[]={UNICODE_RLM, 0}; + constexpr char32_t ALM[]={UNICODE_ALM, 0}; + constexpr char32_t LRI[]={UNICODE_LRI, 0}; + constexpr char32_t RLI[]={UNICODE_RLI, 0}; + constexpr char32_t PDI[]={UNICODE_PDI, 0}; + constexpr char32_t RLO[]={UNICODE_RLO, 0}; + constexpr char32_t LRO[]={UNICODE_LRO, 0}; + constexpr char32_t PDF[]={UNICODE_PDF, 0}; + } +} +#endif +#endif typedef char unicode_bidi_bracket_type_t; @@ -608,19 +626,50 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +/* Bitmask options to unicode_bidi_cleanup */ + +/* + In addition to removing embedding, override, and boundary-neutral + characters also remove isolation markers and implicit markers. +*/ + +#define UNICODE_BIDI_CLEANUP_EXTRA 1 + +/* + Replace all characters classified as paragraph separators by a newline + character. +*/ + +#define UNICODE_BIDI_CLEANUP_BNL 2 + +/* + Options for canonical rendering order. +*/ + +#define UNICODE_BIDI_CLEANUP_CANONICAL \ + (UNICODE_BIDI_CLEANUP_EXTRA | UNICODE_BIDI_CLEANUP_BNL) + +#ifdef __cplusplus +#if __cplusplus >= 201103L +namespace unicode { + namespace literals { + constexpr int CLEANUP_EXTRA=UNICODE_BIDI_CLEANUP_EXTRA; + + constexpr int CLEANUP_BNL=UNICODE_BIDI_CLEANUP_BNL; + + constexpr int CLEANUP_CANONICAL=UNICODE_BIDI_CLEANUP_CANONICAL; + } +} +#endif +#endif + extern size_t unicode_bidi_cleanup(char32_t *string, unicode_bidi_level_t *levels, size_t n, + int options, void (*removed_callback)(size_t, void *), void *); -extern size_t unicode_bidi_extra_cleanup(char32_t *string, - unicode_bidi_level_t *levels, - size_t n, - void (*removed_callback)(size_t, - void *), - void *); - extern void unicode_bidi_logical_order(char32_t *string, unicode_bidi_level_t *levels, size_t n, @@ -2147,7 +2196,8 @@ void bidi_reorder(std::vector<unicode_bidi_level_t> &levels, void bidi_cleanup(std::u32string &string, const std::function<void (size_t)> &removed_callback= - [](size_t) {}); + [](size_t) {}, + int cleanup_options=0); //! Also remove them from the embedding direction level buffer. @@ -2156,28 +2206,8 @@ void bidi_cleanup(std::u32string &string, int bidi_cleanup(std::u32string &string, std::vector<unicode_bidi_level_t> &levels, const std::function<void (size_t)> &removed_callback= - [](size_t) {}); - - -//! Remove directional markers and isolation markers. - -//! Removes them from the string, in place. Optional lambda gets notified -//! of the index (in the original string, of each removed marker. - -void bidi_extra_cleanup(std::u32string &string, - const std::function<void (size_t)> - &removed_callback= - [](size_t) {}); - -//! Also remove them from the embedding direction level buffer. - -//! Returns non-0 in case of non-matching level buffer size. - -int bidi_extra_cleanup(std::u32string &string, - std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t)> - &removed_callback= - [](size_t) {}); + [](size_t) {}, + int cleanup_options=0); //! Convert Unicode string from canonical rendering order to logical order. int bidi_logical_order(std::u32string &string, @@ -2189,8 +2219,7 @@ int bidi_logical_order(std::u32string &string, //! Convert Unicode string from canonical rendering order to logical order. void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, unicode_bidi_level_t paragraph_embedding, - const std::function<void (size_t, size_t)> - &lambda); + const std::function<void (size_t, size_t)> &lambda); //! Embed directional and isolation markers diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 79c4db5..cfae12f 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -2032,6 +2032,7 @@ void unicode_bidi_reorder(char32_t *p, size_t unicode_bidi_cleanup(char32_t *string, unicode_bidi_level_t *levels, size_t n, + int cleanup_options, void (*removed_callback)(size_t, void *), void *arg) { @@ -2040,7 +2041,13 @@ size_t unicode_bidi_cleanup(char32_t *string, { enum_bidi_type_t cl=unicode_bidi_type(string[j]); - if (IS_X9(cl)) + if (cleanup_options & UNICODE_BIDI_CLEANUP_EXTRA + ? ( + is_explicit_indicator_except_b(cl) || + (string[j] == UNICODE_LRM || + string[j] == UNICODE_RLM || + string[j] == UNICODE_ALM)) + : IS_X9(cl)) { if (removed_callback) (*removed_callback)(j, arg); @@ -2048,34 +2055,9 @@ size_t unicode_bidi_cleanup(char32_t *string, } if (levels) levels[i]=levels[j] & 1; - ++i; - } - return i; -} - -size_t unicode_bidi_extra_cleanup(char32_t *string, - unicode_bidi_level_t *levels, - size_t n, - void (*removed_callback)(size_t, void *), - void *arg) -{ - size_t i=0; - for (size_t j=0; j<n; ++j) - { - enum_bidi_type_t cl=unicode_bidi_type(string[j]); - if (is_explicit_indicator_except_b(cl) || - (string[j] == UNICODE_LRM || - string[j] == UNICODE_RLM || - string[j] == UNICODE_ALM)) - { - if (removed_callback) - (*removed_callback)(j, arg); - continue; - } - string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j]; - if (levels) - levels[i]=levels[j] & 1; + string[i]=(cleanup_options & UNICODE_BIDI_CLEANUP_BNL) + && cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j]; ++i; } return i; diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 4217630..a0d5ac4 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -691,7 +691,8 @@ extern "C" { } void unicode::bidi_cleanup(std::u32string &string, - const std::function<void (size_t)> &lambda) + const std::function<void (size_t)> &lambda, + int cleanup_options) { if (string.empty()) return; @@ -701,6 +702,7 @@ void unicode::bidi_cleanup(std::u32string &string, size_t n=unicode_bidi_cleanup(&string[0], 0, string.size(), + cleanup_options, removed_callback, reinterpret_cast<void *>(&cb)); cb.rethrow(); @@ -709,15 +711,20 @@ void unicode::bidi_cleanup(std::u32string &string, int unicode::bidi_cleanup(std::u32string &string, std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t)> &lambda) + const std::function<void (size_t)> &lambda, + int cleanup_options) { if (levels.size() != string.size()) return -1; + if (levels.size() == 0) + return 0; + cb_wrapper<void (size_t)> cb{lambda}; size_t n=unicode_bidi_cleanup(&string[0], &levels[0], string.size(), + cleanup_options, removed_callback, reinterpret_cast<void *>(&cb)); cb.rethrow(); @@ -727,42 +734,6 @@ int unicode::bidi_cleanup(std::u32string &string, return 0; } - -void unicode::bidi_extra_cleanup(std::u32string &string, - const std::function<void (size_t)> &lambda) -{ - if (string.empty()) - return; - - cb_wrapper<void (size_t)> cb{lambda}; - size_t n=unicode_bidi_extra_cleanup(&string[0], - 0, - string.size(), - removed_callback, - reinterpret_cast<void *>(&cb)); - cb.rethrow(); - string.resize(n); -} - -int unicode::bidi_extra_cleanup(std::u32string &string, - std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t)> &lambda) -{ - if (levels.size() != string.size()) - return -1; - - cb_wrapper<void (size_t)> cb{lambda}; - size_t n=unicode_bidi_extra_cleanup(&string[0], - &levels[0], - string.size(), - removed_callback, - reinterpret_cast<void *>(&cb)); - cb.rethrow(); - string.resize(n); - levels.resize(n); - return 0; -} - int unicode::bidi_logical_order(std::u32string &string, std::vector<unicode_bidi_level_t> &levels, unicode_bidi_level_t paragraph_embedding, |
