summaryrefslogtreecommitdiffstats
path: root/unicode
diff options
context:
space:
mode:
authorSam Varshavchik2020-11-29 08:41:57 -0500
committerSam Varshavchik2020-11-30 19:31:57 -0500
commit844f6a9ef755c1c5826c9583b364af08b54a4dcc (patch)
tree10f0af36c609cad9953f7a736e11a2f2e8d8b897 /unicode
parentf2db409949ad94d4fc175d04ebd72bda3bd1df4e (diff)
downloadcourier-libs-844f6a9ef755c1c5826c9583b364af08b54a4dcc.tar.bz2
Combine cleanup functions, add unicode::literals namespace.
Diffstat (limited to 'unicode')
-rw-r--r--unicode/Makefile.am5
-rw-r--r--unicode/biditest2.C13
-rw-r--r--unicode/book.xml179
-rw-r--r--unicode/courier-unicode.h.in93
-rw-r--r--unicode/unicode_bidi.c38
-rw-r--r--unicode/unicodecpp.C47
6 files changed, 211 insertions, 164 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index f864e2d..dbc71aa 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -89,11 +89,11 @@ include_HEADERS=courier-unicode.h \
man_MANS= \
$(srcdir)/man/courier-unicode.7 \
+ $(srcdir)/man/unicode\:\:bidi.3 \
$(srcdir)/man/unicode\:\:bidi_calc.3 \
$(srcdir)/man/unicode\:\:bidi_cleanup.3 \
$(srcdir)/man/unicode\:\:bidi_embed.3 \
$(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \
- $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \
$(srcdir)/man/unicode\:\:bidi_logical_order.3 \
$(srcdir)/man/unicode\:\:bidi_reorder.3 \
$(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \
@@ -118,7 +118,6 @@ man_MANS= \
$(srcdir)/man/unicode_bidi_cleanup.3 \
$(srcdir)/man/unicode_bidi_embed.3 \
$(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \
- $(srcdir)/man/unicode_bidi_extra_cleanup.3 \
$(srcdir)/man/unicode_bidi_logical_order.3 \
$(srcdir)/man/unicode_bidi_mirror.3 \
$(srcdir)/man/unicode_bidi_reorder.3 \
@@ -515,4 +514,4 @@ distrelease:
$(MAKE) dist
www:
- rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode
+ rsync -a --delete-after html/. $$HOME/www/hostrocket/courier-mta.org/unicode
diff --git a/unicode/biditest2.C b/unicode/biditest2.C
index a9ab87d..ded76be 100644
--- a/unicode/biditest2.C
+++ b/unicode/biditest2.C
@@ -307,7 +307,9 @@ void character_test()
exit(1);
}
- unicode::bidi_extra_cleanup(s, levels);
+ unicode::bidi_cleanup(s, levels,
+ [](size_t) {},
+ UNICODE_BIDI_CLEANUP_CANONICAL);
auto dump_ls=
[&]
@@ -371,8 +373,13 @@ void character_test()
}
unicode::bidi_reorder(new_string, std::get<0>(ret));
- unicode::bidi_extra_cleanup(new_string,
- std::get<0>(ret));
+ unicode::bidi_cleanup(new_string,
+ std::get<0>(ret),
+ []
+ (size_t)
+ {
+ },
+ UNICODE_BIDI_CLEANUP_CANONICAL);
/* New string is now back in logical order */
diff --git a/unicode/book.xml b/unicode/book.xml
index c8948ba..b0342ea 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -304,7 +304,6 @@ See COPYING for distribution information.
<refname>unicode_bidi_calc</refname>
<refname>unicode_bidi_reorder</refname>
<refname>unicode_bidi_cleanup</refname>
- <refname>unicode_bidi_extra_cleanup</refname>
<refname>unicode_bidi_logical_order</refname>
<refname>unicode_bidi_embed</refname>
<refname>unicode_bidi_embed_paragraph_level</refname>
@@ -341,15 +340,7 @@ See COPYING for distribution information.
<paramdef>char32_t *<parameter>string</parameter></paramdef>
<paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
<paramdef>size_t <parameter>n</parameter></paramdef>
- <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef>
- <paramdef>void *<parameter>arg</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef>
- <paramdef>char32_t *<parameter>string</parameter></paramdef>
- <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
- <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>int <parameter>options</parameter></paramdef>
<paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef>
<paramdef>void *<parameter>arg</parameter></paramdef>
</funcprototype>
@@ -450,8 +441,7 @@ See COPYING for distribution information.
</listitem>
<listitem>
<para>
- Use <function>unicode_bidi_cleanup</function>() or
- <function>unicode_bidi_extra_cleanup</function>(),
+ Use <function>unicode_bidi_cleanup</function>()
to remove the characters from the string which are used
by the bi-directional algorithm, and are not needed for
rendering the text.
@@ -585,28 +575,12 @@ See COPYING for distribution information.
<quote>rendering order</quote>, but still contain bi-directional
embedding, override, boundary-neutral, isolate, and marker
characters.
- <function>unicode_bidi_cleanup</function>() and
- <function>unicode_bidi_extra_cleanup</function>() remove these
- characters and directional markers from the unicode string.
- <function>unicode_bidi_cleanup</function> removes only the
- embedding, override, and boundry-neutral characters (as
- specified by step X9 of the bi-directional algorithm).
- <function>unicode_bidi_extra_cleanup</function>()
- additionally removes the isolation markers, implicit markers;
- and all characters
- classified as paragraph separators get replaced by a newline.
- </para>
- <para>
- A non-null pointer to the directional embedding level buffer,
- of the same size as the string, also removes the corresponding
- values from the buffer, and the remaining values in the
- embedding level buffer get reset to
- levels <literal>UNICODE_BIDI_LR</literal> and
- <literal> UNICODE_BIDI_RL</literal>, only.
- </para>
+ <function>unicode_bidi_cleanup</function>
+ removes these characters and directional markers.
+ </para>
<para>
- The parameters to <function>unicode_bidi_cleanup</function>() and
- <function>unicode_bidi_extra_cleanup</function>() are:
+ The parameters to <function>unicode_bidi_cleanup</function>()
+ are:
</para>
<itemizedlist>
@@ -617,15 +591,66 @@ See COPYING for distribution information.
</listitem>
<listitem>
<para>
- The pointer to the directional embedding buffer.
- </para>
+ A non-null pointer to the directional embedding level buffer,
+ of the same size as the string, also removes the corresponding
+ values from the buffer, and the remaining values in the
+ embedding level buffer get reset to
+ levels <literal>UNICODE_BIDI_LR</literal> and
+ <literal> UNICODE_BIDI_RL</literal>, only.
+ </para>
</listitem>
+
<listitem>
<para>
The size of the unicode string and the directional embedding
- buffer.
+ buffer (if not NULL).
</para>
</listitem>
+
+ <listitem>
+ <para>
+ A a bitmask that selects the following options
+ (or 0 if no options):
+ </para>
+
+ <variablelist>
+ <varlistentry>
+ <term><literal>UNICODE_BIDI_CLEANUP_EXTRA</literal></term>
+ <listitem>
+ <para>
+ In addition to removing all embedding, override, and
+ boundry-neutral characters as
+ specified by step X9 of the bi-directional algorithm
+ (the default behavior without this flag), also
+ remove all isolation markers and implicit markers.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>UNICODE_BIDI_CLEANUP_BNL</literal></term>
+ <listitem>
+ <para>
+ Replace all characters classified as paragraph
+ separators with a newline character.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal></term>
+ <listitem>
+ <para>
+ A combined set of
+ <literal>UNICODE_BIDI_CLEANUP_EXTRA</literal>
+ and
+ <literal>UNICODE_BIDI_CLEANUP_BNL</literal>,
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </listitem>
+
<listitem>
<para>
A pointer to a function that gets repeatedly invoked with the
@@ -647,17 +672,17 @@ See COPYING for distribution information.
from the first to
the last removed character (if any).
</para>
- <para>
- Multiple calls to <function>unicode_bidi_cleanup</function>() or
- <function>unicode_bidi_extra_cleanup</function>() do no harm;
- except that <function>unicode_bidi_extra_cleanup</function>()
- always removes all the additional characters that
- <function>unicode_bidi_cleanup</function>() does not remove.
- </para>
+
<para>
The character string and the embedding level values resulting
- from <function>unicode_bidi_extra_cleanup</function>() are in
+ from <function>unicode_bidi_cleanup</function>()
+ with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal>
+ are in
<quote>canonical rendering order</quote>.
+ <function>unicode_bidi_logical_order</function>() and
+ <function>unicode_bidi_embed</function>() require the
+ canonical rendering order for their string and embedding level
+ values.
</para>
</refsect2>
@@ -675,7 +700,8 @@ See COPYING for distribution information.
canonical rendering order after applying
<function>unicode_bidi_calc()</function>,
<function>unicode_reorder()</function> and
- <function>unicode_bidi_extra_cleanup()</function>,
+ <function>unicode_bidi_cleanup()</function>
+ (with the canonical option),
with the same paragraph_embedding level.
</para>
@@ -2628,15 +2654,15 @@ See COPYING for distribution information.
<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
- <refentrytitle>unicode::bidi::calc</refentrytitle>
+ <refentrytitle>unicode::bidi</refentrytitle>
<manvolnum>3</manvolnum>
</refmeta>
<refnamediv>
+ <refname>unicode::bidi</refname>
<refname>unicode::bidi_calc</refname>
<refname>unicode::bidi_reorder</refname>
<refname>unicode::bidi_cleanup</refname>
- <refname>unicode::bidi_extra_cleanup</refname>
<refname>unicode::bidi_logical_order</refname>
<refname>unicode::bidi_embed</refname>
<refname>unicode::bidi_embed_paragraph_level</refname>
@@ -2674,6 +2700,7 @@ See COPYING for distribution information.
<funcdef>void <function>unicode::bidi_cleanup</function></funcdef>
<paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
<paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ <paramdef>int <parameter>cleanup_options</parameter></paramdef>
</funcprototype>
<funcprototype>
@@ -2681,19 +2708,7 @@ See COPYING for distribution information.
<paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
<paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
<paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef>
- <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
- <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef>
- <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
- <paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
- <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ <paramdef>int <parameter>cleanup_options</parameter></paramdef>
</funcprototype>
<funcprototype>
@@ -2789,7 +2804,51 @@ See COPYING for distribution information.
</para>
</listitem>
</itemizedlist>
+
+ <refsect2 id="unicode_cpp_bidi_literals">
+ <title><literal>unicode::literals</literal> namespace</title>
+
+ <blockquote>
+ <informalexample>
+ <programlisting><![CDATA[
+using namespace unicode::literals;
+
+std::u32string foo(std::u32string bar)
+{
+ return bar + LRO;
+}
+]]></programlisting>
+ </informalexample>
+ </blockquote>
+
+ <para>
+ This namespace contains the following <literal>constexpr</literal>
+ definitions:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ <classname>char32_t</classname> arrays with literal
+ Unicode character strings containing Unicode directional,
+ isolate, and override markers, like
+ <literal>LRO</literal>,
+ <literal>RLO</literal> and others.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>CLEANUP_EXTRA</literal>,
+ <literal>CLEANUP_BNL</literal>, and
+ <literal>CLEANUP_CANONICAL</literal> options for
+ <function>unicode::bidi_cleanup</function>().
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ </refsect2>
</refsect1>
+
<refsect1 id="unicode_cpp_bidi_seealso">
<title>SEE ALSO</title>
<para>
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index cc9dbbb..3de76d3 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -548,6 +548,24 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i);
#define UNICODE_LRO 0x202d /* Left-to-right override */
#define UNICODE_PDF 0x202c /* Pop directional override */
+#ifdef __cplusplus
+#if __cplusplus >= 201103L
+namespace unicode {
+ namespace literals {
+
+ constexpr char32_t LRM[]={UNICODE_LRM, 0};
+ constexpr char32_t RLM[]={UNICODE_RLM, 0};
+ constexpr char32_t ALM[]={UNICODE_ALM, 0};
+ constexpr char32_t LRI[]={UNICODE_LRI, 0};
+ constexpr char32_t RLI[]={UNICODE_RLI, 0};
+ constexpr char32_t PDI[]={UNICODE_PDI, 0};
+ constexpr char32_t RLO[]={UNICODE_RLO, 0};
+ constexpr char32_t LRO[]={UNICODE_LRO, 0};
+ constexpr char32_t PDF[]={UNICODE_PDF, 0};
+ }
+}
+#endif
+#endif
typedef char unicode_bidi_bracket_type_t;
@@ -608,19 +626,50 @@ typedef enum {
extern enum_bidi_type_t unicode_bidi_type(char32_t c);
+/* Bitmask options to unicode_bidi_cleanup */
+
+/*
+ In addition to removing embedding, override, and boundary-neutral
+ characters also remove isolation markers and implicit markers.
+*/
+
+#define UNICODE_BIDI_CLEANUP_EXTRA 1
+
+/*
+ Replace all characters classified as paragraph separators by a newline
+ character.
+*/
+
+#define UNICODE_BIDI_CLEANUP_BNL 2
+
+/*
+ Options for canonical rendering order.
+*/
+
+#define UNICODE_BIDI_CLEANUP_CANONICAL \
+ (UNICODE_BIDI_CLEANUP_EXTRA | UNICODE_BIDI_CLEANUP_BNL)
+
+#ifdef __cplusplus
+#if __cplusplus >= 201103L
+namespace unicode {
+ namespace literals {
+ constexpr int CLEANUP_EXTRA=UNICODE_BIDI_CLEANUP_EXTRA;
+
+ constexpr int CLEANUP_BNL=UNICODE_BIDI_CLEANUP_BNL;
+
+ constexpr int CLEANUP_CANONICAL=UNICODE_BIDI_CLEANUP_CANONICAL;
+ }
+}
+#endif
+#endif
+
extern size_t unicode_bidi_cleanup(char32_t *string,
unicode_bidi_level_t *levels,
size_t n,
+ int options,
void (*removed_callback)(size_t, void *),
void *);
-extern size_t unicode_bidi_extra_cleanup(char32_t *string,
- unicode_bidi_level_t *levels,
- size_t n,
- void (*removed_callback)(size_t,
- void *),
- void *);
-
extern void unicode_bidi_logical_order(char32_t *string,
unicode_bidi_level_t *levels,
size_t n,
@@ -2147,7 +2196,8 @@ void bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
void bidi_cleanup(std::u32string &string,
const std::function<void (size_t)> &removed_callback=
- [](size_t) {});
+ [](size_t) {},
+ int cleanup_options=0);
//! Also remove them from the embedding direction level buffer.
@@ -2156,28 +2206,8 @@ void bidi_cleanup(std::u32string &string,
int bidi_cleanup(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
const std::function<void (size_t)> &removed_callback=
- [](size_t) {});
-
-
-//! Remove directional markers and isolation markers.
-
-//! Removes them from the string, in place. Optional lambda gets notified
-//! of the index (in the original string, of each removed marker.
-
-void bidi_extra_cleanup(std::u32string &string,
- const std::function<void (size_t)>
- &removed_callback=
- [](size_t) {});
-
-//! Also remove them from the embedding direction level buffer.
-
-//! Returns non-0 in case of non-matching level buffer size.
-
-int bidi_extra_cleanup(std::u32string &string,
- std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t)>
- &removed_callback=
- [](size_t) {});
+ [](size_t) {},
+ int cleanup_options=0);
//! Convert Unicode string from canonical rendering order to logical order.
int bidi_logical_order(std::u32string &string,
@@ -2189,8 +2219,7 @@ int bidi_logical_order(std::u32string &string,
//! Convert Unicode string from canonical rendering order to logical order.
void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding,
- const std::function<void (size_t, size_t)>
- &lambda);
+ const std::function<void (size_t, size_t)> &lambda);
//! Embed directional and isolation markers
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index 79c4db5..cfae12f 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -2032,6 +2032,7 @@ void unicode_bidi_reorder(char32_t *p,
size_t unicode_bidi_cleanup(char32_t *string,
unicode_bidi_level_t *levels,
size_t n,
+ int cleanup_options,
void (*removed_callback)(size_t, void *),
void *arg)
{
@@ -2040,7 +2041,13 @@ size_t unicode_bidi_cleanup(char32_t *string,
{
enum_bidi_type_t cl=unicode_bidi_type(string[j]);
- if (IS_X9(cl))
+ if (cleanup_options & UNICODE_BIDI_CLEANUP_EXTRA
+ ? (
+ is_explicit_indicator_except_b(cl) ||
+ (string[j] == UNICODE_LRM ||
+ string[j] == UNICODE_RLM ||
+ string[j] == UNICODE_ALM))
+ : IS_X9(cl))
{
if (removed_callback)
(*removed_callback)(j, arg);
@@ -2048,34 +2055,9 @@ size_t unicode_bidi_cleanup(char32_t *string,
}
if (levels)
levels[i]=levels[j] & 1;
- ++i;
- }
- return i;
-}
-
-size_t unicode_bidi_extra_cleanup(char32_t *string,
- unicode_bidi_level_t *levels,
- size_t n,
- void (*removed_callback)(size_t, void *),
- void *arg)
-{
- size_t i=0;
- for (size_t j=0; j<n; ++j)
- {
- enum_bidi_type_t cl=unicode_bidi_type(string[j]);
- if (is_explicit_indicator_except_b(cl) ||
- (string[j] == UNICODE_LRM ||
- string[j] == UNICODE_RLM ||
- string[j] == UNICODE_ALM))
- {
- if (removed_callback)
- (*removed_callback)(j, arg);
- continue;
- }
- string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j];
- if (levels)
- levels[i]=levels[j] & 1;
+ string[i]=(cleanup_options & UNICODE_BIDI_CLEANUP_BNL)
+ && cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j];
++i;
}
return i;
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index 4217630..a0d5ac4 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -691,7 +691,8 @@ extern "C" {
}
void unicode::bidi_cleanup(std::u32string &string,
- const std::function<void (size_t)> &lambda)
+ const std::function<void (size_t)> &lambda,
+ int cleanup_options)
{
if (string.empty())
return;
@@ -701,6 +702,7 @@ void unicode::bidi_cleanup(std::u32string &string,
size_t n=unicode_bidi_cleanup(&string[0],
0,
string.size(),
+ cleanup_options,
removed_callback,
reinterpret_cast<void *>(&cb));
cb.rethrow();
@@ -709,15 +711,20 @@ void unicode::bidi_cleanup(std::u32string &string,
int unicode::bidi_cleanup(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t)> &lambda)
+ const std::function<void (size_t)> &lambda,
+ int cleanup_options)
{
if (levels.size() != string.size())
return -1;
+ if (levels.size() == 0)
+ return 0;
+
cb_wrapper<void (size_t)> cb{lambda};
size_t n=unicode_bidi_cleanup(&string[0],
&levels[0],
string.size(),
+ cleanup_options,
removed_callback,
reinterpret_cast<void *>(&cb));
cb.rethrow();
@@ -727,42 +734,6 @@ int unicode::bidi_cleanup(std::u32string &string,
return 0;
}
-
-void unicode::bidi_extra_cleanup(std::u32string &string,
- const std::function<void (size_t)> &lambda)
-{
- if (string.empty())
- return;
-
- cb_wrapper<void (size_t)> cb{lambda};
- size_t n=unicode_bidi_extra_cleanup(&string[0],
- 0,
- string.size(),
- removed_callback,
- reinterpret_cast<void *>(&cb));
- cb.rethrow();
- string.resize(n);
-}
-
-int unicode::bidi_extra_cleanup(std::u32string &string,
- std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t)> &lambda)
-{
- if (levels.size() != string.size())
- return -1;
-
- cb_wrapper<void (size_t)> cb{lambda};
- size_t n=unicode_bidi_extra_cleanup(&string[0],
- &levels[0],
- string.size(),
- removed_callback,
- reinterpret_cast<void *>(&cb));
- cb.rethrow();
- string.resize(n);
- levels.resize(n);
- return 0;
-}
-
int unicode::bidi_logical_order(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding,