summaryrefslogtreecommitdiffstats
path: root/unicode
diff options
context:
space:
mode:
authorSam Varshavchik2021-02-24 07:48:49 -0500
committerSam Varshavchik2021-02-24 07:48:49 -0500
commit7f29205e16403e5c86613ca6a80366969b6c6e7f (patch)
tree3ba27e549001a3c10a4f9b0a70ad15fd21747623 /unicode
parent6e8ce4696bf8c05272a01dc55081fcc186e9e6ac (diff)
downloadcourier-libs-7f29205e16403e5c86613ca6a80366969b6c6e7f.tar.bz2
More unicode functions.
Diffstat (limited to 'unicode')
-rw-r--r--unicode/ChangeLog5
-rw-r--r--unicode/Makefile.am2
-rw-r--r--unicode/biditest2.C32
-rw-r--r--unicode/book.xml91
-rw-r--r--unicode/courier-unicode.h.in14
-rw-r--r--unicode/unicode_bidi.c43
-rw-r--r--unicode/unicodecpp.C23
7 files changed, 201 insertions, 9 deletions
diff --git a/unicode/ChangeLog b/unicode/ChangeLog
index fcb1c10..35cffe6 100644
--- a/unicode/ChangeLog
+++ b/unicode/ChangeLog
@@ -1,3 +1,8 @@
+2021-02-24 Sam Varshavchik <mrsam@courier-mta.com>
+
+ * Implement unicode_bidi_needs_embed(), unicode_bidi_cleaned_size(),
+ unicode::bidi_override,
+
2.2.1
2021-02-14 Sam Varshavchik <mrsam@courier-mta.com>
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index 5877d22..dc502b3 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -97,6 +97,7 @@ man_MANS= \
$(srcdir)/man/unicode[\:][\:]bidi_embed_paragraph_level.3 \
$(srcdir)/man/unicode[\:][\:]bidi_get_direction.3 \
$(srcdir)/man/unicode[\:][\:]bidi_logical_order.3 \
+ $(srcdir)/man/unicode[\:][\:]bidi_needs_embed.3 \
$(srcdir)/man/unicode[\:][\:]bidi_override.3 \
$(srcdir)/man/unicode[\:][\:]bidi_reorder.3 \
$(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 \
@@ -127,6 +128,7 @@ man_MANS= \
$(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \
$(srcdir)/man/unicode_bidi_logical_order.3 \
$(srcdir)/man/unicode_bidi_mirror.3 \
+ $(srcdir)/man/unicode_bidi_needs_embed.3 \
$(srcdir)/man/unicode_bidi_reorder.3 \
$(srcdir)/man/unicode_bidi_setbnl.3 \
$(srcdir)/man/unicode_bidi_type.3 \
diff --git a/unicode/biditest2.C b/unicode/biditest2.C
index 6ab347b..a14b3ea 100644
--- a/unicode/biditest2.C
+++ b/unicode/biditest2.C
@@ -597,29 +597,47 @@ void null_character_test()
void direction_test()
{
static const struct {
- const char32_t *str;
+ std::u32string str;
unicode_bidi_level_t direction;
int is_explicit;
+ bool needs_embed;
} tests[]={
{
U"Hello",
UNICODE_BIDI_LR,
1,
+ true,
},
{
U" ",
UNICODE_BIDI_LR,
0,
+ true,
},
{
U"",
UNICODE_BIDI_LR,
0,
+ true,
},
{
U"שלום",
UNICODE_BIDI_RL,
1,
+ true,
+ },
+ {
+ U"Helloש",
+ UNICODE_BIDI_LR,
+ 1,
+ true,
+ },
+ {
+ U"Hello" + std::u32string{unicode::literals::LRO}
+ + U"ש",
+ UNICODE_BIDI_LR,
+ 1,
+ false,
},
};
@@ -633,6 +651,18 @@ void direction_test()
std::cerr << "direction_test failed\n";
exit(1);
}
+
+ std::u32string s=t.str;
+ auto levels=std::get<0>(unicode::bidi_calc(s, t.direction));
+ unicode::bidi_reorder(s, levels);
+ unicode::bidi_cleanup(s, levels);
+
+ if (unicode::bidi_needs_embed(s, levels, &t.direction)
+ != t.needs_embed)
+ {
+ std::cerr << "needs embed failed\n";
+ exit(1);
+ }
}
}
diff --git a/unicode/book.xml b/unicode/book.xml
index 0b45433..4f0fd71 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -336,6 +336,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
<refname>unicode_bidi_cleanup</refname>
<refname>unicode_bidi_cleaned_size</refname>
<refname>unicode_bidi_logical_order</refname>
+ <refname>unicode_bidi_needs_embed</refname>
<refname>unicode_bidi_embed</refname>
<refname>unicode_bidi_embed_paragraph_level</refname>
@@ -403,7 +404,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
</funcprototype>
<funcprototype>
- <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef>
+ <funcdef>void <function>unicode_bidi_logical_order</function></funcdef>
<paramdef>char32_t *<parameter>string</parameter></paramdef>
<paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
<paramdef>size_t <parameter>n</parameter></paramdef>
@@ -413,6 +414,14 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
</funcprototype>
<funcprototype>
+ <funcdef>int <function>unicode_bidi_needs_embed</function></funcdef>
+ <paramdef>const char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>const unicode_bidi_level_t <parameter>*paragraph_embedding</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
<funcdef>size_t <function>unicode_bidi_embed</function></funcdef>
<paramdef>const char32_t *<parameter>string</parameter></paramdef>
<paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
@@ -871,7 +880,8 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
with the <literal>UNICODE_BIDI_CLEANUP_CANONICAL</literal>
are in
<quote>canonical rendering order</quote>.
- <function>unicode_bidi_logical_order</function>() and
+ <function>unicode_bidi_logical_order</function>(),
+ <function>unicode_bidi_needs_embed</function>() and
<function>unicode_bidi_embed</function>() require the
canonical rendering order for their string and embedding level
values.
@@ -886,8 +896,9 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
<refsect2 id="unicode_bidi_embed">
<title>Embedding bi-directional markers in Unicode text strings</title>
<para>
- <function>unicode_bidi_logical_order</function>() and
- <function>unicode_bidi_embed</function>() add various
+ <function>unicode_bidi_logical_order</function>() rearranges
+ the string from rendering to its logical order.
+ <function>unicode_bidi_embed</function>() adds various
bi-directional markers to a Unicode string in canonical rendering
order. The resulting string is not guaranteed to be
identical to the
@@ -901,12 +912,18 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
<function>unicode_bidi_cleanup()</function>
(with the canonical option),
with the same paragraph_embedding level.
+ <function>unicode_bidi_needs_embed</function>() attempts to
+ heuristically determine whether
+ <function>unicode_bidi_embed</function>() is required.
</para>
<para>
<function>unicode_bidi_logical_order</function>() gets called
first, followed by
- <function>unicode_bidi_embed</function>().
+ <function>unicode_bidi_embed</function>()
+ (or
+ <function>unicode_bidi_needs_embed</function>() in order to
+ determine whether bi-directional markers are required).
Finally, <function>unicode_bidi_embed_paragraph_level</function>()
optionally determines whether the resulting string's default
paragraph embedding level matches the one used for the actual
@@ -963,12 +980,12 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
<itemizedlist>
<listitem>
<para>
- The Unicode string, and &hellip;
+ The Unicode string.
</para>
</listitem>
<listitem>
<para>
- &hellip; the directional embedding buffer, in canonical
+ The directional embedding buffer, in canonical
rendering order.
</para>
</listitem>
@@ -1080,6 +1097,53 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
</para>
</listitem>
</itemizedlist>
+
+ <para>
+ <function>unicode_bidi_needs_embed</function>() attempts to
+ heuristically determine whether the Unicode string, in logical
+ order, requires bi-directional markers.
+ The parameters to
+ <function>unicode_bidi_embed_paragraph_level</function>() are:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ The Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The directional embedding buffer, in logical
+ rendering order.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The size of the string and the embedding level buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ A pointer to an explicit paragraph embedding level, either
+ <literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal>; or a
+ <literal>NULL</literal> pointer (see
+ <function>unicode_bidi_calc_types</function>()'s
+ explanation for this parameter).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ <function>unicode_bidi_needs_embed</function>() returns 0
+ if the Unicode string does not need explicit directional
+ markers, or 1 if it does. This is done by using
+ <function>unicode_bidi_calc()</function>,
+ <function>unicode_bidi_reorder()</function>,
+ <function>unicode_bidi_logical_order</function> and then
+ checking if the end result is different from what was passed
+ in.
+ </para>
</refsect2>
<refsect2 id="unicode_bidi_misc">
<title>Miscellaneous utility functions</title>
@@ -2919,6 +2983,7 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
<refname>unicode::bidi_reorder</refname>
<refname>unicode::bidi_cleanup</refname>
<refname>unicode::bidi_logical_order</refname>
+ <refname>unicode::bidi_needs_embed</refname>
<refname>unicode::bidi_embed</refname>
<refname>unicode::bidi_embed_paragraph_level</refname>
<refname>unicode::bidi_get_direction</refname>
@@ -3026,6 +3091,15 @@ make install DESTDIR=/tmp/courier-unicode-instimage # For example.</programlisti
</funcprototype>
<funcprototype>
+ <funcdef>bool <function>unicode::bidi_needs_embed</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>const std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>const unicode_bidi_level_t (<parameter>paragraph_embedding</parameter>=NULL</paramdef>
+ <paramdef>size_t <parameter>starting_pos</parameter>=0</paramdef>
+ <paramdef>size_t <parameter>n</parameter>=(size_t)-1</paramdef>
+ </funcprototype>
+
+ <funcprototype>
<funcdef>int <function>unicode::bidi_embed</function></funcdef>
<paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
<paramdef>const std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
@@ -3196,7 +3270,8 @@ auto [levels, direction]=unicode::bidi_calc(types);
<para>
<function>unicode::bidi_reorder</function>,
<function>unicode::bidi_cleanup</function>,
- <function>unicode::bidi_logical_order</function> and
+ <function>unicode::bidi_logical_order</function>,
+ <function>unicode::bidi_needs_embed</function> and
<function>unicode::bidi_get_direction</function>
take two optional
parameters (defaulted values or overloaded) specifying
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index a1a502c..2999ee3 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -717,6 +717,12 @@ extern void unicode_bidi_logical_order(char32_t *string,
void *),
void *arg);
+extern int unicode_bidi_needs_embed(const char32_t *string,
+ const unicode_bidi_level_t *levels,
+ size_t n,
+ const unicode_bidi_level_t *
+ paragraph_embedding);
+
extern void unicode_bidi_embed(const char32_t *string,
const unicode_bidi_level_t *levels,
size_t n,
@@ -2328,6 +2334,14 @@ void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,
size_t starting_pos=0,
size_t n=(size_t)-1);
+//! Whether directional and isolation markers are needed.
+
+bool bidi_needs_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t> &levels,
+ const unicode_bidi_level_t *paragraph_embedding=0,
+ size_t starting_pos=0,
+ size_t n=(size_t)-1);
+
//! Embed directional and isolation markers
//! Non-0 return value indicates the string and levels' sizes do not match.
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index 1aa4a88..772f9fe 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -2310,6 +2310,49 @@ static void emit_marker(struct bidi_embed_levelrun *p,
}
}
+int unicode_bidi_needs_embed(const char32_t *string,
+ const unicode_bidi_level_t *levels,
+ size_t n,
+ const unicode_bidi_level_t *paragraph_level)
+{
+ char32_t *string_cpy=(char32_t *)malloc(n * sizeof(char32_t));
+ unicode_bidi_level_t *levels_cpy=(unicode_bidi_level_t *)
+ malloc(n * sizeof(unicode_bidi_level_t));
+ size_t nn;
+ int ret;
+
+ if (!string_cpy || !levels_cpy)
+ abort();
+
+ memcpy(string_cpy, string, n * sizeof(char32_t));
+
+ struct unicode_bidi_direction direction=
+ unicode_bidi_calc(string_cpy, n,
+ levels_cpy, paragraph_level);
+
+ unicode_bidi_reorder(string_cpy, levels_cpy, n, NULL, NULL);
+ nn=unicode_bidi_cleanup(string_cpy, levels_cpy, n, 0,
+ NULL, NULL);
+
+ ret=0;
+ if (n == nn && (paragraph_level == NULL ||
+ direction.direction == *paragraph_level))
+ {
+ unicode_bidi_logical_order(string_cpy, levels_cpy, nn,
+ direction.direction,
+ NULL, NULL);
+ if (memcmp(string_cpy, string, n * sizeof(char32_t)) == 0 &&
+ memcmp(levels_cpy, levels, n * sizeof(unicode_bidi_level_t))
+ == 0)
+ {
+ ret=1;
+ }
+ }
+ free(string_cpy);
+ free(levels_cpy);
+ return ret;
+}
+
void unicode_bidi_embed(const char32_t *string,
const unicode_bidi_level_t *levels,
size_t n,
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index 04d9879..7bb6edc 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -950,6 +950,29 @@ unicode_bidi_direction unicode::bidi_get_direction(const std::u32string &string,
return unicode_bidi_get_direction(string.c_str()+starting_pos, n);
}
+bool unicode::bidi_needs_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t> &levels,
+ const unicode_bidi_level_t *paragraph_embedding,
+ size_t starting_pos,
+ size_t n)
+{
+ if (string.size() != levels.size())
+ return false;
+
+ auto s=levels.size();
+
+ if (starting_pos >= s)
+ return false;
+
+ if (n > s-starting_pos)
+ n=s-starting_pos;
+
+ return unicode_bidi_needs_embed(string.c_str(),
+ n == 0 ? NULL : &levels[starting_pos],
+ n,
+ paragraph_embedding) != 0;
+}
+
std::u32string unicode::bidi_override(const std::u32string &s,
unicode_bidi_level_t direction,
int cleanup_options)