summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Varshavchik2020-07-12 09:44:24 -0400
committerSam Varshavchik2020-08-02 14:56:50 -0400
commitd2915c9cadf6fbc5ae29ffc387cce987b88dbbe0 (patch)
treef76c8edf36fb84c6e082f2a4ae9798b10aeda70e
parent51471a4d8b177adfcd40c145a809193a4ab9bd8d (diff)
downloadcourier-libs-d2915c9cadf6fbc5ae29ffc387cce987b88dbbe0.tar.bz2
Add additional bi-directional related algorithm.
Cleanup, remove markers, via unicode_bidi_cleanup() and unicode_bidi_extra_cleanup(). Re-embed directional markers, via unicode_bidi_logical_order(), unicode_bidi_embed() and unicode_bidi_embed_paragraph_level().
-rw-r--r--unicode/Makefile.am11
-rw-r--r--unicode/README4
-rw-r--r--unicode/biditest.C16
-rw-r--r--unicode/biditest2.C289
-rw-r--r--unicode/book.xml796
-rw-r--r--unicode/courier-unicode.h.in203
-rw-r--r--unicode/docbook/book.css2
-rw-r--r--unicode/unicode_bidi.c919
-rw-r--r--unicode/unicodecpp.C197
9 files changed, 2108 insertions, 329 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index 8ac6fb1..f864e2d 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -90,6 +90,11 @@ include_HEADERS=courier-unicode.h \
man_MANS= \
$(srcdir)/man/courier-unicode.7 \
$(srcdir)/man/unicode\:\:bidi_calc.3 \
+ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \
+ $(srcdir)/man/unicode\:\:bidi_embed.3 \
+ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \
+ $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \
+ $(srcdir)/man/unicode\:\:bidi_logical_order.3 \
$(srcdir)/man/unicode\:\:bidi_reorder.3 \
$(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \
$(srcdir)/man/unicode\:\:iconvert\:\:convert_tocase.3 \
@@ -110,8 +115,14 @@ man_MANS= \
$(srcdir)/man/unicode_bidi.3 \
$(srcdir)/man/unicode_bidi_bracket_type.3 \
$(srcdir)/man/unicode_bidi_calc.3 \
+ $(srcdir)/man/unicode_bidi_cleanup.3 \
+ $(srcdir)/man/unicode_bidi_embed.3 \
+ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \
+ $(srcdir)/man/unicode_bidi_extra_cleanup.3 \
+ $(srcdir)/man/unicode_bidi_logical_order.3 \
$(srcdir)/man/unicode_bidi_mirror.3 \
$(srcdir)/man/unicode_bidi_reorder.3 \
+ $(srcdir)/man/unicode_bidi_type.3 \
$(srcdir)/man/unicode_canonical.3 \
$(srcdir)/man/unicode_category_lookup.3 \
$(srcdir)/man/unicode_convert.3 \
diff --git a/unicode/README b/unicode/README
index 926e004..9994cc9 100644
--- a/unicode/README
+++ b/unicode/README
@@ -16,12 +16,12 @@ Courier Unicode Library
COPYING
This library implements several algorithms related to the Unicode
- Standard:
+ Standard, notably:
* Look up uppercase, lowercase, and titlecase equivalents of a unicode
character.
- * Implementation of grapheme and work breaking rules.
+ * Implementation of grapheme and word breaking rules.
* Implementation of line breaking rules.
diff --git a/unicode/biditest.C b/unicode/biditest.C
index 2d2a6e5..1aa2c63 100644
--- a/unicode/biditest.C
+++ b/unicode/biditest.C
@@ -8,6 +8,7 @@
#include <utility>
#include <iomanip>
#include <numeric>
+#include <unistd.h>
std::vector<std::string> testcase;
@@ -53,11 +54,11 @@ int main(int argc, char **argv)
{
buf.clear();
- if (std::getline(fp, buf).eof() && buf.empty())
- break;
+ bool iseof=std::getline(fp, buf).eof() && buf.empty();
- if (++linenum >= nextlogline)
+ if (iseof || ++linenum >= nextlogline)
{
+ alarm(300);
std::cout << logmsg;
std::ostringstream o;
@@ -72,7 +73,8 @@ int main(int argc, char **argv)
nextlogline += 20000;
}
-
+ if (iseof)
+ break;
buf.erase(std::find(buf.begin(), buf.end(), '#'), buf.end());
if (buf.substr(0, 8) == "@Levels:")
@@ -334,11 +336,7 @@ int main(int argc, char **argv)
n >>= 1;
}
}
-
- std::cout << logmsg;
-
- std::fill(logmsg.begin(), logmsg.end(), ' ');
- std::cout << logmsg << std::endl;
+ std::cout << std::endl;
return 0;
}
diff --git a/unicode/biditest2.C b/unicode/biditest2.C
index f497bcf..cfa0e50 100644
--- a/unicode/biditest2.C
+++ b/unicode/biditest2.C
@@ -1,42 +1,110 @@
#include "unicode_config.h"
#include "courier-unicode.h"
#include <iostream>
+#include <iterator>
#include <sstream>
#include <fstream>
#include <cstdint>
#include <iomanip>
+#include <algorithm>
+#include <unistd.h>
FILE *DEBUGDUMP;
-int main(int argc, char **argv)
+#define BIDI_DEBUG
+
+extern "C" {
+#if 0
+}
+#endif
+
+#include "unicode_bidi.c"
+
+}
+
+void latin_test()
{
- std::ifstream fp("BidiCharacterTest.txt");
+ for (char32_t c=32; c<256; c++)
+ {
+ std::u32string s;
- if (!fp.is_open())
+ s += c;
+
+ std::vector<unicode_bidi_level_t> levels={UNICODE_BIDI_LR};
+
+ auto new_string=unicode::bidi_embed(s, levels,
+ UNICODE_BIDI_LR);
+
+ if (new_string != s)
+ {
+ std::cerr << "Character " << (int)c
+ << " does not work." << std::endl;
+ exit(1);
+ }
+ }
+
+ std::u32string s;
+ std::vector<unicode_bidi_level_t> levels;
+
+ for (char32_t c=32; c<256; c++)
{
- std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl;
+ s += c;
+ levels.push_back(UNICODE_BIDI_LR);
+ }
+
+ auto new_string=unicode::bidi_embed(s, levels,
+ UNICODE_BIDI_LR);
+
+ if (new_string != s)
+ {
+ std::cerr << "iso-8859-1 string does not work."
+ << std::endl;
exit(1);
}
+}
- DEBUGDUMP=fopen("/dev/null", "w");
- if (!DEBUGDUMP)
+void character_test()
+{
+ std::ifstream fp("BidiCharacterTest.txt");
+
+ if (!fp.is_open())
{
- perror("/dev/null");
+ std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl;
exit(1);
}
std::string buf;
size_t linenum=0;
+ size_t nextlogline=0;
+ std::string logmsg;
while (1)
{
buf.clear();
- if (std::getline(fp, buf).eof() && buf.empty())
- break;
- ++linenum;
+ bool iseof=std::getline(fp, buf).eof() && buf.empty();
+
+ if (iseof || ++linenum >= nextlogline)
+ {
+ alarm(300);
+ std::cout << logmsg;
+
+ std::ostringstream o;
+ o << std::setw(6) << linenum << " lines processed... ";
+
+ logmsg=o.str();
+
+ std::cout << logmsg << std::flush;
+
+ std::fill(logmsg.begin(), logmsg.end(), '\b');
+
+ nextlogline += 20000;
+ }
+
+ if (iseof)
+ break;
auto p=buf.find('#');
if (p != buf.npos)
@@ -187,17 +255,202 @@ int main(int argc, char **argv)
std::cerr << std::endl;
exit(1);
}
- }
- return 0;
-}
-#define BIDI_DEBUG
+ std::vector<size_t> actual_render_order;
+
+ size_t n=0;
+
+ std::generate_n(std::back_inserter(actual_render_order),
+ s.size(),
+ [&] { return n++; });
+
+ unicode::bidi_reorder
+ (s, levels,
+ [&]
+ (size_t index,
+ size_t n)
+ {
+ auto b=actual_render_order.begin();
+ std::reverse(b+index, b+index+n);
+ });
+
+ n=0;
+ unicode::bidi_cleanup
+ (s, levels,
+ [&]
+ (size_t i)
+ {
+ actual_render_order.erase
+ (actual_render_order.begin()+i-n);
+ ++n;
+ });
+
+ if (render_order != actual_render_order)
+ {
+ std::cerr << "Regression, line "
+ << linenum
+ << ": render order"
+ << std::endl
+ << " Expected:";
+ for (auto n:render_order)
+ {
+ std::cerr << " " << n;
+ }
+ std::cerr << std::endl
+ << " Actual:";
-extern "C" {
-#if 0
+ for (auto n:actual_render_order)
+ {
+ std::cerr << " " << n;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
+
+ unicode::bidi_extra_cleanup(s, levels);
+
+ auto dump_ls=
+ [&]
+ (const std::u32string &s,
+ const std::vector<unicode_bidi_level_t> &l)
+ {
+ for (size_t i=0; i<s.size(); ++i)
+ {
+ std::cerr << " " << std::hex
+ << std::setw(4)
+ << std::setfill('0')
+ << s[i] << "/"
+ << std::dec
+ << (int)l[i];
+ }
+ };
+
+ for (int pass=0; pass<4; pass++)
+ {
+ int paragraph=pass & 1;
+ int use_default=pass & 2;
+
+ for (size_t i=0; i<s.size(); ++i)
+ {
+ /* L1 */
+ switch (unicode_bidi_type(s[i])) {
+ case UNICODE_BIDI_TYPE_S:
+ case UNICODE_BIDI_TYPE_B:
+ levels.at(i)=paragraph;
+ }
+ }
+
+ auto logical_string=s;
+ auto logical_levels=levels;
+
+ unicode::bidi_logical_order(logical_string,
+ logical_levels,
+ paragraph);
+
+ auto new_string=unicode::bidi_embed(logical_string,
+ logical_levels,
+ paragraph);
+
+ auto save_string=new_string;
+
+ if (use_default)
+ {
+ auto marker=unicode::bidi_embed_paragraph_level
+ (new_string, paragraph);
+
+ if (marker)
+ new_string.insert(0, 1, marker);
+
+ ret=unicode::bidi_calc(new_string);
+ }
+ else
+ {
+ ret=unicode::bidi_calc(new_string, paragraph);
+ }
+
+ unicode::bidi_reorder(new_string, std::get<0>(ret));
+ unicode::bidi_extra_cleanup(new_string,
+ std::get<0>(ret));
+
+ /* New string is now back in logical order */
+
+ if (new_string == s && std::get<0>(ret) == levels)
+ continue;
+
+ fclose(DEBUGDUMP);
+ DEBUGDUMP=stderr;
+
+ std::cerr << "Regression, line "
+ << linenum
+ << ": embedding markers"
+ << std::endl
+ << " Paragraph embedding level: "
+ << paragraph;
+
+ if (use_default)
+ std::cerr << " (defaulted)";
+
+ std::cerr << std::endl
+ << "String (1):";
+
+ dump_ls(s, levels);
+
+ std::cerr << std::endl << "String (2):";
+
+ dump_ls(new_string, std::get<0>(ret));
+ std::cerr << std::endl;
+
+ std::cerr << "Embedding:";
+ dump_ls(logical_string, logical_levels);
+ std::cerr << std::endl;
+
+ unicode::bidi_embed(logical_string,
+ logical_levels,
+ paragraph);
+
+ std::cerr << std::endl
+ << "Embedded string:";
+
+ for (auto c:save_string)
+ {
+ std::cerr << " ";
+
+ switch (c) {
+ case LRM: std::cerr << "LRM"; break;
+ case RLM: std::cerr << "RLM"; break;
+ case RLI: std::cerr << "RLI"; break;
+ case LRI: std::cerr << "LRI"; break;
+ case RLO: std::cerr << "RLO"; break;
+ case LRO: std::cerr << "LRO"; break;
+ case PDF: std::cerr << "PDF"; break;
+ case PDI: std::cerr << "PDI"; break;
+ default:
+ std::cerr << std::hex << std::setw(4)
+ << std::setfill('0')
+ << c;
+ break;
+ }
+ }
+ std::cerr << std::dec << std::endl << std::flush;
+
+ ret=unicode::bidi_calc(save_string, paragraph);
+ unicode::bidi_reorder(save_string, std::get<0>(ret));
+ exit(1);
+ }
+ }
+ std::cout << std::endl;
}
-#endif
-#include "unicode_bidi.c"
+int main(int argc, char **argv)
+{
+ DEBUGDUMP=fopen("/dev/null", "w");
+ if (!DEBUGDUMP)
+ {
+ perror("/dev/null");
+ exit(1);
+ }
+ latin_test();
+ character_test();
+ return 0;
}
diff --git a/unicode/book.xml b/unicode/book.xml
index ad0009a..c8948ba 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -12,7 +12,7 @@
<!--
-Copyright 2014-2017 Double Precision, Inc.
+Copyright 2014-2020 Double Precision, Inc.
See COPYING for distribution information.
-->
@@ -23,7 +23,7 @@ See COPYING for distribution information.
<para>
This library implements several algorithms related to the
<ulink url="https://www.unicode.org/standard/standard.html">Unicode
- Standard</ulink>:
+ Standard</ulink>, notably:
</para>
<itemizedlist>
@@ -36,22 +36,21 @@ See COPYING for distribution information.
<listitem>
<para>
Implementation of
- <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme
- and work breaking</ulink> rules.
+ <link linkend="unicode_grapheme_break">grapheme
+ and word breaking</link> rules.
</para>
</listitem>
<listitem>
<para>
Implementation of
- <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line
- breaking</ulink> rules.
+ <link linkend="unicode_line_break">line breaking</link> rules.
</para>
</listitem>
<listitem>
<para>
Implementation of the
- <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">bi-directional
- algorithm</ulink>.
+ <link linkend="unicode_bidi">bi-directional
+ algorithm</link>.
</para>
</listitem>
<listitem>
@@ -69,15 +68,13 @@ See COPYING for distribution information.
</listitem>
<listitem>
<para>
- Look up the
- <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode
- script property</ulink>.
+ Look up the <link linkend="unicode_script">Unicode
+ script property</link>.
</para>
</listitem>
<listitem>
<para>
- Look up the
- <ulink url="https://unicode.org/notes/tn36/">category</ulink>
+ Look up the <link linkend="unicode_category_lookup">category</link>
property.
</para>
</listitem>
@@ -192,7 +189,7 @@ See COPYING for distribution information.
<programlisting>
#include &lt;courier-unicode.h&gt;</programlisting>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="courier_unicode_descr">
<title>DESCRIPTION</title>
<para>
@@ -226,7 +223,7 @@ See COPYING for distribution information.
with this library.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="courier_unicode_seealso">
<title>SEE ALSO</title>
<para>
@@ -306,16 +303,22 @@ See COPYING for distribution information.
<refname>unicode_bidi</refname>
<refname>unicode_bidi_calc</refname>
<refname>unicode_bidi_reorder</refname>
+ <refname>unicode_bidi_cleanup</refname>
+ <refname>unicode_bidi_extra_cleanup</refname>
+ <refname>unicode_bidi_logical_order</refname>
+ <refname>unicode_bidi_embed</refname>
+ <refname>unicode_bidi_embed_paragraph_level</refname>
+
+ <refname>unicode_bidi_type</refname>
<refname>unicode_bidi_mirror</refname>
<refname>unicode_bidi_bracket_type</refname>
- <refpurpose>unicode bidirectional algorithm</refpurpose>
+ <refpurpose>unicode bi-directional algorithm</refpurpose>
</refnamediv>
<refsynopsisdiv>
<funcsynopsis>
- <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
- <funcsynopsisinfo>unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;&#10;&#10;unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo>
<funcprototype>
<funcdef>void <function>unicode_bidi_calc</function></funcdef>
<paramdef>const char32_t *<parameter>p</parameter></paramdef>
@@ -334,6 +337,51 @@ See COPYING for distribution information.
</funcprototype>
<funcprototype>
+ <funcdef>size_t <function>unicode_bidi_cleanup</function></funcdef>
+ <paramdef>char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef>
+ <paramdef>void *<parameter>arg</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef>
+ <paramdef>char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef>
+ <paramdef>void *<parameter>arg</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef>
+ <paramdef>char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ <paramdef>void (*<parameter>reorder_callback</parameter>)(size_t index, size_t n, void *arg)</paramdef>
+ <paramdef>void *<parameter>arg</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>size_t <function>unicode_bidi_embed</function></funcdef>
+ <paramdef>const char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ <paramdef>void (*<parameter>emit</parameter>)(const char32_t *string, size_t n, void *arg)</paramdef>
+ <paramdef>void *<parameter>arg</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef>
+ <paramdef>const char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
<funcdef>char32_t <function>bidi_mirror</function></funcdef>
<paramdef>char32_t <parameter>c</parameter></paramdef>
</funcprototype>
@@ -350,63 +398,160 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_bidi_descr">
<title>DESCRIPTION</title>
<para>
- <function>unicode_bidi_calc</function>() and
- <function>unicode_bidi_reorder</function>() implement
- the
+ These functions are related to the
<ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>.
- </para>
- <para>
- The first two parameters to
- <function>unicode_bidi_calc</function>() are a unicode string
- and the number of characters in the Unicode string.
- <parameter>levels</parameter> points to a buffer of
- <classname>unicode_bidi_level_t</classname> values.
- The caller is responsible for allocating and deallocating this
- buffer, of
- size <parameter>n</parameter>,
- the same number of values as the number of characters in the
- Unicode string.
- </para>
- <para>
- <function>unicode_bidi_calc</function>() calculates the
- embedding level of each character and fills in the
- <parameter>levels</parameter> buffer (executes all steps of the
- bidirectional algorithm up to, and including, step L1).
- A <literal>NULL</literal> <parameter>initial_embedding</parameter>
- value calculates the default paragraph embedding value.
- A pointer to a <literal>UNICODE_BIDI_LR</literal> or
- <literal>UNICODE_BIDI_RL</literal> value explicitly sets a
- left-to-right or right-to-left paragraph embedding value.
+ They implement the algorithm up to and including step L2,
+ and provide additional functionality of returning miscellaneous
+ bi-directional-related metadata of Unicode characters. There's
+ also a basic algorithm that <quote>reverses</quote> the
+ bi-directional algorithm
+ and produces a Unicode string with bi-directional markers that
+ results in the same bi-directional string after reapplying the
+ algorithm.
</para>
- <para>
- <function>unicode_bidi_calc</function>() calculates each
- character's directional embedding value: an even value for
- left-to-right text or an odd value for right-to-left text.
- Unicode characters with an unspecified directional embedding
- value are specified by the
- <classname>UNICODE_BIDI_SKIP</classname> embedding level value.
- This indicates embedding and override markers, which can be
- removed from the string (together with this embedding value)
- from the string and the embedding value itself). This can be
- done before or after <function>unicode_bidi_reorder</function>().
- </para>
+ <refsect2 id="unicode_bidi_calc_reorder">
+ <title>Calculating bi-directional rendering order</title>
- <refsect2>
- <title>Reordering text</title>
+ <para>
+ The following process computes the rendering order of
+ characters according to the Unicode Bi-Directional algorithm:
+ </para>
+
+ <orderedlist>
+ <listitem>
+ <para>
+ Allocate an array of
+ <structname>unicode_bidi_level_t</structname> that's the
+ same size as the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Use <function>unicode_bidi_calc</function>() to compute
+ the Unicode string's characters' bi-directional embedding
+ level (executes the Bi-Directional algorithm up to and
+ including step L1). This populates the
+ <structname>unicode_bidi_level_t</structname> buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Use <function>unicode_bidi_reorder</function>() to reverse
+ any characters in the string, according to the
+ algorithm (step L2), with an optional
+ callback that reports which ranges of characters get
+ reversed.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Use <function>unicode_bidi_cleanup</function>() or
+ <function>unicode_bidi_extra_cleanup</function>(),
+ to remove the characters from the string which are used
+ by the bi-directional algorithm, and are not needed for
+ rendering the text.
+ </para>
+ </listitem>
+ </orderedlist>
+
+ <para>
+ The parameters to
+ <function>unicode_bidi_calc</function>() are:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ A pointer to the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Number of characters in the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ A pointer to an array of
+ <structname>unicode_bidi_level_t</structname> values.
+ The caller is
+ responsible for allocating and deallocating this array,
+ which has the same size as the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ An optional pointer to a
+ <literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal> value. This sets
+ the default paragraph direction level.
+ A null pointer computes the default paragraph direction
+ level based on the string, as specified by the "P" rules
+ of the bi-directional algorithm.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ <function>unicode_bidi_calc</function>() fills in the
+ <structname>unicode_bidi_level_t</structname> array with the
+ values corresponding to the embedding level of the
+ corresponding character,
+ according the Unicode Bidirection Algorithm (even values for
+ left-to-right ordering, and odd values for right-to-left
+ ordering).
+ A value of UNICODE_BIDI_SKIP designates directional markers
+ (from step X9).
+ </para>
<para>
- <function>unicode_bidi_reorder</function> takes the actual
+ <function>unicode_bidi_calc</function>() returns the resolved
+ paragraph direction level, which
+ always matches the passed in level, if specified, else it
+ reports the
+ derived one.
+ </para>
+
+ <para>
+ <function>unicode_bidi_reorder</function>() takes the actual
unicode string together with the embedding values from
<function>unicode_bidi_calc</function>, then reverses the
- bidirectional string, as specified by step L2 of the bidirectional
+ bi-directional string, as specified by step L2 of the bi-directional
algorithm.
+ The parameters to
+ <function>unicode_bidi_reorder</function>() are:
</para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ A pointer to the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ A pointer to an array of
+ <structname>unicode_bidi_level_t</structname> values.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Number of characters in the Unicode string and the
+ <structname>unicode_bidi_level_t</structname> array.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ An optional <varname>reorder_callback</varname> function
+ pointer.
+ </para>
+ </listitem>
+ </itemizedlist>
<para>
A non-<literal>NULL</literal>
<parameter>reorder_callback</parameter> gets invoked to report
@@ -434,13 +579,280 @@ See COPYING for distribution information.
invokes the <parameter>reorder_callback</parameter> as if
the character string, and their embedding values, were reversed.
</para>
+
+ <para>
+ The resulting string and embedding levels are in
+ <quote>rendering order</quote>, but still contain bi-directional
+ embedding, override, boundary-neutral, isolate, and marker
+ characters.
+ <function>unicode_bidi_cleanup</function>() and
+ <function>unicode_bidi_extra_cleanup</function>() remove these
+ characters and directional markers from the unicode string.
+ <function>unicode_bidi_cleanup</function> removes only the
+ embedding, override, and boundry-neutral characters (as
+ specified by step X9 of the bi-directional algorithm).
+ <function>unicode_bidi_extra_cleanup</function>()
+ additionally removes the isolation markers, implicit markers;
+ and all characters
+ classified as paragraph separators get replaced by a newline.
+ </para>
+ <para>
+ A non-null pointer to the directional embedding level buffer,
+ of the same size as the string, also removes the corresponding
+ values from the buffer, and the remaining values in the
+ embedding level buffer get reset to
+ levels <literal>UNICODE_BIDI_LR</literal> and
+ <literal> UNICODE_BIDI_RL</literal>, only.
+ </para>
+ <para>
+ The parameters to <function>unicode_bidi_cleanup</function>() and
+ <function>unicode_bidi_extra_cleanup</function>() are:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ The pointer to the unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The pointer to the directional embedding buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The size of the unicode string and the directional embedding
+ buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ A pointer to a function that gets repeatedly invoked with the
+ index of the character that gets removed from the Unicode
+ string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ An opaque pointer that gets forwarded to the callback.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ The function pointer (if not <literal>NULL</literal>)
+ gets invoked to report the index of each
+ removed character. The reported index is the index from the
+ original string, and the callback gets invoked in strict order,
+ from the first to
+ the last removed character (if any).
+ </para>
+ <para>
+ Multiple calls to <function>unicode_bidi_cleanup</function>() or
+ <function>unicode_bidi_extra_cleanup</function>() do no harm;
+ except that <function>unicode_bidi_extra_cleanup</function>()
+ always removes all the additional characters that
+ <function>unicode_bidi_cleanup</function>() does not remove.
+ </para>
+ <para>
+ The character string and the embedding level values resulting
+ from <function>unicode_bidi_extra_cleanup</function>() are in
+ <quote>canonical rendering order</quote>.
+ </para>
</refsect2>
- <refsect2>
+
+ <refsect2 id="unicode_bidi_embed">
+ <title>Embedding bi-directional markers in Unicode text strings</title>
+ <para>
+ <function>unicode_bidi_logical_order</function>() and
+ <function>unicode_bidi_embed</function>() add various
+ bi-directional markers to a Unicode string in canonical rendering
+ order. The resulting string is not guaranteed to be
+ identical to the
+ original Unicode bi-directional string. The algorithm is fairly
+ basic,
+ but the resulting bi-directional string produces the same
+ canonical rendering order after applying
+ <function>unicode_bidi_calc()</function>,
+ <function>unicode_reorder()</function> and
+ <function>unicode_bidi_extra_cleanup()</function>,
+ with the same paragraph_embedding level.
+ </para>
+
+ <para>
+ <function>unicode_bidi_logical_order</function>() gets called
+ first, followed by
+ <function>unicode_bidi_embed</function>().
+ Finally, <function>unicode_bidi_embed_paragraph_level</function>()
+ optionally determines whether the resulting string's default
+ paragraph embedding level matches the one used for the actual
+ embedding direction, and if not returns a directional marker
+ to be prepended to the Unicode character string, as a hint.
+ </para>
+ <para>
+ <function>unicode_bidi_logical_order</function>() factors in the
+ characters' embedding values, and the provided paragraph
+ embedding value
+ (<literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal>), and rearranges the characters
+ and the embedding levels in left-to-right order, while
+ simultaneously
+ invoking the supplied reorder_callback indicating each range of
+ characters whose relative order gets reversed. The
+ <function>reorder_callback</function>() receives, as
+ parameters:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ The starting index of the first reversed character, in the
+ string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Number of reversed characters.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Forwarded <parameter>arg</parameter> pointer value.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ This specifies a consecutive range of characters (and
+ directional embedding values)
+ that get reversed (first character in the range becomes the
+ last character,
+ and the last character becomes the first character).
+ </para>
+
+ <para>
+ After
+ <function>unicode_bidi_logical_order</function>(),
+ <function>unicode_bidi_embed</function>() progressively invokes
+ the passed-in callback with
+ the contents of a bi-directional unicode string.
+ The parameters to <function>unicode_bidi_embed</function>() are:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ The Unicode string, and &hellip;
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ &hellip; the directional embedding buffer, in canonical
+ rendering order.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The size of the string and the embedding level buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The paragraph embedding level, either
+ <literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The pointer to the callback function.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ An opaque pointer argument that gets forwarded to the
+ callback function.
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ The callback receives pointers to
+ various parts of the original string that gets passed to
+ <function>unicode_bidi_embed</function>(), intermixed with
+ bi-directional markers,
+ overrides, and isolates. The callback's parameters are:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ The pointer to a Unicode string.
+ </para>
+ <note>
+ <para>
+ It is not a given that the callback receives pointers
+ to progressively increasing pointers of the original
+ string that gets passed to
+ <function>unicode_bidi_embed</function>().
+ Some calls will be for individual bi-directional
+ markers, and
+ <function>unicode_bidi_embed</function>() also
+ performs some additional internal reordering, on the fly,
+ after <function>unicode_bidi_logical_order</function>()'s
+ big hammer.
+ </para>
+ </note>
+ </listitem>
+ <listitem>
+ <para>
+ Number of characters in the Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Forwarded <parameter>arg</parameter> pointer value.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ The assembled unicode string should produce the same
+ canonical rendering order, for the same paragraph embedding
+ level.
+ <function>unicode_bidi_embed_paragraph_level</function>()
+ checks if the specified Unicode string computes the given
+ default paragraph embedding level and returns 0 if it matches.
+ Otherwise it returns a directional marker that should be
+ <emphasis>prepended</emphasis> to the Unicode string to allow
+ <function>unicode_bidi_calc</function>'s optional paragraph
+ embedding level pointer's value to be <literal>NULL</literal>,
+ but derive the same default embedding level.
+ The parameters to
+ <function>unicode_bidi_embed_paragraph_level</function>() are:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ The Unicode string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The size of the string.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The paragraph embedding level, either
+ <literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal>.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </refsect2>
+ <refsect2 id="unicode_bidi_misc">
<title>Miscellaneous utility functions</title>
<para>
<function>unicode_bidi_type</function>
- looks up each character's bidirectional character type.
+ looks up each character's bi-directional character type.
</para>
<para>
<function>unicode_bidi_mirror</function>
@@ -464,7 +876,7 @@ See COPYING for distribution information.
</para>
</refsect2>
</refsect1>
- <refsect1>
+ <refsect1 id="courier_unicode_bidi_seealso">
<title>SEE ALSO</title>
<para>
<ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">TR-9</ulink>,
@@ -502,7 +914,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_canonical_descr">
<title>DESCRIPTION</title>
<para>
@@ -552,7 +964,7 @@ See COPYING for distribution information.
equivalence.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_canonical_seealso">
<title>SEE ALSO</title>
<para>
<ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>,
@@ -641,7 +1053,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_category_descr">
<title>DESCRIPTION</title>
<para>
@@ -783,7 +1195,7 @@ See COPYING for distribution information.
</varlistentry>
</variablelist>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_category_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -950,7 +1362,7 @@ See COPYING for distribution information.
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_convert_descr">
<title>DESCRIPTION</title>
<para>
@@ -1040,7 +1452,7 @@ See COPYING for distribution information.
</para>
- <refsect2>
+ <refsect2 id="unicode_convert_collect">
<title>Collecting converted text into a buffer</title>
<para>
@@ -1097,7 +1509,7 @@ See COPYING for distribution information.
</para>
</refsect2>
- <refsect2>
+ <refsect2 id="unicode_convert_chset_unicode">
<title>Converting between character sets and unicode</title>
<para>
@@ -1126,7 +1538,7 @@ See COPYING for distribution information.
</para>
</refsect2>
- <refsect2>
+ <refsect2 id="unicode_convert_oneshot">
<title>One-shot conversions</title>
<para>
@@ -1175,7 +1587,7 @@ See COPYING for distribution information.
</para>
</refsect2>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_convert_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -1220,7 +1632,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_default_chset_descr">
<title>DESCRIPTION</title>
<para>
<function>unicode_default_chset</function>() returns the name of the
@@ -1231,7 +1643,7 @@ See COPYING for distribution information.
current application locale's character set.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_default_chset_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -1316,7 +1728,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_emoji_descr">
<title>DESCRIPTION</title>
<para>
<function>unicode_emoji_lookup</function>() returns the
@@ -1334,7 +1746,7 @@ See COPYING for distribution information.
character has the corresponding property.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_emoji_seealso">
<title>SEE ALSO</title>
<para>
<ulink url="https://www.unicode.org/reports/tr51/tr51-&tr51ver;.html">TR-51</ulink>,
@@ -1368,7 +1780,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_html40_descr">
<title>DESCRIPTION</title>
<para>
<function>unicode_html40ent_lookup</function>() returns the
@@ -1392,7 +1804,7 @@ See COPYING for distribution information.
a single unicode character.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_html40_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -1448,7 +1860,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_grapheme_descr">
<title>DESCRIPTION</title>
<para>
@@ -1489,7 +1901,7 @@ See COPYING for distribution information.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_grapheme_seealso">
<title>SEE ALSO</title>
<para>
@@ -1600,7 +2012,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_lb_descr">
<title>DESCRIPTION</title>
<para>
These functions implement the unicode line breaking algorithm.
@@ -1730,7 +2142,7 @@ See COPYING for distribution information.
line breaking handle is no longer valid.
</para>
- <refsect2>
+ <refsect2 id="unicode_lb_altcallback">
<title>Alternative callback function</title>
<para>
@@ -1745,7 +2157,7 @@ See COPYING for distribution information.
</para>
</refsect2>
- <refsect2>
+ <refsect2 id="unicode_lb_altcallback_opt">
<title>Options</title>
<para>
@@ -1822,7 +2234,7 @@ See COPYING for distribution information.
</refsect2>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_lb_seealso">
<title>SEE ALSO</title>
<para>
@@ -1859,7 +2271,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_script_descr">
<title>DESCRIPTION</title>
<para>
<function>unicode_script</function>() looks up the
@@ -1871,7 +2283,7 @@ See COPYING for distribution information.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_script_seealso">
<title>SEE ALSO</title>
<para>
@@ -1949,7 +2361,7 @@ See COPYING for distribution information.
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_wb_descr">
<title>DESCRIPTION</title>
<para>
These functions implement the unicode word breaking algorithm.
@@ -2046,7 +2458,7 @@ See COPYING for distribution information.
line breaking handle is no longer valid.
</para>
- <refsect2>
+ <refsect2 id="unicode_wb_scan">
<title>Word scan</title>
<para>
@@ -2075,7 +2487,7 @@ See COPYING for distribution information.
</refsect2>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_wb_seealso">
<title>SEE ALSO</title>
<para>
<ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>,
@@ -2144,7 +2556,7 @@ See COPYING for distribution information.
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_uc_descr">
<title>DESCRIPTION</title>
<para>
<function>unicode_uc</function>(),
@@ -2174,7 +2586,7 @@ See COPYING for distribution information.
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_uc_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2223,94 +2635,162 @@ See COPYING for distribution information.
<refnamediv>
<refname>unicode::bidi_calc</refname>
<refname>unicode::bidi_reorder</refname>
- <refpurpose>unicode bidirectional algorithm</refpurpose>
+ <refname>unicode::bidi_cleanup</refname>
+ <refname>unicode::bidi_extra_cleanup</refname>
+ <refname>unicode::bidi_logical_order</refname>
+ <refname>unicode::bidi_embed</refname>
+ <refname>unicode::bidi_embed_paragraph_level</refname>
+ <refpurpose>unicode bi-directional algorithm</refpurpose>
</refnamediv>
<refsynopsisdiv>
<funcsynopsis>
<funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
<funcprototype>
- <funcdef>std::vector&lt;unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
+ <funcdef>std::tuple&lt;std::vector&lt;unicode_bidi_level_t&gt;, unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
<paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
</funcprototype>
- </funcsynopsis>
- <funcsynopsis>
<funcprototype>
- <funcdef>std::vector&lt;unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
+ <funcdef>std::tuple&lt;std::vector&lt;unicode_bidi_level_t&gt;, unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
<paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
<paramdef>unicode_bidi_level_t <parameter>embedding_level</parameter></paramdef>
</funcprototype>
- </funcsynopsis>
- <funcsynopsis>
<funcprototype>
<funcdef>int <function>unicode::bidi_reorder</function></funcdef>
<paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
<paramdef>std::vector&lt;unicode_bidi_level_t&gt; &amp;<parameter>embedding_level</parameter></paramdef>
- <paramdef>const std::function&lt;void (size_t, size_t)&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t) noexcept&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
</funcprototype>
- </funcsynopsis>
- <funcsynopsis>
<funcprototype>
- <funcdef>int <function>unicode::bidi_reorder</function></funcdef>
+ <funcdef>void <function>unicode::bidi_reorder</function></funcdef>
<paramdef>std::vector&lt;unicode_bidi_level_t&gt; &amp;<parameter>embedding_level</parameter></paramdef>
- <paramdef>const std::function&lt;void (size_t, size_t)&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t) noexcept&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
</funcprototype>
- </funcsynopsis>
+
+ <funcprototype>
+ <funcdef>void <function>unicode::bidi_cleanup</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_cleanup</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_logical_order</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>void <function>unicode::bidi_logical_order</function></funcdef>
+ <paramdef>std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t) noexcept&gt; &amp;<parameter>removed_callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_embed</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>const std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, const char32_t *, size_t) noexcept&gt; &amp;<parameter>callback</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>std::u32string <function>unicode::bidi_embed</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>const std::vector &lt;unicode_bidi_level_t&gt; &amp;<parameter>levels</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_bidi_descr">
<title>DESCRIPTION</title>
<para>
These functions implement the C++ interface for the
- <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>.
+ <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>.
See the description of the underlying
<link linkend="unicode_bidi">
<citerefentry><refentrytitle>unicode_bidi</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link> C library
- API for more information.
+ API for more information. C++ specific notes:
</para>
- <para>
- <function>unicode::bidi_calc</function> computes and return a vector
- of bidirection embedding level values for the given Unicode string.
- An overload takes an additional parameter that override the
- paragraph embedding level, a <literal>UNICODE_BIDI_LR</literal> or
- an <literal>UNICODE_BIDI_RL</literal> value.
- </para>
- <para>
- <function>unicode::bidi_reorder</function> reverses the characters
- in the Unicode script, according to their embedding levels (and
- reverses the corresponding embedding level values too).
- As is with the C API, an optional parameter is a callable object
- that gets invoked to report each range of characters that gets
- reversed (specified as the starting position and a number of
- characters).
- </para>
- <para>
- An overloaded <function>unicode::bidi_reorder</function> without
- the string parameter goes through the motions, according to the
- embedded level vector parameter, but without actually reversing
- the values in the vector, but still invoking the callable object
- normally.
- </para>
- <para>
- This is comparable to the C API. Also comparable with the C API:
- the convention that even embedding levels specify left to right
- text and odd embedding values specify right to left text.
- An embedding value of <literal>UNICODE_BIDI_SKIP</literal>
- indicates an embedding or an override marker that has no
- specified embeded value. These markers may be removed from the
- Unicode string (together with the
- <literal>UNICODE_BIDI_SKIP</literal>
- values from the embedding values vector) either before or after
- they get reordered.
- </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <function>unicode::bidi_calc</function> returns the
+ directional embedding value buffer and the paragraph
+ embedding level.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Several C functions provide a <quote>dry-run</quote> mode
+ by passing a <literal>NULL</literal> pointer. The C++ API
+ provides separate overloads, with and without the nullable
+ parameter.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Several C functions accept a nullable function pointer, with
+ the <literal>NULL</literal> function pointer specifying no
+ callback. The C++ functions have a
+ <classname>std::function</classname> parameter with a
+ default do-nothing closure.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Several C functions accept two parameters, a Unicode character
+ pointer and the embedding level buffer, and a single parameter
+ that specifies the size of both.
+ The equivalent C++ function takes two discrete parameters,
+ a <classname>std::u32string</classname> and a
+ <classname>std::vector</classname> and returns an
+ <classname>int</classname>; a negative value if their sizes
+ differ, and 0 if their sizes match, and the requested function
+ completes. The <function>unicode::bidi_embed</function> overload
+ that returns a <classname>std::u32string</classname> returns
+ an empty string in case of a mismatch.
+ </para>
+ </listitem>
+ </itemizedlist>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_bidi_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2389,7 +2869,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_descr">
<title>DESCRIPTION</title>
<para>
@@ -2447,7 +2927,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2505,7 +2985,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_tocase_descr">
<title>DESCRIPTION</title>
<para>
@@ -2537,7 +3017,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_tocase_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2602,7 +3082,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_fromu_descr">
<title>DESCRIPTION</title>
<para>
@@ -2634,7 +3114,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_fromu_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2698,7 +3178,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_tou_descr">
<title>DESCRIPTION</title>
<para>
@@ -2733,7 +3213,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_convert_tou_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -2846,7 +3326,7 @@ std::vector&lt;std::pair&lt;int, char32_t&gt;&gt; linebreaks;
std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt;&gt;(linebreaks));</programlisting>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_lb_descr">
<title>DESCRIPTION</title>
<para>
@@ -2941,7 +3421,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_lb_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -3012,7 +3492,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt
</funcsynopsis>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_tolower_descr">
<title>DESCRIPTION</title>
<para>
@@ -3040,7 +3520,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_tolower_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
@@ -3104,7 +3584,7 @@ size_t nchars=scan.finish();
</programlisting>
</refsynopsisdiv>
- <refsect1>
+ <refsect1 id="unicode_cpp_wb_descr">
<title>DESCRIPTION</title>
<para>
@@ -3168,7 +3648,7 @@ size_t nchars=scan.finish();
</para>
</refsect1>
- <refsect1>
+ <refsect1 id="unicode_cpp_wb_seealso">
<title>SEE ALSO</title>
<para>
<link linkend="courier-unicode">
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index c8161ea..f6b4b8c 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -536,65 +536,6 @@ int unicode_wbscan_next(unicode_wbscan_info_t i, char32_t ch);
size_t unicode_wbscan_end(unicode_wbscan_info_t i);
-/*
-** Unicode Bidirectional bracket and mirroring lookup
-**
-** http://www.unicode.org/reports/tr9/tr9-42.html
-**
-** unicode_bidi_mirror() returns the Bidi_Mirroring_Glyph property.
-**
-** If there is no mirroring glyph for the given character, returns the
-** same character.
-**
-** unicode_bidi_bracket_type() looks up the Bidi_Paired_Bracket and
-** Bidi_Paired_Bracket_Type properties.
-**
-** unicode_bidi_bracket_type() returns the Bidi_Paired_Bracket property
-** value. If the ret parameter is not a null pointer, the pointed-to
-** value is set to Bidi_Paired_Bracket_Type value, one of the UNICODE_BIDI
-** values.
-**
-** unicode_bidi_bracket_type() returns the same character and
-** UNICODE_BIDI_n if the given character does not have these properties.
-**
-** unicode_bidi_type() looks up the bidirectional character type of the
-** given Unicode character.
-**
-** unicode_bidi_calc() implements the Unicode Bidirectional Algorithm up to
-** step L1.
-**
-** Parameters:
-**
-** - A pointer to char32_t, the Unicode string.
-**
-** - Number of characters in the char32_t string
-**
-** - A pointer to an array of unicode_bidi_level_t values. The caller is
-** responsible for allocating and deallocating this array, which has the
-** same size as the Unicode string (the second parameter).
-**
-** - An optional pointer to a unicode_bidi_level_t value, or a null pointer.
-** A pointer to UNICODE_BIDI_LR or UNICODE_BIDI_RL sets the default paragraph
-** direction level. A null pointer calculates the default paragraph direction
-** level based on the string, as specified by the "P" rules in the algorithm.
-**
-** unicode_bidi_calc() fills in the unicode_bidi_level_t array with the
-** values corresponding to the embedding level of the corresponding character,
-** as specified in the Unicode Bidirection Algorithm (even for left-to-right,
-** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates
-** directional markers (from step X9). These characters should be removed
-** before using unicode_bidi_reorder().
-**
-** unicode_bidi_calc() returns the resolved paragraph direction level, which
-** always matches the passed in level, if specified, else it reports the
-** derived one.
-**
-** unicode_bidi_reorder() reorders the characters according to the resolved
-** embedding levels. A non-null reorder_callback gets invoked repeatedly,
-** indicating the starting index and the number of characters reversed, so
-** that any related metadata can be updated accordingly.
-*/
-
typedef char unicode_bidi_bracket_type_t;
#define UNICODE_BIDI_n 'n'
@@ -654,6 +595,40 @@ typedef enum {
extern enum_bidi_type_t unicode_bidi_type(char32_t c);
+extern size_t unicode_bidi_cleanup(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*removed_callback)(size_t, void *),
+ void *);
+
+extern size_t unicode_bidi_extra_cleanup(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*removed_callback)(size_t,
+ void *),
+ void *);
+
+extern void unicode_bidi_logical_order(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ unicode_bidi_level_t paragraph_embedding,
+ void (*reorder_callback)(size_t, size_t,
+ void *),
+ void *arg);
+
+extern void unicode_bidi_embed(const char32_t *string,
+ const unicode_bidi_level_t *levels,
+ size_t n,
+ unicode_bidi_level_t paragraph_embedding,
+ void (*emit)(const char32_t *string,
+ size_t n,
+ void *arg),
+ void *arg);
+
+extern char32_t unicode_bidi_embed_paragraph_level(const char32_t *str,
+ size_t n,
+ unicode_bidi_level_t);
+
/*
** unicode_canonical() returns the canonical mapping of the given Unicode
** character. The returned structure specifies:
@@ -2117,24 +2092,124 @@ std::u32string tolower(const std::u32string &u);
std::u32string toupper(const std::u32string &u);
//! Calculate bidirectional embedding levels
+
+//! Returns the bidirectional embedding levels, and the paragraph
+//! embedding level.
+
std::tuple<std::vector<unicode_bidi_level_t>,
unicode_bidi_level_t> bidi_calc(const std::u32string &s);
//! Calculate bidirectional embedding levels
+
+//! Overload calculates the embedding levels using a predetermined
+//! paragraph embedding level.
+//!
+//! Returns the bidirectional embedding levels, and the same paragraph
+//! embedding level.
+
std::tuple<std::vector<unicode_bidi_level_t>,
unicode_bidi_level_t> bidi_calc(const std::u32string &s,
unicode_bidi_level_t level);
//! Reorder bidirectional text
+
+//! Reorders the string and levels in place.
+//!
+//! Non-0 return value indicates the string and levels' sizes do not match.
+
int bidi_reorder(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t, size_t)> &reorder_callback=
- [](size_t, size_t){});
+ const std::function<void (size_t, size_t) noexcept>
+ &reorder_callback=[](size_t, size_t) noexcept{});
-//! Reorder bidirectional text
+//! Dry-run reorder bidirectional text
void bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t, size_t)> &reorder_callback=
- [](size_t, size_t){});
+ const std::function<void (size_t, size_t) noexcept>
+ &reorder_callback=[](size_t, size_t) noexcept{});
+
+//! Remove directional markers
+
+//! Removes them from the string, in place. Optional lambda gets notified
+//! of the index (in the original string, of each removed marker.
+
+void bidi_cleanup(std::u32string &string,
+ const std::function<void (size_t) noexcept> &removed_callback=
+ [](size_t) noexcept {});
+
+//! Also remove them from the embedding direction level buffer.
+
+//! Returns non-0 in case of non-matching level buffer size.
+
+int bidi_cleanup(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t) noexcept> &removed_callback=
+ [](size_t) noexcept {});
+
+
+//! Remove directional markers and isolation markers.
+
+//! Removes them from the string, in place. Optional lambda gets notified
+//! of the index (in the original string, of each removed marker.
+
+void bidi_extra_cleanup(std::u32string &string,
+ const std::function<void (size_t) noexcept>
+ &removed_callback=
+ [](size_t) noexcept {});
+
+//! Also remove them from the embedding direction level buffer.
+
+//! Returns non-0 in case of non-matching level buffer size.
+
+int bidi_extra_cleanup(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t) noexcept>
+ &removed_callback=
+ [](size_t) noexcept {});
+
+//! Convert Unicode string from canonical rendering order to logical order.
+int bidi_logical_order(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (size_t, size_t) noexcept>
+ &lambda=[](size_t,size_t){});
+
+//! Convert Unicode string from canonical rendering order to logical order.
+void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (size_t, size_t) noexcept>
+ &lambda);
+
+//! Embed directional and isolation markers
+
+//! Non-0 return value indicates the string and levels' sizes do not match.
+//!
+//! The lambda gets called repeatedly, to specify the contents of the
+//! string with embedded direction markers.
+
+int bidi_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (const char32_t *string,
+ size_t n) noexcept> &lambda);
+
+//! Embed directional and isolation markers
+
+//! \overload
+//!
+//! Provides a lambda that collects the new string, and returns it. An
+//! empty string gets returned if the string and levels' sizes do not match.
+
+std::u32string bidi_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding);
+
+//! Check if a directional marker needs to be inserted
+
+//! In order for the unicode string to have the specified default
+//! paragraph embedding level.
+
+extern char32_t bidi_embed_paragraph_level(const std::u32string &string,
+ unicode_bidi_level_t level);
#if 0
{
diff --git a/unicode/docbook/book.css b/unicode/docbook/book.css
index d1420cd..a133e82 100644
--- a/unicode/docbook/book.css
+++ b/unicode/docbook/book.css
@@ -44,7 +44,7 @@ code.computeroutput div.literallayout {
font-weight: bold;
}
-.command, .acronym, .symbol {
+.command, .acronym, .symbol, .structname {
font-family: "liberation mono", "courier new", monospace;
background-color: #eeeeee;
}
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index 055ee89..a35e9b5 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -112,6 +112,17 @@ typedef enum {
(c) == UNICODE_BIDI_TYPE_LRO || \
(c) == UNICODE_BIDI_TYPE_RLO)
+#define is_explicit_indicator_except_b(c) \
+ ( is_isolate_initiator(c) || \
+ is_embedding_initiator(c) || \
+ (c) == UNICODE_BIDI_TYPE_BN || \
+ (c) == UNICODE_BIDI_TYPE_PDF || \
+ (c) == UNICODE_BIDI_TYPE_PDI)
+
+#define is_explicit_indicator(c) \
+ ( is_explicit_indicator_except_b(c) || \
+ (c) == UNICODE_BIDI_TYPE_B)
+
/* BD13 implementation */
/* A level run, specified as indexes */
@@ -529,6 +540,8 @@ static void directional_status_stack_push
(struct directional_status_stack_entry *)
malloc(sizeof(struct directional_status_stack_entry));
+ if (!p)
+ abort();
#ifdef BIDI_DEBUG
fprintf(DEBUGDUMP, "BIDI: Push level %d, override: %s, isolate: %s\n",
(int)embedding_level,
@@ -548,16 +561,21 @@ static void directional_status_stack_push
}
static unicode_bidi_level_t
-compute_paragraph_embedding_level(const enum_bidi_type_t *p,
- size_t i, size_t j)
+compute_paragraph_embedding_level(size_t i, size_t j,
+ enum_bidi_type_t (*get)(size_t i,
+ void *arg),
+ void *arg)
+
{
unicode_bidi_level_t in_isolation=0;
for (; i<j; ++i)
{
- if (is_isolate_initiator(p[i]))
+ enum_bidi_type_t t=get(i, arg);
+
+ if (is_isolate_initiator(t))
++in_isolation;
- else if (p[i] == UNICODE_BIDI_TYPE_PDI)
+ else if (t == UNICODE_BIDI_TYPE_PDI)
{
if (in_isolation)
--in_isolation;
@@ -565,16 +583,43 @@ compute_paragraph_embedding_level(const enum_bidi_type_t *p,
if (in_isolation == 0)
{
- if (p[i] == UNICODE_BIDI_TYPE_AL ||
- p[i] == UNICODE_BIDI_TYPE_R)
+ if (t == UNICODE_BIDI_TYPE_AL ||
+ t == UNICODE_BIDI_TYPE_R)
{
- return 1;
+ return UNICODE_BIDI_RL;
}
- if (p[i] == UNICODE_BIDI_TYPE_L)
+ if (t == UNICODE_BIDI_TYPE_L)
break;
}
}
- return 0;
+ return UNICODE_BIDI_LR;
+}
+
+struct compute_paragraph_embedding_level_type_info {
+ const enum_bidi_type_t *p;
+};
+
+static enum_bidi_type_t
+get_enum_bidi_type_for_paragraph_embedding_level(size_t i,
+ void *arg)
+{
+ struct compute_paragraph_embedding_level_type_info *p=
+ (struct compute_paragraph_embedding_level_type_info *)arg;
+
+ return p->p[i];
+}
+
+static unicode_bidi_level_t
+compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p,
+ size_t i, size_t j)
+{
+ struct compute_paragraph_embedding_level_type_info info;
+ info.p=p;
+
+ return compute_paragraph_embedding_level
+ (i, j,
+ get_enum_bidi_type_for_paragraph_embedding_level,
+ &info);
}
static directional_status_stack_t
@@ -591,7 +636,7 @@ directional_status_stack_init(const char32_t *chars,
stack->paragraph_embedding_level=
initial_embedding_level
? *initial_embedding_level & 1
- : compute_paragraph_embedding_level(classes, 0, n);
+ : compute_paragraph_embedding_level_from_types(classes, 0, n);
stack->chars=chars;
stack->classes=classes;
@@ -676,6 +721,8 @@ unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
enum_bidi_type_t *buf=
(enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t));
+ if (!buf)
+ abort();
for (size_t i=0; i<n; ++i)
{
buf[i]=unicode_bidi_type(p[i]);
@@ -732,7 +779,7 @@ unicode_bidi_b(const char32_t *p,
} \
} while(0)
-static void unicode_bidi_w(directional_status_stack_t stack,
+static void unicode_bidi_w(enum_bidi_type_t *classes,
struct isolating_run_sequence_s *seq);
static void unicode_bidi_n(directional_status_stack_t stack,
struct isolating_run_sequence_s *seq);
@@ -900,7 +947,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
}
}
- cur_class=compute_paragraph_embedding_level
+ cur_class=compute_paragraph_embedding_level_from_types
(stack->classes, i+1, j) == 1
? UNICODE_BIDI_TYPE_RLI
: UNICODE_BIDI_TYPE_LRI;
@@ -955,24 +1002,11 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
break;
}
- switch (stack->orig_classes[i]) {
- case UNICODE_BIDI_TYPE_BN:
- case UNICODE_BIDI_TYPE_B:
- case UNICODE_BIDI_TYPE_RLE:
- case UNICODE_BIDI_TYPE_LRE:
- case UNICODE_BIDI_TYPE_RLO:
- case UNICODE_BIDI_TYPE_LRO:
- case UNICODE_BIDI_TYPE_PDF:
- case UNICODE_BIDI_TYPE_RLI:
- case UNICODE_BIDI_TYPE_LRI:
- case UNICODE_BIDI_TYPE_FSI:
- case UNICODE_BIDI_TYPE_PDI:
- break;
- default:
+ if (!is_explicit_indicator(stack->orig_classes[i]))
+ {
/* X6 */
stack->levels[i]=stack->head->embedding_level;
RESET_CLASS(stack->classes[i],stack);
- break;
}
if (stack->classes[i] == UNICODE_BIDI_TYPE_PDI)
@@ -1210,7 +1244,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
dump_sequence("Contents before W", stack, p);
#endif
- unicode_bidi_w(stack, p);
+ unicode_bidi_w(stack->classes, p);
#ifdef BIDI_DEBUG
dump_sequence("Contents after W", stack, p);
@@ -1258,7 +1292,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
}
}
-static void unicode_bidi_w(directional_status_stack_t stack,
+static void unicode_bidi_w(enum_bidi_type_t *classes,
struct isolating_run_sequence_s *seq)
{
irs_iterator iter=irs_begin(seq), end=irs_end(seq);
@@ -1268,10 +1302,10 @@ static void unicode_bidi_w(directional_status_stack_t stack,
while (irs_compare(&iter, &end))
{
- if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_NSM)
+ if (classes[iter.i] == UNICODE_BIDI_TYPE_NSM)
{
/* W1 */
- stack->classes[iter.i] =
+ classes[iter.i] =
is_isolate_initiator(previous_type) ||
previous_type == UNICODE_BIDI_TYPE_PDI
? UNICODE_BIDI_TYPE_ON
@@ -1281,14 +1315,14 @@ static void unicode_bidi_w(directional_status_stack_t stack,
/* W2 */
- if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_EN &&
+ if (classes[iter.i] == UNICODE_BIDI_TYPE_EN &&
strong_type == UNICODE_BIDI_TYPE_AL)
{
- stack->classes[iter.i] = UNICODE_BIDI_TYPE_AN;
+ classes[iter.i] = UNICODE_BIDI_TYPE_AN;
}
/* W2 */
- previous_type=stack->classes[iter.i];
+ previous_type=classes[iter.i];
switch (previous_type) {
case UNICODE_BIDI_TYPE_R:
@@ -1312,12 +1346,12 @@ static void unicode_bidi_w(directional_status_stack_t stack,
while (not_eol)
{
/* W3 */
- if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_AL)
- stack->classes[iter.i] = UNICODE_BIDI_TYPE_R;
+ if (classes[iter.i] == UNICODE_BIDI_TYPE_AL)
+ classes[iter.i] = UNICODE_BIDI_TYPE_R;
/* W4 */
- enum_bidi_type_t this_type=stack->classes[iter.i];
+ enum_bidi_type_t this_type=classes[iter.i];
irs_incr(&iter);
not_eol=irs_compare(&iter, &end);
@@ -1332,13 +1366,13 @@ static void unicode_bidi_w(directional_status_stack_t stack,
previous_type == UNICODE_BIDI_TYPE_AN)
)
) &&
- stack->classes[iter.i] == previous_type)
+ classes[iter.i] == previous_type)
{
irs_iterator prev=iter;
irs_decr(&prev);
- stack->classes[prev.i]=previous_type;
+ classes[prev.i]=previous_type;
}
if (not_eol)
@@ -1353,9 +1387,9 @@ static void unicode_bidi_w(directional_status_stack_t stack,
while (irs_compare(&iter, &end))
{
- if (stack->classes[iter.i] != UNICODE_BIDI_TYPE_ET)
+ if (classes[iter.i] != UNICODE_BIDI_TYPE_ET)
{
- previous_type=stack->classes[iter.i];
+ previous_type=classes[iter.i];
irs_incr(&iter);
continue;
}
@@ -1363,7 +1397,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,
/* ET after EN */
if (previous_type == UNICODE_BIDI_TYPE_EN)
{
- stack->classes[iter.i] = UNICODE_BIDI_TYPE_EN;
+ classes[iter.i] = UNICODE_BIDI_TYPE_EN;
irs_incr(&iter);
continue;
}
@@ -1374,7 +1408,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,
while (irs_incr(&iter), irs_compare(&iter, &end))
{
- previous_type=stack->classes[iter.i];
+ previous_type=classes[iter.i];
if (previous_type == UNICODE_BIDI_TYPE_ET)
continue;
@@ -1383,7 +1417,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,
{
while (irs_compare(&start, &iter))
{
- stack->classes[start.i]=
+ classes[start.i]=
UNICODE_BIDI_TYPE_EN;
irs_incr(&start);
}
@@ -1397,12 +1431,12 @@ static void unicode_bidi_w(directional_status_stack_t stack,
for (iter=irs_begin(seq);
irs_compare(&iter, &end); irs_incr(&iter))
{
- switch (stack->classes[iter.i]) {
+ switch (classes[iter.i]) {
case UNICODE_BIDI_TYPE_ET:
case UNICODE_BIDI_TYPE_ES:
case UNICODE_BIDI_TYPE_CS:
/* W6 */
- stack->classes[iter.i]=UNICODE_BIDI_TYPE_ON;
+ classes[iter.i]=UNICODE_BIDI_TYPE_ON;
break;
default:
break;
@@ -1416,14 +1450,14 @@ static void unicode_bidi_w(directional_status_stack_t stack,
while (irs_compare(&iter, &end))
{
- switch (stack->classes[iter.i]) {
+ switch (classes[iter.i]) {
case UNICODE_BIDI_TYPE_L:
case UNICODE_BIDI_TYPE_R:
- previous_type=stack->classes[iter.i];
+ previous_type=classes[iter.i];
break;
case UNICODE_BIDI_TYPE_EN:
if (previous_type == UNICODE_BIDI_TYPE_L)
- stack->classes[iter.i]=previous_type;
+ classes[iter.i]=previous_type;
break;
default:
break;
@@ -1573,13 +1607,13 @@ static void unicode_bidi_n(directional_status_stack_t stack,
ADJUST_EOCLASS(eoclass);
-#define E_CLASS (seq->embedding_level & 1 ? \
- UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L)
+#define E_CLASS(level) ((level) & 1 ? \
+ UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L)
-#define O_CLASS (seq->embedding_level & 1 ? \
- UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R)
+#define O_CLASS(level) ((level) & 1 ? \
+ UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R)
- if (eoclass == E_CLASS)
+ if (eoclass == E_CLASS(seq->embedding_level))
{
#ifdef BIDI_DEBUG
if (stackp)
@@ -1599,7 +1633,7 @@ static void unicode_bidi_n(directional_status_stack_t stack,
for (size_t i=0; i<stackp; ++i)
stack_iters[i]->has_e=1;
}
- else if (eoclass == O_CLASS)
+ else if (eoclass == O_CLASS(seq->embedding_level))
{
#ifdef BIDI_DEBUG
if (stackp)
@@ -1636,8 +1670,8 @@ static void unicode_bidi_n(directional_status_stack_t stack,
"Brackets: %d and %d: e=%s, o=%s",
(int)p->start.i,
(int)p->end.i,
- bidi_classname(E_CLASS),
- bidi_classname(O_CLASS));
+ bidi_classname(E_CLASS(seq->embedding_level)),
+ bidi_classname(O_CLASS(seq->embedding_level)));
fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n",
p->has_e,
@@ -1879,6 +1913,37 @@ static void level_run_layers_add(struct level_run_layers *p)
level_runs_init(p->lruns + (p->n_lruns++));
}
+static void reverse_str(char32_t *p,
+ unicode_bidi_level_t *levels,
+ size_t start,
+ size_t end,
+ void (*reorder_callback)(size_t, size_t, void *),
+ void *arg)
+{
+ size_t right=end;
+ size_t left=start;
+
+ while (right > left)
+ {
+ --right;
+
+ if (p)
+ {
+ char32_t c=p[left];
+ unicode_bidi_level_t l=levels[left];
+
+ p[left]=p[right];
+ levels[left]=levels[right];
+ p[right]=c;
+ levels[right]=l;
+ }
+ ++left;
+ }
+
+ if (end-start > 1 && reorder_callback)
+ (*reorder_callback)(start, end-start, arg);
+}
+
void unicode_bidi_reorder(char32_t *p,
unicode_bidi_level_t *levels,
size_t n,
@@ -1887,6 +1952,15 @@ void unicode_bidi_reorder(char32_t *p,
{
/* L2 */
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, "Before L2:");
+ for (size_t i=0; i<n; ++i)
+ fprintf(DEBUGDUMP, " %04x/%d",
+ (unsigned)p[i],
+ (int)levels[i]);
+ fprintf(DEBUGDUMP, "\n");
+#endif
+
struct level_run_layers layers;
unicode_bidi_level_t previous_level=0;
@@ -1920,39 +1994,738 @@ void unicode_bidi_reorder(char32_t *p,
}
}
}
-
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, "L2:\n");
+#endif
for (size_t i=layers.n_lruns; i; )
{
struct level_runs *runs=layers.lruns+ --i;
+#ifdef BIDI_DEBUG
+ if (runs->n_level_runs)
+ fprintf(DEBUGDUMP, "Reverse %d:",
+ (int)i);
+#endif
+
for (size_t j=0; j<runs->n_level_runs; ++j)
{
size_t start=runs->runs[j].start;
size_t end=runs->runs[j].end;
- size_t right=end;
- size_t left=start;
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, " %d-%d",
+ (int)start, (int)end-1);
+#endif
- while (right > left)
+ reverse_str(p, levels, start, end,
+ reorder_callback, arg);
+ }
+
+#ifdef BIDI_DEBUG
+ if (runs->n_level_runs)
+ fprintf(DEBUGDUMP, "\n");
+#endif
+ }
+
+ level_run_layers_deinit(&layers);
+}
+
+#define LRM 0x200E
+#define RLM 0x200F
+#define ALM 0x061C
+
+size_t unicode_bidi_cleanup(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*removed_callback)(size_t, void *),
+ void *arg)
+{
+ size_t i=0;
+ for (size_t j=0; j<n; ++j)
+ {
+ enum_bidi_type_t cl=unicode_bidi_type(string[j]);
+
+ if (IS_X9(cl))
+ {
+ if (removed_callback)
+ (*removed_callback)(j, arg);
+ continue;
+ }
+ if (levels)
+ levels[i]=levels[j] & 1;
+ ++i;
+ }
+ return i;
+}
+
+size_t unicode_bidi_extra_cleanup(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*removed_callback)(size_t, void *),
+ void *arg)
+{
+ size_t i=0;
+ for (size_t j=0; j<n; ++j)
+ {
+ enum_bidi_type_t cl=unicode_bidi_type(string[j]);
+
+ if (is_explicit_indicator_except_b(cl) ||
+ (string[j] == LRM ||
+ string[j] == RLM ||
+ string[j] == ALM))
+ {
+ if (removed_callback)
+ (*removed_callback)(j, arg);
+ continue;
+ }
+ string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j];
+ if (levels)
+ levels[i]=levels[j] & 1;
+ ++i;
+ }
+ return i;
+}
+
+void unicode_bidi_logical_order(char32_t *string,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ unicode_bidi_level_t paragraph_embedding,
+ void (*reorder_callback)(size_t, size_t,
+ void *),
+ void *arg)
+{
+ size_t i=0;
+
+ // On this pass:
+ //
+ // When paragraph_embedding is 0, we reverse odd embedding levels.
+ // When paragraph_embedding is 1, we reverse even embedding levels.
+
+#define LOGICAL_FLIP(n) ( ((n) ^ paragraph_embedding) & 1)
+
+ while (i<n)
+ {
+ if ( !LOGICAL_FLIP(levels[i]))
+ {
+ ++i;
+ continue;
+ }
+
+ size_t j=i;
+
+ while (i<n)
+ {
+ if (!LOGICAL_FLIP(levels[i]))
+ break;
+ ++i;
+ }
+
+ reverse_str(string, levels, j, i,
+ reorder_callback, arg);
+ }
+
+ if (paragraph_embedding & 1)
+ reverse_str(string, levels, 0, n, reorder_callback, arg);
+}
+
+/*
+** Track consecutive sequences of characters with the same embedding level.
+**
+** Linked list create in compute_bidi_embed_levelruns().
+*/
+
+struct bidi_embed_levelrun {
+ struct bidi_embed_levelrun *next;
+ size_t start;
+ size_t end;
+ unicode_bidi_level_t level;
+};
+
+static struct bidi_embed_levelrun **
+record_bidi_embed_levelrun(struct bidi_embed_levelrun **tailp,
+ size_t start,
+ size_t end,
+ unicode_bidi_level_t level)
+{
+ struct bidi_embed_levelrun *p;
+
+ p=(struct bidi_embed_levelrun *)calloc(1, sizeof(*p));
+ if (!p)
+ abort();
+
+ p->start=start;
+ p->end=end;
+ p->level=level;
+
+ if (*tailp)
+ {
+ (*tailp)->next=p;
+ return &(*tailp)->next;
+ }
+ else
+ {
+ *tailp=p;
+ return tailp;
+ }
+}
+
+static void compute_bidi_embed_levelruns(const char32_t *string,
+ const unicode_bidi_level_t *levels,
+ size_t n,
+ struct bidi_embed_levelrun **tailp)
+{
+ size_t i=0;
+
+ while (i<n)
+ {
+ size_t j=i;
+
+ while (++i < n)
+ {
+ if ((levels[i] & 1) != (levels[j] & 1))
+ break;
+ }
+ tailp=record_bidi_embed_levelrun(tailp, j, i,
+ levels[j] & 1);
+ }
+}
+
+#define RLI 0x2067
+#define LRI 0x2066
+#define RLO 0x202e
+#define LRO 0x202d
+#define PDF 0x202c
+#define PDI 0x2069
+
+/*
+** Whether a directional marker and a PDI is required to be generated after
+** some subset of characters.
+*/
+
+struct need_marker_info {
+ int need_marker;
+ int need_pdi;
+};
+
+static void need_marker_info_init(struct need_marker_info *info)
+{
+ info->need_marker=0;
+ info->need_pdi=0;
+}
+
+static void need_marker_info_merge(struct need_marker_info *info,
+ const struct need_marker_info *other_info)
+{
+ if (other_info->need_marker)
+ info->need_marker=1;
+ if (other_info->need_pdi)
+ info->need_pdi=1;
+}
+
+static void emit_bidi_embed_levelrun(const char32_t *string,
+ enum_bidi_type_t *classes,
+ struct bidi_embed_levelrun *run,
+ unicode_bidi_level_t paragraph_level,
+ unicode_bidi_level_t previous_level,
+ unicode_bidi_level_t next_level,
+ struct need_marker_info *need_marker,
+ void (*emit)(const char32_t *string,
+ size_t n,
+ void *arg),
+ void *arg);
+
+/* L1 */
+
+static int is_l1_on_or_after(const enum_bidi_type_t *classes,
+ size_t n,
+ size_t i,
+ int atend)
+{
+ /*
+ ** Determine if rule L1 will apply starting at the given position.
+ */
+ while (i<n)
+ {
+ enum_bidi_type_t t=classes[i];
+
+ if (t == UNICODE_BIDI_TYPE_WS)
+ {
+ ++i;
+ continue;
+ }
+
+ if (t == UNICODE_BIDI_TYPE_S ||
+ t == UNICODE_BIDI_TYPE_B)
+ return 1;
+ return 0;
+ }
+ return atend;
+}
+
+static void emit_marker(struct bidi_embed_levelrun *p,
+ struct need_marker_info *info,
+ void (*emit)(const char32_t *string,
+ size_t n,
+ void *arg),
+ void *arg)
+{
+ char32_t marker= (p->level & 1) ? RLM:LRM;
+
+ if (info->need_marker)
+ (*emit)(&marker, 1, arg);
+
+ if (info->need_pdi)
+ {
+ marker=PDI;
+ (*emit)(&marker, 1, arg);
+ }
+}
+
+void unicode_bidi_embed(const char32_t *string,
+ const unicode_bidi_level_t *levels,
+ size_t n,
+ unicode_bidi_level_t paragraph_level,
+ void (*emit)(const char32_t *string,
+ size_t n,
+ void *arg),
+ void *arg)
+{
+ struct bidi_embed_levelrun *runs=0;
+ enum_bidi_type_t *classes=
+ (enum_bidi_type_t *)calloc(n, sizeof(enum_bidi_type_t));
+
+ if (!classes)
+ abort();
+
+ for (size_t i=0; i<n; ++i)
+ classes[i]=unicode_bidi_type(string[i]);
+
+ compute_bidi_embed_levelruns(string, levels,
+ n,
+ &runs);
+
+ /*
+ ** Go through the sequences of consecutive characters with the
+ ** same embedding level. Keep track of the preceding and the
+ ** next embedding level, which is usually the opposite from the
+ ** current sequence's embedding level. Except that the first and
+ ** the last sequence of characters, in the string, are bound to
+ ** the paragraph_level, which may be the same.
+ */
+
+ unicode_bidi_level_t previous_level=paragraph_level;
+
+ while (runs)
+ {
+ struct bidi_embed_levelrun *p=runs;
+
+ runs=runs->next;
+
+ unicode_bidi_level_t next_level=paragraph_level;
+
+ if (runs)
+ next_level=runs->level;
+
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, " Range %d-%d, level %d\n",
+ (int)p->start, (int)(p->end-1), p->level);
+#endif
+
+ if (((p->level ^ paragraph_level) & 1) == 0)
+ {
+ /*
+ ** Sequence in the same direction as the paragraph
+ ** embedding level.
+ **
+ ** We'll definitely need a directional marker if
+ ** rule L1 applies after this sequence.
+ */
+
+ struct need_marker_info need_marker;
+
+ need_marker_info_init(&need_marker);
+
+ if (classes[p->end-1] == UNICODE_BIDI_TYPE_WS)
+ {
+ need_marker.need_marker=
+ is_l1_on_or_after(classes, n,
+ p->end,
+ 0);
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, " need marker=%d\n",
+ need_marker.need_marker);
+#endif
+
+ }
+
+ emit_bidi_embed_levelrun(string, classes,
+ p, paragraph_level,
+ previous_level,
+ next_level,
+ &need_marker,
+ emit, arg);
+
+ emit_marker(p, &need_marker, emit, arg);
+ }
+ else
+ {
+ struct need_marker_info need_marker;
+ size_t orig_end=p->end;
+
+ /*
+ ** Sequence in the opposite direction. Because S and
+ ** B reset to the paragraph level, no matter what,
+ ** if we want things to render like that we will need
+ ** to emit sequences on each side of S/B in reverse
+ ** order. We start at the end of this sequence, then
+ ** search towards the beginning, emit that sequence,
+ ** emit the S and B, then go to the next sequence.
+ */
+
+ need_marker_info_init(&need_marker);
+
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, " need marker=%d\n",
+ need_marker);
+#endif
+
+ while (p->start < p->end)
{
- --right;
+ size_t j=p->end;
- if (p)
+ int end_with_ws=
+ classes[j-1] == UNICODE_BIDI_TYPE_WS;
+ while (j > p->start)
{
- char32_t c=p[left];
- unicode_bidi_level_t l=levels[left];
+ --j;
- p[left]=p[right];
- levels[left]=levels[right];
- p[right]=c;
- levels[right]=l;
+ enum_bidi_type_t t=classes[j];
+
+ if (t == UNICODE_BIDI_TYPE_S ||
+ t == UNICODE_BIDI_TYPE_B)
+ {
+ ++j;
+ break;
+ }
+ }
+
+ if (j == p->end) /* Must be lone break */
+ {
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ " break: %d\n",
+ (int)j);
+#endif
+ --p->end;
+
+ previous_level=paragraph_level;
+
+ (*emit)(string+p->end, 1, arg);
+ continue;
+ }
+
+ struct need_marker_info need_marker_partial;
+
+ need_marker_info_init(&need_marker_partial);
+
+ /*
+ ** Rule L1, there's going to be an S or a B
+ ** after we emit this sequence.
+ */
+
+ if (j != p->start)
+ need_marker_partial.need_marker=1;
+
+ /*
+ ** To emit this sequence, we monkey-patch
+ ** the run level to indicate the sub-
+ ** sequence to emit.
+ */
+ size_t i=p->start;
+
+ p->start=j;
+
+ emit_bidi_embed_levelrun
+ (string, classes, p, paragraph_level,
+ previous_level,
+
+ j == i
+ /* No more, this is next */
+ ? next_level
+ /* We'll emit a paragraph brk */
+ : paragraph_level,
+ &need_marker_partial,
+ emit, arg);
+
+ /* Continue monkey-patching. */
+
+ p->end=p->start;
+ p->start=i;
+
+ if (p->start == p->end)
+ /* Do it below */
+ {
+ if (end_with_ws)
+ need_marker.need_marker=
+ is_l1_on_or_after
+ (classes, n,
+ orig_end,
+ 0);
+ need_marker_info_merge
+ (&need_marker,
+ &need_marker_partial);
+ }
+ else
+ {
+ emit_marker(p, &need_marker_partial,
+ emit, arg);
}
- ++left;
}
+ emit_marker(p, &need_marker, emit, arg);
+ }
+ free(p);
+ }
+ free(classes);
+}
+
+#define ADJUST_LR(t,e) do { \
+ switch (t) { \
+ case UNICODE_BIDI_TYPE_AL: \
+ (t)=UNICODE_BIDI_TYPE_R; \
+ break; \
+ case UNICODE_BIDI_TYPE_ET: \
+ case UNICODE_BIDI_TYPE_ES: \
+ case UNICODE_BIDI_TYPE_AN: \
+ case UNICODE_BIDI_TYPE_EN: \
+ (t)=UNICODE_BIDI_TYPE_L; \
+ break; \
+ default: \
+ break; \
+ } \
+ } while (0)
+
+#define ADJUST_LRSTRONG(t) do { \
+ switch (t) { \
+ case UNICODE_BIDI_TYPE_AL: \
+ (t)=UNICODE_BIDI_TYPE_R; \
+ default: \
+ break; \
+ } \
+ } while (0)
+
+static void emit_bidi_embed_levelrun(const char32_t *string,
+ enum_bidi_type_t *classes,
+ struct bidi_embed_levelrun *run,
+ unicode_bidi_level_t paragraph_level,
+ unicode_bidi_level_t previous_level,
+ unicode_bidi_level_t next_level,
+ struct need_marker_info *need_marker,
+ void (*emit)(const char32_t *string,
+ size_t n,
+ void *arg),
+ void *arg)
+{
+ /*
+ ** Our first order of business will be to apply rules W to this
+ ** sequence, to resolve weak types.
+ **
+ ** It's easy to simulate what unicode_bidi_w() expects.
+ */
+
+ struct level_run lrun;
+ struct isolating_run_sequence_s seq;
+ enum_bidi_type_t e_type=E_CLASS(run->level);
+ enum_bidi_type_t o_type=O_CLASS(run->level);
+
+ if (run->start == run->end)
+ return;
+
+ memset(&seq, 0, sizeof(seq));
+
+ seq.embedding_level=run->level;
+ seq.sos=seq.eos=e_type;
+ seq.runs.runs=&lrun;
+ seq.runs.n_level_runs=1;
+ seq.runs.cap_level_runs=1;
+ lrun.start=run->start;
+ lrun.end=run->end;
+ unicode_bidi_w(classes, &seq);
+
+ /*
+ ** Peek at the first character's class.
+ **
+ ** If the previous sequence's embedding level was the same, it
+ ** guarantees the peristence of the embedding direction. We can
+ ** accept classes that default to our embedding level.
+ **
+ ** Otherwise we recognize only strong classes.
+ */
+ enum_bidi_type_t t=classes[run->start];
+
+ if (previous_level == run->level)
+ {
+ ADJUST_LR(t, E_CLASS(previous_level));
+ }
+ else
+ {
+ ADJUST_LRSTRONG(t);
+ }
+
+ /*
+ ** Sequence in the opposite direction always get isolated.
+ */
+ char32_t override_start=run->level ? RLI:LRI;
+
+ if (run->level != paragraph_level)
+ (*emit)(&override_start, 1, arg);
+
+ /*
+ ** Make sure the character sequence has strong context.
+ */
+ if (t == o_type)
+ {
+ struct need_marker_info need_marker;
+
+ need_marker_info_init(&need_marker);
+
+ need_marker.need_marker=1;
+
+ emit_marker(run, &need_marker, emit, arg);
+ }
+
+ override_start=run->level ? RLO:LRO;
+ char32_t override_end=PDF;
+
+ size_t start=run->start;
+ size_t end=run->end;
+
+ while (start < end)
+ {
+ size_t i=start;
+ size_t word_start=i;
+
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ " examining, starting at: %d\n", (int)i);
+#endif
+
+ /*
+ ** Look for the next character with the opposite class.
+ ** While doing that, keep an eye out on any WS or ONs,
+ ** which will tell us where the most recent "word"s starts,
+ ** before this character.
+ */
+ while (i < end)
+ {
+ enum_bidi_type_t t=classes[i];
+
+ ADJUST_LR(t, e_type);
+
+ if (t == o_type)
+ break;
+
+ switch (t) {
+ case UNICODE_BIDI_TYPE_WS:
+ case UNICODE_BIDI_TYPE_ON:
+ word_start=i+1;
+ break;
+ default:
+ break;
+ }
+
+ ++i;
+ }
+
+ if (i < end)
+ {
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ " override needed: %d,"
+ " start of word at %d, ",
+ (int)i, (int)word_start);
+#endif
+ /*
+ ** Found something to override. First, emit everything
+ ** up to the start of this "word".
+ **
+ ** Then emit the RLO or LRO, then look for the end
+ ** of the "word", and drop the PDF there.
+ */
+ if (word_start > start)
+ (*emit)(string+start,
+ word_start-start, arg);
+
+ (*emit)(&override_start, 1, arg);
+ while (++i < end)
+ {
+ enum_bidi_type_t t=classes[i];
- if (end-start > 1 && reorder_callback)
- (*reorder_callback)(start, end-start, arg);
+ switch (t) {
+ case UNICODE_BIDI_TYPE_WS:
+ case UNICODE_BIDI_TYPE_ON:
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, "end of word at %d\n",
+ (int)i);
+#endif
+ (*emit)(string+word_start, i-word_start, arg);
+ (*emit)(&override_end, 1, arg);
+ start=i;
+ continue;
}
+ (*emit)(string+start, i-start, arg);
+ start=i;
}
- level_run_layers_deinit(&layers);
+ /*
+ ** Make sure that if a different embedding level follows we will
+ ** emit a marker, to ensure strong context.
+ */
+ t=classes[run->end-1];
+
+ if (next_level != run->level)
+ {
+ ADJUST_LRSTRONG(t);
+
+ if (e_type != t)
+ need_marker->need_marker=1;
+ }
+
+ if (run->level != paragraph_level)
+ need_marker->need_pdi=1;
+}
+
+struct compute_paragraph_embedding_level_char_info {
+ const char32_t *str;
+};
+
+static enum_bidi_type_t
+get_enum_bidi_type_for_embedding_paragraph_level(size_t i,
+ void *arg)
+{
+ struct compute_paragraph_embedding_level_char_info *p=
+ (struct compute_paragraph_embedding_level_char_info *)arg;
+
+ return unicode_bidi_type(p->str[i]);
+}
+
+char32_t unicode_bidi_embed_paragraph_level(const char32_t *str,
+ size_t n,
+ unicode_bidi_level_t paragraph_level
+ )
+{
+ struct compute_paragraph_embedding_level_char_info info;
+ info.str=str;
+
+ if ((compute_paragraph_embedding_level
+ (0, n,
+ get_enum_bidi_type_for_embedding_paragraph_level,
+ &info) ^ paragraph_level) == 0)
+ return 0;
+
+ return (paragraph_level & 1) ? RLM:LRM;
}
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index ca139cc..04d2893 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -1,5 +1,5 @@
/*
-** Copyright 2011-2014 Double Precision, Inc.
+** Copyright 2011-2020 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
@@ -596,7 +596,8 @@ extern "C" {
void *arg)
{
auto p=reinterpret_cast<const std::function<void (size_t,
- size_t)> *>
+ size_t)
+ noexcept> *>
(arg);
(*p)(i, cnt);
@@ -605,7 +606,8 @@ extern "C" {
int unicode::bidi_reorder(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t, size_t)> &lambda)
+ const std::function<void (size_t, size_t)
+ noexcept> &lambda)
{
size_t s=string.size();
@@ -624,7 +626,8 @@ int unicode::bidi_reorder(std::u32string &string,
}
void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
- const std::function<void (size_t, size_t)> &lambda)
+ const std::function<void (size_t, size_t)
+ noexcept> &lambda)
{
size_t s=levels.size();
@@ -636,3 +639,189 @@ void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
(reinterpret_cast<const void *>(&lambda)));
}
+
+extern "C" {
+ static void removed_callback(size_t i,
+ void *arg)
+ {
+ auto p=reinterpret_cast<const std::function<void (size_t)
+ noexcept> *>
+ (arg);
+
+ (*p)(i);
+ }
+}
+
+void unicode::bidi_cleanup(std::u32string &string,
+ const std::function<void (size_t) noexcept> &lambda)
+{
+ if (string.empty())
+ return;
+
+ size_t n=unicode_bidi_cleanup(&string[0],
+ 0,
+ string.size(),
+ removed_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>
+ (&lambda)));
+
+ string.resize(n);
+}
+
+int unicode::bidi_cleanup(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t) noexcept> &lambda)
+{
+ if (levels.size() != string.size())
+ return -1;
+
+ size_t n=unicode_bidi_cleanup(&string[0],
+ &levels[0],
+ string.size(),
+ removed_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>
+ (&lambda)));
+
+ string.resize(n);
+ levels.resize(n);
+ return 0;
+}
+
+
+void unicode::bidi_extra_cleanup(std::u32string &string,
+ const std::function<void (size_t) noexcept>
+ &lambda)
+{
+ if (string.empty())
+ return;
+
+ size_t n=unicode_bidi_extra_cleanup(&string[0],
+ 0,
+ string.size(),
+ removed_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>
+ (&lambda)));
+
+ string.resize(n);
+}
+
+int unicode::bidi_extra_cleanup(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t) noexcept>
+ &lambda)
+{
+ if (levels.size() != string.size())
+ return -1;
+
+ size_t n=unicode_bidi_extra_cleanup(&string[0],
+ &levels[0],
+ string.size(),
+ removed_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>
+ (&lambda)));
+
+ string.resize(n);
+ levels.resize(n);
+ return 0;
+}
+
+int unicode::bidi_logical_order(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (size_t, size_t)
+ noexcept> &lambda)
+{
+ if (string.size() != levels.size())
+ return -1;
+
+ if (string.empty())
+ return 0;
+
+ unicode_bidi_logical_order(&string[0], &levels[0], string.size(),
+ paragraph_embedding,
+ &reorder_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>(&lambda)));
+ return 0;
+}
+
+void unicode::bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (size_t, size_t)
+ noexcept> &lambda)
+{
+ if (levels.size() == 0)
+ return;
+
+ unicode_bidi_logical_order(NULL, &levels[0], levels.size(),
+ paragraph_embedding,
+ &reorder_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>(&lambda)));
+}
+
+extern "C" {
+ static void embed_callback(const char32_t *string,
+ size_t n,
+ void *arg)
+ {
+ auto p=reinterpret_cast<const std::function<void
+ (const char32_t *,
+ size_t n)
+ noexcept> *>(arg);
+ (*p)(string, n);
+ }
+}
+
+int unicode::bidi_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t> &levels,
+ unicode_bidi_level_t paragraph_embedding,
+ const std::function<void (const char32_t *string,
+ size_t n) noexcept>
+ &lambda)
+{
+ if (string.size() != levels.size())
+ return -1;
+
+ if (string.empty())
+ return 0;
+
+ unicode_bidi_embed(&string[0], &levels[0], string.size(),
+ paragraph_embedding,
+ embed_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>
+ (&lambda)));
+ return 0;
+}
+
+std::u32string unicode::bidi_embed(const std::u32string &string,
+ const std::vector<unicode_bidi_level_t
+ > &levels,
+ unicode_bidi_level_t paragraph_embedding)
+{
+ std::u32string new_string;
+
+ (void)bidi_embed(string, levels, paragraph_embedding,
+ [&]
+ (const char32_t *string,
+ size_t n)
+ {
+ new_string.insert(new_string.end(),
+ string, string+n);
+ });
+
+ return new_string;
+}
+
+char32_t unicode::bidi_embed_paragraph_level(const std::u32string &string,
+ unicode_bidi_level_t level)
+{
+ return unicode_bidi_embed_paragraph_level(string.c_str(),
+ string.size(),
+ level);
+}