summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Varshavchik2020-07-09 21:36:46 -0400
committerSam Varshavchik2020-07-12 15:56:45 -0400
commit7a9293cd28b293b793793368237d8856cfb0eff4 (patch)
tree3c19854a7869103405c78a97e40503db64fac7b6
parent2219f725acd0dc36fa00080c846a8982273a6f61 (diff)
downloadcourier-libs-7a9293cd28b293b793793368237d8856cfb0eff4.tar.bz2
Documentation, C++ bindings, reorder.
-rw-r--r--unicode/Makefile.am86
-rw-r--r--unicode/README4
-rw-r--r--unicode/biditest.C116
-rw-r--r--unicode/book.xml961
-rw-r--r--unicode/courier-unicode.h.in30
-rw-r--r--unicode/unicode_bidi.c231
-rw-r--r--unicode/unicodecpp.C76
7 files changed, 1111 insertions, 393 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index 397987c..081965e 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -85,7 +85,87 @@ include_HEADERS=courier-unicode.h \
courier-unicode-categories-tab.h \
courier-unicode-script-tab.h
-man_MANS=$(srcdir)/man/courier-unicode.7 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert_tocase.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]fromu.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]tou.3 $(srcdir)/man/unicode[\:][\:]iso_8859_1.3 $(srcdir)/man/unicode[\:][\:]linebreak_callback_base.3 $(srcdir)/man/unicode[\:][\:]linebreak_callback_save_buf.3 $(srcdir)/man/unicode[\:][\:]linebreak_iter.3 $(srcdir)/man/unicode[\:][\:]linebreakc_callback_base.3 $(srcdir)/man/unicode[\:][\:]linebreakc_iter.3 $(srcdir)/man/unicode[\:][\:]tolower.3 $(srcdir)/man/unicode[\:][\:]toupper.3 $(srcdir)/man/unicode[\:][\:]ucs_2.3 $(srcdir)/man/unicode[\:][\:]ucs_4.3 $(srcdir)/man/unicode[\:][\:]utf_8.3 $(srcdir)/man/unicode[\:][\:]wordbreak_callback_base.3 $(srcdir)/man/unicode_category_lookup.3 $(srcdir)/man/unicode_convert.3 $(srcdir)/man/unicode_convert_deinit.3 $(srcdir)/man/unicode_convert_fromu_init.3 $(srcdir)/man/unicode_convert_fromu_tobuf.3 $(srcdir)/man/unicode_convert_fromutf8.3 $(srcdir)/man/unicode_convert_init.3 $(srcdir)/man/unicode_convert_tobuf.3 $(srcdir)/man/unicode_convert_tocase.3 $(srcdir)/man/unicode_convert_tocbuf_fromutf8_init.3 $(srcdir)/man/unicode_convert_tocbuf_init.3 $(srcdir)/man/unicode_convert_tocbuf_toutf8_init.3 $(srcdir)/man/unicode_convert_tou_init.3 $(srcdir)/man/unicode_convert_tou_tobuf.3 $(srcdir)/man/unicode_convert_toutf8.3 $(srcdir)/man/unicode_convert_uc.3 $(srcdir)/man/unicode_default_chset.3 $(srcdir)/man/unicode_grapheme_break.3 $(srcdir)/man/unicode_html40ent_lookup.3 $(srcdir)/man/unicode_isalnum.3 $(srcdir)/man/unicode_isalpha.3 $(srcdir)/man/unicode_isblank.3 $(srcdir)/man/unicode_isdigit.3 $(srcdir)/man/unicode_isgraph.3 $(srcdir)/man/unicode_islower.3 $(srcdir)/man/unicode_ispunct.3 $(srcdir)/man/unicode_isspace.3 $(srcdir)/man/unicode_isupper.3 $(srcdir)/man/unicode_lb_end.3 $(srcdir)/man/unicode_lb_init.3 $(srcdir)/man/unicode_lb_next.3 $(srcdir)/man/unicode_lb_next_cnt.3 $(srcdir)/man/unicode_lb_set_opts.3 $(srcdir)/man/unicode_lbc_end.3 $(srcdir)/man/unicode_lbc_init.3 $(srcdir)/man/unicode_lbc_next.3 $(srcdir)/man/unicode_lbc_next_cnt.3 $(srcdir)/man/unicode_lbc_set_opts.3 $(srcdir)/man/unicode_lc.3 $(srcdir)/man/unicode_locale_chset.3 $(srcdir)/man/unicode_script.3 $(srcdir)/man/unicode_tc.3 $(srcdir)/man/unicode_u_ucs2_native.3 $(srcdir)/man/unicode_u_ucs4_native.3 $(srcdir)/man/unicode_uc.3 $(srcdir)/man/unicode_wb_end.3 $(srcdir)/man/unicode_wb_init.3 $(srcdir)/man/unicode_wb_next.3 $(srcdir)/man/unicode_wb_next_cnt.3 $(srcdir)/man/unicode_wbscan_end.3 $(srcdir)/man/unicode_wbscan_init.3 $(srcdir)/man/unicode_wbscan_next.3
+man_MANS= \
+ $(srcdir)/man/courier-unicode.7 \
+ $(srcdir)/man/unicode\:\:bidi_calc.3 \
+ $(srcdir)/man/unicode\:\:bidi_reorder.3 \
+ $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \
+ $(srcdir)/man/unicode\:\:iconvert\:\:convert_tocase.3 \
+ $(srcdir)/man/unicode\:\:iconvert\:\:fromu.3 \
+ $(srcdir)/man/unicode\:\:iconvert\:\:tou.3 \
+ $(srcdir)/man/unicode\:\:iso_8859_1.3 \
+ $(srcdir)/man/unicode\:\:linebreak_callback_base.3 \
+ $(srcdir)/man/unicode\:\:linebreak_callback_save_buf.3 \
+ $(srcdir)/man/unicode\:\:linebreak_iter.3 \
+ $(srcdir)/man/unicode\:\:linebreakc_callback_base.3 \
+ $(srcdir)/man/unicode\:\:linebreakc_iter.3 \
+ $(srcdir)/man/unicode\:\:tolower.3 \
+ $(srcdir)/man/unicode\:\:toupper.3 \
+ $(srcdir)/man/unicode\:\:ucs_2.3 \
+ $(srcdir)/man/unicode\:\:ucs_4.3 \
+ $(srcdir)/man/unicode\:\:utf_8.3 \
+ $(srcdir)/man/unicode\:\:wordbreak_callback_base.3 \
+ $(srcdir)/man/unicode_bidi.3 \
+ $(srcdir)/man/unicode_bidi_bracket_type.3 \
+ $(srcdir)/man/unicode_bidi_calc.3 \
+ $(srcdir)/man/unicode_bidi_mirror.3 \
+ $(srcdir)/man/unicode_bidi_reorder.3 \
+ $(srcdir)/man/unicode_category_lookup.3 \
+ $(srcdir)/man/unicode_convert.3 \
+ $(srcdir)/man/unicode_convert_deinit.3 \
+ $(srcdir)/man/unicode_convert_fromu_init.3 \
+ $(srcdir)/man/unicode_convert_fromu_tobuf.3 \
+ $(srcdir)/man/unicode_convert_fromutf8.3 \
+ $(srcdir)/man/unicode_convert_init.3 \
+ $(srcdir)/man/unicode_convert_tobuf.3 \
+ $(srcdir)/man/unicode_convert_tocase.3 \
+ $(srcdir)/man/unicode_convert_tocbuf_fromutf8_init.3 \
+ $(srcdir)/man/unicode_convert_tocbuf_init.3 \
+ $(srcdir)/man/unicode_convert_tocbuf_toutf8_init.3 \
+ $(srcdir)/man/unicode_convert_tou_init.3 \
+ $(srcdir)/man/unicode_convert_tou_tobuf.3 \
+ $(srcdir)/man/unicode_convert_toutf8.3 \
+ $(srcdir)/man/unicode_convert_uc.3 \
+ $(srcdir)/man/unicode_default_chset.3 \
+ $(srcdir)/man/unicode_grapheme_break.3 \
+ $(srcdir)/man/unicode_grapheme_break_deinit.3 \
+ $(srcdir)/man/unicode_grapheme_break_init.3 \
+ $(srcdir)/man/unicode_grapheme_break_next.3 \
+ $(srcdir)/man/unicode_html40ent_lookup.3 \
+ $(srcdir)/man/unicode_isalnum.3 \
+ $(srcdir)/man/unicode_isalpha.3 \
+ $(srcdir)/man/unicode_isblank.3 \
+ $(srcdir)/man/unicode_isdigit.3 \
+ $(srcdir)/man/unicode_isgraph.3 \
+ $(srcdir)/man/unicode_islower.3 \
+ $(srcdir)/man/unicode_ispunct.3 \
+ $(srcdir)/man/unicode_isspace.3 \
+ $(srcdir)/man/unicode_isupper.3 \
+ $(srcdir)/man/unicode_lb_end.3 \
+ $(srcdir)/man/unicode_lb_init.3 \
+ $(srcdir)/man/unicode_lb_next.3 \
+ $(srcdir)/man/unicode_lb_next_cnt.3 \
+ $(srcdir)/man/unicode_lb_set_opts.3 \
+ $(srcdir)/man/unicode_lbc_end.3 \
+ $(srcdir)/man/unicode_lbc_init.3 \
+ $(srcdir)/man/unicode_lbc_next.3 \
+ $(srcdir)/man/unicode_lbc_next_cnt.3 \
+ $(srcdir)/man/unicode_lbc_set_opts.3 \
+ $(srcdir)/man/unicode_lc.3 \
+ $(srcdir)/man/unicode_line_break.3 \
+ $(srcdir)/man/unicode_locale_chset.3 \
+ $(srcdir)/man/unicode_script.3 \
+ $(srcdir)/man/unicode_tc.3 \
+ $(srcdir)/man/unicode_u_ucs2_native.3 \
+ $(srcdir)/man/unicode_u_ucs4_native.3 \
+ $(srcdir)/man/unicode_uc.3 \
+ $(srcdir)/man/unicode_wb_end.3 \
+ $(srcdir)/man/unicode_wb_init.3 \
+ $(srcdir)/man/unicode_wb_next.3 \
+ $(srcdir)/man/unicode_wb_next_cnt.3 \
+ $(srcdir)/man/unicode_wbscan_end.3 \
+ $(srcdir)/man/unicode_wbscan_init.3 \
+ $(srcdir)/man/unicode_wbscan_next.3
libcourier_unicode_la_SOURCES=\
courier-unicode-categories-tab.h \
@@ -329,7 +409,7 @@ docs.stamp:
rm -f man/*.[123456789]
mv man.tmp/* man
rm -rf html.tmp man.tmp
- perl -p -e 's/:/[\\:]/g if s@^man_MANS=.*@"man_MANS=" . join(" ", map { "\$$(srcdir)/$$_" } glob("man/*.[123456789]"))@e' Makefile.am >Makefile.am.new
+ perl -e '$$f=join("",<STDIN>); $$p=join("", map { " \\\n \$$(srcdir)/$$_" } glob("man/*.[123456789]")); $$p=~s/:/\\:/g; $$f =~ s/\nman_MANS=([^\n]|\n[^\n])*/\nman_MANS=$$p/s; print $$f' <Makefile.am >Makefile.am.new
cmp Makefile.am Makefile.am.new || mv -f Makefile.am.new Makefile.am; rm -f Makefile.am.new
touch docs.stamp
@@ -405,4 +485,4 @@ distrelease:
$(MAKE) dist
www:
- rsync -a html/. $$HOME/www/www.courier-mta.org/unicode
+ rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode
diff --git a/unicode/README b/unicode/README
index 2aeb1f0..926e004 100644
--- a/unicode/README
+++ b/unicode/README
@@ -25,6 +25,8 @@ Courier Unicode Library
* Implementation of line breaking rules.
+ * Implementation of the bi-directional algorithm.
+
* Several ancillary functions, like looking up the unicode character
that corresponds to some HTML 4.0 entity (such as “&amp;”, for
example), and determining the normal width or a double-width status of
@@ -40,7 +42,7 @@ Courier Unicode Library
Current status
The current release of the Courier Unicode library is based on the Unicode
- 8.0.0 standard.
+ 13.0.0 standard.
--------------------------------------------------------------------------
diff --git a/unicode/biditest.C b/unicode/biditest.C
index c58da0d..61841a1 100644
--- a/unicode/biditest.C
+++ b/unicode/biditest.C
@@ -5,7 +5,9 @@
#include <sstream>
#include <string>
#include <algorithm>
+#include <utility>
#include <iomanip>
+#include <numeric>
std::vector<std::string> testcase;
@@ -43,6 +45,8 @@ int main(int argc, char **argv)
std::vector<unicode_bidi_level_t> expected_levels;
+ std::vector<size_t> expected_reorder;
+
while (1)
{
buf.clear();
@@ -99,6 +103,28 @@ int main(int argc, char **argv)
continue;
}
+
+
+ if (buf.substr(0, 9) == "@Reorder:")
+ {
+ expected_reorder.clear();
+
+ std::istringstream i(buf);
+
+ std::string word;
+
+ i >> word;
+
+ size_t n;
+
+ while (i >> n)
+ {
+ expected_reorder.push_back(n);
+ }
+ continue;
+ }
+
+
if (buf.substr(0, 1) == "@")
continue;
@@ -138,10 +164,9 @@ int main(int argc, char **argv)
std::vector<unicode_bidi_level_t> actual_levels;
- std::vector<char32_t> dummy_input;
+ std::u32string dummy_input;
dummy_input.resize(testcase.size());
- actual_levels.resize(testcase.size());
static const unicode_bidi_level_t level_0=0;
static const unicode_bidi_level_t level_1=1;
@@ -153,9 +178,9 @@ int main(int argc, char **argv)
{
if (n & 1)
{
- unicode_bidi_calc(&dummy_input[0],
- testcase.size(),
- &actual_levels[0], level);
+ actual_levels=level ?
+ unicode::bidi_calc(dummy_input,*level)
+ : unicode::bidi_calc(dummy_input);
int matched=0;
@@ -220,6 +245,87 @@ int main(int argc, char **argv)
std::cerr << std::endl;
exit(1);
}
+
+ std::vector<size_t> actual_reorder;
+
+ actual_reorder.resize(testcase.size());
+
+ std::iota(actual_reorder.begin(),
+ actual_reorder.end(), 0);
+
+ unicode::bidi_reorder
+ (dummy_input,
+ actual_levels,
+ [&]
+ (size_t s, size_t cnt)
+ {
+ auto *b=&actual_reorder[s];
+ auto *e=b+cnt;
+
+ while (b < e)
+ {
+ --e;
+ std::swap(*b, *e);
+ ++b;
+ }
+ });
+
+ auto b=actual_reorder.begin(), p=b,
+ e=actual_reorder.end();
+
+ auto q=actual_levels.begin();
+
+ while (b != e)
+ {
+ if (*q != UNICODE_BIDI_SKIP)
+ {
+ *p=*b;
+ ++p;
+ }
+ ++b;
+ ++q;
+ }
+ actual_reorder.erase(p, e);
+
+ if (actual_reorder != expected_reorder)
+ {
+ fclose(DEBUGDUMP);
+ DEBUGDUMP=stderr;
+ std::cout << std::endl
+ << std::flush;
+ unicode_bidi_calc(&dummy_input[0],
+ testcase.size(),
+ &actual_levels[0],
+ level);
+
+ std::cerr << "Regression, line "
+ << linenum;
+
+ if (!level)
+ {
+ std::cerr << ", auto";
+ }
+ else
+ {
+ std::cerr <<
+ (*level ? ", RTL"
+ : ", LTR");
+ }
+ std::cerr << ": expected reorder";
+
+ for (auto o:expected_reorder)
+ {
+ std::cerr << " " << o;
+ }
+ std::cerr << std::endl
+ << "Moved: ";
+ for (auto o:actual_reorder)
+ {
+ std::cerr << " " << o;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
}
n >>= 1;
diff --git a/unicode/book.xml b/unicode/book.xml
index 41b8037..ee4b5e5 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -1,7 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
- "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
+ "https://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
+<!ENTITY tr9ver "42">
<!ENTITY tr14ver "45">
<!ENTITY tr24ver "31">
<!ENTITY tr29ver "37">
@@ -19,7 +20,7 @@ See COPYING for distribution information.
<para>
This library implements several algorithms related to the
- <ulink url="http://www.unicode.org/standard/standard.html">Unicode
+ <ulink url="https://www.unicode.org/standard/standard.html">Unicode
Standard</ulink>:
</para>
@@ -33,25 +34,32 @@ See COPYING for distribution information.
<listitem>
<para>
Implementation of
- <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme
+ <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme
and work breaking</ulink> rules.
</para>
</listitem>
<listitem>
<para>
Implementation of
- <ulink url="http://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line
+ <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line
breaking</ulink> rules.
</para>
</listitem>
<listitem>
<para>
+ Implementation of the
+ <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">bi-directional
+ algorithm</ulink>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
Several ancillary functions, like looking up
the unicode character that corresponds to some HTML 4.0
entity (such as <quote>&amp;amp;</quote>, for example), and
determining the normal width or a double-width status of a unicode
character. Also, an adaptation of the
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink>
API for this unicode library.
@@ -60,14 +68,14 @@ See COPYING for distribution information.
<listitem>
<para>
Look up the
- <ulink url="http://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode
+ <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode
script property</ulink>.
</para>
</listitem>
<listitem>
<para>
Look up the
- <ulink url="http://unicode.org/notes/tn36/">category</ulink>
+ <ulink url="https://unicode.org/notes/tn36/">category</ulink>
property.
</para>
</listitem>
@@ -82,7 +90,7 @@ See COPYING for distribution information.
<para>
The current release of the Courier Unicode library is based on the
- Unicode 8.0.0 standard.
+ Unicode 13.0.0 standard.
</para>
</section>
@@ -91,7 +99,7 @@ See COPYING for distribution information.
<para>
Download the current version of the library from
- <ulink url="/download.html#unicode">http://www.courier-mta.org/download.html#unicode</ulink>.
+ <ulink url="/download.html#unicode">https://www.courier-mta.org/download.html#unicode</ulink>.
After unpacking the tarball, run the configure script, which takes
the usual options, followed by <command>make</command>, then
<command>make install</command>.
@@ -154,7 +162,7 @@ See COPYING for distribution information.
<manvolnum>7</manvolnum></citerefentry></link>.
Refer to the included manual pages,
and
- <ulink url="http://www.courier-mta.org/unicode/manpages.html"> the HTML
+ <ulink url="https://www.courier-mta.org/unicode/manpages.html"> the HTML
version of the man pages</ulink> for more information.
</para>
</section>
@@ -166,7 +174,7 @@ See COPYING for distribution information.
<title>C manual pages</title>
<refentry id="courier-unicode">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>courier-unicode</refentrytitle>
<manvolnum>7</manvolnum>
@@ -187,12 +195,12 @@ See COPYING for distribution information.
<para>
This library implements several algorithms related to the
- <ulink url="http://www.unicode.org/standard/standard.html">Unicode
+ <ulink url="https://www.unicode.org/standard/standard.html">Unicode
Standard</ulink>.
This library uses
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
- <citerefentry><refentrytitle>iconv</refentrytitle>
- <manvolnum>3</manvolnum></citerefentry></ulink> to convert
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"
+ ><citerefentry><refentrytitle>iconv</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></ulink> to convert
text in a given character set to unicode. Any character set
displayed by <command>iconv --list</command> can be specified
for the corresponding character set parameter. Additionally,
@@ -229,6 +237,9 @@ See COPYING for distribution information.
<link linkend="unicode_html40ent_lookup">
<citerefentry><refentrytitle>unicode_html40ent_lookup</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
+ <link linkend="unicode_bidi">
+ <citerefentry><refentrytitle>unicode_bidi</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>,
<link linkend="unicode_category_lookup">
<citerefentry><refentrytitle>unicode_category_lookup</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
@@ -247,6 +258,9 @@ See COPYING for distribution information.
<link linkend="unicode_uc">
<citerefentry><refentrytitle>unicode_uc</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
+ <link linkend="unicode__bidi">
+ <citerefentry><refentrytitle>unicode::bidi</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>,
<link linkend="unicode__iconvert__convert">
<citerefentry><refentrytitle>unicode::iconvert::convert</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
@@ -272,8 +286,409 @@ See COPYING for distribution information.
</refsect1>
</refentry>
+ <refentry id="unicode_bidi">
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+
+ <refmeta>
+ <refentrytitle>unicode_bidi</refentrytitle>
+ <manvolnum>3</manvolnum>
+ </refmeta>
+
+ <refnamediv>
+ <refname>unicode_bidi</refname>
+ <refname>unicode_bidi_calc</refname>
+ <refname>unicode_bidi_reorder</refname>
+ <refname>unicode_bidi_mirror</refname>
+ <refname>unicode_bidi_bracket_type</refname>
+
+ <refpurpose>unicode bidirectional algorithm</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <funcsynopsis>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+ <funcsynopsisinfo>unicode_bidi_level_t lr=UNICODE_BIDI_LR</funcsynopsisinfo>
+ <funcprototype>
+ <funcdef>void unicode_bidi_calc</funcdef>
+ <paramdef>const char32_t *<parameter>p</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>const unicode_bidi_level_t *<parameter>initial_embedding_level</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>void unicode_bidi_reorder</funcdef>
+ <paramdef>char32_t *<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef>
+ <paramdef>size_t <parameter>n</parameter></paramdef>
+ <paramdef>void (*<parameter>reorder_callback</parameter>)(size_t, size_t, void *)</paramdef>
+ <paramdef>void *<parameter>arg</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>char32_t <function>bidi_mirror</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>char32_t <function>bidi_bracket_type</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ <paramdef>unicode_bracket_type_t *<parameter>ret</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+ </refsynopsisdiv>
+ <refsect1>
+ <title>DESCRIPTION</title>
+
+ <para>
+ <function>unicode_bidi_calc</function>() and
+ <function>unicode_bidi_reorder</function>() implement
+ the
+ <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>.
+ </para>
+ <para>
+ The first two parameters to
+ <function>unicode_bidi_calc</function>() are a unicode string
+ and the number of characters in the Unicode string.
+ <parameter>levels</parameter> points to a buffer of
+ <classname>unicode_bidi_level_t</classname> values which the
+ caller is responsible for allocating and deallocating, and has
+ the same number of values as the number of characters in the
+ Unicode string.
+ </para>
+ <para>
+ <function>unicode_bidi_calc</function>() calculates the
+ embedding level of each character and fills in the
+ <parameter>levels</parameter> buffer (executes all steps of the
+ bidirectional algorithm up to step L1).
+ A <literal>NULL</literal> <parameter>initial_embedding</parameter>
+ value calculates the default paragraph embedding value.
+ A pointer to a <literal>UNICODE_BIDI_LR</literal> or
+ <literal>UNICODE_BIDI_RL</literal> value explicitly sets a
+ left-to-right or right-to-left paragraph embedding value.
+ </para>
+
+ <para>
+ <function>unicode_bidi_calc</function>() calculates each
+ character's embedding value; an even value for left-to-right text
+ or an odd value for right-to-left text. A
+ <classname>UNICODE_BIDI_SKIP</classname> embedding level value
+ specifies a character whose embedding value is unspecified.
+ This is used for embedding and override markers which can be
+ removed from the string (together with this embedding value)
+ from the string and the embedding value itself). This can be
+ done before or after <function>unicode_bidi_reorder</function>.
+ </para>
+
+ <refsect2>
+ <title>Reordering text</title>
+
+ <para>
+ <function>unicode_bidi_reorder</function> takes the actual
+ unicode string together with the embedding values from
+ <function>unicode_bidi_calc</function>, then reverses the
+ bidirectional string, as specified by step L2 of the bidirectional
+ algorithm.
+ </para>
+
+ <para>
+ A non-<literal>NULL</literal>
+ <parameter>reorder_callback</parameter> gets invoked to report
+ each reveversed character range. The callback's first parameter
+ is the index of the first reversed character, the second parameter
+ is the number of reversed characters.
+ The third parameter is the <parameter>arg</parameter> passthrough
+ parameter.
+ </para>
+
+ <para>
+ <parameter>reorder_callback</parameter> gets invoked after
+ reversing each consecutive range of values in the
+ <parameter>string</parameter> and <parameter>levels</parameter>
+ buffers. For example: <quote>reorder_callback(5, 2, arg)</quote>
+ reports that character indexes #5 and #6 got reverse in the
+ string.
+ </para>
+
+ <para>
+ Specifying a NULL <parameter>string</parameter> leaves the
+ <parameter>levels</parameter> buffer unchanged, but still
+ invokes the <parameter>reorder_callback</parameter> as if
+ the character string, and their values, were reversed.
+ </para>
+ </refsect2>
+ <refsect2>
+ <title>Miscellaneous utility functions</title>
+
+ <para>
+ <function>unicode_bidi_mirror</function>
+ returns the glyph that's a mirror image of the parameter
+ (i.e. an open parenthesis for a close parenthesis, and vice
+ versa); or the same value if there is no mirror image.
+ </para>
+
+ <para>
+ <function>unicode_bidi_bracket_type</function>
+ looks up each bracket character and returns its opposite, or
+ the same value if the character is not a bracket that has an
+ opposing bracket character.
+ A non-NULL <parameter>ret</parameter> gets initialized to
+ either <literal>UNICODE_BIDI_o</literal>,
+ <literal>UNICODE_BIDI_c</literal> or
+ <literal>UNICODE_BIDI_n</literal>.
+ </para>
+ </refsect2>
+ </refsect1>
+ <refsect1>
+ <title>SEE ALSO</title>
+ <para>
+ <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">TR-9</ulink>,
+ <link linkend="unicode__bidi">
+ <citerefentry><refentrytitle>unicode::bidi</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>,
+ <link linkend="courier-unicode">
+ <citerefentry>
+ <refentrytitle>courier-unicode</refentrytitle>
+ <manvolnum>7</manvolnum></citerefentry></link>,
+ </para>
+ </refsect1>
+ </refentry>
+
+ <refentry id="unicode_category_lookup">
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+
+ <refmeta>
+ <refentrytitle>unicode_category_lookup</refentrytitle>
+ <manvolnum>3</manvolnum>
+ </refmeta>
+
+ <refnamediv>
+ <refname>unicode_category_lookup</refname>
+ <refname>unicode_isalnum</refname>
+ <refname>unicode_isalpha</refname>
+ <refname>unicode_isblank</refname>
+ <refname>unicode_isdigit</refname>
+ <refname>unicode_isgraph</refname>
+ <refname>unicode_islower</refname>
+ <refname>unicode_ispunct</refname>
+ <refname>unicode_isspace</refname>
+ <refname>unicode_isupper</refname>
+
+ <refpurpose>unicode character categorization</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <funcsynopsis>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+ <funcprototype>
+ <funcdef>uint32_t <function>unicode_category_lookup</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isalnum</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isalpha</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isblank</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isdigit</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isgraph</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_islower</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_ispunct</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isspace</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+
+ <funcprototype>
+ <funcdef>int <function>unicode_isupper</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+ </refsynopsisdiv>
+ <refsect1>
+ <title>DESCRIPTION</title>
+
+ <para>
+ <function>unicode_category_lookup</function>() looks up the
+ <ulink url="https://unicode.org/notes/tn36/">unicode character's
+ categorization</ulink>.
+ <function>unicode_category_lookup</function>() returns a 32 bit
+ value.
+ The value's
+ <symbol>UNICODE_CATEGORY_1</symbol> bits specify the first level
+ of the unicode character's category, with
+ <symbol>UNICODE_CATEGORY_2</symbol>,
+ <symbol>UNICODE_CATEGORY_3</symbol>, and
+ <symbol>UNICODE_CATEGORY_4</symbol> bits specifying the 2nd,
+ 3rd, and 4th level, if given. A value of 0 for each corresponding
+ bit set indicates that no category is specified for this level,
+ for this character; otherwise the possible values are defined
+ in <filename>&lt;courier-unicode.h&gt;</filename>.
+ </para>
+
+ <para>
+ The remaining functions implement comparable equivalents of
+ their non-unicode versions in the standard C library, as follows:
+ </para>
+
+ <variablelist>
+ <varlistentry>
+ <term><function>unicode_isalnum</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <function>unicode_isalpha</function>() or
+ <function>unicode_isdigit</function>().
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isalpha</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <symbol>UNICODE_CATEGORY_1_LETTER</symbol>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isblank</function>()</term>
+ <listitem>
+ <para>
+ Return non-0 for
+ <symbol>TAB</symbol>, and all
+ <symbol>UNICODE_CATEGORY_2_SPACE</symbol>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isdigit</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <symbol>UNICODE_CATEGORY_1_NUMBER</symbol>
+ | <symbol>UNICODE_CATEGORY_2_DIGIT</symbol>,
+ only (no third categories).
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isgraph</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all codepoints above
+ <symbol>SPACE</symbol> which are not
+ <function>unicode_isspace</function>().
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_islower</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <function>unicode_isalpha</function>() for which the
+ character is
+ equal to
+ <link linkend="unicode_uc">
+ <citerefentry><refentrytitle>unicode_lc</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>
+ of itself.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_ispunct</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <symbol>UNICODE_CATEGORY_1_PUNCTUATION</symbol>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isspace</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for unicode_isblank() or
+ for unicode characters
+ with linebreaking properties of
+ <symbol>BK</symbol>,
+ <symbol>CR</symbol>,
+ <symbol>LF</symbol>,
+ <symbol>NL</symbol>,
+ and
+ <symbol>SP</symbol>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><function>unicode_isupper</function>()</term>
+ <listitem>
+ <para>
+ Returns non-0 for all
+ <function>unicode_isalpha</function>() for which the
+ character is
+ equal to
+ <link linkend="unicode_uc">
+ <citerefentry><refentrytitle>unicode_uc</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>
+ of itself.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </refsect1>
+ <refsect1>
+ <title>SEE ALSO</title>
+ <para>
+ <link linkend="courier-unicode">
+ <citerefentry>
+ <refentrytitle>courier-unicode</refentrytitle>
+ <manvolnum>7</manvolnum></citerefentry></link>,
+ <link linkend="unicode_uc">
+ <citerefentry><refentrytitle>unicode_convert_tocase</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>.
+ </para>
+ </refsect1>
+ </refentry>
+
<refentry id="unicode_convert">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_convert</refentrytitle>
@@ -444,7 +859,7 @@ See COPYING for distribution information.
<function>unicode_convert_init</function>(),
<function>unicode_convert</function>(), and
<function>unicode_convert_deinit</function>() are an adaption of th
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink> API that uses the same
calling convention as the other algorithms in this unicode library,
@@ -668,7 +1083,7 @@ See COPYING for distribution information.
</refentry>
<refentry id="unicode_default_chset">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_default_chset</refentrytitle>
@@ -721,7 +1136,7 @@ See COPYING for distribution information.
</refentry>
<refentry id="unicode_html40ent_lookup">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_html40ent_lookup</refentrytitle>
@@ -780,251 +1195,18 @@ See COPYING for distribution information.
</refsect1>
</refentry>
- <refentry id="unicode_category_lookup">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
-
- <refmeta>
- <refentrytitle>unicode_category_lookup</refentrytitle>
- <manvolnum>3</manvolnum>
- </refmeta>
-
- <refnamediv>
- <refname>unicode_category_lookup</refname>
- <refname>unicode_isalnum</refname>
- <refname>unicode_isalpha</refname>
- <refname>unicode_isblank</refname>
- <refname>unicode_isdigit</refname>
- <refname>unicode_isgraph</refname>
- <refname>unicode_islower</refname>
- <refname>unicode_ispunct</refname>
- <refname>unicode_isspace</refname>
- <refname>unicode_isupper</refname>
-
- <refpurpose>unicode character categorization</refpurpose>
- </refnamediv>
-
- <refsynopsisdiv>
- <funcsynopsis>
- <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
- <funcprototype>
- <funcdef>uint32_t <function>unicode_category_lookup</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isalnum</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isalpha</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isblank</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isdigit</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isgraph</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_islower</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_ispunct</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isspace</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
-
- <funcprototype>
- <funcdef>int <function>unicode_isupper</function></funcdef>
- <paramdef>char32_t <parameter>c</parameter></paramdef>
- </funcprototype>
- </funcsynopsis>
- </refsynopsisdiv>
- <refsect1>
- <title>DESCRIPTION</title>
-
- <para>
- <function>unicode_category_lookup</function>() looks up the
- <ulink url="http://unicode.org/notes/tn36/">unicode character's
- categorization</ulink>.
- <function>unicode_category_lookup</function>() returns a 32 bit
- value.
- The value's
- <symbol>UNICODE_CATEGORY_1</symbol> bits specify the first level
- of the unicode character's category, with
- <symbol>UNICODE_CATEGORY_2</symbol>,
- <symbol>UNICODE_CATEGORY_3</symbol>, and
- <symbol>UNICODE_CATEGORY_4</symbol> bits specifying the 2nd,
- 3rd, and 4th level, if given. A value of 0 for each corresponding
- bit set indicates that no category is specified for this level,
- for this character; otherwise the possible values are defined
- in <filename>&lt;courier-unicode.h&gt;</filename>.
- </para>
-
- <para>
- The remaining functions implement comparable equivalents of
- their non-unicode versions in the standard C library, as follows:
- </para>
-
- <variablelist>
- <varlistentry>
- <term><function>unicode_isalnum</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <function>unicode_isalpha</function>() or
- <function>unicode_isdigit</function>().
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isalpha</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <symbol>UNICODE_CATEGORY_1_LETTER</symbol>.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isblank</function>()</term>
- <listitem>
- <para>
- Return non-0 for
- <symbol>TAB</symbol>, and all
- <symbol>UNICODE_CATEGORY_2_SPACE</symbol>.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isdigit</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <symbol>UNICODE_CATEGORY_1_NUMBER</symbol>
- | <symbol>UNICODE_CATEGORY_2_DIGIT</symbol>,
- only (no third categories).
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isgraph</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all codepoints above
- <symbol>SPACE</symbol> which are not
- <function>unicode_isspace</function>().
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_islower</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <function>unicode_isalpha</function>() for which the
- character is
- equal to
- <link linkend="unicode_uc">
- <citerefentry><refentrytitle>unicode_lc</refentrytitle>
- <manvolnum>3</manvolnum></citerefentry></link>
- of itself.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_ispunct</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <symbol>UNICODE_CATEGORY_1_PUNCTUATION</symbol>.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isspace</function>()</term>
- <listitem>
- <para>
- Returns non-0 for unicode_isblank() or
- for unicode characters
- with linebreaking properties of
- <symbol>BK</symbol>,
- <symbol>CR</symbol>,
- <symbol>LF</symbol>,
- <symbol>NL</symbol>,
- and
- <symbol>SP</symbol>.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term><function>unicode_isupper</function>()</term>
- <listitem>
- <para>
- Returns non-0 for all
- <function>unicode_isalpha</function>() for which the
- character is
- equal to
- <link linkend="unicode_uc">
- <citerefentry><refentrytitle>unicode_uc</refentrytitle>
- <manvolnum>3</manvolnum></citerefentry></link>
- of itself.
- </para>
- </listitem>
- </varlistentry>
- </variablelist>
- </refsect1>
- <refsect1>
- <title>SEE ALSO</title>
- <para>
- <link linkend="courier-unicode">
- <citerefentry>
- <refentrytitle>courier-unicode</refentrytitle>
- <manvolnum>7</manvolnum></citerefentry></link>,
- <link linkend="unicode_uc">
- <citerefentry><refentrytitle>unicode_convert_tocase</refentrytitle>
- <manvolnum>3</manvolnum></citerefentry></link>.
- </para>
- </refsect1>
- </refentry>
-
<refentry id="unicode_grapheme_break">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_grapheme_break</refentrytitle>
- <refentrytitle>unicode_grapheme_break_init</refentrytitle>
- <refentrytitle>unicode_grapheme_break_next</refentrytitle>
- <refentrytitle>unicode_grapheme_break_deinit</refentrytitle>
<manvolnum>3</manvolnum>
</refmeta>
<refnamediv>
<refname>unicode_grapheme_break</refname>
+ <refname>unicode_grapheme_break_init</refname>
+ <refname>unicode_grapheme_break_next</refname>
+ <refname>unicode_grapheme_break_deinit</refname>
<refpurpose>unicode grapheme cluster boundary rules</refpurpose>
</refnamediv>
@@ -1059,22 +1241,23 @@ See COPYING for distribution information.
<title>DESCRIPTION</title>
<para>
+ These functions implement the unicode grapheme cluster breaking
+ algorithm. Invoke
+ <function>unicode_grapheme_break_init</function>() to initialize
+ the grapheme cluster breaking algorithm.
<function>unicode_grapheme_break_init</function>() returns an
- opaque handle for an object that computes grapheme breaks.
- Each call to <function>unicode_grapheme_break_next</function>()
- passes one character of a unicode string, and returns a non-0
- value if there's a grapheme break before this character, in the
+ opaque handle. Each subsequent call to
+ <function>unicode_grapheme_break_next</function>() passes this
+ handle, and the next character.
+ <function>unicode_grapheme_break_next</function>() returns a non-0
+ value if there's a grapheme break before the character, in a
sequence of Unicode characters.
<function>unicode_grapheme_break_deinit</function>() releases
- all reosurces used by the grapheme breaking handle.
+ all reosurces used by the grapheme breaking handle, and the
+ <classname>unicode_grapheme_break_info_t</classname> handle
+ is no longer valid after this call.
</para>
<para>
- Call
- <function>unicode_grapheme_break_init</function>(), then call
- <function>unicode_grapheme_break_next</function>() for each
- character,
- then call
- <function>unicode_grapheme_break_deinit</function>().
The first call to <function>unicode_grapheme_break_next</function>()
always returns non-0, as per the GB1 rule.
</para>
@@ -1085,10 +1268,11 @@ See COPYING for distribution information.
<parameter>a</parameter> and
<parameter>b</parameter>.
This is is equivalent to calling
- <function>> unicode_grapheme_break_init</function>(),
+ <function>unicode_grapheme_break_init</function>(),
followed by two calls to
<function> unicode_grapheme_break_next</function>(), and finally
- <function>unicode_grapheme_break_deinit</function>(), and returns
+ <function>unicode_grapheme_break_deinit</function>(), then
+ returning
the result of the second
call to <function>unicode_grapheme_break_next</function>().
</para>
@@ -1098,7 +1282,7 @@ See COPYING for distribution information.
<title>SEE ALSO</title>
<para>
- <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>,
+ <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>,
<link linkend="courier-unicode">
<citerefentry>
<refentrytitle>courier-unicode</refentrytitle>
@@ -1116,60 +1300,15 @@ See COPYING for distribution information.
</refsect1>
</refentry>
- <refentry id="unicode_script">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
- <refmeta>
- <refentrytitle>unicode_script</refentrytitle>
- <manvolnum>3</manvolnum>
- </refmeta>
-
- <refnamediv>
- <refname>unicode_script</refname>
- <refpurpose>unicode script property</refpurpose>
- </refnamediv>
-
- <refsynopsisdiv>
- <funcsynopsis>
- <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
- <funcprototype>
- <funcdef>unicode_script_t <function>unicode_script</function></funcdef>
- <paramdef>char32_t <parameter>ch</parameter></paramdef>
- </funcprototype>
- </funcsynopsis>
- </refsynopsisdiv>
- <refsect1>
- <title>DESCRIPTION</title>
- <para>
- <function>unicode_script</function>() looks up the
- <quote>script</quote> property of the specified unicode character,
- and returns it. The <classname>unicode_script_t</classname>
- enumeration encodes possible unicode script values.
- <literal>unicode_script_unknown</literal> gets returned for a
- unicode character with an unknown script property.
- </para>
- </refsect1>
-
- <refsect1>
- <title>SEE ALSO</title>
-
- <para>
- <ulink url="http://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">TR-24</ulink>,
- <link linkend="courier-unicode">
- <citerefentry>
- <refentrytitle>courier-unicode</refentrytitle>
- <manvolnum>7</manvolnum></citerefentry></link>.
- </para>
- </refsect1>
- </refentry>
-
<refentry id="unicode_line_break">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_line_break</refentrytitle>
<manvolnum>3</manvolnum>
</refmeta>
<refnamediv>
+ <refname>unicode_line_break</refname>
<refname>unicode_lb_init</refname>
<refname>unicode_lb_set_opts</refname>
<refname>unicode_lb_next</refname>
@@ -1483,13 +1622,59 @@ See COPYING for distribution information.
<link linkend="unicode__linebreak">
<citerefentry><refentrytitle>unicode::linebreak</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
- <ulink url="http://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">TR-14</ulink>
+ <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">TR-14</ulink>
+ </para>
+ </refsect1>
+ </refentry>
+
+ <refentry id="unicode_script">
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+ <refmeta>
+ <refentrytitle>unicode_script</refentrytitle>
+ <manvolnum>3</manvolnum>
+ </refmeta>
+
+ <refnamediv>
+ <refname>unicode_script</refname>
+ <refpurpose>unicode script property</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <funcsynopsis>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+ <funcprototype>
+ <funcdef>unicode_script_t <function>unicode_script</function></funcdef>
+ <paramdef>char32_t <parameter>ch</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+ </refsynopsisdiv>
+ <refsect1>
+ <title>DESCRIPTION</title>
+ <para>
+ <function>unicode_script</function>() looks up the
+ <quote>script</quote> property of the specified unicode character,
+ and returns it. The <classname>unicode_script_t</classname>
+ enumeration encodes possible unicode script values.
+ <literal>unicode_script_unknown</literal> gets returned for a
+ unicode character with an unknown script property.
+ </para>
+ </refsect1>
+
+ <refsect1>
+ <title>SEE ALSO</title>
+
+ <para>
+ <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">TR-24</ulink>,
+ <link linkend="courier-unicode">
+ <citerefentry>
+ <refentrytitle>courier-unicode</refentrytitle>
+ <manvolnum>7</manvolnum></citerefentry></link>.
</para>
</refsect1>
</refentry>
<refentry id="unicode_word_break">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_word_break</refentrytitle>
<manvolnum>3</manvolnum>
@@ -1682,7 +1867,7 @@ See COPYING for distribution information.
<refsect1>
<title>SEE ALSO</title>
<para>
- <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>,
+ <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>,
<link linkend="courier-unicode">
<citerefentry>
<refentrytitle>courier-unicode</refentrytitle>
@@ -1704,7 +1889,7 @@ See COPYING for distribution information.
</refentry>
<refentry id="unicode_uc">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode_uc</refentrytitle>
<manvolnum>3</manvolnum>
@@ -1816,8 +2001,109 @@ See COPYING for distribution information.
<section id="manpagescpp">
<title>C++ manual pages</title>
+ <refentry id="unicode__bidi">
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+
+ <refmeta>
+ <refentrytitle>unicode::bidi::calc</refentrytitle>
+ <manvolnum>3</manvolnum>
+ </refmeta>
+
+ <refnamediv>
+ <refname>unicode::bidi_calc</refname>
+ <refname>unicode::bidi_reorder</refname>
+ <refpurpose>unicode bidirectional algorithm</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <funcsynopsis>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+ <funcprototype>
+ <funcdef>std::vector&lt;unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>std::vector&lt;unicode_bidi_level_t&gt; <function>unicode::bidi_calc</function></funcdef>
+ <paramdef>const std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>unicode_bidi_level_t<parameter>embedding_level</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_reorder</function></funcdef>
+ <paramdef>std::u32string &amp;<parameter>string</parameter></paramdef>
+ <paramdef>std::vector&lt;unicode_bidi_level_t&gt; &amp;<parameter> embedding_level</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t)&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>unicode::bidi_reorder</function></funcdef>
+ <paramdef>std::vector&lt;unicode_bidi_level_t&gt; &amp;<parameter>embedding_level</parameter></paramdef>
+ <paramdef>const std::function&lt;void (size_t, size_t)&gt; &amp;<parameter>reorder_callback</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+ </refsynopsisdiv>
+
+ <refsect1>
+ <title>DESCRIPTION</title>
+
+ <para>
+ These functions implement the C++ interface for the
+ <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>.
+ See the description of the underlying
+ <link linkend="unicode_bidi">
+ <citerefentry><refentrytitle>unicode_bidi</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link> C library
+ API for more information.
+ </para>
+
+ <para>
+ <function>unicode::bidi_calc</function> computes and return a vector
+ of bidirection embedding level values for the given Unicode string.
+ An overload takes an additional parameter that override the
+ paragraph embedding level, a <literal>UNICODE_BIDI_LR</literal> or
+ an <literal>UNICODE_BIDI_RL</literal> value.
+ </para>
+ <para>
+ <function>unicode::bidi_reorder</function> reverses the characters
+ in the Unicode script, according to their embedding levels (and
+ reverses the corresponding embedding level values too).
+ As is with the C API, an optional parameter is a callable object
+ that gets invoked to report each range of characters that gets
+ reversed (specified as the starting position and a number of
+ characters).
+ </para>
+ <para>
+ An overloaded <function>unicode::bidi_reorder</function> without
+ the string parameter goes through the motions, according to the
+ embedded level vector parameter, but without actually reversing
+ the values in the vector, but still invoking the callable object
+ normally.
+ </para>
+ <para>
+ This is comparable to the C API. Also comparable with the C API:
+ the convention that even embedding levels specify left to right
+ text and odd embedding values specify right to left text.
+ An embedding value of <literal>UNICODE_BIDI_SKIP</literal>
+ indicates an embedding or an override marker that has no
+ specified embeded value. These markers may be removed from the
+ Unicode string (together with the
+ <literal>UNICODE_BIDI_SKIP</literal>
+ values from the embedding values vector) either before or after
+ they get reordered.
+ </para>
+ </refsect1>
+ </refentry>
+
+
<refentry id="unicode__iconvert__convert">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::iconvert::convert</refentrytitle>
@@ -1951,7 +2237,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
<link linkend="unicode_convert">
<citerefentry><refentrytitle>unicode_convert</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink>.
@@ -1960,7 +2246,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</refentry>
<refentry id="unicode__iconvert__convert_tocase">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::iconvert::convert_tocase</refentrytitle>
@@ -2041,7 +2327,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
<link linkend="unicode_convert">
<citerefentry><refentrytitle>unicode_convert</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink>.
@@ -2050,7 +2336,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</refentry>
<refentry id="unicode__iconvert__fromu">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::iconvert::fromu</refentrytitle>
@@ -2138,7 +2424,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
<link linkend="unicode_convert">
<citerefentry><refentrytitle>unicode_convert</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink>.
@@ -2147,7 +2433,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</refentry>
<refentry id="unicode__iconvert__tou">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::iconvert::tou</refentrytitle>
@@ -2237,7 +2523,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
<link linkend="unicode_convert">
<citerefentry><refentrytitle>unicode_convert</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
- <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html">
+ <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html">
<citerefentry><refentrytitle>iconv</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></ulink>.
@@ -2246,7 +2532,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>
</refentry>
<refentry id="unicode__linebreak">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::linebreak</refentrytitle>
@@ -2447,7 +2733,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt
</refentry>
<refentry id="unicode__tolower">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::tolower</refentrytitle>
@@ -2542,19 +2828,8 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator&lt;std::vector&lt;int&gt
</refsect1>
</refentry>
-
-
-
-
-
-
-
-
-
-
-
<refentry id="unicode__wordbreak">
- <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info>
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
<refmeta>
<refentrytitle>unicode::wordbreak</refentrytitle>
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index 67f3bda..b8c88f4 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -2,7 +2,7 @@
#define courier_unicode_h
/*
-** Copyright 2000-2018 Double Precision, Inc.
+** Copyright 2000-2020 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
@@ -12,6 +12,7 @@
#include <string>
#include <vector>
#include <list>
+#include <functional>
extern "C" {
#endif
@@ -40,7 +41,7 @@ typedef uint32_t char32_t;
#endif
#endif
-#define COURIER_UNICODE_VERSION 210
+#define COURIER_UNICODE_VERSION 220
/*
** The system default character set, from the locale.
@@ -605,6 +606,13 @@ extern void unicode_bidi_calc(const char32_t *p, size_t n,
const unicode_bidi_level_t *
initial_embedding_level);
+extern void unicode_bidi_reorder(char32_t *p,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*reorder_callback)(size_t, size_t,
+ void *),
+ void *arg);
+
/*
** A buffer that holds unicode characters, and dynamically grows as needed.
*/
@@ -2025,6 +2033,24 @@ std::u32string tolower(const std::u32string &u);
std::u32string toupper(const std::u32string &u);
+//! Calculate bidirectional embedding levels
+std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s);
+
+//! Calculate bidirectional embedding levels
+std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s,
+ unicode_bidi_level_t level);
+
+//! Reorder bidirectional text
+int bidi_reorder(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t, size_t)> &reorder_callback=
+ [](size_t, size_t){});
+
+//! Reorder bidirectional text
+void bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t, size_t)> &reorder_callback=
+ [](size_t, size_t){});
+
#if 0
{
#endif
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index 38dcb44..9e7fcf4 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -1,5 +1,5 @@
/*
-** Copyright 2011-2020 Double Precision, Inc.
+** Copyright 2020 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
@@ -148,14 +148,56 @@ struct level_run {
size_t end; /* one past */
};
+/* A growing list of level runs */
+
+struct level_runs {
+ struct level_run *runs; /* All level runs in the sequence */
+ size_t n_level_runs; /* How many of them */
+ size_t cap_level_runs; /* Capacity of the level runs */
+};
+
+static void level_runs_init(struct level_runs *p)
+{
+ p->runs=0;
+ p->n_level_runs=0;
+ p->cap_level_runs=0;
+}
+
+static void level_runs_deinit(struct level_runs *p)
+{
+ if (p->runs)
+ free(p->runs);
+}
+
+static struct level_run *level_runs_add(struct level_runs *p)
+{
+ if (p->n_level_runs == p->cap_level_runs)
+ {
+ p->cap_level_runs *= 2;
+
+ if (p->cap_level_runs == 0)
+ p->cap_level_runs=1;
+
+ p->runs=(struct level_run *)
+ (p->runs ?
+ realloc(p->runs,
+ sizeof(struct level_run) *
+ p->cap_level_runs)
+ :malloc(sizeof(struct level_run) *
+ p->cap_level_runs));
+ if (!p->runs)
+ abort();
+ }
+
+ return p->runs + (p->n_level_runs++);
+}
+
/* An isolating run sequence */
struct isolating_run_sequence_s {
struct isolating_run_sequence_s *prev, *next; /* Linked list */
- struct level_run *level_runs; /* All level runs in the sequence */
- size_t n_level_runs; /* How many of them */
- size_t cap_level_runs; /* Capacity of the level runs */
+ struct level_runs runs;
unicode_bidi_level_t embedding_level; /* This seq's embedding level */
enum_bidi_class_t sos, eos;
};
@@ -185,11 +227,11 @@ static irs_iterator irs_begin(struct isolating_run_sequence_s *seq)
/* Edge case, empty isolating run sequence */
- while (iter.level_run_i < seq->n_level_runs)
+ while (iter.level_run_i < seq->runs.n_level_runs)
{
- iter.i=seq->level_runs[iter.level_run_i].start;
+ iter.i=seq->runs.runs[iter.level_run_i].start;
- if (iter.i < seq->level_runs[iter.level_run_i].end)
+ if (iter.i < seq->runs.runs[iter.level_run_i].end)
break;
++iter.level_run_i;
@@ -202,7 +244,7 @@ static irs_iterator irs_end(struct isolating_run_sequence_s *seq)
irs_iterator iter;
iter.seq=seq;
- iter.level_run_i=seq->n_level_runs;
+ iter.level_run_i=seq->runs.n_level_runs;
return iter;
}
@@ -214,7 +256,7 @@ static int irs_compare(const irs_iterator *a,
if (a->level_run_i > b->level_run_i)
return 1;
- if (a->level_run_i == a->seq->n_level_runs)
+ if (a->level_run_i == a->seq->runs.n_level_runs)
return 0;
if (a->i < b->i)
@@ -227,7 +269,7 @@ static int irs_compare(const irs_iterator *a,
static void irs_incr(irs_iterator *iter)
{
- if (iter->seq->n_level_runs == iter->level_run_i)
+ if (iter->seq->runs.n_level_runs == iter->level_run_i)
{
fprintf(stderr, "%s%s\n",
"Internal error: attempting to increment ",
@@ -235,10 +277,10 @@ static void irs_incr(irs_iterator *iter)
abort();
}
- if (++iter->i >= iter->seq->level_runs[iter->level_run_i].end)
+ if (++iter->i >= iter->seq->runs.runs[iter->level_run_i].end)
{
- if (++iter->level_run_i < iter->seq->n_level_runs)
- iter->i=iter->seq->level_runs[iter->level_run_i].start;
+ if (++iter->level_run_i < iter->seq->runs.n_level_runs)
+ iter->i=iter->seq->runs.runs[iter->level_run_i].start;
}
}
@@ -246,8 +288,8 @@ static void irs_decr(irs_iterator *iter)
{
while (1)
{
- if (iter->seq->n_level_runs > iter->level_run_i &&
- iter->i > iter->seq->level_runs[iter->level_run_i].start)
+ if (iter->seq->runs.n_level_runs > iter->level_run_i &&
+ iter->i > iter->seq->runs.runs[iter->level_run_i].start)
{
--iter->i;
break;
@@ -261,7 +303,7 @@ static void irs_decr(irs_iterator *iter)
abort();
}
- iter->i=iter->seq->level_runs[--iter->level_run_i].end;
+ iter->i=iter->seq->runs.runs[--iter->level_run_i].end;
}
}
@@ -328,13 +370,12 @@ isolating_run_sequences_init(struct isolating_run_sequences_s *p,
if (!seq) abort();
- if ((seq->level_runs=(struct level_run *)
- malloc(sizeof(struct level_run))) == 0) abort();
+ level_runs_init(&seq->runs);
- seq->level_runs->start=i;
- seq->level_runs->end=i;
+ struct level_run *run=level_runs_add(&seq->runs);
- seq->n_level_runs=seq->cap_level_runs=1;
+ run->start=i;
+ run->end=i;
seq->embedding_level=embedding_level;
if (!p->head)
@@ -355,7 +396,7 @@ static void isolating_run_sequences_record(struct isolating_run_sequence_s *p,
size_t i)
{
struct level_run *current_level_run=
- &p->level_runs[p->n_level_runs-1];
+ &p->runs.runs[p->runs.n_level_runs-1];
if (current_level_run->start == current_level_run->end)
{
@@ -375,19 +416,7 @@ static void isolating_run_sequences_record(struct isolating_run_sequence_s *p,
** run sequence.
*/
- if (p->n_level_runs == p->cap_level_runs)
- {
- p->cap_level_runs *= 2;
-
- p->level_runs=(struct level_run *)
- realloc(p->level_runs,
- sizeof(struct level_run) *
- p->cap_level_runs);
- if (!p->level_runs)
- abort();
- }
-
- current_level_run = p->level_runs + (p->n_level_runs++);
+ current_level_run=level_runs_add(&p->runs);
current_level_run->start=i;
current_level_run->end=i+1;
@@ -430,7 +459,7 @@ static void isolating_run_sequences_deinit(struct isolating_run_sequences_s *p)
seq=seq->next;
- free(p->level_runs);
+ level_runs_deinit(&p->runs);
free(p);
}
@@ -706,12 +735,12 @@ void dump_sequence_info(directional_status_stack_t stack,
(seq->sos == UNICODE_BIDI_CLASS_L ? 'L':'R'),
(seq->eos == UNICODE_BIDI_CLASS_L ? 'L':'R'));
- for (size_t i=0; i<seq->n_level_runs; ++i)
+ for (size_t i=0; i<seq->runs.n_level_runs; ++i)
{
fprintf(DEBUGDUMP, "%s[%lu-%lu]",
i == 0 ? " ":", ",
- (unsigned long)seq->level_runs[i].start,
- (unsigned long)seq->level_runs[i].end-1);
+ (unsigned long)seq->runs.runs[i].start,
+ (unsigned long)seq->runs.runs[i].end-1);
}
fprintf(DEBUGDUMP, "\n");
}
@@ -1706,3 +1735,127 @@ static void unicode_bidi_n(directional_status_stack_t stack,
dump_sequence("Contents after I", stack, seq);
#endif
}
+
+struct level_run_layers {
+ struct level_runs *lruns; /* At this embedding level, or higher */
+ size_t n_lruns; /* How many of them */
+ size_t cap_lruns; /* Capacity of the level runs */
+};
+
+static void level_run_layers_init(struct level_run_layers *p)
+{
+ p->lruns=0;
+ p->n_lruns=0;
+ p->cap_lruns=0;
+}
+
+static void level_run_layers_deinit(struct level_run_layers *p)
+{
+ if (p->lruns)
+ {
+ for (size_t i=0; i<p->n_lruns; ++i)
+ level_runs_deinit(&p->lruns[i]);
+ free(p->lruns);
+ }
+}
+
+static void level_run_layers_add(struct level_run_layers *p)
+{
+ if (p->n_lruns == p->cap_lruns)
+ {
+ p->cap_lruns *= 2;
+
+ if (p->cap_lruns == 0)
+ p->cap_lruns=1;
+
+ p->lruns=(struct level_runs *)
+ (p->lruns ?
+ realloc(p->lruns,
+ sizeof(struct level_runs) *
+ p->cap_lruns)
+ :malloc(sizeof(struct level_runs) *
+ p->cap_lruns));
+ if (!p->lruns)
+ abort();
+ }
+
+ level_runs_init(p->lruns + (p->n_lruns++));
+}
+
+void unicode_bidi_reorder(char32_t *p,
+ unicode_bidi_level_t *levels,
+ size_t n,
+ void (*reorder_callback)(size_t, size_t, void *),
+ void *arg)
+{
+ /* L2 */
+
+ struct level_run_layers layers;
+ unicode_bidi_level_t previous_level=0;
+
+ level_run_layers_init(&layers);
+
+ for (size_t i=0; i<n; ++i)
+ {
+ if (levels[i] != UNICODE_BIDI_SKIP)
+ previous_level=levels[i];
+
+ while (layers.n_lruns <= previous_level)
+ level_run_layers_add(&layers);
+
+ /* We intentionally don't put anything in level 0 */
+ for (size_t j=1; j<=previous_level; ++j)
+ {
+ struct level_runs *runs=layers.lruns+j;
+
+ if (runs->n_level_runs &&
+ runs->runs[runs->n_level_runs-1].end == i)
+ {
+ ++runs->runs[runs->n_level_runs-1].end;
+ }
+ else
+ {
+ struct level_run *run=
+ level_runs_add(runs);
+
+ run->start=i;
+ run->end=i+1;
+ }
+ }
+ }
+
+ for (size_t i=layers.n_lruns; i; )
+ {
+ struct level_runs *runs=layers.lruns+ --i;
+
+ for (size_t j=0; j<runs->n_level_runs; ++j)
+ {
+ size_t start=runs->runs[j].start;
+ size_t end=runs->runs[j].end;
+ size_t right=end;
+ size_t left=start;
+
+ while (right > left)
+ {
+ --right;
+
+ if (p)
+ {
+ char32_t c=p[left];
+ unicode_bidi_level_t l=levels[left];
+
+ p[left]=p[right];
+ levels[left]=levels[right];
+ p[right]=c;
+ levels[right]=l;
+ }
+ ++left;
+ }
+
+ if (end-start > 1 && reorder_callback)
+ (*reorder_callback)(start, end-start, arg);
+ }
+ }
+
+ level_run_layers_deinit(&layers);
+}
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index 51bed3c..adb7869 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -557,3 +557,79 @@ std::u32string unicode::toupper(const std::u32string &u)
return copy;
}
+
+std::vector<unicode_bidi_level_t>
+unicode::bidi_calc(const std::u32string &s)
+{
+ return unicode::bidi_calc(s, UNICODE_BIDI_SKIP);
+}
+
+std::vector<unicode_bidi_level_t>
+unicode::bidi_calc(const std::u32string &s,
+ unicode_bidi_level_t paragraph_embedding_level)
+{
+ const unicode_bidi_level_t *initial_embedding_level=0;
+
+ if (paragraph_embedding_level == UNICODE_BIDI_LR ||
+ paragraph_embedding_level == UNICODE_BIDI_RL)
+ {
+ initial_embedding_level=&paragraph_embedding_level;
+ }
+
+ std::vector<unicode_bidi_level_t> buf;
+
+ buf.resize(s.size());
+
+ if (s.size())
+ {
+ unicode_bidi_calc(s.c_str(), s.size(), &buf[0],
+ initial_embedding_level);
+ }
+ return buf;
+}
+
+extern "C" {
+ static void reorder_callback(size_t i, size_t cnt,
+ void *arg)
+ {
+ auto p=reinterpret_cast<const std::function<void (size_t,
+ size_t)> *>
+ (arg);
+
+ (*p)(i, cnt);
+ }
+}
+
+int unicode::bidi_reorder(std::u32string &string,
+ std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t, size_t)> &lambda)
+{
+ size_t s=string.size();
+
+ if (s != levels.size())
+ return -1;
+
+ if (!s)
+ return 0;
+
+ unicode_bidi_reorder(&string[0], &levels[0], s,
+ reorder_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>(&lambda)));
+
+ return 0;
+}
+
+void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
+ const std::function<void (size_t, size_t)> &lambda)
+{
+ size_t s=levels.size();
+
+ if (!s)
+ return;
+
+ unicode_bidi_reorder(0, &levels[0], s, reorder_callback,
+ const_cast<void *>
+ (reinterpret_cast<const void *>(&lambda)));
+
+}