diff options
| author | Sam Varshavchik | 2020-07-09 21:36:46 -0400 |
|---|---|---|
| committer | Sam Varshavchik | 2020-07-12 15:56:45 -0400 |
| commit | 7a9293cd28b293b793793368237d8856cfb0eff4 (patch) | |
| tree | 3c19854a7869103405c78a97e40503db64fac7b6 | |
| parent | 2219f725acd0dc36fa00080c846a8982273a6f61 (diff) | |
| download | courier-libs-7a9293cd28b293b793793368237d8856cfb0eff4.tar.bz2 | |
Documentation, C++ bindings, reorder.
| -rw-r--r-- | unicode/Makefile.am | 86 | ||||
| -rw-r--r-- | unicode/README | 4 | ||||
| -rw-r--r-- | unicode/biditest.C | 116 | ||||
| -rw-r--r-- | unicode/book.xml | 961 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 30 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 231 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 76 |
7 files changed, 1111 insertions, 393 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 397987c..081965e 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -85,7 +85,87 @@ include_HEADERS=courier-unicode.h \ courier-unicode-categories-tab.h \ courier-unicode-script-tab.h -man_MANS=$(srcdir)/man/courier-unicode.7 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]convert_tocase.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]fromu.3 $(srcdir)/man/unicode[\:][\:]iconvert[\:][\:]tou.3 $(srcdir)/man/unicode[\:][\:]iso_8859_1.3 $(srcdir)/man/unicode[\:][\:]linebreak_callback_base.3 $(srcdir)/man/unicode[\:][\:]linebreak_callback_save_buf.3 $(srcdir)/man/unicode[\:][\:]linebreak_iter.3 $(srcdir)/man/unicode[\:][\:]linebreakc_callback_base.3 $(srcdir)/man/unicode[\:][\:]linebreakc_iter.3 $(srcdir)/man/unicode[\:][\:]tolower.3 $(srcdir)/man/unicode[\:][\:]toupper.3 $(srcdir)/man/unicode[\:][\:]ucs_2.3 $(srcdir)/man/unicode[\:][\:]ucs_4.3 $(srcdir)/man/unicode[\:][\:]utf_8.3 $(srcdir)/man/unicode[\:][\:]wordbreak_callback_base.3 $(srcdir)/man/unicode_category_lookup.3 $(srcdir)/man/unicode_convert.3 $(srcdir)/man/unicode_convert_deinit.3 $(srcdir)/man/unicode_convert_fromu_init.3 $(srcdir)/man/unicode_convert_fromu_tobuf.3 $(srcdir)/man/unicode_convert_fromutf8.3 $(srcdir)/man/unicode_convert_init.3 $(srcdir)/man/unicode_convert_tobuf.3 $(srcdir)/man/unicode_convert_tocase.3 $(srcdir)/man/unicode_convert_tocbuf_fromutf8_init.3 $(srcdir)/man/unicode_convert_tocbuf_init.3 $(srcdir)/man/unicode_convert_tocbuf_toutf8_init.3 $(srcdir)/man/unicode_convert_tou_init.3 $(srcdir)/man/unicode_convert_tou_tobuf.3 $(srcdir)/man/unicode_convert_toutf8.3 $(srcdir)/man/unicode_convert_uc.3 $(srcdir)/man/unicode_default_chset.3 $(srcdir)/man/unicode_grapheme_break.3 $(srcdir)/man/unicode_html40ent_lookup.3 $(srcdir)/man/unicode_isalnum.3 $(srcdir)/man/unicode_isalpha.3 $(srcdir)/man/unicode_isblank.3 $(srcdir)/man/unicode_isdigit.3 $(srcdir)/man/unicode_isgraph.3 $(srcdir)/man/unicode_islower.3 $(srcdir)/man/unicode_ispunct.3 $(srcdir)/man/unicode_isspace.3 $(srcdir)/man/unicode_isupper.3 $(srcdir)/man/unicode_lb_end.3 $(srcdir)/man/unicode_lb_init.3 $(srcdir)/man/unicode_lb_next.3 $(srcdir)/man/unicode_lb_next_cnt.3 $(srcdir)/man/unicode_lb_set_opts.3 $(srcdir)/man/unicode_lbc_end.3 $(srcdir)/man/unicode_lbc_init.3 $(srcdir)/man/unicode_lbc_next.3 $(srcdir)/man/unicode_lbc_next_cnt.3 $(srcdir)/man/unicode_lbc_set_opts.3 $(srcdir)/man/unicode_lc.3 $(srcdir)/man/unicode_locale_chset.3 $(srcdir)/man/unicode_script.3 $(srcdir)/man/unicode_tc.3 $(srcdir)/man/unicode_u_ucs2_native.3 $(srcdir)/man/unicode_u_ucs4_native.3 $(srcdir)/man/unicode_uc.3 $(srcdir)/man/unicode_wb_end.3 $(srcdir)/man/unicode_wb_init.3 $(srcdir)/man/unicode_wb_next.3 $(srcdir)/man/unicode_wb_next_cnt.3 $(srcdir)/man/unicode_wbscan_end.3 $(srcdir)/man/unicode_wbscan_init.3 $(srcdir)/man/unicode_wbscan_next.3 +man_MANS= \ + $(srcdir)/man/courier-unicode.7 \ + $(srcdir)/man/unicode\:\:bidi_calc.3 \ + $(srcdir)/man/unicode\:\:bidi_reorder.3 \ + $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \ + $(srcdir)/man/unicode\:\:iconvert\:\:convert_tocase.3 \ + $(srcdir)/man/unicode\:\:iconvert\:\:fromu.3 \ + $(srcdir)/man/unicode\:\:iconvert\:\:tou.3 \ + $(srcdir)/man/unicode\:\:iso_8859_1.3 \ + $(srcdir)/man/unicode\:\:linebreak_callback_base.3 \ + $(srcdir)/man/unicode\:\:linebreak_callback_save_buf.3 \ + $(srcdir)/man/unicode\:\:linebreak_iter.3 \ + $(srcdir)/man/unicode\:\:linebreakc_callback_base.3 \ + $(srcdir)/man/unicode\:\:linebreakc_iter.3 \ + $(srcdir)/man/unicode\:\:tolower.3 \ + $(srcdir)/man/unicode\:\:toupper.3 \ + $(srcdir)/man/unicode\:\:ucs_2.3 \ + $(srcdir)/man/unicode\:\:ucs_4.3 \ + $(srcdir)/man/unicode\:\:utf_8.3 \ + $(srcdir)/man/unicode\:\:wordbreak_callback_base.3 \ + $(srcdir)/man/unicode_bidi.3 \ + $(srcdir)/man/unicode_bidi_bracket_type.3 \ + $(srcdir)/man/unicode_bidi_calc.3 \ + $(srcdir)/man/unicode_bidi_mirror.3 \ + $(srcdir)/man/unicode_bidi_reorder.3 \ + $(srcdir)/man/unicode_category_lookup.3 \ + $(srcdir)/man/unicode_convert.3 \ + $(srcdir)/man/unicode_convert_deinit.3 \ + $(srcdir)/man/unicode_convert_fromu_init.3 \ + $(srcdir)/man/unicode_convert_fromu_tobuf.3 \ + $(srcdir)/man/unicode_convert_fromutf8.3 \ + $(srcdir)/man/unicode_convert_init.3 \ + $(srcdir)/man/unicode_convert_tobuf.3 \ + $(srcdir)/man/unicode_convert_tocase.3 \ + $(srcdir)/man/unicode_convert_tocbuf_fromutf8_init.3 \ + $(srcdir)/man/unicode_convert_tocbuf_init.3 \ + $(srcdir)/man/unicode_convert_tocbuf_toutf8_init.3 \ + $(srcdir)/man/unicode_convert_tou_init.3 \ + $(srcdir)/man/unicode_convert_tou_tobuf.3 \ + $(srcdir)/man/unicode_convert_toutf8.3 \ + $(srcdir)/man/unicode_convert_uc.3 \ + $(srcdir)/man/unicode_default_chset.3 \ + $(srcdir)/man/unicode_grapheme_break.3 \ + $(srcdir)/man/unicode_grapheme_break_deinit.3 \ + $(srcdir)/man/unicode_grapheme_break_init.3 \ + $(srcdir)/man/unicode_grapheme_break_next.3 \ + $(srcdir)/man/unicode_html40ent_lookup.3 \ + $(srcdir)/man/unicode_isalnum.3 \ + $(srcdir)/man/unicode_isalpha.3 \ + $(srcdir)/man/unicode_isblank.3 \ + $(srcdir)/man/unicode_isdigit.3 \ + $(srcdir)/man/unicode_isgraph.3 \ + $(srcdir)/man/unicode_islower.3 \ + $(srcdir)/man/unicode_ispunct.3 \ + $(srcdir)/man/unicode_isspace.3 \ + $(srcdir)/man/unicode_isupper.3 \ + $(srcdir)/man/unicode_lb_end.3 \ + $(srcdir)/man/unicode_lb_init.3 \ + $(srcdir)/man/unicode_lb_next.3 \ + $(srcdir)/man/unicode_lb_next_cnt.3 \ + $(srcdir)/man/unicode_lb_set_opts.3 \ + $(srcdir)/man/unicode_lbc_end.3 \ + $(srcdir)/man/unicode_lbc_init.3 \ + $(srcdir)/man/unicode_lbc_next.3 \ + $(srcdir)/man/unicode_lbc_next_cnt.3 \ + $(srcdir)/man/unicode_lbc_set_opts.3 \ + $(srcdir)/man/unicode_lc.3 \ + $(srcdir)/man/unicode_line_break.3 \ + $(srcdir)/man/unicode_locale_chset.3 \ + $(srcdir)/man/unicode_script.3 \ + $(srcdir)/man/unicode_tc.3 \ + $(srcdir)/man/unicode_u_ucs2_native.3 \ + $(srcdir)/man/unicode_u_ucs4_native.3 \ + $(srcdir)/man/unicode_uc.3 \ + $(srcdir)/man/unicode_wb_end.3 \ + $(srcdir)/man/unicode_wb_init.3 \ + $(srcdir)/man/unicode_wb_next.3 \ + $(srcdir)/man/unicode_wb_next_cnt.3 \ + $(srcdir)/man/unicode_wbscan_end.3 \ + $(srcdir)/man/unicode_wbscan_init.3 \ + $(srcdir)/man/unicode_wbscan_next.3 libcourier_unicode_la_SOURCES=\ courier-unicode-categories-tab.h \ @@ -329,7 +409,7 @@ docs.stamp: rm -f man/*.[123456789] mv man.tmp/* man rm -rf html.tmp man.tmp - perl -p -e 's/:/[\\:]/g if s@^man_MANS=.*@"man_MANS=" . join(" ", map { "\$$(srcdir)/$$_" } glob("man/*.[123456789]"))@e' Makefile.am >Makefile.am.new + perl -e '$$f=join("",<STDIN>); $$p=join("", map { " \\\n \$$(srcdir)/$$_" } glob("man/*.[123456789]")); $$p=~s/:/\\:/g; $$f =~ s/\nman_MANS=([^\n]|\n[^\n])*/\nman_MANS=$$p/s; print $$f' <Makefile.am >Makefile.am.new cmp Makefile.am Makefile.am.new || mv -f Makefile.am.new Makefile.am; rm -f Makefile.am.new touch docs.stamp @@ -405,4 +485,4 @@ distrelease: $(MAKE) dist www: - rsync -a html/. $$HOME/www/www.courier-mta.org/unicode + rsync -a html/. $$HOME/www/hostrocket/courier-mta.org/unicode diff --git a/unicode/README b/unicode/README index 2aeb1f0..926e004 100644 --- a/unicode/README +++ b/unicode/README @@ -25,6 +25,8 @@ Courier Unicode Library * Implementation of line breaking rules. + * Implementation of the bi-directional algorithm. + * Several ancillary functions, like looking up the unicode character that corresponds to some HTML 4.0 entity (such as “&”, for example), and determining the normal width or a double-width status of @@ -40,7 +42,7 @@ Courier Unicode Library Current status The current release of the Courier Unicode library is based on the Unicode - 8.0.0 standard. + 13.0.0 standard. -------------------------------------------------------------------------- diff --git a/unicode/biditest.C b/unicode/biditest.C index c58da0d..61841a1 100644 --- a/unicode/biditest.C +++ b/unicode/biditest.C @@ -5,7 +5,9 @@ #include <sstream> #include <string> #include <algorithm> +#include <utility> #include <iomanip> +#include <numeric> std::vector<std::string> testcase; @@ -43,6 +45,8 @@ int main(int argc, char **argv) std::vector<unicode_bidi_level_t> expected_levels; + std::vector<size_t> expected_reorder; + while (1) { buf.clear(); @@ -99,6 +103,28 @@ int main(int argc, char **argv) continue; } + + + if (buf.substr(0, 9) == "@Reorder:") + { + expected_reorder.clear(); + + std::istringstream i(buf); + + std::string word; + + i >> word; + + size_t n; + + while (i >> n) + { + expected_reorder.push_back(n); + } + continue; + } + + if (buf.substr(0, 1) == "@") continue; @@ -138,10 +164,9 @@ int main(int argc, char **argv) std::vector<unicode_bidi_level_t> actual_levels; - std::vector<char32_t> dummy_input; + std::u32string dummy_input; dummy_input.resize(testcase.size()); - actual_levels.resize(testcase.size()); static const unicode_bidi_level_t level_0=0; static const unicode_bidi_level_t level_1=1; @@ -153,9 +178,9 @@ int main(int argc, char **argv) { if (n & 1) { - unicode_bidi_calc(&dummy_input[0], - testcase.size(), - &actual_levels[0], level); + actual_levels=level ? + unicode::bidi_calc(dummy_input,*level) + : unicode::bidi_calc(dummy_input); int matched=0; @@ -220,6 +245,87 @@ int main(int argc, char **argv) std::cerr << std::endl; exit(1); } + + std::vector<size_t> actual_reorder; + + actual_reorder.resize(testcase.size()); + + std::iota(actual_reorder.begin(), + actual_reorder.end(), 0); + + unicode::bidi_reorder + (dummy_input, + actual_levels, + [&] + (size_t s, size_t cnt) + { + auto *b=&actual_reorder[s]; + auto *e=b+cnt; + + while (b < e) + { + --e; + std::swap(*b, *e); + ++b; + } + }); + + auto b=actual_reorder.begin(), p=b, + e=actual_reorder.end(); + + auto q=actual_levels.begin(); + + while (b != e) + { + if (*q != UNICODE_BIDI_SKIP) + { + *p=*b; + ++p; + } + ++b; + ++q; + } + actual_reorder.erase(p, e); + + if (actual_reorder != expected_reorder) + { + fclose(DEBUGDUMP); + DEBUGDUMP=stderr; + std::cout << std::endl + << std::flush; + unicode_bidi_calc(&dummy_input[0], + testcase.size(), + &actual_levels[0], + level); + + std::cerr << "Regression, line " + << linenum; + + if (!level) + { + std::cerr << ", auto"; + } + else + { + std::cerr << + (*level ? ", RTL" + : ", LTR"); + } + std::cerr << ": expected reorder"; + + for (auto o:expected_reorder) + { + std::cerr << " " << o; + } + std::cerr << std::endl + << "Moved: "; + for (auto o:actual_reorder) + { + std::cerr << " " << o; + } + std::cerr << std::endl; + exit(1); + } } n >>= 1; diff --git a/unicode/book.xml b/unicode/book.xml index 41b8037..ee4b5e5 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -1,7 +1,8 @@ <?xml version="1.0" encoding="utf-8"?> <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" - "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [ + "https://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [ +<!ENTITY tr9ver "42"> <!ENTITY tr14ver "45"> <!ENTITY tr24ver "31"> <!ENTITY tr29ver "37"> @@ -19,7 +20,7 @@ See COPYING for distribution information. <para> This library implements several algorithms related to the - <ulink url="http://www.unicode.org/standard/standard.html">Unicode + <ulink url="https://www.unicode.org/standard/standard.html">Unicode Standard</ulink>: </para> @@ -33,25 +34,32 @@ See COPYING for distribution information. <listitem> <para> Implementation of - <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme + <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme and work breaking</ulink> rules. </para> </listitem> <listitem> <para> Implementation of - <ulink url="http://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line + <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line breaking</ulink> rules. </para> </listitem> <listitem> <para> + Implementation of the + <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">bi-directional + algorithm</ulink>. + </para> + </listitem> + <listitem> + <para> Several ancillary functions, like looking up the unicode character that corresponds to some HTML 4.0 entity (such as <quote>&amp;</quote>, for example), and determining the normal width or a double-width status of a unicode character. Also, an adaptation of the - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink> API for this unicode library. @@ -60,14 +68,14 @@ See COPYING for distribution information. <listitem> <para> Look up the - <ulink url="http://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode + <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode script property</ulink>. </para> </listitem> <listitem> <para> Look up the - <ulink url="http://unicode.org/notes/tn36/">category</ulink> + <ulink url="https://unicode.org/notes/tn36/">category</ulink> property. </para> </listitem> @@ -82,7 +90,7 @@ See COPYING for distribution information. <para> The current release of the Courier Unicode library is based on the - Unicode 8.0.0 standard. + Unicode 13.0.0 standard. </para> </section> @@ -91,7 +99,7 @@ See COPYING for distribution information. <para> Download the current version of the library from - <ulink url="/download.html#unicode">http://www.courier-mta.org/download.html#unicode</ulink>. + <ulink url="/download.html#unicode">https://www.courier-mta.org/download.html#unicode</ulink>. After unpacking the tarball, run the configure script, which takes the usual options, followed by <command>make</command>, then <command>make install</command>. @@ -154,7 +162,7 @@ See COPYING for distribution information. <manvolnum>7</manvolnum></citerefentry></link>. Refer to the included manual pages, and - <ulink url="http://www.courier-mta.org/unicode/manpages.html"> the HTML + <ulink url="https://www.courier-mta.org/unicode/manpages.html"> the HTML version of the man pages</ulink> for more information. </para> </section> @@ -166,7 +174,7 @@ See COPYING for distribution information. <title>C manual pages</title> <refentry id="courier-unicode"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>courier-unicode</refentrytitle> <manvolnum>7</manvolnum> @@ -187,12 +195,12 @@ See COPYING for distribution information. <para> This library implements several algorithms related to the - <ulink url="http://www.unicode.org/standard/standard.html">Unicode + <ulink url="https://www.unicode.org/standard/standard.html">Unicode Standard</ulink>. This library uses - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> - <citerefentry><refentrytitle>iconv</refentrytitle> - <manvolnum>3</manvolnum></citerefentry></ulink> to convert + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html" + ><citerefentry><refentrytitle>iconv</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></ulink> to convert text in a given character set to unicode. Any character set displayed by <command>iconv --list</command> can be specified for the corresponding character set parameter. Additionally, @@ -229,6 +237,9 @@ See COPYING for distribution information. <link linkend="unicode_html40ent_lookup"> <citerefentry><refentrytitle>unicode_html40ent_lookup</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, + <link linkend="unicode_bidi"> + <citerefentry><refentrytitle>unicode_bidi</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link>, <link linkend="unicode_category_lookup"> <citerefentry><refentrytitle>unicode_category_lookup</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, @@ -247,6 +258,9 @@ See COPYING for distribution information. <link linkend="unicode_uc"> <citerefentry><refentrytitle>unicode_uc</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, + <link linkend="unicode__bidi"> + <citerefentry><refentrytitle>unicode::bidi</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link>, <link linkend="unicode__iconvert__convert"> <citerefentry><refentrytitle>unicode::iconvert::convert</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, @@ -272,8 +286,409 @@ See COPYING for distribution information. </refsect1> </refentry> + <refentry id="unicode_bidi"> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + + <refmeta> + <refentrytitle>unicode_bidi</refentrytitle> + <manvolnum>3</manvolnum> + </refmeta> + + <refnamediv> + <refname>unicode_bidi</refname> + <refname>unicode_bidi_calc</refname> + <refname>unicode_bidi_reorder</refname> + <refname>unicode_bidi_mirror</refname> + <refname>unicode_bidi_bracket_type</refname> + + <refpurpose>unicode bidirectional algorithm</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> + <funcsynopsisinfo>unicode_bidi_level_t lr=UNICODE_BIDI_LR</funcsynopsisinfo> + <funcprototype> + <funcdef>void unicode_bidi_calc</funcdef> + <paramdef>const char32_t *<parameter>p</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>const unicode_bidi_level_t *<parameter>initial_embedding_level</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>void unicode_bidi_reorder</funcdef> + <paramdef>char32_t *<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>void (*<parameter>reorder_callback</parameter>)(size_t, size_t, void *)</paramdef> + <paramdef>void *<parameter>arg</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>char32_t <function>bidi_mirror</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>char32_t <function>bidi_bracket_type</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + <paramdef>unicode_bracket_type_t *<parameter>ret</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + <refsect1> + <title>DESCRIPTION</title> + + <para> + <function>unicode_bidi_calc</function>() and + <function>unicode_bidi_reorder</function>() implement + the + <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>. + </para> + <para> + The first two parameters to + <function>unicode_bidi_calc</function>() are a unicode string + and the number of characters in the Unicode string. + <parameter>levels</parameter> points to a buffer of + <classname>unicode_bidi_level_t</classname> values which the + caller is responsible for allocating and deallocating, and has + the same number of values as the number of characters in the + Unicode string. + </para> + <para> + <function>unicode_bidi_calc</function>() calculates the + embedding level of each character and fills in the + <parameter>levels</parameter> buffer (executes all steps of the + bidirectional algorithm up to step L1). + A <literal>NULL</literal> <parameter>initial_embedding</parameter> + value calculates the default paragraph embedding value. + A pointer to a <literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal> value explicitly sets a + left-to-right or right-to-left paragraph embedding value. + </para> + + <para> + <function>unicode_bidi_calc</function>() calculates each + character's embedding value; an even value for left-to-right text + or an odd value for right-to-left text. A + <classname>UNICODE_BIDI_SKIP</classname> embedding level value + specifies a character whose embedding value is unspecified. + This is used for embedding and override markers which can be + removed from the string (together with this embedding value) + from the string and the embedding value itself). This can be + done before or after <function>unicode_bidi_reorder</function>. + </para> + + <refsect2> + <title>Reordering text</title> + + <para> + <function>unicode_bidi_reorder</function> takes the actual + unicode string together with the embedding values from + <function>unicode_bidi_calc</function>, then reverses the + bidirectional string, as specified by step L2 of the bidirectional + algorithm. + </para> + + <para> + A non-<literal>NULL</literal> + <parameter>reorder_callback</parameter> gets invoked to report + each reveversed character range. The callback's first parameter + is the index of the first reversed character, the second parameter + is the number of reversed characters. + The third parameter is the <parameter>arg</parameter> passthrough + parameter. + </para> + + <para> + <parameter>reorder_callback</parameter> gets invoked after + reversing each consecutive range of values in the + <parameter>string</parameter> and <parameter>levels</parameter> + buffers. For example: <quote>reorder_callback(5, 2, arg)</quote> + reports that character indexes #5 and #6 got reverse in the + string. + </para> + + <para> + Specifying a NULL <parameter>string</parameter> leaves the + <parameter>levels</parameter> buffer unchanged, but still + invokes the <parameter>reorder_callback</parameter> as if + the character string, and their values, were reversed. + </para> + </refsect2> + <refsect2> + <title>Miscellaneous utility functions</title> + + <para> + <function>unicode_bidi_mirror</function> + returns the glyph that's a mirror image of the parameter + (i.e. an open parenthesis for a close parenthesis, and vice + versa); or the same value if there is no mirror image. + </para> + + <para> + <function>unicode_bidi_bracket_type</function> + looks up each bracket character and returns its opposite, or + the same value if the character is not a bracket that has an + opposing bracket character. + A non-NULL <parameter>ret</parameter> gets initialized to + either <literal>UNICODE_BIDI_o</literal>, + <literal>UNICODE_BIDI_c</literal> or + <literal>UNICODE_BIDI_n</literal>. + </para> + </refsect2> + </refsect1> + <refsect1> + <title>SEE ALSO</title> + <para> + <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">TR-9</ulink>, + <link linkend="unicode__bidi"> + <citerefentry><refentrytitle>unicode::bidi</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link>, + <link linkend="courier-unicode"> + <citerefentry> + <refentrytitle>courier-unicode</refentrytitle> + <manvolnum>7</manvolnum></citerefentry></link>, + </para> + </refsect1> + </refentry> + + <refentry id="unicode_category_lookup"> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + + <refmeta> + <refentrytitle>unicode_category_lookup</refentrytitle> + <manvolnum>3</manvolnum> + </refmeta> + + <refnamediv> + <refname>unicode_category_lookup</refname> + <refname>unicode_isalnum</refname> + <refname>unicode_isalpha</refname> + <refname>unicode_isblank</refname> + <refname>unicode_isdigit</refname> + <refname>unicode_isgraph</refname> + <refname>unicode_islower</refname> + <refname>unicode_ispunct</refname> + <refname>unicode_isspace</refname> + <refname>unicode_isupper</refname> + + <refpurpose>unicode character categorization</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> + <funcprototype> + <funcdef>uint32_t <function>unicode_category_lookup</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isalnum</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isalpha</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isblank</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isdigit</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isgraph</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_islower</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_ispunct</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isspace</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode_isupper</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + <refsect1> + <title>DESCRIPTION</title> + + <para> + <function>unicode_category_lookup</function>() looks up the + <ulink url="https://unicode.org/notes/tn36/">unicode character's + categorization</ulink>. + <function>unicode_category_lookup</function>() returns a 32 bit + value. + The value's + <symbol>UNICODE_CATEGORY_1</symbol> bits specify the first level + of the unicode character's category, with + <symbol>UNICODE_CATEGORY_2</symbol>, + <symbol>UNICODE_CATEGORY_3</symbol>, and + <symbol>UNICODE_CATEGORY_4</symbol> bits specifying the 2nd, + 3rd, and 4th level, if given. A value of 0 for each corresponding + bit set indicates that no category is specified for this level, + for this character; otherwise the possible values are defined + in <filename><courier-unicode.h></filename>. + </para> + + <para> + The remaining functions implement comparable equivalents of + their non-unicode versions in the standard C library, as follows: + </para> + + <variablelist> + <varlistentry> + <term><function>unicode_isalnum</function>()</term> + <listitem> + <para> + Returns non-0 for all + <function>unicode_isalpha</function>() or + <function>unicode_isdigit</function>(). + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isalpha</function>()</term> + <listitem> + <para> + Returns non-0 for all + <symbol>UNICODE_CATEGORY_1_LETTER</symbol>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isblank</function>()</term> + <listitem> + <para> + Return non-0 for + <symbol>TAB</symbol>, and all + <symbol>UNICODE_CATEGORY_2_SPACE</symbol>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isdigit</function>()</term> + <listitem> + <para> + Returns non-0 for all + <symbol>UNICODE_CATEGORY_1_NUMBER</symbol> + | <symbol>UNICODE_CATEGORY_2_DIGIT</symbol>, + only (no third categories). + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isgraph</function>()</term> + <listitem> + <para> + Returns non-0 for all codepoints above + <symbol>SPACE</symbol> which are not + <function>unicode_isspace</function>(). + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_islower</function>()</term> + <listitem> + <para> + Returns non-0 for all + <function>unicode_isalpha</function>() for which the + character is + equal to + <link linkend="unicode_uc"> + <citerefentry><refentrytitle>unicode_lc</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link> + of itself. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_ispunct</function>()</term> + <listitem> + <para> + Returns non-0 for all + <symbol>UNICODE_CATEGORY_1_PUNCTUATION</symbol>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isspace</function>()</term> + <listitem> + <para> + Returns non-0 for unicode_isblank() or + for unicode characters + with linebreaking properties of + <symbol>BK</symbol>, + <symbol>CR</symbol>, + <symbol>LF</symbol>, + <symbol>NL</symbol>, + and + <symbol>SP</symbol>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>unicode_isupper</function>()</term> + <listitem> + <para> + Returns non-0 for all + <function>unicode_isalpha</function>() for which the + character is + equal to + <link linkend="unicode_uc"> + <citerefentry><refentrytitle>unicode_uc</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link> + of itself. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + <refsect1> + <title>SEE ALSO</title> + <para> + <link linkend="courier-unicode"> + <citerefentry> + <refentrytitle>courier-unicode</refentrytitle> + <manvolnum>7</manvolnum></citerefentry></link>, + <link linkend="unicode_uc"> + <citerefentry><refentrytitle>unicode_convert_tocase</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link>. + </para> + </refsect1> + </refentry> + <refentry id="unicode_convert"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_convert</refentrytitle> @@ -444,7 +859,7 @@ See COPYING for distribution information. <function>unicode_convert_init</function>(), <function>unicode_convert</function>(), and <function>unicode_convert_deinit</function>() are an adaption of th - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink> API that uses the same calling convention as the other algorithms in this unicode library, @@ -668,7 +1083,7 @@ See COPYING for distribution information. </refentry> <refentry id="unicode_default_chset"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_default_chset</refentrytitle> @@ -721,7 +1136,7 @@ See COPYING for distribution information. </refentry> <refentry id="unicode_html40ent_lookup"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_html40ent_lookup</refentrytitle> @@ -780,251 +1195,18 @@ See COPYING for distribution information. </refsect1> </refentry> - <refentry id="unicode_category_lookup"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> - - <refmeta> - <refentrytitle>unicode_category_lookup</refentrytitle> - <manvolnum>3</manvolnum> - </refmeta> - - <refnamediv> - <refname>unicode_category_lookup</refname> - <refname>unicode_isalnum</refname> - <refname>unicode_isalpha</refname> - <refname>unicode_isblank</refname> - <refname>unicode_isdigit</refname> - <refname>unicode_isgraph</refname> - <refname>unicode_islower</refname> - <refname>unicode_ispunct</refname> - <refname>unicode_isspace</refname> - <refname>unicode_isupper</refname> - - <refpurpose>unicode character categorization</refpurpose> - </refnamediv> - - <refsynopsisdiv> - <funcsynopsis> - <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> - <funcprototype> - <funcdef>uint32_t <function>unicode_category_lookup</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isalnum</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isalpha</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isblank</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isdigit</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isgraph</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_islower</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_ispunct</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isspace</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - - <funcprototype> - <funcdef>int <function>unicode_isupper</function></funcdef> - <paramdef>char32_t <parameter>c</parameter></paramdef> - </funcprototype> - </funcsynopsis> - </refsynopsisdiv> - <refsect1> - <title>DESCRIPTION</title> - - <para> - <function>unicode_category_lookup</function>() looks up the - <ulink url="http://unicode.org/notes/tn36/">unicode character's - categorization</ulink>. - <function>unicode_category_lookup</function>() returns a 32 bit - value. - The value's - <symbol>UNICODE_CATEGORY_1</symbol> bits specify the first level - of the unicode character's category, with - <symbol>UNICODE_CATEGORY_2</symbol>, - <symbol>UNICODE_CATEGORY_3</symbol>, and - <symbol>UNICODE_CATEGORY_4</symbol> bits specifying the 2nd, - 3rd, and 4th level, if given. A value of 0 for each corresponding - bit set indicates that no category is specified for this level, - for this character; otherwise the possible values are defined - in <filename><courier-unicode.h></filename>. - </para> - - <para> - The remaining functions implement comparable equivalents of - their non-unicode versions in the standard C library, as follows: - </para> - - <variablelist> - <varlistentry> - <term><function>unicode_isalnum</function>()</term> - <listitem> - <para> - Returns non-0 for all - <function>unicode_isalpha</function>() or - <function>unicode_isdigit</function>(). - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isalpha</function>()</term> - <listitem> - <para> - Returns non-0 for all - <symbol>UNICODE_CATEGORY_1_LETTER</symbol>. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isblank</function>()</term> - <listitem> - <para> - Return non-0 for - <symbol>TAB</symbol>, and all - <symbol>UNICODE_CATEGORY_2_SPACE</symbol>. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isdigit</function>()</term> - <listitem> - <para> - Returns non-0 for all - <symbol>UNICODE_CATEGORY_1_NUMBER</symbol> - | <symbol>UNICODE_CATEGORY_2_DIGIT</symbol>, - only (no third categories). - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isgraph</function>()</term> - <listitem> - <para> - Returns non-0 for all codepoints above - <symbol>SPACE</symbol> which are not - <function>unicode_isspace</function>(). - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_islower</function>()</term> - <listitem> - <para> - Returns non-0 for all - <function>unicode_isalpha</function>() for which the - character is - equal to - <link linkend="unicode_uc"> - <citerefentry><refentrytitle>unicode_lc</refentrytitle> - <manvolnum>3</manvolnum></citerefentry></link> - of itself. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_ispunct</function>()</term> - <listitem> - <para> - Returns non-0 for all - <symbol>UNICODE_CATEGORY_1_PUNCTUATION</symbol>. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isspace</function>()</term> - <listitem> - <para> - Returns non-0 for unicode_isblank() or - for unicode characters - with linebreaking properties of - <symbol>BK</symbol>, - <symbol>CR</symbol>, - <symbol>LF</symbol>, - <symbol>NL</symbol>, - and - <symbol>SP</symbol>. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term><function>unicode_isupper</function>()</term> - <listitem> - <para> - Returns non-0 for all - <function>unicode_isalpha</function>() for which the - character is - equal to - <link linkend="unicode_uc"> - <citerefentry><refentrytitle>unicode_uc</refentrytitle> - <manvolnum>3</manvolnum></citerefentry></link> - of itself. - </para> - </listitem> - </varlistentry> - </variablelist> - </refsect1> - <refsect1> - <title>SEE ALSO</title> - <para> - <link linkend="courier-unicode"> - <citerefentry> - <refentrytitle>courier-unicode</refentrytitle> - <manvolnum>7</manvolnum></citerefentry></link>, - <link linkend="unicode_uc"> - <citerefentry><refentrytitle>unicode_convert_tocase</refentrytitle> - <manvolnum>3</manvolnum></citerefentry></link>. - </para> - </refsect1> - </refentry> - <refentry id="unicode_grapheme_break"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_grapheme_break</refentrytitle> - <refentrytitle>unicode_grapheme_break_init</refentrytitle> - <refentrytitle>unicode_grapheme_break_next</refentrytitle> - <refentrytitle>unicode_grapheme_break_deinit</refentrytitle> <manvolnum>3</manvolnum> </refmeta> <refnamediv> <refname>unicode_grapheme_break</refname> + <refname>unicode_grapheme_break_init</refname> + <refname>unicode_grapheme_break_next</refname> + <refname>unicode_grapheme_break_deinit</refname> <refpurpose>unicode grapheme cluster boundary rules</refpurpose> </refnamediv> @@ -1059,22 +1241,23 @@ See COPYING for distribution information. <title>DESCRIPTION</title> <para> + These functions implement the unicode grapheme cluster breaking + algorithm. Invoke + <function>unicode_grapheme_break_init</function>() to initialize + the grapheme cluster breaking algorithm. <function>unicode_grapheme_break_init</function>() returns an - opaque handle for an object that computes grapheme breaks. - Each call to <function>unicode_grapheme_break_next</function>() - passes one character of a unicode string, and returns a non-0 - value if there's a grapheme break before this character, in the + opaque handle. Each subsequent call to + <function>unicode_grapheme_break_next</function>() passes this + handle, and the next character. + <function>unicode_grapheme_break_next</function>() returns a non-0 + value if there's a grapheme break before the character, in a sequence of Unicode characters. <function>unicode_grapheme_break_deinit</function>() releases - all reosurces used by the grapheme breaking handle. + all reosurces used by the grapheme breaking handle, and the + <classname>unicode_grapheme_break_info_t</classname> handle + is no longer valid after this call. </para> <para> - Call - <function>unicode_grapheme_break_init</function>(), then call - <function>unicode_grapheme_break_next</function>() for each - character, - then call - <function>unicode_grapheme_break_deinit</function>(). The first call to <function>unicode_grapheme_break_next</function>() always returns non-0, as per the GB1 rule. </para> @@ -1085,10 +1268,11 @@ See COPYING for distribution information. <parameter>a</parameter> and <parameter>b</parameter>. This is is equivalent to calling - <function>> unicode_grapheme_break_init</function>(), + <function>unicode_grapheme_break_init</function>(), followed by two calls to <function> unicode_grapheme_break_next</function>(), and finally - <function>unicode_grapheme_break_deinit</function>(), and returns + <function>unicode_grapheme_break_deinit</function>(), then + returning the result of the second call to <function>unicode_grapheme_break_next</function>(). </para> @@ -1098,7 +1282,7 @@ See COPYING for distribution information. <title>SEE ALSO</title> <para> - <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, + <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, <link linkend="courier-unicode"> <citerefentry> <refentrytitle>courier-unicode</refentrytitle> @@ -1116,60 +1300,15 @@ See COPYING for distribution information. </refsect1> </refentry> - <refentry id="unicode_script"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> - <refmeta> - <refentrytitle>unicode_script</refentrytitle> - <manvolnum>3</manvolnum> - </refmeta> - - <refnamediv> - <refname>unicode_script</refname> - <refpurpose>unicode script property</refpurpose> - </refnamediv> - - <refsynopsisdiv> - <funcsynopsis> - <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> - <funcprototype> - <funcdef>unicode_script_t <function>unicode_script</function></funcdef> - <paramdef>char32_t <parameter>ch</parameter></paramdef> - </funcprototype> - </funcsynopsis> - </refsynopsisdiv> - <refsect1> - <title>DESCRIPTION</title> - <para> - <function>unicode_script</function>() looks up the - <quote>script</quote> property of the specified unicode character, - and returns it. The <classname>unicode_script_t</classname> - enumeration encodes possible unicode script values. - <literal>unicode_script_unknown</literal> gets returned for a - unicode character with an unknown script property. - </para> - </refsect1> - - <refsect1> - <title>SEE ALSO</title> - - <para> - <ulink url="http://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">TR-24</ulink>, - <link linkend="courier-unicode"> - <citerefentry> - <refentrytitle>courier-unicode</refentrytitle> - <manvolnum>7</manvolnum></citerefentry></link>. - </para> - </refsect1> - </refentry> - <refentry id="unicode_line_break"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_line_break</refentrytitle> <manvolnum>3</manvolnum> </refmeta> <refnamediv> + <refname>unicode_line_break</refname> <refname>unicode_lb_init</refname> <refname>unicode_lb_set_opts</refname> <refname>unicode_lb_next</refname> @@ -1483,13 +1622,59 @@ See COPYING for distribution information. <link linkend="unicode__linebreak"> <citerefentry><refentrytitle>unicode::linebreak</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, - <ulink url="http://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">TR-14</ulink> + <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">TR-14</ulink> + </para> + </refsect1> + </refentry> + + <refentry id="unicode_script"> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + <refmeta> + <refentrytitle>unicode_script</refentrytitle> + <manvolnum>3</manvolnum> + </refmeta> + + <refnamediv> + <refname>unicode_script</refname> + <refpurpose>unicode script property</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> + <funcprototype> + <funcdef>unicode_script_t <function>unicode_script</function></funcdef> + <paramdef>char32_t <parameter>ch</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + <refsect1> + <title>DESCRIPTION</title> + <para> + <function>unicode_script</function>() looks up the + <quote>script</quote> property of the specified unicode character, + and returns it. The <classname>unicode_script_t</classname> + enumeration encodes possible unicode script values. + <literal>unicode_script_unknown</literal> gets returned for a + unicode character with an unknown script property. + </para> + </refsect1> + + <refsect1> + <title>SEE ALSO</title> + + <para> + <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">TR-24</ulink>, + <link linkend="courier-unicode"> + <citerefentry> + <refentrytitle>courier-unicode</refentrytitle> + <manvolnum>7</manvolnum></citerefentry></link>. </para> </refsect1> </refentry> <refentry id="unicode_word_break"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_word_break</refentrytitle> <manvolnum>3</manvolnum> @@ -1682,7 +1867,7 @@ See COPYING for distribution information. <refsect1> <title>SEE ALSO</title> <para> - <ulink url="http://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, + <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, <link linkend="courier-unicode"> <citerefentry> <refentrytitle>courier-unicode</refentrytitle> @@ -1704,7 +1889,7 @@ See COPYING for distribution information. </refentry> <refentry id="unicode_uc"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode_uc</refentrytitle> <manvolnum>3</manvolnum> @@ -1816,8 +2001,109 @@ See COPYING for distribution information. <section id="manpagescpp"> <title>C++ manual pages</title> + <refentry id="unicode__bidi"> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + + <refmeta> + <refentrytitle>unicode::bidi::calc</refentrytitle> + <manvolnum>3</manvolnum> + </refmeta> + + <refnamediv> + <refname>unicode::bidi_calc</refname> + <refname>unicode::bidi_reorder</refname> + <refpurpose>unicode bidirectional algorithm</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> + <funcprototype> + <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <funcsynopsis> + <funcprototype> + <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t<parameter>embedding_level</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <funcsynopsis> + <funcprototype> + <funcdef>int <function>unicode::bidi_reorder</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>std::vector<unicode_bidi_level_t> &<parameter> embedding_level</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <funcsynopsis> + <funcprototype> + <funcdef>int <function>unicode::bidi_reorder</function></funcdef> + <paramdef>std::vector<unicode_bidi_level_t> &<parameter>embedding_level</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + + <refsect1> + <title>DESCRIPTION</title> + + <para> + These functions implement the C++ interface for the + <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>. + See the description of the underlying + <link linkend="unicode_bidi"> + <citerefentry><refentrytitle>unicode_bidi</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link> C library + API for more information. + </para> + + <para> + <function>unicode::bidi_calc</function> computes and return a vector + of bidirection embedding level values for the given Unicode string. + An overload takes an additional parameter that override the + paragraph embedding level, a <literal>UNICODE_BIDI_LR</literal> or + an <literal>UNICODE_BIDI_RL</literal> value. + </para> + <para> + <function>unicode::bidi_reorder</function> reverses the characters + in the Unicode script, according to their embedding levels (and + reverses the corresponding embedding level values too). + As is with the C API, an optional parameter is a callable object + that gets invoked to report each range of characters that gets + reversed (specified as the starting position and a number of + characters). + </para> + <para> + An overloaded <function>unicode::bidi_reorder</function> without + the string parameter goes through the motions, according to the + embedded level vector parameter, but without actually reversing + the values in the vector, but still invoking the callable object + normally. + </para> + <para> + This is comparable to the C API. Also comparable with the C API: + the convention that even embedding levels specify left to right + text and odd embedding values specify right to left text. + An embedding value of <literal>UNICODE_BIDI_SKIP</literal> + indicates an embedding or an override marker that has no + specified embeded value. These markers may be removed from the + Unicode string (together with the + <literal>UNICODE_BIDI_SKIP</literal> + values from the embedding values vector) either before or after + they get reordered. + </para> + </refsect1> + </refentry> + + <refentry id="unicode__iconvert__convert"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::iconvert::convert</refentrytitle> @@ -1951,7 +2237,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> <link linkend="unicode_convert"> <citerefentry><refentrytitle>unicode_convert</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink>. @@ -1960,7 +2246,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </refentry> <refentry id="unicode__iconvert__convert_tocase"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::iconvert::convert_tocase</refentrytitle> @@ -2041,7 +2327,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> <link linkend="unicode_convert"> <citerefentry><refentrytitle>unicode_convert</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink>. @@ -2050,7 +2336,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </refentry> <refentry id="unicode__iconvert__fromu"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::iconvert::fromu</refentrytitle> @@ -2138,7 +2424,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> <link linkend="unicode_convert"> <citerefentry><refentrytitle>unicode_convert</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink>. @@ -2147,7 +2433,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </refentry> <refentry id="unicode__iconvert__tou"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::iconvert::tou</refentrytitle> @@ -2237,7 +2523,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> <link linkend="unicode_convert"> <citerefentry><refentrytitle>unicode_convert</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, - <ulink url="http://manpages.courier-mta.org/htmlman3/iconv.3.html"> + <ulink url="https://manpages.courier-mta.org/htmlman3/iconv.3.html"> <citerefentry><refentrytitle>iconv</refentrytitle> <manvolnum>3</manvolnum></citerefentry></ulink>. @@ -2246,7 +2532,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </refentry> <refentry id="unicode__linebreak"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::linebreak</refentrytitle> @@ -2447,7 +2733,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int> </refentry> <refentry id="unicode__tolower"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::tolower</refentrytitle> @@ -2542,19 +2828,8 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int> </refsect1> </refentry> - - - - - - - - - - - <refentry id="unicode__wordbreak"> - <info><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></info> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> <refmeta> <refentrytitle>unicode::wordbreak</refentrytitle> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 67f3bda..b8c88f4 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -2,7 +2,7 @@ #define courier_unicode_h /* -** Copyright 2000-2018 Double Precision, Inc. +** Copyright 2000-2020 Double Precision, Inc. ** See COPYING for distribution information. ** */ @@ -12,6 +12,7 @@ #include <string> #include <vector> #include <list> +#include <functional> extern "C" { #endif @@ -40,7 +41,7 @@ typedef uint32_t char32_t; #endif #endif -#define COURIER_UNICODE_VERSION 210 +#define COURIER_UNICODE_VERSION 220 /* ** The system default character set, from the locale. @@ -605,6 +606,13 @@ extern void unicode_bidi_calc(const char32_t *p, size_t n, const unicode_bidi_level_t * initial_embedding_level); +extern void unicode_bidi_reorder(char32_t *p, + unicode_bidi_level_t *levels, + size_t n, + void (*reorder_callback)(size_t, size_t, + void *), + void *arg); + /* ** A buffer that holds unicode characters, and dynamically grows as needed. */ @@ -2025,6 +2033,24 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); +//! Calculate bidirectional embedding levels +std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s); + +//! Calculate bidirectional embedding levels +std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s, + unicode_bidi_level_t level); + +//! Reorder bidirectional text +int bidi_reorder(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t, size_t)> &reorder_callback= + [](size_t, size_t){}); + +//! Reorder bidirectional text +void bidi_reorder(std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t, size_t)> &reorder_callback= + [](size_t, size_t){}); + #if 0 { #endif diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 38dcb44..9e7fcf4 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -1,5 +1,5 @@ /* -** Copyright 2011-2020 Double Precision, Inc. +** Copyright 2020 Double Precision, Inc. ** See COPYING for distribution information. ** */ @@ -148,14 +148,56 @@ struct level_run { size_t end; /* one past */ }; +/* A growing list of level runs */ + +struct level_runs { + struct level_run *runs; /* All level runs in the sequence */ + size_t n_level_runs; /* How many of them */ + size_t cap_level_runs; /* Capacity of the level runs */ +}; + +static void level_runs_init(struct level_runs *p) +{ + p->runs=0; + p->n_level_runs=0; + p->cap_level_runs=0; +} + +static void level_runs_deinit(struct level_runs *p) +{ + if (p->runs) + free(p->runs); +} + +static struct level_run *level_runs_add(struct level_runs *p) +{ + if (p->n_level_runs == p->cap_level_runs) + { + p->cap_level_runs *= 2; + + if (p->cap_level_runs == 0) + p->cap_level_runs=1; + + p->runs=(struct level_run *) + (p->runs ? + realloc(p->runs, + sizeof(struct level_run) * + p->cap_level_runs) + :malloc(sizeof(struct level_run) * + p->cap_level_runs)); + if (!p->runs) + abort(); + } + + return p->runs + (p->n_level_runs++); +} + /* An isolating run sequence */ struct isolating_run_sequence_s { struct isolating_run_sequence_s *prev, *next; /* Linked list */ - struct level_run *level_runs; /* All level runs in the sequence */ - size_t n_level_runs; /* How many of them */ - size_t cap_level_runs; /* Capacity of the level runs */ + struct level_runs runs; unicode_bidi_level_t embedding_level; /* This seq's embedding level */ enum_bidi_class_t sos, eos; }; @@ -185,11 +227,11 @@ static irs_iterator irs_begin(struct isolating_run_sequence_s *seq) /* Edge case, empty isolating run sequence */ - while (iter.level_run_i < seq->n_level_runs) + while (iter.level_run_i < seq->runs.n_level_runs) { - iter.i=seq->level_runs[iter.level_run_i].start; + iter.i=seq->runs.runs[iter.level_run_i].start; - if (iter.i < seq->level_runs[iter.level_run_i].end) + if (iter.i < seq->runs.runs[iter.level_run_i].end) break; ++iter.level_run_i; @@ -202,7 +244,7 @@ static irs_iterator irs_end(struct isolating_run_sequence_s *seq) irs_iterator iter; iter.seq=seq; - iter.level_run_i=seq->n_level_runs; + iter.level_run_i=seq->runs.n_level_runs; return iter; } @@ -214,7 +256,7 @@ static int irs_compare(const irs_iterator *a, if (a->level_run_i > b->level_run_i) return 1; - if (a->level_run_i == a->seq->n_level_runs) + if (a->level_run_i == a->seq->runs.n_level_runs) return 0; if (a->i < b->i) @@ -227,7 +269,7 @@ static int irs_compare(const irs_iterator *a, static void irs_incr(irs_iterator *iter) { - if (iter->seq->n_level_runs == iter->level_run_i) + if (iter->seq->runs.n_level_runs == iter->level_run_i) { fprintf(stderr, "%s%s\n", "Internal error: attempting to increment ", @@ -235,10 +277,10 @@ static void irs_incr(irs_iterator *iter) abort(); } - if (++iter->i >= iter->seq->level_runs[iter->level_run_i].end) + if (++iter->i >= iter->seq->runs.runs[iter->level_run_i].end) { - if (++iter->level_run_i < iter->seq->n_level_runs) - iter->i=iter->seq->level_runs[iter->level_run_i].start; + if (++iter->level_run_i < iter->seq->runs.n_level_runs) + iter->i=iter->seq->runs.runs[iter->level_run_i].start; } } @@ -246,8 +288,8 @@ static void irs_decr(irs_iterator *iter) { while (1) { - if (iter->seq->n_level_runs > iter->level_run_i && - iter->i > iter->seq->level_runs[iter->level_run_i].start) + if (iter->seq->runs.n_level_runs > iter->level_run_i && + iter->i > iter->seq->runs.runs[iter->level_run_i].start) { --iter->i; break; @@ -261,7 +303,7 @@ static void irs_decr(irs_iterator *iter) abort(); } - iter->i=iter->seq->level_runs[--iter->level_run_i].end; + iter->i=iter->seq->runs.runs[--iter->level_run_i].end; } } @@ -328,13 +370,12 @@ isolating_run_sequences_init(struct isolating_run_sequences_s *p, if (!seq) abort(); - if ((seq->level_runs=(struct level_run *) - malloc(sizeof(struct level_run))) == 0) abort(); + level_runs_init(&seq->runs); - seq->level_runs->start=i; - seq->level_runs->end=i; + struct level_run *run=level_runs_add(&seq->runs); - seq->n_level_runs=seq->cap_level_runs=1; + run->start=i; + run->end=i; seq->embedding_level=embedding_level; if (!p->head) @@ -355,7 +396,7 @@ static void isolating_run_sequences_record(struct isolating_run_sequence_s *p, size_t i) { struct level_run *current_level_run= - &p->level_runs[p->n_level_runs-1]; + &p->runs.runs[p->runs.n_level_runs-1]; if (current_level_run->start == current_level_run->end) { @@ -375,19 +416,7 @@ static void isolating_run_sequences_record(struct isolating_run_sequence_s *p, ** run sequence. */ - if (p->n_level_runs == p->cap_level_runs) - { - p->cap_level_runs *= 2; - - p->level_runs=(struct level_run *) - realloc(p->level_runs, - sizeof(struct level_run) * - p->cap_level_runs); - if (!p->level_runs) - abort(); - } - - current_level_run = p->level_runs + (p->n_level_runs++); + current_level_run=level_runs_add(&p->runs); current_level_run->start=i; current_level_run->end=i+1; @@ -430,7 +459,7 @@ static void isolating_run_sequences_deinit(struct isolating_run_sequences_s *p) seq=seq->next; - free(p->level_runs); + level_runs_deinit(&p->runs); free(p); } @@ -706,12 +735,12 @@ void dump_sequence_info(directional_status_stack_t stack, (seq->sos == UNICODE_BIDI_CLASS_L ? 'L':'R'), (seq->eos == UNICODE_BIDI_CLASS_L ? 'L':'R')); - for (size_t i=0; i<seq->n_level_runs; ++i) + for (size_t i=0; i<seq->runs.n_level_runs; ++i) { fprintf(DEBUGDUMP, "%s[%lu-%lu]", i == 0 ? " ":", ", - (unsigned long)seq->level_runs[i].start, - (unsigned long)seq->level_runs[i].end-1); + (unsigned long)seq->runs.runs[i].start, + (unsigned long)seq->runs.runs[i].end-1); } fprintf(DEBUGDUMP, "\n"); } @@ -1706,3 +1735,127 @@ static void unicode_bidi_n(directional_status_stack_t stack, dump_sequence("Contents after I", stack, seq); #endif } + +struct level_run_layers { + struct level_runs *lruns; /* At this embedding level, or higher */ + size_t n_lruns; /* How many of them */ + size_t cap_lruns; /* Capacity of the level runs */ +}; + +static void level_run_layers_init(struct level_run_layers *p) +{ + p->lruns=0; + p->n_lruns=0; + p->cap_lruns=0; +} + +static void level_run_layers_deinit(struct level_run_layers *p) +{ + if (p->lruns) + { + for (size_t i=0; i<p->n_lruns; ++i) + level_runs_deinit(&p->lruns[i]); + free(p->lruns); + } +} + +static void level_run_layers_add(struct level_run_layers *p) +{ + if (p->n_lruns == p->cap_lruns) + { + p->cap_lruns *= 2; + + if (p->cap_lruns == 0) + p->cap_lruns=1; + + p->lruns=(struct level_runs *) + (p->lruns ? + realloc(p->lruns, + sizeof(struct level_runs) * + p->cap_lruns) + :malloc(sizeof(struct level_runs) * + p->cap_lruns)); + if (!p->lruns) + abort(); + } + + level_runs_init(p->lruns + (p->n_lruns++)); +} + +void unicode_bidi_reorder(char32_t *p, + unicode_bidi_level_t *levels, + size_t n, + void (*reorder_callback)(size_t, size_t, void *), + void *arg) +{ + /* L2 */ + + struct level_run_layers layers; + unicode_bidi_level_t previous_level=0; + + level_run_layers_init(&layers); + + for (size_t i=0; i<n; ++i) + { + if (levels[i] != UNICODE_BIDI_SKIP) + previous_level=levels[i]; + + while (layers.n_lruns <= previous_level) + level_run_layers_add(&layers); + + /* We intentionally don't put anything in level 0 */ + for (size_t j=1; j<=previous_level; ++j) + { + struct level_runs *runs=layers.lruns+j; + + if (runs->n_level_runs && + runs->runs[runs->n_level_runs-1].end == i) + { + ++runs->runs[runs->n_level_runs-1].end; + } + else + { + struct level_run *run= + level_runs_add(runs); + + run->start=i; + run->end=i+1; + } + } + } + + for (size_t i=layers.n_lruns; i; ) + { + struct level_runs *runs=layers.lruns+ --i; + + for (size_t j=0; j<runs->n_level_runs; ++j) + { + size_t start=runs->runs[j].start; + size_t end=runs->runs[j].end; + size_t right=end; + size_t left=start; + + while (right > left) + { + --right; + + if (p) + { + char32_t c=p[left]; + unicode_bidi_level_t l=levels[left]; + + p[left]=p[right]; + levels[left]=levels[right]; + p[right]=c; + levels[right]=l; + } + ++left; + } + + if (end-start > 1 && reorder_callback) + (*reorder_callback)(start, end-start, arg); + } + } + + level_run_layers_deinit(&layers); +} diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index 51bed3c..adb7869 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -557,3 +557,79 @@ std::u32string unicode::toupper(const std::u32string &u) return copy; } + +std::vector<unicode_bidi_level_t> +unicode::bidi_calc(const std::u32string &s) +{ + return unicode::bidi_calc(s, UNICODE_BIDI_SKIP); +} + +std::vector<unicode_bidi_level_t> +unicode::bidi_calc(const std::u32string &s, + unicode_bidi_level_t paragraph_embedding_level) +{ + const unicode_bidi_level_t *initial_embedding_level=0; + + if (paragraph_embedding_level == UNICODE_BIDI_LR || + paragraph_embedding_level == UNICODE_BIDI_RL) + { + initial_embedding_level=¶graph_embedding_level; + } + + std::vector<unicode_bidi_level_t> buf; + + buf.resize(s.size()); + + if (s.size()) + { + unicode_bidi_calc(s.c_str(), s.size(), &buf[0], + initial_embedding_level); + } + return buf; +} + +extern "C" { + static void reorder_callback(size_t i, size_t cnt, + void *arg) + { + auto p=reinterpret_cast<const std::function<void (size_t, + size_t)> *> + (arg); + + (*p)(i, cnt); + } +} + +int unicode::bidi_reorder(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t, size_t)> &lambda) +{ + size_t s=string.size(); + + if (s != levels.size()) + return -1; + + if (!s) + return 0; + + unicode_bidi_reorder(&string[0], &levels[0], s, + reorder_callback, + const_cast<void *> + (reinterpret_cast<const void *>(&lambda))); + + return 0; +} + +void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t, size_t)> &lambda) +{ + size_t s=levels.size(); + + if (!s) + return; + + unicode_bidi_reorder(0, &levels[0], s, reorder_callback, + const_cast<void *> + (reinterpret_cast<const void *>(&lambda))); + +} |
