diff options
| author | Sam Varshavchik | 2020-07-12 09:44:24 -0400 |
|---|---|---|
| committer | Sam Varshavchik | 2020-08-02 14:56:50 -0400 |
| commit | d2915c9cadf6fbc5ae29ffc387cce987b88dbbe0 (patch) | |
| tree | f76c8edf36fb84c6e082f2a4ae9798b10aeda70e | |
| parent | 51471a4d8b177adfcd40c145a809193a4ab9bd8d (diff) | |
| download | courier-libs-d2915c9cadf6fbc5ae29ffc387cce987b88dbbe0.tar.bz2 | |
Add additional bi-directional related algorithm.
Cleanup, remove markers, via unicode_bidi_cleanup() and
unicode_bidi_extra_cleanup().
Re-embed directional markers, via unicode_bidi_logical_order(),
unicode_bidi_embed() and unicode_bidi_embed_paragraph_level().
| -rw-r--r-- | unicode/Makefile.am | 11 | ||||
| -rw-r--r-- | unicode/README | 4 | ||||
| -rw-r--r-- | unicode/biditest.C | 16 | ||||
| -rw-r--r-- | unicode/biditest2.C | 289 | ||||
| -rw-r--r-- | unicode/book.xml | 796 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 203 | ||||
| -rw-r--r-- | unicode/docbook/book.css | 2 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 919 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 197 |
9 files changed, 2108 insertions, 329 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 8ac6fb1..f864e2d 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -90,6 +90,11 @@ include_HEADERS=courier-unicode.h \ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ + $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ + $(srcdir)/man/unicode\:\:bidi_embed.3 \ + $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ + $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \ + $(srcdir)/man/unicode\:\:bidi_logical_order.3 \ $(srcdir)/man/unicode\:\:bidi_reorder.3 \ $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \ $(srcdir)/man/unicode\:\:iconvert\:\:convert_tocase.3 \ @@ -110,8 +115,14 @@ man_MANS= \ $(srcdir)/man/unicode_bidi.3 \ $(srcdir)/man/unicode_bidi_bracket_type.3 \ $(srcdir)/man/unicode_bidi_calc.3 \ + $(srcdir)/man/unicode_bidi_cleanup.3 \ + $(srcdir)/man/unicode_bidi_embed.3 \ + $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ + $(srcdir)/man/unicode_bidi_extra_cleanup.3 \ + $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ + $(srcdir)/man/unicode_bidi_type.3 \ $(srcdir)/man/unicode_canonical.3 \ $(srcdir)/man/unicode_category_lookup.3 \ $(srcdir)/man/unicode_convert.3 \ diff --git a/unicode/README b/unicode/README index 926e004..9994cc9 100644 --- a/unicode/README +++ b/unicode/README @@ -16,12 +16,12 @@ Courier Unicode Library COPYING This library implements several algorithms related to the Unicode - Standard: + Standard, notably: * Look up uppercase, lowercase, and titlecase equivalents of a unicode character. - * Implementation of grapheme and work breaking rules. + * Implementation of grapheme and word breaking rules. * Implementation of line breaking rules. diff --git a/unicode/biditest.C b/unicode/biditest.C index 2d2a6e5..1aa2c63 100644 --- a/unicode/biditest.C +++ b/unicode/biditest.C @@ -8,6 +8,7 @@ #include <utility> #include <iomanip> #include <numeric> +#include <unistd.h> std::vector<std::string> testcase; @@ -53,11 +54,11 @@ int main(int argc, char **argv) { buf.clear(); - if (std::getline(fp, buf).eof() && buf.empty()) - break; + bool iseof=std::getline(fp, buf).eof() && buf.empty(); - if (++linenum >= nextlogline) + if (iseof || ++linenum >= nextlogline) { + alarm(300); std::cout << logmsg; std::ostringstream o; @@ -72,7 +73,8 @@ int main(int argc, char **argv) nextlogline += 20000; } - + if (iseof) + break; buf.erase(std::find(buf.begin(), buf.end(), '#'), buf.end()); if (buf.substr(0, 8) == "@Levels:") @@ -334,11 +336,7 @@ int main(int argc, char **argv) n >>= 1; } } - - std::cout << logmsg; - - std::fill(logmsg.begin(), logmsg.end(), ' '); - std::cout << logmsg << std::endl; + std::cout << std::endl; return 0; } diff --git a/unicode/biditest2.C b/unicode/biditest2.C index f497bcf..cfa0e50 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -1,42 +1,110 @@ #include "unicode_config.h" #include "courier-unicode.h" #include <iostream> +#include <iterator> #include <sstream> #include <fstream> #include <cstdint> #include <iomanip> +#include <algorithm> +#include <unistd.h> FILE *DEBUGDUMP; -int main(int argc, char **argv) +#define BIDI_DEBUG + +extern "C" { +#if 0 +} +#endif + +#include "unicode_bidi.c" + +} + +void latin_test() { - std::ifstream fp("BidiCharacterTest.txt"); + for (char32_t c=32; c<256; c++) + { + std::u32string s; - if (!fp.is_open()) + s += c; + + std::vector<unicode_bidi_level_t> levels={UNICODE_BIDI_LR}; + + auto new_string=unicode::bidi_embed(s, levels, + UNICODE_BIDI_LR); + + if (new_string != s) + { + std::cerr << "Character " << (int)c + << " does not work." << std::endl; + exit(1); + } + } + + std::u32string s; + std::vector<unicode_bidi_level_t> levels; + + for (char32_t c=32; c<256; c++) { - std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl; + s += c; + levels.push_back(UNICODE_BIDI_LR); + } + + auto new_string=unicode::bidi_embed(s, levels, + UNICODE_BIDI_LR); + + if (new_string != s) + { + std::cerr << "iso-8859-1 string does not work." + << std::endl; exit(1); } +} - DEBUGDUMP=fopen("/dev/null", "w"); - if (!DEBUGDUMP) +void character_test() +{ + std::ifstream fp("BidiCharacterTest.txt"); + + if (!fp.is_open()) { - perror("/dev/null"); + std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl; exit(1); } std::string buf; size_t linenum=0; + size_t nextlogline=0; + std::string logmsg; while (1) { buf.clear(); - if (std::getline(fp, buf).eof() && buf.empty()) - break; - ++linenum; + bool iseof=std::getline(fp, buf).eof() && buf.empty(); + + if (iseof || ++linenum >= nextlogline) + { + alarm(300); + std::cout << logmsg; + + std::ostringstream o; + o << std::setw(6) << linenum << " lines processed... "; + + logmsg=o.str(); + + std::cout << logmsg << std::flush; + + std::fill(logmsg.begin(), logmsg.end(), '\b'); + + nextlogline += 20000; + } + + if (iseof) + break; auto p=buf.find('#'); if (p != buf.npos) @@ -187,17 +255,202 @@ int main(int argc, char **argv) std::cerr << std::endl; exit(1); } - } - return 0; -} -#define BIDI_DEBUG + std::vector<size_t> actual_render_order; + + size_t n=0; + + std::generate_n(std::back_inserter(actual_render_order), + s.size(), + [&] { return n++; }); + + unicode::bidi_reorder + (s, levels, + [&] + (size_t index, + size_t n) + { + auto b=actual_render_order.begin(); + std::reverse(b+index, b+index+n); + }); + + n=0; + unicode::bidi_cleanup + (s, levels, + [&] + (size_t i) + { + actual_render_order.erase + (actual_render_order.begin()+i-n); + ++n; + }); + + if (render_order != actual_render_order) + { + std::cerr << "Regression, line " + << linenum + << ": render order" + << std::endl + << " Expected:"; + for (auto n:render_order) + { + std::cerr << " " << n; + } + std::cerr << std::endl + << " Actual:"; -extern "C" { -#if 0 + for (auto n:actual_render_order) + { + std::cerr << " " << n; + } + std::cerr << std::endl; + exit(1); + } + + unicode::bidi_extra_cleanup(s, levels); + + auto dump_ls= + [&] + (const std::u32string &s, + const std::vector<unicode_bidi_level_t> &l) + { + for (size_t i=0; i<s.size(); ++i) + { + std::cerr << " " << std::hex + << std::setw(4) + << std::setfill('0') + << s[i] << "/" + << std::dec + << (int)l[i]; + } + }; + + for (int pass=0; pass<4; pass++) + { + int paragraph=pass & 1; + int use_default=pass & 2; + + for (size_t i=0; i<s.size(); ++i) + { + /* L1 */ + switch (unicode_bidi_type(s[i])) { + case UNICODE_BIDI_TYPE_S: + case UNICODE_BIDI_TYPE_B: + levels.at(i)=paragraph; + } + } + + auto logical_string=s; + auto logical_levels=levels; + + unicode::bidi_logical_order(logical_string, + logical_levels, + paragraph); + + auto new_string=unicode::bidi_embed(logical_string, + logical_levels, + paragraph); + + auto save_string=new_string; + + if (use_default) + { + auto marker=unicode::bidi_embed_paragraph_level + (new_string, paragraph); + + if (marker) + new_string.insert(0, 1, marker); + + ret=unicode::bidi_calc(new_string); + } + else + { + ret=unicode::bidi_calc(new_string, paragraph); + } + + unicode::bidi_reorder(new_string, std::get<0>(ret)); + unicode::bidi_extra_cleanup(new_string, + std::get<0>(ret)); + + /* New string is now back in logical order */ + + if (new_string == s && std::get<0>(ret) == levels) + continue; + + fclose(DEBUGDUMP); + DEBUGDUMP=stderr; + + std::cerr << "Regression, line " + << linenum + << ": embedding markers" + << std::endl + << " Paragraph embedding level: " + << paragraph; + + if (use_default) + std::cerr << " (defaulted)"; + + std::cerr << std::endl + << "String (1):"; + + dump_ls(s, levels); + + std::cerr << std::endl << "String (2):"; + + dump_ls(new_string, std::get<0>(ret)); + std::cerr << std::endl; + + std::cerr << "Embedding:"; + dump_ls(logical_string, logical_levels); + std::cerr << std::endl; + + unicode::bidi_embed(logical_string, + logical_levels, + paragraph); + + std::cerr << std::endl + << "Embedded string:"; + + for (auto c:save_string) + { + std::cerr << " "; + + switch (c) { + case LRM: std::cerr << "LRM"; break; + case RLM: std::cerr << "RLM"; break; + case RLI: std::cerr << "RLI"; break; + case LRI: std::cerr << "LRI"; break; + case RLO: std::cerr << "RLO"; break; + case LRO: std::cerr << "LRO"; break; + case PDF: std::cerr << "PDF"; break; + case PDI: std::cerr << "PDI"; break; + default: + std::cerr << std::hex << std::setw(4) + << std::setfill('0') + << c; + break; + } + } + std::cerr << std::dec << std::endl << std::flush; + + ret=unicode::bidi_calc(save_string, paragraph); + unicode::bidi_reorder(save_string, std::get<0>(ret)); + exit(1); + } + } + std::cout << std::endl; } -#endif -#include "unicode_bidi.c" +int main(int argc, char **argv) +{ + DEBUGDUMP=fopen("/dev/null", "w"); + if (!DEBUGDUMP) + { + perror("/dev/null"); + exit(1); + } + latin_test(); + character_test(); + return 0; } diff --git a/unicode/book.xml b/unicode/book.xml index ad0009a..c8948ba 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -12,7 +12,7 @@ <!-- -Copyright 2014-2017 Double Precision, Inc. +Copyright 2014-2020 Double Precision, Inc. See COPYING for distribution information. --> @@ -23,7 +23,7 @@ See COPYING for distribution information. <para> This library implements several algorithms related to the <ulink url="https://www.unicode.org/standard/standard.html">Unicode - Standard</ulink>: + Standard</ulink>, notably: </para> <itemizedlist> @@ -36,22 +36,21 @@ See COPYING for distribution information. <listitem> <para> Implementation of - <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme - and work breaking</ulink> rules. + <link linkend="unicode_grapheme_break">grapheme + and word breaking</link> rules. </para> </listitem> <listitem> <para> Implementation of - <ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line - breaking</ulink> rules. + <link linkend="unicode_line_break">line breaking</link> rules. </para> </listitem> <listitem> <para> Implementation of the - <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">bi-directional - algorithm</ulink>. + <link linkend="unicode_bidi">bi-directional + algorithm</link>. </para> </listitem> <listitem> @@ -69,15 +68,13 @@ See COPYING for distribution information. </listitem> <listitem> <para> - Look up the - <ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode - script property</ulink>. + Look up the <link linkend="unicode_script">Unicode + script property</link>. </para> </listitem> <listitem> <para> - Look up the - <ulink url="https://unicode.org/notes/tn36/">category</ulink> + Look up the <link linkend="unicode_category_lookup">category</link> property. </para> </listitem> @@ -192,7 +189,7 @@ See COPYING for distribution information. <programlisting> #include <courier-unicode.h></programlisting> </refsynopsisdiv> - <refsect1> + <refsect1 id="courier_unicode_descr"> <title>DESCRIPTION</title> <para> @@ -226,7 +223,7 @@ See COPYING for distribution information. with this library. </para> </refsect1> - <refsect1> + <refsect1 id="courier_unicode_seealso"> <title>SEE ALSO</title> <para> @@ -306,16 +303,22 @@ See COPYING for distribution information. <refname>unicode_bidi</refname> <refname>unicode_bidi_calc</refname> <refname>unicode_bidi_reorder</refname> + <refname>unicode_bidi_cleanup</refname> + <refname>unicode_bidi_extra_cleanup</refname> + <refname>unicode_bidi_logical_order</refname> + <refname>unicode_bidi_embed</refname> + <refname>unicode_bidi_embed_paragraph_level</refname> + + <refname>unicode_bidi_type</refname> <refname>unicode_bidi_mirror</refname> <refname>unicode_bidi_bracket_type</refname> - <refpurpose>unicode bidirectional algorithm</refpurpose> + <refpurpose>unicode bi-directional algorithm</refpurpose> </refnamediv> <refsynopsisdiv> <funcsynopsis> - <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> - <funcsynopsisinfo>unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo> + <funcsynopsisinfo>#include <courier-unicode.h> unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo> <funcprototype> <funcdef>void <function>unicode_bidi_calc</function></funcdef> <paramdef>const char32_t *<parameter>p</parameter></paramdef> @@ -334,6 +337,51 @@ See COPYING for distribution information. </funcprototype> <funcprototype> + <funcdef>size_t <function>unicode_bidi_cleanup</function></funcdef> + <paramdef>char32_t *<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> + <paramdef>void *<parameter>arg</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef> + <paramdef>char32_t *<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> + <paramdef>void *<parameter>arg</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef> + <paramdef>char32_t *<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + <paramdef>void (*<parameter>reorder_callback</parameter>)(size_t index, size_t n, void *arg)</paramdef> + <paramdef>void *<parameter>arg</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>size_t <function>unicode_bidi_embed</function></funcdef> + <paramdef>const char32_t *<parameter>string</parameter></paramdef> + <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + <paramdef>void (*<parameter>emit</parameter>)(const char32_t *string, size_t n, void *arg)</paramdef> + <paramdef>void *<parameter>arg</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef> + <paramdef>const char32_t *<parameter>string</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + </funcprototype> + + <funcprototype> <funcdef>char32_t <function>bidi_mirror</function></funcdef> <paramdef>char32_t <parameter>c</parameter></paramdef> </funcprototype> @@ -350,63 +398,160 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_bidi_descr"> <title>DESCRIPTION</title> <para> - <function>unicode_bidi_calc</function>() and - <function>unicode_bidi_reorder</function>() implement - the + These functions are related to the <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>. - </para> - <para> - The first two parameters to - <function>unicode_bidi_calc</function>() are a unicode string - and the number of characters in the Unicode string. - <parameter>levels</parameter> points to a buffer of - <classname>unicode_bidi_level_t</classname> values. - The caller is responsible for allocating and deallocating this - buffer, of - size <parameter>n</parameter>, - the same number of values as the number of characters in the - Unicode string. - </para> - <para> - <function>unicode_bidi_calc</function>() calculates the - embedding level of each character and fills in the - <parameter>levels</parameter> buffer (executes all steps of the - bidirectional algorithm up to, and including, step L1). - A <literal>NULL</literal> <parameter>initial_embedding</parameter> - value calculates the default paragraph embedding value. - A pointer to a <literal>UNICODE_BIDI_LR</literal> or - <literal>UNICODE_BIDI_RL</literal> value explicitly sets a - left-to-right or right-to-left paragraph embedding value. + They implement the algorithm up to and including step L2, + and provide additional functionality of returning miscellaneous + bi-directional-related metadata of Unicode characters. There's + also a basic algorithm that <quote>reverses</quote> the + bi-directional algorithm + and produces a Unicode string with bi-directional markers that + results in the same bi-directional string after reapplying the + algorithm. </para> - <para> - <function>unicode_bidi_calc</function>() calculates each - character's directional embedding value: an even value for - left-to-right text or an odd value for right-to-left text. - Unicode characters with an unspecified directional embedding - value are specified by the - <classname>UNICODE_BIDI_SKIP</classname> embedding level value. - This indicates embedding and override markers, which can be - removed from the string (together with this embedding value) - from the string and the embedding value itself). This can be - done before or after <function>unicode_bidi_reorder</function>(). - </para> + <refsect2 id="unicode_bidi_calc_reorder"> + <title>Calculating bi-directional rendering order</title> - <refsect2> - <title>Reordering text</title> + <para> + The following process computes the rendering order of + characters according to the Unicode Bi-Directional algorithm: + </para> + + <orderedlist> + <listitem> + <para> + Allocate an array of + <structname>unicode_bidi_level_t</structname> that's the + same size as the Unicode string. + </para> + </listitem> + <listitem> + <para> + Use <function>unicode_bidi_calc</function>() to compute + the Unicode string's characters' bi-directional embedding + level (executes the Bi-Directional algorithm up to and + including step L1). This populates the + <structname>unicode_bidi_level_t</structname> buffer. + </para> + </listitem> + <listitem> + <para> + Use <function>unicode_bidi_reorder</function>() to reverse + any characters in the string, according to the + algorithm (step L2), with an optional + callback that reports which ranges of characters get + reversed. + </para> + </listitem> + <listitem> + <para> + Use <function>unicode_bidi_cleanup</function>() or + <function>unicode_bidi_extra_cleanup</function>(), + to remove the characters from the string which are used + by the bi-directional algorithm, and are not needed for + rendering the text. + </para> + </listitem> + </orderedlist> + + <para> + The parameters to + <function>unicode_bidi_calc</function>() are: + </para> + + <itemizedlist> + <listitem> + <para> + A pointer to the Unicode string. + </para> + </listitem> + <listitem> + <para> + Number of characters in the Unicode string. + </para> + </listitem> + <listitem> + <para> + A pointer to an array of + <structname>unicode_bidi_level_t</structname> values. + The caller is + responsible for allocating and deallocating this array, + which has the same size as the Unicode string. + </para> + </listitem> + <listitem> + <para> + An optional pointer to a + <literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal> value. This sets + the default paragraph direction level. + A null pointer computes the default paragraph direction + level based on the string, as specified by the "P" rules + of the bi-directional algorithm. + </para> + </listitem> + </itemizedlist> + + <para> + <function>unicode_bidi_calc</function>() fills in the + <structname>unicode_bidi_level_t</structname> array with the + values corresponding to the embedding level of the + corresponding character, + according the Unicode Bidirection Algorithm (even values for + left-to-right ordering, and odd values for right-to-left + ordering). + A value of UNICODE_BIDI_SKIP designates directional markers + (from step X9). + </para> <para> - <function>unicode_bidi_reorder</function> takes the actual + <function>unicode_bidi_calc</function>() returns the resolved + paragraph direction level, which + always matches the passed in level, if specified, else it + reports the + derived one. + </para> + + <para> + <function>unicode_bidi_reorder</function>() takes the actual unicode string together with the embedding values from <function>unicode_bidi_calc</function>, then reverses the - bidirectional string, as specified by step L2 of the bidirectional + bi-directional string, as specified by step L2 of the bi-directional algorithm. + The parameters to + <function>unicode_bidi_reorder</function>() are: </para> + <itemizedlist> + <listitem> + <para> + A pointer to the Unicode string. + </para> + </listitem> + <listitem> + <para> + A pointer to an array of + <structname>unicode_bidi_level_t</structname> values. + </para> + </listitem> + <listitem> + <para> + Number of characters in the Unicode string and the + <structname>unicode_bidi_level_t</structname> array. + </para> + </listitem> + <listitem> + <para> + An optional <varname>reorder_callback</varname> function + pointer. + </para> + </listitem> + </itemizedlist> <para> A non-<literal>NULL</literal> <parameter>reorder_callback</parameter> gets invoked to report @@ -434,13 +579,280 @@ See COPYING for distribution information. invokes the <parameter>reorder_callback</parameter> as if the character string, and their embedding values, were reversed. </para> + + <para> + The resulting string and embedding levels are in + <quote>rendering order</quote>, but still contain bi-directional + embedding, override, boundary-neutral, isolate, and marker + characters. + <function>unicode_bidi_cleanup</function>() and + <function>unicode_bidi_extra_cleanup</function>() remove these + characters and directional markers from the unicode string. + <function>unicode_bidi_cleanup</function> removes only the + embedding, override, and boundry-neutral characters (as + specified by step X9 of the bi-directional algorithm). + <function>unicode_bidi_extra_cleanup</function>() + additionally removes the isolation markers, implicit markers; + and all characters + classified as paragraph separators get replaced by a newline. + </para> + <para> + A non-null pointer to the directional embedding level buffer, + of the same size as the string, also removes the corresponding + values from the buffer, and the remaining values in the + embedding level buffer get reset to + levels <literal>UNICODE_BIDI_LR</literal> and + <literal> UNICODE_BIDI_RL</literal>, only. + </para> + <para> + The parameters to <function>unicode_bidi_cleanup</function>() and + <function>unicode_bidi_extra_cleanup</function>() are: + </para> + + <itemizedlist> + <listitem> + <para> + The pointer to the unicode string. + </para> + </listitem> + <listitem> + <para> + The pointer to the directional embedding buffer. + </para> + </listitem> + <listitem> + <para> + The size of the unicode string and the directional embedding + buffer. + </para> + </listitem> + <listitem> + <para> + A pointer to a function that gets repeatedly invoked with the + index of the character that gets removed from the Unicode + string. + </para> + </listitem> + <listitem> + <para> + An opaque pointer that gets forwarded to the callback. + </para> + </listitem> + </itemizedlist> + <para> + The function pointer (if not <literal>NULL</literal>) + gets invoked to report the index of each + removed character. The reported index is the index from the + original string, and the callback gets invoked in strict order, + from the first to + the last removed character (if any). + </para> + <para> + Multiple calls to <function>unicode_bidi_cleanup</function>() or + <function>unicode_bidi_extra_cleanup</function>() do no harm; + except that <function>unicode_bidi_extra_cleanup</function>() + always removes all the additional characters that + <function>unicode_bidi_cleanup</function>() does not remove. + </para> + <para> + The character string and the embedding level values resulting + from <function>unicode_bidi_extra_cleanup</function>() are in + <quote>canonical rendering order</quote>. + </para> </refsect2> - <refsect2> + + <refsect2 id="unicode_bidi_embed"> + <title>Embedding bi-directional markers in Unicode text strings</title> + <para> + <function>unicode_bidi_logical_order</function>() and + <function>unicode_bidi_embed</function>() add various + bi-directional markers to a Unicode string in canonical rendering + order. The resulting string is not guaranteed to be + identical to the + original Unicode bi-directional string. The algorithm is fairly + basic, + but the resulting bi-directional string produces the same + canonical rendering order after applying + <function>unicode_bidi_calc()</function>, + <function>unicode_reorder()</function> and + <function>unicode_bidi_extra_cleanup()</function>, + with the same paragraph_embedding level. + </para> + + <para> + <function>unicode_bidi_logical_order</function>() gets called + first, followed by + <function>unicode_bidi_embed</function>(). + Finally, <function>unicode_bidi_embed_paragraph_level</function>() + optionally determines whether the resulting string's default + paragraph embedding level matches the one used for the actual + embedding direction, and if not returns a directional marker + to be prepended to the Unicode character string, as a hint. + </para> + <para> + <function>unicode_bidi_logical_order</function>() factors in the + characters' embedding values, and the provided paragraph + embedding value + (<literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal>), and rearranges the characters + and the embedding levels in left-to-right order, while + simultaneously + invoking the supplied reorder_callback indicating each range of + characters whose relative order gets reversed. The + <function>reorder_callback</function>() receives, as + parameters: + </para> + <itemizedlist> + <listitem> + <para> + The starting index of the first reversed character, in the + string. + </para> + </listitem> + <listitem> + <para> + Number of reversed characters. + </para> + </listitem> + <listitem> + <para> + Forwarded <parameter>arg</parameter> pointer value. + </para> + </listitem> + </itemizedlist> + <para> + This specifies a consecutive range of characters (and + directional embedding values) + that get reversed (first character in the range becomes the + last character, + and the last character becomes the first character). + </para> + + <para> + After + <function>unicode_bidi_logical_order</function>(), + <function>unicode_bidi_embed</function>() progressively invokes + the passed-in callback with + the contents of a bi-directional unicode string. + The parameters to <function>unicode_bidi_embed</function>() are: + </para> + <itemizedlist> + <listitem> + <para> + The Unicode string, and … + </para> + </listitem> + <listitem> + <para> + … the directional embedding buffer, in canonical + rendering order. + </para> + </listitem> + <listitem> + <para> + The size of the string and the embedding level buffer. + </para> + </listitem> + <listitem> + <para> + The paragraph embedding level, either + <literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal>. + </para> + </listitem> + <listitem> + <para> + The pointer to the callback function. + </para> + </listitem> + <listitem> + <para> + An opaque pointer argument that gets forwarded to the + callback function. + </para> + </listitem> + </itemizedlist> + <para> + The callback receives pointers to + various parts of the original string that gets passed to + <function>unicode_bidi_embed</function>(), intermixed with + bi-directional markers, + overrides, and isolates. The callback's parameters are: + </para> + + <itemizedlist> + <listitem> + <para> + The pointer to a Unicode string. + </para> + <note> + <para> + It is not a given that the callback receives pointers + to progressively increasing pointers of the original + string that gets passed to + <function>unicode_bidi_embed</function>(). + Some calls will be for individual bi-directional + markers, and + <function>unicode_bidi_embed</function>() also + performs some additional internal reordering, on the fly, + after <function>unicode_bidi_logical_order</function>()'s + big hammer. + </para> + </note> + </listitem> + <listitem> + <para> + Number of characters in the Unicode string. + </para> + </listitem> + <listitem> + <para> + Forwarded <parameter>arg</parameter> pointer value. + </para> + </listitem> + </itemizedlist> + + <para> + The assembled unicode string should produce the same + canonical rendering order, for the same paragraph embedding + level. + <function>unicode_bidi_embed_paragraph_level</function>() + checks if the specified Unicode string computes the given + default paragraph embedding level and returns 0 if it matches. + Otherwise it returns a directional marker that should be + <emphasis>prepended</emphasis> to the Unicode string to allow + <function>unicode_bidi_calc</function>'s optional paragraph + embedding level pointer's value to be <literal>NULL</literal>, + but derive the same default embedding level. + The parameters to + <function>unicode_bidi_embed_paragraph_level</function>() are: + </para> + <itemizedlist> + <listitem> + <para> + The Unicode string. + </para> + </listitem> + <listitem> + <para> + The size of the string. + </para> + </listitem> + <listitem> + <para> + The paragraph embedding level, either + <literal>UNICODE_BIDI_LR</literal> or + <literal>UNICODE_BIDI_RL</literal>. + </para> + </listitem> + </itemizedlist> + </refsect2> + <refsect2 id="unicode_bidi_misc"> <title>Miscellaneous utility functions</title> <para> <function>unicode_bidi_type</function> - looks up each character's bidirectional character type. + looks up each character's bi-directional character type. </para> <para> <function>unicode_bidi_mirror</function> @@ -464,7 +876,7 @@ See COPYING for distribution information. </para> </refsect2> </refsect1> - <refsect1> + <refsect1 id="courier_unicode_bidi_seealso"> <title>SEE ALSO</title> <para> <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">TR-9</ulink>, @@ -502,7 +914,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_canonical_descr"> <title>DESCRIPTION</title> <para> @@ -552,7 +964,7 @@ See COPYING for distribution information. equivalence. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_canonical_seealso"> <title>SEE ALSO</title> <para> <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>, @@ -641,7 +1053,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_category_descr"> <title>DESCRIPTION</title> <para> @@ -783,7 +1195,7 @@ See COPYING for distribution information. </varlistentry> </variablelist> </refsect1> - <refsect1> + <refsect1 id="unicode_category_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -950,7 +1362,7 @@ See COPYING for distribution information. </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_convert_descr"> <title>DESCRIPTION</title> <para> @@ -1040,7 +1452,7 @@ See COPYING for distribution information. </para> - <refsect2> + <refsect2 id="unicode_convert_collect"> <title>Collecting converted text into a buffer</title> <para> @@ -1097,7 +1509,7 @@ See COPYING for distribution information. </para> </refsect2> - <refsect2> + <refsect2 id="unicode_convert_chset_unicode"> <title>Converting between character sets and unicode</title> <para> @@ -1126,7 +1538,7 @@ See COPYING for distribution information. </para> </refsect2> - <refsect2> + <refsect2 id="unicode_convert_oneshot"> <title>One-shot conversions</title> <para> @@ -1175,7 +1587,7 @@ See COPYING for distribution information. </para> </refsect2> </refsect1> - <refsect1> + <refsect1 id="unicode_convert_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -1220,7 +1632,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_default_chset_descr"> <title>DESCRIPTION</title> <para> <function>unicode_default_chset</function>() returns the name of the @@ -1231,7 +1643,7 @@ See COPYING for distribution information. current application locale's character set. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_default_chset_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -1316,7 +1728,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_emoji_descr"> <title>DESCRIPTION</title> <para> <function>unicode_emoji_lookup</function>() returns the @@ -1334,7 +1746,7 @@ See COPYING for distribution information. character has the corresponding property. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_emoji_seealso"> <title>SEE ALSO</title> <para> <ulink url="https://www.unicode.org/reports/tr51/tr51-&tr51ver;.html">TR-51</ulink>, @@ -1368,7 +1780,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_html40_descr"> <title>DESCRIPTION</title> <para> <function>unicode_html40ent_lookup</function>() returns the @@ -1392,7 +1804,7 @@ See COPYING for distribution information. a single unicode character. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_html40_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -1448,7 +1860,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_grapheme_descr"> <title>DESCRIPTION</title> <para> @@ -1489,7 +1901,7 @@ See COPYING for distribution information. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_grapheme_seealso"> <title>SEE ALSO</title> <para> @@ -1600,7 +2012,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_lb_descr"> <title>DESCRIPTION</title> <para> These functions implement the unicode line breaking algorithm. @@ -1730,7 +2142,7 @@ See COPYING for distribution information. line breaking handle is no longer valid. </para> - <refsect2> + <refsect2 id="unicode_lb_altcallback"> <title>Alternative callback function</title> <para> @@ -1745,7 +2157,7 @@ See COPYING for distribution information. </para> </refsect2> - <refsect2> + <refsect2 id="unicode_lb_altcallback_opt"> <title>Options</title> <para> @@ -1822,7 +2234,7 @@ See COPYING for distribution information. </refsect2> </refsect1> - <refsect1> + <refsect1 id="unicode_lb_seealso"> <title>SEE ALSO</title> <para> @@ -1859,7 +2271,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_script_descr"> <title>DESCRIPTION</title> <para> <function>unicode_script</function>() looks up the @@ -1871,7 +2283,7 @@ See COPYING for distribution information. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_script_seealso"> <title>SEE ALSO</title> <para> @@ -1949,7 +2361,7 @@ See COPYING for distribution information. </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_wb_descr"> <title>DESCRIPTION</title> <para> These functions implement the unicode word breaking algorithm. @@ -2046,7 +2458,7 @@ See COPYING for distribution information. line breaking handle is no longer valid. </para> - <refsect2> + <refsect2 id="unicode_wb_scan"> <title>Word scan</title> <para> @@ -2075,7 +2487,7 @@ See COPYING for distribution information. </refsect2> </refsect1> - <refsect1> + <refsect1 id="unicode_wb_seealso"> <title>SEE ALSO</title> <para> <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, @@ -2144,7 +2556,7 @@ See COPYING for distribution information. </funcprototype> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_uc_descr"> <title>DESCRIPTION</title> <para> <function>unicode_uc</function>(), @@ -2174,7 +2586,7 @@ See COPYING for distribution information. </para> </refsect1> - <refsect1> + <refsect1 id="unicode_uc_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2223,94 +2635,162 @@ See COPYING for distribution information. <refnamediv> <refname>unicode::bidi_calc</refname> <refname>unicode::bidi_reorder</refname> - <refpurpose>unicode bidirectional algorithm</refpurpose> + <refname>unicode::bidi_cleanup</refname> + <refname>unicode::bidi_extra_cleanup</refname> + <refname>unicode::bidi_logical_order</refname> + <refname>unicode::bidi_embed</refname> + <refname>unicode::bidi_embed_paragraph_level</refname> + <refpurpose>unicode bi-directional algorithm</refpurpose> </refnamediv> <refsynopsisdiv> <funcsynopsis> <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> <funcprototype> - <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> + <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> <paramdef>const std::u32string &<parameter>string</parameter></paramdef> </funcprototype> - </funcsynopsis> - <funcsynopsis> <funcprototype> - <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> + <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> <paramdef>const std::u32string &<parameter>string</parameter></paramdef> <paramdef>unicode_bidi_level_t <parameter>embedding_level</parameter></paramdef> </funcprototype> - </funcsynopsis> - <funcsynopsis> <funcprototype> <funcdef>int <function>unicode::bidi_reorder</function></funcdef> <paramdef>std::u32string &<parameter>string</parameter></paramdef> <paramdef>std::vector<unicode_bidi_level_t> &<parameter>embedding_level</parameter></paramdef> - <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>reorder_callback</parameter></paramdef> </funcprototype> - </funcsynopsis> - <funcsynopsis> <funcprototype> - <funcdef>int <function>unicode::bidi_reorder</function></funcdef> + <funcdef>void <function>unicode::bidi_reorder</function></funcdef> <paramdef>std::vector<unicode_bidi_level_t> &<parameter>embedding_level</parameter></paramdef> - <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>reorder_callback</parameter></paramdef> </funcprototype> - </funcsynopsis> + + <funcprototype> + <funcdef>void <function>unicode::bidi_cleanup</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode::bidi_cleanup</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode::bidi_logical_order</function></funcdef> + <paramdef>std::u32string &<parameter>string</parameter></paramdef> + <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>void <function>unicode::bidi_logical_order</function></funcdef> + <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>int <function>unicode::bidi_embed</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + <paramdef>const std::function<void (size_t, const char32_t *, size_t) noexcept> &<parameter>callback</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>std::u32string <function>unicode::bidi_embed</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef> + <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> + </funcprototype> + </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_bidi_descr"> <title>DESCRIPTION</title> <para> These functions implement the C++ interface for the - <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>. + <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>. See the description of the underlying <link linkend="unicode_bidi"> <citerefentry><refentrytitle>unicode_bidi</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link> C library - API for more information. + API for more information. C++ specific notes: </para> - <para> - <function>unicode::bidi_calc</function> computes and return a vector - of bidirection embedding level values for the given Unicode string. - An overload takes an additional parameter that override the - paragraph embedding level, a <literal>UNICODE_BIDI_LR</literal> or - an <literal>UNICODE_BIDI_RL</literal> value. - </para> - <para> - <function>unicode::bidi_reorder</function> reverses the characters - in the Unicode script, according to their embedding levels (and - reverses the corresponding embedding level values too). - As is with the C API, an optional parameter is a callable object - that gets invoked to report each range of characters that gets - reversed (specified as the starting position and a number of - characters). - </para> - <para> - An overloaded <function>unicode::bidi_reorder</function> without - the string parameter goes through the motions, according to the - embedded level vector parameter, but without actually reversing - the values in the vector, but still invoking the callable object - normally. - </para> - <para> - This is comparable to the C API. Also comparable with the C API: - the convention that even embedding levels specify left to right - text and odd embedding values specify right to left text. - An embedding value of <literal>UNICODE_BIDI_SKIP</literal> - indicates an embedding or an override marker that has no - specified embeded value. These markers may be removed from the - Unicode string (together with the - <literal>UNICODE_BIDI_SKIP</literal> - values from the embedding values vector) either before or after - they get reordered. - </para> + <itemizedlist> + <listitem> + <para> + <function>unicode::bidi_calc</function> returns the + directional embedding value buffer and the paragraph + embedding level. + </para> + </listitem> + <listitem> + <para> + Several C functions provide a <quote>dry-run</quote> mode + by passing a <literal>NULL</literal> pointer. The C++ API + provides separate overloads, with and without the nullable + parameter. + </para> + </listitem> + <listitem> + <para> + Several C functions accept a nullable function pointer, with + the <literal>NULL</literal> function pointer specifying no + callback. The C++ functions have a + <classname>std::function</classname> parameter with a + default do-nothing closure. + </para> + </listitem> + + <listitem> + <para> + Several C functions accept two parameters, a Unicode character + pointer and the embedding level buffer, and a single parameter + that specifies the size of both. + The equivalent C++ function takes two discrete parameters, + a <classname>std::u32string</classname> and a + <classname>std::vector</classname> and returns an + <classname>int</classname>; a negative value if their sizes + differ, and 0 if their sizes match, and the requested function + completes. The <function>unicode::bidi_embed</function> overload + that returns a <classname>std::u32string</classname> returns + an empty string in case of a mismatch. + </para> + </listitem> + </itemizedlist> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_bidi_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2389,7 +2869,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_convert_descr"> <title>DESCRIPTION</title> <para> @@ -2447,7 +2927,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_convert_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2505,7 +2985,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_convert_tocase_descr"> <title>DESCRIPTION</title> <para> @@ -2537,7 +3017,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_convert_tocase_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2602,7 +3082,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_convert_fromu_descr"> <title>DESCRIPTION</title> <para> @@ -2634,7 +3114,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_convert_fromu_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2698,7 +3178,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_convert_tou_descr"> <title>DESCRIPTION</title> <para> @@ -2733,7 +3213,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_convert_tou_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -2846,7 +3326,7 @@ std::vector<std::pair<int, char32_t>> linebreaks; std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int>>(linebreaks));</programlisting> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_lb_descr"> <title>DESCRIPTION</title> <para> @@ -2941,7 +3421,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_lb_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -3012,7 +3492,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int> </funcsynopsis> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_tolower_descr"> <title>DESCRIPTION</title> <para> @@ -3040,7 +3520,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int> </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_tolower_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> @@ -3104,7 +3584,7 @@ size_t nchars=scan.finish(); </programlisting> </refsynopsisdiv> - <refsect1> + <refsect1 id="unicode_cpp_wb_descr"> <title>DESCRIPTION</title> <para> @@ -3168,7 +3648,7 @@ size_t nchars=scan.finish(); </para> </refsect1> - <refsect1> + <refsect1 id="unicode_cpp_wb_seealso"> <title>SEE ALSO</title> <para> <link linkend="courier-unicode"> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index c8161ea..f6b4b8c 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -536,65 +536,6 @@ int unicode_wbscan_next(unicode_wbscan_info_t i, char32_t ch); size_t unicode_wbscan_end(unicode_wbscan_info_t i); -/* -** Unicode Bidirectional bracket and mirroring lookup -** -** http://www.unicode.org/reports/tr9/tr9-42.html -** -** unicode_bidi_mirror() returns the Bidi_Mirroring_Glyph property. -** -** If there is no mirroring glyph for the given character, returns the -** same character. -** -** unicode_bidi_bracket_type() looks up the Bidi_Paired_Bracket and -** Bidi_Paired_Bracket_Type properties. -** -** unicode_bidi_bracket_type() returns the Bidi_Paired_Bracket property -** value. If the ret parameter is not a null pointer, the pointed-to -** value is set to Bidi_Paired_Bracket_Type value, one of the UNICODE_BIDI -** values. -** -** unicode_bidi_bracket_type() returns the same character and -** UNICODE_BIDI_n if the given character does not have these properties. -** -** unicode_bidi_type() looks up the bidirectional character type of the -** given Unicode character. -** -** unicode_bidi_calc() implements the Unicode Bidirectional Algorithm up to -** step L1. -** -** Parameters: -** -** - A pointer to char32_t, the Unicode string. -** -** - Number of characters in the char32_t string -** -** - A pointer to an array of unicode_bidi_level_t values. The caller is -** responsible for allocating and deallocating this array, which has the -** same size as the Unicode string (the second parameter). -** -** - An optional pointer to a unicode_bidi_level_t value, or a null pointer. -** A pointer to UNICODE_BIDI_LR or UNICODE_BIDI_RL sets the default paragraph -** direction level. A null pointer calculates the default paragraph direction -** level based on the string, as specified by the "P" rules in the algorithm. -** -** unicode_bidi_calc() fills in the unicode_bidi_level_t array with the -** values corresponding to the embedding level of the corresponding character, -** as specified in the Unicode Bidirection Algorithm (even for left-to-right, -** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates -** directional markers (from step X9). These characters should be removed -** before using unicode_bidi_reorder(). -** -** unicode_bidi_calc() returns the resolved paragraph direction level, which -** always matches the passed in level, if specified, else it reports the -** derived one. -** -** unicode_bidi_reorder() reorders the characters according to the resolved -** embedding levels. A non-null reorder_callback gets invoked repeatedly, -** indicating the starting index and the number of characters reversed, so -** that any related metadata can be updated accordingly. -*/ - typedef char unicode_bidi_bracket_type_t; #define UNICODE_BIDI_n 'n' @@ -654,6 +595,40 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +extern size_t unicode_bidi_cleanup(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + void (*removed_callback)(size_t, void *), + void *); + +extern size_t unicode_bidi_extra_cleanup(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + void (*removed_callback)(size_t, + void *), + void *); + +extern void unicode_bidi_logical_order(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + unicode_bidi_level_t paragraph_embedding, + void (*reorder_callback)(size_t, size_t, + void *), + void *arg); + +extern void unicode_bidi_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + unicode_bidi_level_t paragraph_embedding, + void (*emit)(const char32_t *string, + size_t n, + void *arg), + void *arg); + +extern char32_t unicode_bidi_embed_paragraph_level(const char32_t *str, + size_t n, + unicode_bidi_level_t); + /* ** unicode_canonical() returns the canonical mapping of the given Unicode ** character. The returned structure specifies: @@ -2117,24 +2092,124 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); //! Calculate bidirectional embedding levels + +//! Returns the bidirectional embedding levels, and the paragraph +//! embedding level. + std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> bidi_calc(const std::u32string &s); //! Calculate bidirectional embedding levels + +//! Overload calculates the embedding levels using a predetermined +//! paragraph embedding level. +//! +//! Returns the bidirectional embedding levels, and the same paragraph +//! embedding level. + std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> bidi_calc(const std::u32string &s, unicode_bidi_level_t level); //! Reorder bidirectional text + +//! Reorders the string and levels in place. +//! +//! Non-0 return value indicates the string and levels' sizes do not match. + int bidi_reorder(std::u32string &string, std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t, size_t)> &reorder_callback= - [](size_t, size_t){}); + const std::function<void (size_t, size_t) noexcept> + &reorder_callback=[](size_t, size_t) noexcept{}); -//! Reorder bidirectional text +//! Dry-run reorder bidirectional text void bidi_reorder(std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t, size_t)> &reorder_callback= - [](size_t, size_t){}); + const std::function<void (size_t, size_t) noexcept> + &reorder_callback=[](size_t, size_t) noexcept{}); + +//! Remove directional markers + +//! Removes them from the string, in place. Optional lambda gets notified +//! of the index (in the original string, of each removed marker. + +void bidi_cleanup(std::u32string &string, + const std::function<void (size_t) noexcept> &removed_callback= + [](size_t) noexcept {}); + +//! Also remove them from the embedding direction level buffer. + +//! Returns non-0 in case of non-matching level buffer size. + +int bidi_cleanup(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t) noexcept> &removed_callback= + [](size_t) noexcept {}); + + +//! Remove directional markers and isolation markers. + +//! Removes them from the string, in place. Optional lambda gets notified +//! of the index (in the original string, of each removed marker. + +void bidi_extra_cleanup(std::u32string &string, + const std::function<void (size_t) noexcept> + &removed_callback= + [](size_t) noexcept {}); + +//! Also remove them from the embedding direction level buffer. + +//! Returns non-0 in case of non-matching level buffer size. + +int bidi_extra_cleanup(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t) noexcept> + &removed_callback= + [](size_t) noexcept {}); + +//! Convert Unicode string from canonical rendering order to logical order. +int bidi_logical_order(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (size_t, size_t) noexcept> + &lambda=[](size_t,size_t){}); + +//! Convert Unicode string from canonical rendering order to logical order. +void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (size_t, size_t) noexcept> + &lambda); + +//! Embed directional and isolation markers + +//! Non-0 return value indicates the string and levels' sizes do not match. +//! +//! The lambda gets called repeatedly, to specify the contents of the +//! string with embedded direction markers. + +int bidi_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (const char32_t *string, + size_t n) noexcept> &lambda); + +//! Embed directional and isolation markers + +//! \overload +//! +//! Provides a lambda that collects the new string, and returns it. An +//! empty string gets returned if the string and levels' sizes do not match. + +std::u32string bidi_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding); + +//! Check if a directional marker needs to be inserted + +//! In order for the unicode string to have the specified default +//! paragraph embedding level. + +extern char32_t bidi_embed_paragraph_level(const std::u32string &string, + unicode_bidi_level_t level); #if 0 { diff --git a/unicode/docbook/book.css b/unicode/docbook/book.css index d1420cd..a133e82 100644 --- a/unicode/docbook/book.css +++ b/unicode/docbook/book.css @@ -44,7 +44,7 @@ code.computeroutput div.literallayout { font-weight: bold; } -.command, .acronym, .symbol { +.command, .acronym, .symbol, .structname { font-family: "liberation mono", "courier new", monospace; background-color: #eeeeee; } diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 055ee89..a35e9b5 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -112,6 +112,17 @@ typedef enum { (c) == UNICODE_BIDI_TYPE_LRO || \ (c) == UNICODE_BIDI_TYPE_RLO) +#define is_explicit_indicator_except_b(c) \ + ( is_isolate_initiator(c) || \ + is_embedding_initiator(c) || \ + (c) == UNICODE_BIDI_TYPE_BN || \ + (c) == UNICODE_BIDI_TYPE_PDF || \ + (c) == UNICODE_BIDI_TYPE_PDI) + +#define is_explicit_indicator(c) \ + ( is_explicit_indicator_except_b(c) || \ + (c) == UNICODE_BIDI_TYPE_B) + /* BD13 implementation */ /* A level run, specified as indexes */ @@ -529,6 +540,8 @@ static void directional_status_stack_push (struct directional_status_stack_entry *) malloc(sizeof(struct directional_status_stack_entry)); + if (!p) + abort(); #ifdef BIDI_DEBUG fprintf(DEBUGDUMP, "BIDI: Push level %d, override: %s, isolate: %s\n", (int)embedding_level, @@ -548,16 +561,21 @@ static void directional_status_stack_push } static unicode_bidi_level_t -compute_paragraph_embedding_level(const enum_bidi_type_t *p, - size_t i, size_t j) +compute_paragraph_embedding_level(size_t i, size_t j, + enum_bidi_type_t (*get)(size_t i, + void *arg), + void *arg) + { unicode_bidi_level_t in_isolation=0; for (; i<j; ++i) { - if (is_isolate_initiator(p[i])) + enum_bidi_type_t t=get(i, arg); + + if (is_isolate_initiator(t)) ++in_isolation; - else if (p[i] == UNICODE_BIDI_TYPE_PDI) + else if (t == UNICODE_BIDI_TYPE_PDI) { if (in_isolation) --in_isolation; @@ -565,16 +583,43 @@ compute_paragraph_embedding_level(const enum_bidi_type_t *p, if (in_isolation == 0) { - if (p[i] == UNICODE_BIDI_TYPE_AL || - p[i] == UNICODE_BIDI_TYPE_R) + if (t == UNICODE_BIDI_TYPE_AL || + t == UNICODE_BIDI_TYPE_R) { - return 1; + return UNICODE_BIDI_RL; } - if (p[i] == UNICODE_BIDI_TYPE_L) + if (t == UNICODE_BIDI_TYPE_L) break; } } - return 0; + return UNICODE_BIDI_LR; +} + +struct compute_paragraph_embedding_level_type_info { + const enum_bidi_type_t *p; +}; + +static enum_bidi_type_t +get_enum_bidi_type_for_paragraph_embedding_level(size_t i, + void *arg) +{ + struct compute_paragraph_embedding_level_type_info *p= + (struct compute_paragraph_embedding_level_type_info *)arg; + + return p->p[i]; +} + +static unicode_bidi_level_t +compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p, + size_t i, size_t j) +{ + struct compute_paragraph_embedding_level_type_info info; + info.p=p; + + return compute_paragraph_embedding_level + (i, j, + get_enum_bidi_type_for_paragraph_embedding_level, + &info); } static directional_status_stack_t @@ -591,7 +636,7 @@ directional_status_stack_init(const char32_t *chars, stack->paragraph_embedding_level= initial_embedding_level ? *initial_embedding_level & 1 - : compute_paragraph_embedding_level(classes, 0, n); + : compute_paragraph_embedding_level_from_types(classes, 0, n); stack->chars=chars; stack->classes=classes; @@ -676,6 +721,8 @@ unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, enum_bidi_type_t *buf= (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); + if (!buf) + abort(); for (size_t i=0; i<n; ++i) { buf[i]=unicode_bidi_type(p[i]); @@ -732,7 +779,7 @@ unicode_bidi_b(const char32_t *p, } \ } while(0) -static void unicode_bidi_w(directional_status_stack_t stack, +static void unicode_bidi_w(enum_bidi_type_t *classes, struct isolating_run_sequence_s *seq); static void unicode_bidi_n(directional_status_stack_t stack, struct isolating_run_sequence_s *seq); @@ -900,7 +947,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } } - cur_class=compute_paragraph_embedding_level + cur_class=compute_paragraph_embedding_level_from_types (stack->classes, i+1, j) == 1 ? UNICODE_BIDI_TYPE_RLI : UNICODE_BIDI_TYPE_LRI; @@ -955,24 +1002,11 @@ static void unicode_bidi_cl(directional_status_stack_t stack) break; } - switch (stack->orig_classes[i]) { - case UNICODE_BIDI_TYPE_BN: - case UNICODE_BIDI_TYPE_B: - case UNICODE_BIDI_TYPE_RLE: - case UNICODE_BIDI_TYPE_LRE: - case UNICODE_BIDI_TYPE_RLO: - case UNICODE_BIDI_TYPE_LRO: - case UNICODE_BIDI_TYPE_PDF: - case UNICODE_BIDI_TYPE_RLI: - case UNICODE_BIDI_TYPE_LRI: - case UNICODE_BIDI_TYPE_FSI: - case UNICODE_BIDI_TYPE_PDI: - break; - default: + if (!is_explicit_indicator(stack->orig_classes[i])) + { /* X6 */ stack->levels[i]=stack->head->embedding_level; RESET_CLASS(stack->classes[i],stack); - break; } if (stack->classes[i] == UNICODE_BIDI_TYPE_PDI) @@ -1210,7 +1244,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) dump_sequence("Contents before W", stack, p); #endif - unicode_bidi_w(stack, p); + unicode_bidi_w(stack->classes, p); #ifdef BIDI_DEBUG dump_sequence("Contents after W", stack, p); @@ -1258,7 +1292,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } } -static void unicode_bidi_w(directional_status_stack_t stack, +static void unicode_bidi_w(enum_bidi_type_t *classes, struct isolating_run_sequence_s *seq) { irs_iterator iter=irs_begin(seq), end=irs_end(seq); @@ -1268,10 +1302,10 @@ static void unicode_bidi_w(directional_status_stack_t stack, while (irs_compare(&iter, &end)) { - if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_NSM) + if (classes[iter.i] == UNICODE_BIDI_TYPE_NSM) { /* W1 */ - stack->classes[iter.i] = + classes[iter.i] = is_isolate_initiator(previous_type) || previous_type == UNICODE_BIDI_TYPE_PDI ? UNICODE_BIDI_TYPE_ON @@ -1281,14 +1315,14 @@ static void unicode_bidi_w(directional_status_stack_t stack, /* W2 */ - if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_EN && + if (classes[iter.i] == UNICODE_BIDI_TYPE_EN && strong_type == UNICODE_BIDI_TYPE_AL) { - stack->classes[iter.i] = UNICODE_BIDI_TYPE_AN; + classes[iter.i] = UNICODE_BIDI_TYPE_AN; } /* W2 */ - previous_type=stack->classes[iter.i]; + previous_type=classes[iter.i]; switch (previous_type) { case UNICODE_BIDI_TYPE_R: @@ -1312,12 +1346,12 @@ static void unicode_bidi_w(directional_status_stack_t stack, while (not_eol) { /* W3 */ - if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_AL) - stack->classes[iter.i] = UNICODE_BIDI_TYPE_R; + if (classes[iter.i] == UNICODE_BIDI_TYPE_AL) + classes[iter.i] = UNICODE_BIDI_TYPE_R; /* W4 */ - enum_bidi_type_t this_type=stack->classes[iter.i]; + enum_bidi_type_t this_type=classes[iter.i]; irs_incr(&iter); not_eol=irs_compare(&iter, &end); @@ -1332,13 +1366,13 @@ static void unicode_bidi_w(directional_status_stack_t stack, previous_type == UNICODE_BIDI_TYPE_AN) ) ) && - stack->classes[iter.i] == previous_type) + classes[iter.i] == previous_type) { irs_iterator prev=iter; irs_decr(&prev); - stack->classes[prev.i]=previous_type; + classes[prev.i]=previous_type; } if (not_eol) @@ -1353,9 +1387,9 @@ static void unicode_bidi_w(directional_status_stack_t stack, while (irs_compare(&iter, &end)) { - if (stack->classes[iter.i] != UNICODE_BIDI_TYPE_ET) + if (classes[iter.i] != UNICODE_BIDI_TYPE_ET) { - previous_type=stack->classes[iter.i]; + previous_type=classes[iter.i]; irs_incr(&iter); continue; } @@ -1363,7 +1397,7 @@ static void unicode_bidi_w(directional_status_stack_t stack, /* ET after EN */ if (previous_type == UNICODE_BIDI_TYPE_EN) { - stack->classes[iter.i] = UNICODE_BIDI_TYPE_EN; + classes[iter.i] = UNICODE_BIDI_TYPE_EN; irs_incr(&iter); continue; } @@ -1374,7 +1408,7 @@ static void unicode_bidi_w(directional_status_stack_t stack, while (irs_incr(&iter), irs_compare(&iter, &end)) { - previous_type=stack->classes[iter.i]; + previous_type=classes[iter.i]; if (previous_type == UNICODE_BIDI_TYPE_ET) continue; @@ -1383,7 +1417,7 @@ static void unicode_bidi_w(directional_status_stack_t stack, { while (irs_compare(&start, &iter)) { - stack->classes[start.i]= + classes[start.i]= UNICODE_BIDI_TYPE_EN; irs_incr(&start); } @@ -1397,12 +1431,12 @@ static void unicode_bidi_w(directional_status_stack_t stack, for (iter=irs_begin(seq); irs_compare(&iter, &end); irs_incr(&iter)) { - switch (stack->classes[iter.i]) { + switch (classes[iter.i]) { case UNICODE_BIDI_TYPE_ET: case UNICODE_BIDI_TYPE_ES: case UNICODE_BIDI_TYPE_CS: /* W6 */ - stack->classes[iter.i]=UNICODE_BIDI_TYPE_ON; + classes[iter.i]=UNICODE_BIDI_TYPE_ON; break; default: break; @@ -1416,14 +1450,14 @@ static void unicode_bidi_w(directional_status_stack_t stack, while (irs_compare(&iter, &end)) { - switch (stack->classes[iter.i]) { + switch (classes[iter.i]) { case UNICODE_BIDI_TYPE_L: case UNICODE_BIDI_TYPE_R: - previous_type=stack->classes[iter.i]; + previous_type=classes[iter.i]; break; case UNICODE_BIDI_TYPE_EN: if (previous_type == UNICODE_BIDI_TYPE_L) - stack->classes[iter.i]=previous_type; + classes[iter.i]=previous_type; break; default: break; @@ -1573,13 +1607,13 @@ static void unicode_bidi_n(directional_status_stack_t stack, ADJUST_EOCLASS(eoclass); -#define E_CLASS (seq->embedding_level & 1 ? \ - UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L) +#define E_CLASS(level) ((level) & 1 ? \ + UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L) -#define O_CLASS (seq->embedding_level & 1 ? \ - UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R) +#define O_CLASS(level) ((level) & 1 ? \ + UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R) - if (eoclass == E_CLASS) + if (eoclass == E_CLASS(seq->embedding_level)) { #ifdef BIDI_DEBUG if (stackp) @@ -1599,7 +1633,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, for (size_t i=0; i<stackp; ++i) stack_iters[i]->has_e=1; } - else if (eoclass == O_CLASS) + else if (eoclass == O_CLASS(seq->embedding_level)) { #ifdef BIDI_DEBUG if (stackp) @@ -1636,8 +1670,8 @@ static void unicode_bidi_n(directional_status_stack_t stack, "Brackets: %d and %d: e=%s, o=%s", (int)p->start.i, (int)p->end.i, - bidi_classname(E_CLASS), - bidi_classname(O_CLASS)); + bidi_classname(E_CLASS(seq->embedding_level)), + bidi_classname(O_CLASS(seq->embedding_level))); fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n", p->has_e, @@ -1879,6 +1913,37 @@ static void level_run_layers_add(struct level_run_layers *p) level_runs_init(p->lruns + (p->n_lruns++)); } +static void reverse_str(char32_t *p, + unicode_bidi_level_t *levels, + size_t start, + size_t end, + void (*reorder_callback)(size_t, size_t, void *), + void *arg) +{ + size_t right=end; + size_t left=start; + + while (right > left) + { + --right; + + if (p) + { + char32_t c=p[left]; + unicode_bidi_level_t l=levels[left]; + + p[left]=p[right]; + levels[left]=levels[right]; + p[right]=c; + levels[right]=l; + } + ++left; + } + + if (end-start > 1 && reorder_callback) + (*reorder_callback)(start, end-start, arg); +} + void unicode_bidi_reorder(char32_t *p, unicode_bidi_level_t *levels, size_t n, @@ -1887,6 +1952,15 @@ void unicode_bidi_reorder(char32_t *p, { /* L2 */ +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, "Before L2:"); + for (size_t i=0; i<n; ++i) + fprintf(DEBUGDUMP, " %04x/%d", + (unsigned)p[i], + (int)levels[i]); + fprintf(DEBUGDUMP, "\n"); +#endif + struct level_run_layers layers; unicode_bidi_level_t previous_level=0; @@ -1920,39 +1994,738 @@ void unicode_bidi_reorder(char32_t *p, } } } - +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, "L2:\n"); +#endif for (size_t i=layers.n_lruns; i; ) { struct level_runs *runs=layers.lruns+ --i; +#ifdef BIDI_DEBUG + if (runs->n_level_runs) + fprintf(DEBUGDUMP, "Reverse %d:", + (int)i); +#endif + for (size_t j=0; j<runs->n_level_runs; ++j) { size_t start=runs->runs[j].start; size_t end=runs->runs[j].end; - size_t right=end; - size_t left=start; +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, " %d-%d", + (int)start, (int)end-1); +#endif - while (right > left) + reverse_str(p, levels, start, end, + reorder_callback, arg); + } + +#ifdef BIDI_DEBUG + if (runs->n_level_runs) + fprintf(DEBUGDUMP, "\n"); +#endif + } + + level_run_layers_deinit(&layers); +} + +#define LRM 0x200E +#define RLM 0x200F +#define ALM 0x061C + +size_t unicode_bidi_cleanup(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + void (*removed_callback)(size_t, void *), + void *arg) +{ + size_t i=0; + for (size_t j=0; j<n; ++j) + { + enum_bidi_type_t cl=unicode_bidi_type(string[j]); + + if (IS_X9(cl)) + { + if (removed_callback) + (*removed_callback)(j, arg); + continue; + } + if (levels) + levels[i]=levels[j] & 1; + ++i; + } + return i; +} + +size_t unicode_bidi_extra_cleanup(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + void (*removed_callback)(size_t, void *), + void *arg) +{ + size_t i=0; + for (size_t j=0; j<n; ++j) + { + enum_bidi_type_t cl=unicode_bidi_type(string[j]); + + if (is_explicit_indicator_except_b(cl) || + (string[j] == LRM || + string[j] == RLM || + string[j] == ALM)) + { + if (removed_callback) + (*removed_callback)(j, arg); + continue; + } + string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j]; + if (levels) + levels[i]=levels[j] & 1; + ++i; + } + return i; +} + +void unicode_bidi_logical_order(char32_t *string, + unicode_bidi_level_t *levels, + size_t n, + unicode_bidi_level_t paragraph_embedding, + void (*reorder_callback)(size_t, size_t, + void *), + void *arg) +{ + size_t i=0; + + // On this pass: + // + // When paragraph_embedding is 0, we reverse odd embedding levels. + // When paragraph_embedding is 1, we reverse even embedding levels. + +#define LOGICAL_FLIP(n) ( ((n) ^ paragraph_embedding) & 1) + + while (i<n) + { + if ( !LOGICAL_FLIP(levels[i])) + { + ++i; + continue; + } + + size_t j=i; + + while (i<n) + { + if (!LOGICAL_FLIP(levels[i])) + break; + ++i; + } + + reverse_str(string, levels, j, i, + reorder_callback, arg); + } + + if (paragraph_embedding & 1) + reverse_str(string, levels, 0, n, reorder_callback, arg); +} + +/* +** Track consecutive sequences of characters with the same embedding level. +** +** Linked list create in compute_bidi_embed_levelruns(). +*/ + +struct bidi_embed_levelrun { + struct bidi_embed_levelrun *next; + size_t start; + size_t end; + unicode_bidi_level_t level; +}; + +static struct bidi_embed_levelrun ** +record_bidi_embed_levelrun(struct bidi_embed_levelrun **tailp, + size_t start, + size_t end, + unicode_bidi_level_t level) +{ + struct bidi_embed_levelrun *p; + + p=(struct bidi_embed_levelrun *)calloc(1, sizeof(*p)); + if (!p) + abort(); + + p->start=start; + p->end=end; + p->level=level; + + if (*tailp) + { + (*tailp)->next=p; + return &(*tailp)->next; + } + else + { + *tailp=p; + return tailp; + } +} + +static void compute_bidi_embed_levelruns(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + struct bidi_embed_levelrun **tailp) +{ + size_t i=0; + + while (i<n) + { + size_t j=i; + + while (++i < n) + { + if ((levels[i] & 1) != (levels[j] & 1)) + break; + } + tailp=record_bidi_embed_levelrun(tailp, j, i, + levels[j] & 1); + } +} + +#define RLI 0x2067 +#define LRI 0x2066 +#define RLO 0x202e +#define LRO 0x202d +#define PDF 0x202c +#define PDI 0x2069 + +/* +** Whether a directional marker and a PDI is required to be generated after +** some subset of characters. +*/ + +struct need_marker_info { + int need_marker; + int need_pdi; +}; + +static void need_marker_info_init(struct need_marker_info *info) +{ + info->need_marker=0; + info->need_pdi=0; +} + +static void need_marker_info_merge(struct need_marker_info *info, + const struct need_marker_info *other_info) +{ + if (other_info->need_marker) + info->need_marker=1; + if (other_info->need_pdi) + info->need_pdi=1; +} + +static void emit_bidi_embed_levelrun(const char32_t *string, + enum_bidi_type_t *classes, + struct bidi_embed_levelrun *run, + unicode_bidi_level_t paragraph_level, + unicode_bidi_level_t previous_level, + unicode_bidi_level_t next_level, + struct need_marker_info *need_marker, + void (*emit)(const char32_t *string, + size_t n, + void *arg), + void *arg); + +/* L1 */ + +static int is_l1_on_or_after(const enum_bidi_type_t *classes, + size_t n, + size_t i, + int atend) +{ + /* + ** Determine if rule L1 will apply starting at the given position. + */ + while (i<n) + { + enum_bidi_type_t t=classes[i]; + + if (t == UNICODE_BIDI_TYPE_WS) + { + ++i; + continue; + } + + if (t == UNICODE_BIDI_TYPE_S || + t == UNICODE_BIDI_TYPE_B) + return 1; + return 0; + } + return atend; +} + +static void emit_marker(struct bidi_embed_levelrun *p, + struct need_marker_info *info, + void (*emit)(const char32_t *string, + size_t n, + void *arg), + void *arg) +{ + char32_t marker= (p->level & 1) ? RLM:LRM; + + if (info->need_marker) + (*emit)(&marker, 1, arg); + + if (info->need_pdi) + { + marker=PDI; + (*emit)(&marker, 1, arg); + } +} + +void unicode_bidi_embed(const char32_t *string, + const unicode_bidi_level_t *levels, + size_t n, + unicode_bidi_level_t paragraph_level, + void (*emit)(const char32_t *string, + size_t n, + void *arg), + void *arg) +{ + struct bidi_embed_levelrun *runs=0; + enum_bidi_type_t *classes= + (enum_bidi_type_t *)calloc(n, sizeof(enum_bidi_type_t)); + + if (!classes) + abort(); + + for (size_t i=0; i<n; ++i) + classes[i]=unicode_bidi_type(string[i]); + + compute_bidi_embed_levelruns(string, levels, + n, + &runs); + + /* + ** Go through the sequences of consecutive characters with the + ** same embedding level. Keep track of the preceding and the + ** next embedding level, which is usually the opposite from the + ** current sequence's embedding level. Except that the first and + ** the last sequence of characters, in the string, are bound to + ** the paragraph_level, which may be the same. + */ + + unicode_bidi_level_t previous_level=paragraph_level; + + while (runs) + { + struct bidi_embed_levelrun *p=runs; + + runs=runs->next; + + unicode_bidi_level_t next_level=paragraph_level; + + if (runs) + next_level=runs->level; + +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, " Range %d-%d, level %d\n", + (int)p->start, (int)(p->end-1), p->level); +#endif + + if (((p->level ^ paragraph_level) & 1) == 0) + { + /* + ** Sequence in the same direction as the paragraph + ** embedding level. + ** + ** We'll definitely need a directional marker if + ** rule L1 applies after this sequence. + */ + + struct need_marker_info need_marker; + + need_marker_info_init(&need_marker); + + if (classes[p->end-1] == UNICODE_BIDI_TYPE_WS) + { + need_marker.need_marker= + is_l1_on_or_after(classes, n, + p->end, + 0); +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, " need marker=%d\n", + need_marker.need_marker); +#endif + + } + + emit_bidi_embed_levelrun(string, classes, + p, paragraph_level, + previous_level, + next_level, + &need_marker, + emit, arg); + + emit_marker(p, &need_marker, emit, arg); + } + else + { + struct need_marker_info need_marker; + size_t orig_end=p->end; + + /* + ** Sequence in the opposite direction. Because S and + ** B reset to the paragraph level, no matter what, + ** if we want things to render like that we will need + ** to emit sequences on each side of S/B in reverse + ** order. We start at the end of this sequence, then + ** search towards the beginning, emit that sequence, + ** emit the S and B, then go to the next sequence. + */ + + need_marker_info_init(&need_marker); + +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, " need marker=%d\n", + need_marker); +#endif + + while (p->start < p->end) { - --right; + size_t j=p->end; - if (p) + int end_with_ws= + classes[j-1] == UNICODE_BIDI_TYPE_WS; + while (j > p->start) { - char32_t c=p[left]; - unicode_bidi_level_t l=levels[left]; + --j; - p[left]=p[right]; - levels[left]=levels[right]; - p[right]=c; - levels[right]=l; + enum_bidi_type_t t=classes[j]; + + if (t == UNICODE_BIDI_TYPE_S || + t == UNICODE_BIDI_TYPE_B) + { + ++j; + break; + } + } + + if (j == p->end) /* Must be lone break */ + { +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + " break: %d\n", + (int)j); +#endif + --p->end; + + previous_level=paragraph_level; + + (*emit)(string+p->end, 1, arg); + continue; + } + + struct need_marker_info need_marker_partial; + + need_marker_info_init(&need_marker_partial); + + /* + ** Rule L1, there's going to be an S or a B + ** after we emit this sequence. + */ + + if (j != p->start) + need_marker_partial.need_marker=1; + + /* + ** To emit this sequence, we monkey-patch + ** the run level to indicate the sub- + ** sequence to emit. + */ + size_t i=p->start; + + p->start=j; + + emit_bidi_embed_levelrun + (string, classes, p, paragraph_level, + previous_level, + + j == i + /* No more, this is next */ + ? next_level + /* We'll emit a paragraph brk */ + : paragraph_level, + &need_marker_partial, + emit, arg); + + /* Continue monkey-patching. */ + + p->end=p->start; + p->start=i; + + if (p->start == p->end) + /* Do it below */ + { + if (end_with_ws) + need_marker.need_marker= + is_l1_on_or_after + (classes, n, + orig_end, + 0); + need_marker_info_merge + (&need_marker, + &need_marker_partial); + } + else + { + emit_marker(p, &need_marker_partial, + emit, arg); } - ++left; } + emit_marker(p, &need_marker, emit, arg); + } + free(p); + } + free(classes); +} + +#define ADJUST_LR(t,e) do { \ + switch (t) { \ + case UNICODE_BIDI_TYPE_AL: \ + (t)=UNICODE_BIDI_TYPE_R; \ + break; \ + case UNICODE_BIDI_TYPE_ET: \ + case UNICODE_BIDI_TYPE_ES: \ + case UNICODE_BIDI_TYPE_AN: \ + case UNICODE_BIDI_TYPE_EN: \ + (t)=UNICODE_BIDI_TYPE_L; \ + break; \ + default: \ + break; \ + } \ + } while (0) + +#define ADJUST_LRSTRONG(t) do { \ + switch (t) { \ + case UNICODE_BIDI_TYPE_AL: \ + (t)=UNICODE_BIDI_TYPE_R; \ + default: \ + break; \ + } \ + } while (0) + +static void emit_bidi_embed_levelrun(const char32_t *string, + enum_bidi_type_t *classes, + struct bidi_embed_levelrun *run, + unicode_bidi_level_t paragraph_level, + unicode_bidi_level_t previous_level, + unicode_bidi_level_t next_level, + struct need_marker_info *need_marker, + void (*emit)(const char32_t *string, + size_t n, + void *arg), + void *arg) +{ + /* + ** Our first order of business will be to apply rules W to this + ** sequence, to resolve weak types. + ** + ** It's easy to simulate what unicode_bidi_w() expects. + */ + + struct level_run lrun; + struct isolating_run_sequence_s seq; + enum_bidi_type_t e_type=E_CLASS(run->level); + enum_bidi_type_t o_type=O_CLASS(run->level); + + if (run->start == run->end) + return; + + memset(&seq, 0, sizeof(seq)); + + seq.embedding_level=run->level; + seq.sos=seq.eos=e_type; + seq.runs.runs=&lrun; + seq.runs.n_level_runs=1; + seq.runs.cap_level_runs=1; + lrun.start=run->start; + lrun.end=run->end; + unicode_bidi_w(classes, &seq); + + /* + ** Peek at the first character's class. + ** + ** If the previous sequence's embedding level was the same, it + ** guarantees the peristence of the embedding direction. We can + ** accept classes that default to our embedding level. + ** + ** Otherwise we recognize only strong classes. + */ + enum_bidi_type_t t=classes[run->start]; + + if (previous_level == run->level) + { + ADJUST_LR(t, E_CLASS(previous_level)); + } + else + { + ADJUST_LRSTRONG(t); + } + + /* + ** Sequence in the opposite direction always get isolated. + */ + char32_t override_start=run->level ? RLI:LRI; + + if (run->level != paragraph_level) + (*emit)(&override_start, 1, arg); + + /* + ** Make sure the character sequence has strong context. + */ + if (t == o_type) + { + struct need_marker_info need_marker; + + need_marker_info_init(&need_marker); + + need_marker.need_marker=1; + + emit_marker(run, &need_marker, emit, arg); + } + + override_start=run->level ? RLO:LRO; + char32_t override_end=PDF; + + size_t start=run->start; + size_t end=run->end; + + while (start < end) + { + size_t i=start; + size_t word_start=i; + +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + " examining, starting at: %d\n", (int)i); +#endif + + /* + ** Look for the next character with the opposite class. + ** While doing that, keep an eye out on any WS or ONs, + ** which will tell us where the most recent "word"s starts, + ** before this character. + */ + while (i < end) + { + enum_bidi_type_t t=classes[i]; + + ADJUST_LR(t, e_type); + + if (t == o_type) + break; + + switch (t) { + case UNICODE_BIDI_TYPE_WS: + case UNICODE_BIDI_TYPE_ON: + word_start=i+1; + break; + default: + break; + } + + ++i; + } + + if (i < end) + { +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + " override needed: %d," + " start of word at %d, ", + (int)i, (int)word_start); +#endif + /* + ** Found something to override. First, emit everything + ** up to the start of this "word". + ** + ** Then emit the RLO or LRO, then look for the end + ** of the "word", and drop the PDF there. + */ + if (word_start > start) + (*emit)(string+start, + word_start-start, arg); + + (*emit)(&override_start, 1, arg); + while (++i < end) + { + enum_bidi_type_t t=classes[i]; - if (end-start > 1 && reorder_callback) - (*reorder_callback)(start, end-start, arg); + switch (t) { + case UNICODE_BIDI_TYPE_WS: + case UNICODE_BIDI_TYPE_ON: + break; + default: + continue; + } + break; + } +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, "end of word at %d\n", + (int)i); +#endif + (*emit)(string+word_start, i-word_start, arg); + (*emit)(&override_end, 1, arg); + start=i; + continue; } + (*emit)(string+start, i-start, arg); + start=i; } - level_run_layers_deinit(&layers); + /* + ** Make sure that if a different embedding level follows we will + ** emit a marker, to ensure strong context. + */ + t=classes[run->end-1]; + + if (next_level != run->level) + { + ADJUST_LRSTRONG(t); + + if (e_type != t) + need_marker->need_marker=1; + } + + if (run->level != paragraph_level) + need_marker->need_pdi=1; +} + +struct compute_paragraph_embedding_level_char_info { + const char32_t *str; +}; + +static enum_bidi_type_t +get_enum_bidi_type_for_embedding_paragraph_level(size_t i, + void *arg) +{ + struct compute_paragraph_embedding_level_char_info *p= + (struct compute_paragraph_embedding_level_char_info *)arg; + + return unicode_bidi_type(p->str[i]); +} + +char32_t unicode_bidi_embed_paragraph_level(const char32_t *str, + size_t n, + unicode_bidi_level_t paragraph_level + ) +{ + struct compute_paragraph_embedding_level_char_info info; + info.str=str; + + if ((compute_paragraph_embedding_level + (0, n, + get_enum_bidi_type_for_embedding_paragraph_level, + &info) ^ paragraph_level) == 0) + return 0; + + return (paragraph_level & 1) ? RLM:LRM; } diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index ca139cc..04d2893 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -1,5 +1,5 @@ /* -** Copyright 2011-2014 Double Precision, Inc. +** Copyright 2011-2020 Double Precision, Inc. ** See COPYING for distribution information. ** */ @@ -596,7 +596,8 @@ extern "C" { void *arg) { auto p=reinterpret_cast<const std::function<void (size_t, - size_t)> *> + size_t) + noexcept> *> (arg); (*p)(i, cnt); @@ -605,7 +606,8 @@ extern "C" { int unicode::bidi_reorder(std::u32string &string, std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t, size_t)> &lambda) + const std::function<void (size_t, size_t) + noexcept> &lambda) { size_t s=string.size(); @@ -624,7 +626,8 @@ int unicode::bidi_reorder(std::u32string &string, } void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels, - const std::function<void (size_t, size_t)> &lambda) + const std::function<void (size_t, size_t) + noexcept> &lambda) { size_t s=levels.size(); @@ -636,3 +639,189 @@ void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels, (reinterpret_cast<const void *>(&lambda))); } + +extern "C" { + static void removed_callback(size_t i, + void *arg) + { + auto p=reinterpret_cast<const std::function<void (size_t) + noexcept> *> + (arg); + + (*p)(i); + } +} + +void unicode::bidi_cleanup(std::u32string &string, + const std::function<void (size_t) noexcept> &lambda) +{ + if (string.empty()) + return; + + size_t n=unicode_bidi_cleanup(&string[0], + 0, + string.size(), + removed_callback, + const_cast<void *> + (reinterpret_cast<const void *> + (&lambda))); + + string.resize(n); +} + +int unicode::bidi_cleanup(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t) noexcept> &lambda) +{ + if (levels.size() != string.size()) + return -1; + + size_t n=unicode_bidi_cleanup(&string[0], + &levels[0], + string.size(), + removed_callback, + const_cast<void *> + (reinterpret_cast<const void *> + (&lambda))); + + string.resize(n); + levels.resize(n); + return 0; +} + + +void unicode::bidi_extra_cleanup(std::u32string &string, + const std::function<void (size_t) noexcept> + &lambda) +{ + if (string.empty()) + return; + + size_t n=unicode_bidi_extra_cleanup(&string[0], + 0, + string.size(), + removed_callback, + const_cast<void *> + (reinterpret_cast<const void *> + (&lambda))); + + string.resize(n); +} + +int unicode::bidi_extra_cleanup(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + const std::function<void (size_t) noexcept> + &lambda) +{ + if (levels.size() != string.size()) + return -1; + + size_t n=unicode_bidi_extra_cleanup(&string[0], + &levels[0], + string.size(), + removed_callback, + const_cast<void *> + (reinterpret_cast<const void *> + (&lambda))); + + string.resize(n); + levels.resize(n); + return 0; +} + +int unicode::bidi_logical_order(std::u32string &string, + std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (size_t, size_t) + noexcept> &lambda) +{ + if (string.size() != levels.size()) + return -1; + + if (string.empty()) + return 0; + + unicode_bidi_logical_order(&string[0], &levels[0], string.size(), + paragraph_embedding, + &reorder_callback, + const_cast<void *> + (reinterpret_cast<const void *>(&lambda))); + return 0; +} + +void unicode::bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (size_t, size_t) + noexcept> &lambda) +{ + if (levels.size() == 0) + return; + + unicode_bidi_logical_order(NULL, &levels[0], levels.size(), + paragraph_embedding, + &reorder_callback, + const_cast<void *> + (reinterpret_cast<const void *>(&lambda))); +} + +extern "C" { + static void embed_callback(const char32_t *string, + size_t n, + void *arg) + { + auto p=reinterpret_cast<const std::function<void + (const char32_t *, + size_t n) + noexcept> *>(arg); + (*p)(string, n); + } +} + +int unicode::bidi_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t> &levels, + unicode_bidi_level_t paragraph_embedding, + const std::function<void (const char32_t *string, + size_t n) noexcept> + &lambda) +{ + if (string.size() != levels.size()) + return -1; + + if (string.empty()) + return 0; + + unicode_bidi_embed(&string[0], &levels[0], string.size(), + paragraph_embedding, + embed_callback, + const_cast<void *> + (reinterpret_cast<const void *> + (&lambda))); + return 0; +} + +std::u32string unicode::bidi_embed(const std::u32string &string, + const std::vector<unicode_bidi_level_t + > &levels, + unicode_bidi_level_t paragraph_embedding) +{ + std::u32string new_string; + + (void)bidi_embed(string, levels, paragraph_embedding, + [&] + (const char32_t *string, + size_t n) + { + new_string.insert(new_string.end(), + string, string+n); + }); + + return new_string; +} + +char32_t unicode::bidi_embed_paragraph_level(const std::u32string &string, + unicode_bidi_level_t level) +{ + return unicode_bidi_embed_paragraph_level(string.c_str(), + string.size(), + level); +} |
