diff options
| -rw-r--r-- | unicode/Makefile.am | 11 | ||||
| -rw-r--r-- | unicode/README | 4 | ||||
| -rw-r--r-- | unicode/biditest.C | 16 | ||||
| -rw-r--r-- | unicode/biditest2.C | 289 | ||||
| -rw-r--r-- | unicode/book.xml | 796 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 203 | ||||
| -rw-r--r-- | unicode/docbook/book.css | 2 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 919 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 197 | 
9 files changed, 2108 insertions, 329 deletions
| diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 8ac6fb1..f864e2d 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -90,6 +90,11 @@ include_HEADERS=courier-unicode.h \  man_MANS= \          $(srcdir)/man/courier-unicode.7 \          $(srcdir)/man/unicode\:\:bidi_calc.3 \ +        $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ +        $(srcdir)/man/unicode\:\:bidi_embed.3 \ +        $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ +        $(srcdir)/man/unicode\:\:bidi_extra_cleanup.3 \ +        $(srcdir)/man/unicode\:\:bidi_logical_order.3 \          $(srcdir)/man/unicode\:\:bidi_reorder.3 \          $(srcdir)/man/unicode\:\:iconvert\:\:convert.3 \          $(srcdir)/man/unicode\:\:iconvert\:\:convert_tocase.3 \ @@ -110,8 +115,14 @@ man_MANS= \          $(srcdir)/man/unicode_bidi.3 \          $(srcdir)/man/unicode_bidi_bracket_type.3 \          $(srcdir)/man/unicode_bidi_calc.3 \ +        $(srcdir)/man/unicode_bidi_cleanup.3 \ +        $(srcdir)/man/unicode_bidi_embed.3 \ +        $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ +        $(srcdir)/man/unicode_bidi_extra_cleanup.3 \ +        $(srcdir)/man/unicode_bidi_logical_order.3 \          $(srcdir)/man/unicode_bidi_mirror.3 \          $(srcdir)/man/unicode_bidi_reorder.3 \ +        $(srcdir)/man/unicode_bidi_type.3 \          $(srcdir)/man/unicode_canonical.3 \          $(srcdir)/man/unicode_category_lookup.3 \          $(srcdir)/man/unicode_convert.3 \ diff --git a/unicode/README b/unicode/README index 926e004..9994cc9 100644 --- a/unicode/README +++ b/unicode/README @@ -16,12 +16,12 @@ Courier Unicode Library     COPYING     This library implements several algorithms related to the Unicode -   Standard: +   Standard, notably:       * Look up uppercase, lowercase, and titlecase equivalents of a unicode         character. -     * Implementation of grapheme and work breaking rules. +     * Implementation of grapheme and word breaking rules.       * Implementation of line breaking rules. diff --git a/unicode/biditest.C b/unicode/biditest.C index 2d2a6e5..1aa2c63 100644 --- a/unicode/biditest.C +++ b/unicode/biditest.C @@ -8,6 +8,7 @@  #include	<utility>  #include	<iomanip>  #include	<numeric> +#include	<unistd.h>  std::vector<std::string> testcase; @@ -53,11 +54,11 @@ int main(int argc, char **argv)  	{  		buf.clear(); -		if (std::getline(fp, buf).eof() && buf.empty()) -			break; +		bool iseof=std::getline(fp, buf).eof() && buf.empty(); -		if (++linenum >= nextlogline) +		if (iseof || ++linenum >= nextlogline)  		{ +			alarm(300);  			std::cout << logmsg;  			std::ostringstream o; @@ -72,7 +73,8 @@ int main(int argc, char **argv)  			nextlogline += 20000;  		} - +		if (iseof) +			break;  		buf.erase(std::find(buf.begin(), buf.end(), '#'), buf.end());  		if (buf.substr(0, 8) == "@Levels:") @@ -334,11 +336,7 @@ int main(int argc, char **argv)  			n >>= 1;  		}  	} - -	std::cout << logmsg; - -	std::fill(logmsg.begin(), logmsg.end(), ' '); -	std::cout << logmsg << std::endl; +	std::cout << std::endl;  	return 0;  } diff --git a/unicode/biditest2.C b/unicode/biditest2.C index f497bcf..cfa0e50 100644 --- a/unicode/biditest2.C +++ b/unicode/biditest2.C @@ -1,42 +1,110 @@  #include	"unicode_config.h"  #include	"courier-unicode.h"  #include	<iostream> +#include	<iterator>  #include	<sstream>  #include	<fstream>  #include	<cstdint>  #include	<iomanip> +#include	<algorithm> +#include	<unistd.h>  FILE *DEBUGDUMP; -int main(int argc, char **argv) +#define BIDI_DEBUG + +extern "C" { +#if 0 +} +#endif + +#include "unicode_bidi.c" + +} + +void latin_test()  { -	std::ifstream fp("BidiCharacterTest.txt"); +	for (char32_t c=32; c<256; c++) +	{ +		std::u32string s; -	if (!fp.is_open()) +		s += c; + +		std::vector<unicode_bidi_level_t> levels={UNICODE_BIDI_LR}; + +		auto new_string=unicode::bidi_embed(s, levels, +						    UNICODE_BIDI_LR); + +		if (new_string != s) +		{ +			std::cerr << "Character " << (int)c +				  << " does not work." << std::endl; +			exit(1); +		} +	} + +	std::u32string s; +	std::vector<unicode_bidi_level_t> levels; + +	for (char32_t c=32; c<256; c++)  	{ -		std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl; +		s += c; +		levels.push_back(UNICODE_BIDI_LR); +	} + +	auto new_string=unicode::bidi_embed(s, levels, +					    UNICODE_BIDI_LR); + +	if (new_string != s) +	{ +		std::cerr << "iso-8859-1 string does not work." +			  << std::endl;  		exit(1);  	} +} -	DEBUGDUMP=fopen("/dev/null", "w"); -	if (!DEBUGDUMP) +void character_test() +{ +	std::ifstream fp("BidiCharacterTest.txt"); + +	if (!fp.is_open())  	{ -		perror("/dev/null"); +		std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl;  		exit(1);  	}  	std::string buf;  	size_t linenum=0; +	size_t nextlogline=0; +	std::string logmsg;  	while (1)  	{  		buf.clear(); -		if (std::getline(fp, buf).eof() && buf.empty()) -			break; -		++linenum; +		bool iseof=std::getline(fp, buf).eof() && buf.empty(); + +		if (iseof || ++linenum >= nextlogline) +		{ +			alarm(300); +			std::cout << logmsg; + +			std::ostringstream o; +			o << std::setw(6) << linenum << " lines processed... "; + +			logmsg=o.str(); + +			std::cout << logmsg << std::flush; + +			std::fill(logmsg.begin(), logmsg.end(), '\b'); + +			nextlogline += 20000; +		} + +		if (iseof) +			break;  		auto p=buf.find('#');  		if (p != buf.npos) @@ -187,17 +255,202 @@ int main(int argc, char **argv)  			std::cerr << std::endl;  			exit(1);  		} -	} -	return 0; -} -#define BIDI_DEBUG +		std::vector<size_t> actual_render_order; + +		size_t n=0; + +		std::generate_n(std::back_inserter(actual_render_order), +				s.size(), +				[&] { return n++; }); + +		unicode::bidi_reorder +			(s, levels, +			 [&] +			 (size_t index, +			  size_t n) +			 { +				 auto b=actual_render_order.begin(); +				 std::reverse(b+index, b+index+n); +			 }); + +		n=0; +		unicode::bidi_cleanup +			(s, levels, +			 [&] +			 (size_t i) +			 { +				 actual_render_order.erase +					 (actual_render_order.begin()+i-n); +				 ++n; +			 }); + +		if (render_order != actual_render_order) +		{ +			std::cerr << "Regression, line " +				  << linenum +				  << ": render order" +				  << std::endl +				  << "   Expected:"; +			for (auto n:render_order) +			{ +				std::cerr << " " << n; +			} +			std::cerr << std::endl +				  << "     Actual:"; -extern "C" { -#if 0 +			for (auto n:actual_render_order) +			{ +				std::cerr << " " << n; +			} +			std::cerr << std::endl; +			exit(1); +		} + +		unicode::bidi_extra_cleanup(s, levels); + +		auto dump_ls= +			[&] +			(const std::u32string &s, +			 const std::vector<unicode_bidi_level_t> &l) +			{ +				for (size_t i=0; i<s.size(); ++i) +				{ +					std::cerr << " " << std::hex +						  << std::setw(4) +						  << std::setfill('0') +						  << s[i] << "/" +						  << std::dec +						  << (int)l[i]; +				} +			}; + +		for (int pass=0; pass<4; pass++) +		{ +			int paragraph=pass & 1; +			int use_default=pass & 2; + +			for (size_t i=0; i<s.size(); ++i) +			{ +				/* L1 */ +				switch (unicode_bidi_type(s[i])) { +				case UNICODE_BIDI_TYPE_S: +				case UNICODE_BIDI_TYPE_B: +					levels.at(i)=paragraph; +				} +			} + +			auto logical_string=s; +			auto logical_levels=levels; + +			unicode::bidi_logical_order(logical_string, +						    logical_levels, +						    paragraph); + +			auto new_string=unicode::bidi_embed(logical_string, +							    logical_levels, +							    paragraph); + +			auto save_string=new_string; + +			if (use_default) +			{ +				auto marker=unicode::bidi_embed_paragraph_level +					(new_string, paragraph); + +				if (marker) +					new_string.insert(0, 1, marker); + +				ret=unicode::bidi_calc(new_string); +			} +			else +			{ +				ret=unicode::bidi_calc(new_string, paragraph); +			} + +			unicode::bidi_reorder(new_string, std::get<0>(ret)); +			unicode::bidi_extra_cleanup(new_string, +						    std::get<0>(ret)); + +			/* New string is now back in logical order */ + +			if (new_string == s && std::get<0>(ret) == levels) +				continue; + +			fclose(DEBUGDUMP); +			DEBUGDUMP=stderr; + +			std::cerr << "Regression, line " +				  << linenum +				  << ": embedding markers" +				  << std::endl +				  << "   Paragraph embedding level: " +				  << paragraph; + +			if (use_default) +				std::cerr << " (defaulted)"; + +			std::cerr << std::endl +				  << "String (1):"; + +			dump_ls(s, levels); + +			std::cerr << std::endl << "String (2):"; + +			dump_ls(new_string, std::get<0>(ret)); +			std::cerr << std::endl; + +			std::cerr << "Embedding:"; +			dump_ls(logical_string, logical_levels); +			std::cerr << std::endl; + +			unicode::bidi_embed(logical_string, +					    logical_levels, +					    paragraph); + +			std::cerr << std::endl +				  << "Embedded string:"; + +			for (auto c:save_string) +			{ +				std::cerr << " "; + +				switch (c) { +				case LRM: std::cerr << "LRM"; break; +				case RLM: std::cerr << "RLM"; break; +				case RLI: std::cerr << "RLI"; break; +				case LRI: std::cerr << "LRI"; break; +				case RLO: std::cerr << "RLO"; break; +				case LRO: std::cerr << "LRO"; break; +				case PDF: std::cerr << "PDF"; break; +				case PDI: std::cerr << "PDI"; break; +				default: +					std::cerr << std::hex << std::setw(4) +						  << std::setfill('0') +						  << c; +					break; +				} +			} +			std::cerr << std::dec << std::endl << std::flush; + +			ret=unicode::bidi_calc(save_string, paragraph); +			unicode::bidi_reorder(save_string, std::get<0>(ret)); +			exit(1); +		} +	} +	std::cout << std::endl;  } -#endif -#include "unicode_bidi.c" +int main(int argc, char **argv) +{ +	DEBUGDUMP=fopen("/dev/null", "w"); +	if (!DEBUGDUMP) +	{ +		perror("/dev/null"); +		exit(1); +	} +	latin_test(); +	character_test(); +	return 0;  } diff --git a/unicode/book.xml b/unicode/book.xml index ad0009a..c8948ba 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -12,7 +12,7 @@  <!-- -Copyright 2014-2017 Double Precision, Inc. +Copyright 2014-2020 Double Precision, Inc.  See COPYING for distribution information.  --> @@ -23,7 +23,7 @@ See COPYING for distribution information.    <para>      This library implements several algorithms related to the      <ulink url="https://www.unicode.org/standard/standard.html">Unicode -    Standard</ulink>: +    Standard</ulink>, notably:    </para>    <itemizedlist> @@ -36,22 +36,21 @@ See COPYING for distribution information.      <listitem>        <para>  	Implementation of -	<ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">grapheme -	and work breaking</ulink> rules. +	<link linkend="unicode_grapheme_break">grapheme +	and word breaking</link> rules.        </para>      </listitem>      <listitem>        <para>  	Implementation of -	<ulink url="https://www.unicode.org/reports/tr14/tr14-&tr14ver;.html">line -	breaking</ulink> rules. +	<link linkend="unicode_line_break">line	breaking</link> rules.        </para>      </listitem>      <listitem>        <para>  	Implementation of the -	<ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">bi-directional -	algorithm</ulink>. +	<link linkend="unicode_bidi">bi-directional +	algorithm</link>.        </para>      </listitem>      <listitem> @@ -69,15 +68,13 @@ See COPYING for distribution information.      </listitem>      <listitem>        <para> -	Look up the -	<ulink url="https://www.unicode.org/reports/tr24/tr24-&tr24ver;.html">Unicode -	script property</ulink>. +	Look up the <link linkend="unicode_script">Unicode +	script property</link>.        </para>      </listitem>      <listitem>        <para> -	Look up the -	<ulink url="https://unicode.org/notes/tn36/">category</ulink> +	Look up the <link linkend="unicode_category_lookup">category</link>  	property.        </para>      </listitem> @@ -192,7 +189,7 @@ See COPYING for distribution information.  	  <programlisting>  #include <courier-unicode.h></programlisting>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="courier_unicode_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -226,7 +223,7 @@ See COPYING for distribution information.  	    with this library.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="courier_unicode_seealso">  	  <title>SEE ALSO</title>  	  <para> @@ -306,16 +303,22 @@ See COPYING for distribution information.  	  <refname>unicode_bidi</refname>  	  <refname>unicode_bidi_calc</refname>  	  <refname>unicode_bidi_reorder</refname> +	  <refname>unicode_bidi_cleanup</refname> +	  <refname>unicode_bidi_extra_cleanup</refname> +	  <refname>unicode_bidi_logical_order</refname> +	  <refname>unicode_bidi_embed</refname> +	  <refname>unicode_bidi_embed_paragraph_level</refname> + +	  <refname>unicode_bidi_type</refname>  	  <refname>unicode_bidi_mirror</refname>  	  <refname>unicode_bidi_bracket_type</refname> -	  <refpurpose>unicode bidirectional algorithm</refpurpose> +	  <refpurpose>unicode bi-directional algorithm</refpurpose>  	</refnamediv>  	<refsynopsisdiv>  	  <funcsynopsis> -	    <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> -	    <funcsynopsisinfo>unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo> +	    <funcsynopsisinfo>#include <courier-unicode.h>

unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo>  	    <funcprototype>  	      <funcdef>void <function>unicode_bidi_calc</function></funcdef>                <paramdef>const char32_t *<parameter>p</parameter></paramdef> @@ -334,6 +337,51 @@ See COPYING for distribution information.  	    </funcprototype>  	    <funcprototype> +	      <funcdef>size_t <function>unicode_bidi_cleanup</function></funcdef> +              <paramdef>char32_t *<parameter>string</parameter></paramdef> +              <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> +	      <paramdef>void *<parameter>arg</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype> +	      <funcdef>size_t <function>unicode_bidi_extra_cleanup</function></funcdef> +              <paramdef>char32_t *<parameter>string</parameter></paramdef> +              <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>void (*<parameter>removed_callback</parameter>)(size_t, size_t, void *)</paramdef> +	      <paramdef>void *<parameter>arg</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype> +	      <funcdef>size_t <function>unicode_bidi_logical_order</function></funcdef> +              <paramdef>char32_t *<parameter>string</parameter></paramdef> +              <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +              <paramdef>void (*<parameter>reorder_callback</parameter>)(size_t index, size_t n, void *arg)</paramdef> +	      <paramdef>void *<parameter>arg</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype> +	      <funcdef>size_t <function>unicode_bidi_embed</function></funcdef> +              <paramdef>const char32_t *<parameter>string</parameter></paramdef> +              <paramdef>const unicode_bidi_level_t *<parameter>levels</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +              <paramdef>void (*<parameter>emit</parameter>)(const char32_t *string, size_t n, void *arg)</paramdef> +	      <paramdef>void *<parameter>arg</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype> +	      <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef> +              <paramdef>const char32_t *<parameter>string</parameter></paramdef> +              <paramdef>size_t <parameter>n</parameter></paramdef> +              <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +	    </funcprototype> + +	    <funcprototype>  	      <funcdef>char32_t <function>bidi_mirror</function></funcdef>                <paramdef>char32_t <parameter>c</parameter></paramdef>  	    </funcprototype> @@ -350,63 +398,160 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_bidi_descr">  	  <title>DESCRIPTION</title>  	  <para> -	    <function>unicode_bidi_calc</function>() and -	    <function>unicode_bidi_reorder</function>() implement -	    the +	    These functions are related to the  	    <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>. -	  </para> -	  <para> -	    The first two parameters to -	    <function>unicode_bidi_calc</function>() are a unicode string -	    and the number of characters in the Unicode string. -	    <parameter>levels</parameter> points to a buffer of -	    <classname>unicode_bidi_level_t</classname> values. -	    The caller is responsible for allocating and deallocating this -	    buffer, of -	    size <parameter>n</parameter>, -	    the same number of values as the number of characters in the -	    Unicode string. -	  </para> -	  <para> -	    <function>unicode_bidi_calc</function>() calculates the -	    embedding level of each character and fills in the -	    <parameter>levels</parameter> buffer (executes all steps of the -	    bidirectional algorithm up to, and including, step L1). -	    A <literal>NULL</literal> <parameter>initial_embedding</parameter> -	    value calculates the default paragraph embedding value. -	    A pointer to a <literal>UNICODE_BIDI_LR</literal> or -	    <literal>UNICODE_BIDI_RL</literal> value explicitly sets a -	    left-to-right or right-to-left paragraph embedding value. +	    They implement the algorithm up to and including step L2, +	    and provide additional functionality of returning miscellaneous +	    bi-directional-related metadata of Unicode characters. There's +	    also a basic algorithm that <quote>reverses</quote> the +	    bi-directional algorithm +	    and produces a Unicode string with bi-directional markers that +	    results in the same bi-directional string after reapplying the +	    algorithm.  	  </para> -	  <para> -	    <function>unicode_bidi_calc</function>() calculates each -	    character's directional embedding value: an even value for -	    left-to-right text or an odd value for right-to-left text. -	    Unicode characters with an unspecified directional embedding -	    value are specified by the -	    <classname>UNICODE_BIDI_SKIP</classname> embedding level value. -	    This indicates embedding and override markers, which can be -	    removed from the string (together with this embedding value) -	    from the string and the embedding value itself). This can be -	    done before or after <function>unicode_bidi_reorder</function>(). -	  </para> +	  <refsect2 id="unicode_bidi_calc_reorder"> +	    <title>Calculating bi-directional rendering order</title> -	  <refsect2> -	    <title>Reordering text</title> +	    <para> +	      The following process computes the rendering order of +	      characters according to the Unicode Bi-Directional algorithm: +	    </para> + +	    <orderedlist> +	      <listitem> +		<para> +		  Allocate an array of +		  <structname>unicode_bidi_level_t</structname> that's the +		  same size as the Unicode string. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  Use <function>unicode_bidi_calc</function>() to compute +		  the Unicode string's characters' bi-directional embedding +		  level (executes the Bi-Directional algorithm up to and +		  including step L1). This populates the +		  <structname>unicode_bidi_level_t</structname> buffer. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  Use <function>unicode_bidi_reorder</function>() to reverse +		  any characters in the string, according to the +		  algorithm (step L2), with an optional +		  callback that reports which ranges of characters get +		  reversed. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  Use <function>unicode_bidi_cleanup</function>() or +		  <function>unicode_bidi_extra_cleanup</function>(), +		  to remove the characters from the string which are used +		  by the bi-directional algorithm, and are not needed for +		  rendering the text. +		</para> +	      </listitem> +	    </orderedlist> + +	    <para> +	      The parameters to +	      <function>unicode_bidi_calc</function>() are: +	    </para> + +	    <itemizedlist> +	      <listitem> +		<para> +		  A pointer to the Unicode string. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  Number of characters in the Unicode string. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  A pointer to an array of +		  <structname>unicode_bidi_level_t</structname> values. +		  The caller is +		  responsible for allocating and deallocating this array, +		  which has the same size as the Unicode string. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  An optional pointer to a +		  <literal>UNICODE_BIDI_LR</literal> or +		  <literal>UNICODE_BIDI_RL</literal> value. This sets +		  the default paragraph direction level. +		  A null pointer computes the default paragraph direction +		  level based on the string, as specified by the "P" rules +		  of the bi-directional algorithm. +		</para> +	      </listitem> +	    </itemizedlist> + +	    <para> +	      <function>unicode_bidi_calc</function>() fills in the +	      <structname>unicode_bidi_level_t</structname> array with the +	      values corresponding to the embedding level of the +	      corresponding character, +	      according the Unicode Bidirection Algorithm (even values for +	      left-to-right ordering, and odd values for right-to-left +	      ordering). +	      A value of UNICODE_BIDI_SKIP designates directional markers +	      (from step X9). +	    </para>  	    <para> -	      <function>unicode_bidi_reorder</function> takes the actual +	      <function>unicode_bidi_calc</function>() returns the resolved +	      paragraph direction level, which +	      always matches the passed in level, if specified, else it +	      reports the +	      derived one. +	    </para> + +	    <para> +	      <function>unicode_bidi_reorder</function>() takes the actual  	      unicode string together with the embedding values from  	      <function>unicode_bidi_calc</function>, then reverses the -	      bidirectional string, as specified by step L2 of the bidirectional +	      bi-directional string, as specified by step L2 of the bi-directional  	      algorithm. +	      The parameters to +	      <function>unicode_bidi_reorder</function>() are:  	    </para> +	    <itemizedlist> +	      <listitem> +		<para> +		  A pointer to the Unicode string. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  A pointer to an array of +		  <structname>unicode_bidi_level_t</structname> values. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  Number of characters in the Unicode string and the +		  <structname>unicode_bidi_level_t</structname> array. +		</para> +	      </listitem> +	      <listitem> +		<para> +		  An optional <varname>reorder_callback</varname> function +		  pointer. +		</para> +	      </listitem> +	    </itemizedlist>  	    <para>  	      A non-<literal>NULL</literal>  	      <parameter>reorder_callback</parameter> gets invoked to report @@ -434,13 +579,280 @@ See COPYING for distribution information.  	      invokes the <parameter>reorder_callback</parameter> as if  	      the character string, and their embedding values, were reversed.  	    </para> + +	    <para> +	      The resulting string and embedding levels are in +	      <quote>rendering order</quote>, but still contain bi-directional +	      embedding, override, boundary-neutral, isolate, and marker +	      characters. +	      <function>unicode_bidi_cleanup</function>() and +	      <function>unicode_bidi_extra_cleanup</function>() remove these +	      characters and directional markers from the unicode string. +	      <function>unicode_bidi_cleanup</function> removes only the +	      embedding, override, and  boundry-neutral characters (as +	      specified by step X9 of the bi-directional algorithm). +	      <function>unicode_bidi_extra_cleanup</function>() +	      additionally removes the isolation markers, implicit markers; +	      and all characters +	      classified as paragraph separators get replaced by a newline. +            </para> +	    <para> +	      A non-null pointer to the directional embedding level buffer, +	      of the same size as the string, also removes the corresponding +	      values from the buffer, and the remaining values in the +	      embedding level buffer get reset to +	      levels <literal>UNICODE_BIDI_LR</literal> and +	      <literal> UNICODE_BIDI_RL</literal>, only. +            </para> +	    <para> +	      The parameters to <function>unicode_bidi_cleanup</function>() and +	      <function>unicode_bidi_extra_cleanup</function>() are: +            </para> + +	    <itemizedlist> +	      <listitem> +		<para> +		  The pointer to the unicode string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The pointer to the directional embedding buffer. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The size of the unicode string and the directional embedding +		  buffer. +                </para> +              </listitem> +	      <listitem> +		<para> +		  A pointer to a function that gets repeatedly invoked with the +		  index of the character that gets removed from the Unicode +		  string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  An opaque pointer that gets forwarded to the callback. +                </para> +              </listitem> +            </itemizedlist> +	    <para> +	      The function pointer (if not <literal>NULL</literal>) +	      gets invoked to report the index of each +	      removed character. The reported index is the index from the +	      original string, and the callback gets invoked in strict order, +	      from the first to +	      the last removed character (if any). +            </para> +	    <para> +	      Multiple calls to <function>unicode_bidi_cleanup</function>() or +	      <function>unicode_bidi_extra_cleanup</function>() do no harm; +	      except that <function>unicode_bidi_extra_cleanup</function>() +	      always removes all the additional characters that +	      <function>unicode_bidi_cleanup</function>() does not remove. +            </para> +	    <para> +	      The character string and the embedding level values resulting +	      from <function>unicode_bidi_extra_cleanup</function>() are in +	      <quote>canonical rendering order</quote>. +            </para>  	  </refsect2> -	  <refsect2> + +	  <refsect2 id="unicode_bidi_embed"> +	    <title>Embedding bi-directional markers in Unicode text strings</title> +            <para> +	      <function>unicode_bidi_logical_order</function>() and +	      <function>unicode_bidi_embed</function>() add various +	      bi-directional markers to a Unicode string in canonical rendering +	      order. The resulting string is not guaranteed to be +	      identical to the +	      original Unicode bi-directional string. The algorithm is fairly +	      basic, +	      but the resulting bi-directional string produces the same +	      canonical rendering order after applying +	      <function>unicode_bidi_calc()</function>, +	      <function>unicode_reorder()</function> and +	      <function>unicode_bidi_extra_cleanup()</function>, +	      with the same paragraph_embedding level. +            </para> + +	    <para> +	      <function>unicode_bidi_logical_order</function>() gets called +	      first, followed by +	      <function>unicode_bidi_embed</function>(). +	      Finally, <function>unicode_bidi_embed_paragraph_level</function>() +	      optionally determines whether the resulting string's default +	      paragraph embedding level matches the one used for the actual +	      embedding direction, and if not returns a directional marker +	      to be prepended to the Unicode character string, as a hint. +            </para> +	    <para> +	      <function>unicode_bidi_logical_order</function>() factors in the +	      characters' embedding values, and the provided paragraph +	      embedding value +	      (<literal>UNICODE_BIDI_LR</literal> or +	      <literal>UNICODE_BIDI_RL</literal>), and rearranges the characters +	      and the embedding levels in left-to-right order, while +	      simultaneously +	      invoking the supplied reorder_callback indicating each range of +	      characters whose relative order gets reversed. The +	      <function>reorder_callback</function>() receives, as +	      parameters: +            </para> +	    <itemizedlist> +	      <listitem> +		<para> +		  The starting index of the first reversed character, in the +		  string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  Number of reversed characters. +                </para> +              </listitem> +	      <listitem> +		<para> +		  Forwarded <parameter>arg</parameter> pointer value. +                </para> +              </listitem> +            </itemizedlist> +	    <para> +	      This specifies a consecutive range of characters (and +	      directional  embedding values) +	      that get reversed (first character in the range becomes the +	      last character, +	      and the last character becomes the first character). +            </para> + +	    <para> +	      After +	      <function>unicode_bidi_logical_order</function>(), +	      <function>unicode_bidi_embed</function>() progressively invokes +	      the passed-in callback with +	      the contents of a bi-directional unicode string. +	      The parameters to <function>unicode_bidi_embed</function>() are: +            </para> +            <itemizedlist> +	      <listitem> +		<para> +		  The Unicode string, and … +                </para> +              </listitem> +	      <listitem> +		<para> +		  … the directional embedding buffer, in canonical +		  rendering order. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The size of the string and the embedding level buffer. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The paragraph embedding level, either +		  <literal>UNICODE_BIDI_LR</literal> or +		  <literal>UNICODE_BIDI_RL</literal>. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The pointer to the callback function. +                </para> +              </listitem> +	      <listitem> +		<para> +		  An opaque pointer argument that gets forwarded to the +		  callback function. +                </para> +              </listitem> +            </itemizedlist> +	    <para> +	      The callback receives pointers to +	      various parts of the original string that gets passed to +	      <function>unicode_bidi_embed</function>(), intermixed with +	      bi-directional markers, +	      overrides, and isolates. The callback's parameters are: +            </para> + +            <itemizedlist> +	      <listitem> +		<para> +		  The pointer to a Unicode string. +                </para> +		<note> +		  <para> +		    It is not a given that the callback receives pointers +		    to progressively increasing pointers of the original +		    string that gets passed to +		    <function>unicode_bidi_embed</function>(). +		    Some calls will be for individual bi-directional +		    markers, and +		    <function>unicode_bidi_embed</function>() also +		    performs some additional internal reordering, on the fly, +		    after <function>unicode_bidi_logical_order</function>()'s +		    big hammer. +                  </para> +                </note> +              </listitem> +	      <listitem> +		<para> +		  Number of characters in the Unicode string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  Forwarded <parameter>arg</parameter> pointer value. +                </para> +              </listitem> +            </itemizedlist> + +	    <para> +	      The assembled unicode string should produce the same +	      canonical rendering order, for the same paragraph embedding +	      level. +	      <function>unicode_bidi_embed_paragraph_level</function>() +	      checks if the specified Unicode string computes the given +	      default paragraph embedding level and returns 0 if it matches. +	      Otherwise it returns a directional marker that should be +	      <emphasis>prepended</emphasis> to the Unicode string to allow +	      <function>unicode_bidi_calc</function>'s optional paragraph +	      embedding level pointer's value to be <literal>NULL</literal>, +	      but derive the same default embedding level. +	      The parameters to +	      <function>unicode_bidi_embed_paragraph_level</function>() are: +            </para> +            <itemizedlist> +	      <listitem> +		<para> +		  The Unicode string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The size of the string. +                </para> +              </listitem> +	      <listitem> +		<para> +		  The paragraph embedding level, either +		  <literal>UNICODE_BIDI_LR</literal> or +		  <literal>UNICODE_BIDI_RL</literal>. +                </para> +              </listitem> +	    </itemizedlist> +          </refsect2> +	  <refsect2 id="unicode_bidi_misc">  	    <title>Miscellaneous utility functions</title>  	    <para>  	      <function>unicode_bidi_type</function> -	      looks up each character's bidirectional character type. +	      looks up each character's bi-directional character type.  	    </para>  	    <para>  	      <function>unicode_bidi_mirror</function> @@ -464,7 +876,7 @@ See COPYING for distribution information.  	    </para>  	  </refsect2>  	</refsect1> -	<refsect1> +	<refsect1 id="courier_unicode_bidi_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html">TR-9</ulink>, @@ -502,7 +914,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_canonical_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -552,7 +964,7 @@ See COPYING for distribution information.  	    equivalence.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_canonical_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>, @@ -641,7 +1053,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_category_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -783,7 +1195,7 @@ See COPYING for distribution information.  	    </varlistentry>  	  </variablelist>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_category_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -950,7 +1362,7 @@ See COPYING for distribution information.  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_convert_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -1040,7 +1452,7 @@ See COPYING for distribution information.  	  </para> -	  <refsect2> +	  <refsect2 id="unicode_convert_collect">  	    <title>Collecting converted text into a buffer</title>  	    <para> @@ -1097,7 +1509,7 @@ See COPYING for distribution information.  	    </para>  	  </refsect2> -	  <refsect2> +	  <refsect2 id="unicode_convert_chset_unicode">  	    <title>Converting between character sets and unicode</title>  	    <para> @@ -1126,7 +1538,7 @@ See COPYING for distribution information.  	    </para>  	  </refsect2> -	  <refsect2> +	  <refsect2 id="unicode_convert_oneshot">  	    <title>One-shot conversions</title>  	    <para> @@ -1175,7 +1587,7 @@ See COPYING for distribution information.  	    </para>  	  </refsect2>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_convert_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -1220,7 +1632,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_default_chset_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    <function>unicode_default_chset</function>() returns the name of the @@ -1231,7 +1643,7 @@ See COPYING for distribution information.  	    current application locale's character set.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_default_chset_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -1316,7 +1728,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_emoji_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    <function>unicode_emoji_lookup</function>() returns the @@ -1334,7 +1746,7 @@ See COPYING for distribution information.  	    character has the corresponding property.  	  </para>          </refsect1> -	<refsect1> +	<refsect1 id="unicode_emoji_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <ulink url="https://www.unicode.org/reports/tr51/tr51-&tr51ver;.html">TR-51</ulink>, @@ -1368,7 +1780,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_html40_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    <function>unicode_html40ent_lookup</function>() returns the @@ -1392,7 +1804,7 @@ See COPYING for distribution information.  	    a single unicode character.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_html40_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -1448,7 +1860,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_grapheme_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -1489,7 +1901,7 @@ See COPYING for distribution information.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_grapheme_seealso">  	  <title>SEE ALSO</title>  	  <para> @@ -1600,7 +2012,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_lb_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    These functions implement the unicode line breaking algorithm. @@ -1730,7 +2142,7 @@ See COPYING for distribution information.  	    line breaking handle is no longer valid.  	  </para> -	  <refsect2> +	  <refsect2 id="unicode_lb_altcallback">  	    <title>Alternative callback function</title>  	    <para> @@ -1745,7 +2157,7 @@ See COPYING for distribution information.  	    </para>  	  </refsect2> -	  <refsect2> +	  <refsect2 id="unicode_lb_altcallback_opt">  	    <title>Options</title>  	    <para> @@ -1822,7 +2234,7 @@ See COPYING for distribution information.  	  </refsect2>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_lb_seealso">  	  <title>SEE ALSO</title>  	  <para> @@ -1859,7 +2271,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_script_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    <function>unicode_script</function>() looks up the @@ -1871,7 +2283,7 @@ See COPYING for distribution information.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_script_seealso">  	  <title>SEE ALSO</title>  	  <para> @@ -1949,7 +2361,7 @@ See COPYING for distribution information.  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_wb_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    These functions implement the unicode word breaking algorithm. @@ -2046,7 +2458,7 @@ See COPYING for distribution information.  	    line breaking handle is no longer valid.  	  </para> -	  <refsect2> +	  <refsect2 id="unicode_wb_scan">  	    <title>Word scan</title>  	    <para> @@ -2075,7 +2487,7 @@ See COPYING for distribution information.  	  </refsect2>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_wb_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <ulink url="https://www.unicode.org/reports/tr29/tr29-&tr29ver;.html">TR-29</ulink>, @@ -2144,7 +2556,7 @@ See COPYING for distribution information.  	    </funcprototype>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_uc_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    <function>unicode_uc</function>(), @@ -2174,7 +2586,7 @@ See COPYING for distribution information.  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_uc_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2223,94 +2635,162 @@ See COPYING for distribution information.  	<refnamediv>  	  <refname>unicode::bidi_calc</refname>  	  <refname>unicode::bidi_reorder</refname> -	  <refpurpose>unicode bidirectional algorithm</refpurpose> +	  <refname>unicode::bidi_cleanup</refname> +	  <refname>unicode::bidi_extra_cleanup</refname> +	  <refname>unicode::bidi_logical_order</refname> +	  <refname>unicode::bidi_embed</refname> +	  <refname>unicode::bidi_embed_paragraph_level</refname> +	  <refpurpose>unicode bi-directional algorithm</refpurpose>  	</refnamediv>  	<refsynopsisdiv>  	  <funcsynopsis>  	    <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo>  	    <funcprototype> -              <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> +              <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef>  	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef>  	    </funcprototype> -	  </funcsynopsis> -	  <funcsynopsis>  	    <funcprototype> -              <funcdef>std::vector<unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> +              <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef>  	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef>  	      <paramdef>unicode_bidi_level_t <parameter>embedding_level</parameter></paramdef>  	    </funcprototype> -	  </funcsynopsis> -	  <funcsynopsis>  	    <funcprototype>                <funcdef>int <function>unicode::bidi_reorder</function></funcdef>  	      <paramdef>std::u32string &<parameter>string</parameter></paramdef>  	      <paramdef>std::vector<unicode_bidi_level_t> &<parameter>embedding_level</parameter></paramdef> -	      <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> +	      <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>reorder_callback</parameter></paramdef>  	    </funcprototype> -	  </funcsynopsis> -	  <funcsynopsis>  	    <funcprototype> -              <funcdef>int <function>unicode::bidi_reorder</function></funcdef> +              <funcdef>void <function>unicode::bidi_reorder</function></funcdef>  	      <paramdef>std::vector<unicode_bidi_level_t> &<parameter>embedding_level</parameter></paramdef> -	      <paramdef>const std::function<void (size_t, size_t)> &<parameter>reorder_callback</parameter></paramdef> +	      <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>reorder_callback</parameter></paramdef>  	    </funcprototype> -	  </funcsynopsis> + +	    <funcprototype> +              <funcdef>void <function>unicode::bidi_cleanup</function></funcdef> +	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>int <function>unicode::bidi_cleanup</function></funcdef> +	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>void <function>unicode::bidi_extra_cleanup</function></funcdef> +	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>int <function>unicode::bidi_extra_cleanup</function></funcdef> +	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>const std::function<void (size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>int <function>unicode::bidi_logical_order</function></funcdef> +	      <paramdef>std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +	      <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>void <function>unicode::bidi_logical_order</function></funcdef> +	      <paramdef>std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +	      <paramdef>const std::function<void (size_t, size_t) noexcept> &<parameter>removed_callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>int <function>unicode::bidi_embed</function></funcdef> +	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +	      <paramdef>const std::function<void (size_t, const char32_t *, size_t) noexcept> &<parameter>callback</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +              <funcdef>std::u32string <function>unicode::bidi_embed</function></funcdef> +	      <paramdef>const std::u32string &<parameter>string</parameter></paramdef> +	      <paramdef>const std::vector <unicode_bidi_level_t> &<parameter>levels</parameter></paramdef> +	      <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +            </funcprototype> + +	    <funcprototype> +	      <funcdef>char32_t <function>unicode_bidi_embed_paragraph_level</function></funcdef> +              <paramdef>const std::u32string &<parameter>string</parameter></paramdef> +              <paramdef>unicode_bidi_level_t <parameter>paragraph_embedding</parameter></paramdef> +	    </funcprototype> +          </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_bidi_descr">  	  <title>DESCRIPTION</title>  	  <para>  	    These functions implement the C++ interface for the -	    <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-directional algorithm</ulink>. +	    <ulink url="https://www.unicode.org/reports/tr9/tr9-&tr9ver;.html"> Unicode Bi-Directional algorithm</ulink>.  	    See the description of the underlying  	    <link linkend="unicode_bidi">  	      <citerefentry><refentrytitle>unicode_bidi</refentrytitle>  	      <manvolnum>3</manvolnum></citerefentry></link> C library -	      API for more information. +	      API for more information. C++ specific notes:  	  </para> -	  <para> -            <function>unicode::bidi_calc</function> computes and return a vector -	    of bidirection embedding level values for the given Unicode string. -	    An overload takes an additional parameter that override the -	    paragraph embedding level, a <literal>UNICODE_BIDI_LR</literal> or -            an <literal>UNICODE_BIDI_RL</literal> value. -          </para> -	  <para> -            <function>unicode::bidi_reorder</function> reverses the characters -	    in the Unicode script, according to their embedding levels (and -	    reverses the corresponding embedding level values too). -	    As is with the C API, an optional parameter is a callable object -	    that gets invoked to report each range of characters that gets -	    reversed (specified as the starting position and a number of -	    characters). -          </para> -	  <para> -	    An overloaded <function>unicode::bidi_reorder</function> without -	    the string parameter goes through the motions, according to the -	    embedded level vector parameter, but without actually reversing -	    the values in the vector, but still invoking the callable object -	    normally. -          </para> -	  <para> -	    This is comparable to the C API. Also comparable with the C API: -	    the convention that even embedding levels specify left to right -	    text and odd embedding values specify right to left text. -	    An embedding value of <literal>UNICODE_BIDI_SKIP</literal> -	    indicates an embedding or an override marker that has no -	    specified embeded value. These markers may be removed from the -	    Unicode string (together with the -	    <literal>UNICODE_BIDI_SKIP</literal> -	    values from the embedding values vector) either before or after -	    they get reordered. -	  </para> +	  <itemizedlist> +	    <listitem> +	      <para> +                <function>unicode::bidi_calc</function> returns the +		directional embedding value buffer and the paragraph +		embedding level. +              </para> +            </listitem> +	    <listitem> +	      <para> +		Several C functions provide a <quote>dry-run</quote> mode +		by passing a <literal>NULL</literal> pointer. The C++ API +		provides separate overloads, with and without the nullable +		parameter. +              </para> +            </listitem> +	    <listitem> +	      <para> +		Several C functions accept a nullable function pointer, with +		the <literal>NULL</literal> function pointer specifying no +		callback. The C++ functions have a +		<classname>std::function</classname> parameter with a +		default do-nothing closure. +              </para> +            </listitem> + +	    <listitem> +	      <para> +		Several C functions accept two parameters, a Unicode character +		pointer and the embedding level buffer, and a single parameter +		that specifies the size of both. +		The equivalent C++ function takes two discrete parameters, +		a <classname>std::u32string</classname> and a +		<classname>std::vector</classname> and returns an +		<classname>int</classname>; a negative value if their sizes +		differ, and 0 if their sizes match, and the requested function +		completes. The <function>unicode::bidi_embed</function> overload +		that returns a <classname>std::u32string</classname> returns +		an empty string in case of a mismatch. +              </para> +            </listitem> +          </itemizedlist>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_bidi_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2389,7 +2869,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -2447,7 +2927,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2505,7 +2985,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_tocase_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -2537,7 +3017,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_tocase_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2602,7 +3082,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_fromu_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -2634,7 +3114,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_fromu_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2698,7 +3178,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_tou_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -2733,7 +3213,7 @@ extern const char unicode::iso_8859_1[];</funcsynopsisinfo>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_convert_tou_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -2846,7 +3326,7 @@ std::vector<std::pair<int, char32_t>> linebreaks;  std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int>>(linebreaks));</programlisting>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_lb_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -2941,7 +3421,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_lb_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -3012,7 +3492,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int>  	  </funcsynopsis>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_tolower_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -3040,7 +3520,7 @@ std::copy(beg_iter, end_iter, std::back_insert_iterator<std::vector<int>  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_tolower_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> @@ -3104,7 +3584,7 @@ size_t nchars=scan.finish();  </programlisting>  	</refsynopsisdiv> -	<refsect1> +	<refsect1 id="unicode_cpp_wb_descr">  	  <title>DESCRIPTION</title>  	  <para> @@ -3168,7 +3648,7 @@ size_t nchars=scan.finish();  	  </para>  	</refsect1> -	<refsect1> +	<refsect1 id="unicode_cpp_wb_seealso">  	  <title>SEE ALSO</title>  	  <para>  	    <link linkend="courier-unicode"> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index c8161ea..f6b4b8c 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -536,65 +536,6 @@ int unicode_wbscan_next(unicode_wbscan_info_t i, char32_t ch);  size_t unicode_wbscan_end(unicode_wbscan_info_t i); -/* -** Unicode Bidirectional bracket and mirroring lookup -** -** http://www.unicode.org/reports/tr9/tr9-42.html -** -** unicode_bidi_mirror() returns the Bidi_Mirroring_Glyph property. -** -** If there is no mirroring glyph for the given character, returns the -** same character. -** -** unicode_bidi_bracket_type() looks up the Bidi_Paired_Bracket and -** Bidi_Paired_Bracket_Type properties. -** -** unicode_bidi_bracket_type() returns the Bidi_Paired_Bracket property -** value. If the ret parameter is not a null pointer, the pointed-to -** value is set to Bidi_Paired_Bracket_Type value, one of the UNICODE_BIDI -** values. -** -** unicode_bidi_bracket_type() returns the same character and -** UNICODE_BIDI_n if the given character does not have these properties. -** -** unicode_bidi_type() looks up the bidirectional character type of the -** given Unicode character. -** -** unicode_bidi_calc() implements the Unicode Bidirectional Algorithm up to -** step L1. -** -** Parameters: -** -** - A pointer to char32_t, the Unicode string. -** -** - Number of characters in the char32_t string -** -** - A pointer to an array of unicode_bidi_level_t values. The caller is -** responsible for allocating and deallocating this array, which has the -** same size as the Unicode string (the second parameter). -** -** - An optional pointer to a unicode_bidi_level_t value, or a null pointer. -** A pointer to UNICODE_BIDI_LR or UNICODE_BIDI_RL sets the default paragraph -** direction level. A null pointer calculates the default paragraph direction -** level based on the string, as specified by the "P" rules in the algorithm. -** -** unicode_bidi_calc() fills in the unicode_bidi_level_t array with the -** values corresponding to the embedding level of the corresponding character, -** as specified in the Unicode Bidirection Algorithm (even for left-to-right, -** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates -** directional markers (from step X9). These characters should be removed -** before using unicode_bidi_reorder(). -** -** unicode_bidi_calc() returns the resolved paragraph direction level, which -** always matches the passed in level, if specified, else it reports the -** derived one. -** -** unicode_bidi_reorder() reorders the characters according to the resolved -** embedding levels. A non-null reorder_callback gets invoked repeatedly, -** indicating the starting index and the number of characters reversed, so -** that any related metadata can be updated accordingly. -*/ -  typedef char unicode_bidi_bracket_type_t;  #define UNICODE_BIDI_n  'n' @@ -654,6 +595,40 @@ typedef enum {  extern enum_bidi_type_t unicode_bidi_type(char32_t c); +extern size_t unicode_bidi_cleanup(char32_t *string, +				   unicode_bidi_level_t *levels, +				   size_t n, +				   void (*removed_callback)(size_t, void *), +				   void *); + +extern size_t unicode_bidi_extra_cleanup(char32_t *string, +					 unicode_bidi_level_t *levels, +					 size_t n, +					 void (*removed_callback)(size_t, +								  void *), +					 void *); + +extern void unicode_bidi_logical_order(char32_t *string, +				       unicode_bidi_level_t *levels, +				       size_t n, +				       unicode_bidi_level_t paragraph_embedding, +				       void (*reorder_callback)(size_t, size_t, +								void *), +				       void *arg); + +extern void unicode_bidi_embed(const char32_t *string, +			       const unicode_bidi_level_t *levels, +			       size_t n, +			       unicode_bidi_level_t paragraph_embedding, +			       void (*emit)(const char32_t *string, +					    size_t n, +					    void *arg), +			       void *arg); + +extern char32_t unicode_bidi_embed_paragraph_level(const char32_t *str, +						   size_t n, +						   unicode_bidi_level_t); +  /*  ** unicode_canonical() returns the canonical mapping of the given Unicode  ** character. The returned structure specifies: @@ -2117,24 +2092,124 @@ std::u32string tolower(const std::u32string &u);  std::u32string toupper(const std::u32string &u);  //! Calculate bidirectional embedding levels + +//! Returns the bidirectional embedding levels, and the paragraph +//! embedding level. +  std::tuple<std::vector<unicode_bidi_level_t>,  	   unicode_bidi_level_t> bidi_calc(const std::u32string &s);  //! Calculate bidirectional embedding levels + +//! Overload calculates the embedding levels using a predetermined +//! paragraph embedding level. +//! +//! Returns the bidirectional embedding levels, and the same paragraph +//! embedding level. +  std::tuple<std::vector<unicode_bidi_level_t>,  	   unicode_bidi_level_t> bidi_calc(const std::u32string &s,  					   unicode_bidi_level_t level);  //! Reorder bidirectional text + +//! Reorders the string and levels in place. +//! +//! Non-0 return value indicates the string and levels' sizes do not match. +  int bidi_reorder(std::u32string &string,  		 std::vector<unicode_bidi_level_t> &levels, -		 const std::function<void (size_t, size_t)> &reorder_callback= -		 [](size_t, size_t){}); +		 const std::function<void (size_t, size_t) noexcept> +		 &reorder_callback=[](size_t, size_t) noexcept{}); -//! Reorder bidirectional text +//! Dry-run reorder bidirectional text  void bidi_reorder(std::vector<unicode_bidi_level_t> &levels, -		  const std::function<void (size_t, size_t)> &reorder_callback= -		  [](size_t, size_t){}); +		  const std::function<void (size_t, size_t) noexcept> +		  &reorder_callback=[](size_t, size_t) noexcept{}); + +//! Remove directional markers + +//! Removes them from the string, in place. Optional lambda gets notified +//! of the index (in the original string, of each removed marker. + +void bidi_cleanup(std::u32string &string, +		  const std::function<void (size_t) noexcept> &removed_callback= +		  [](size_t) noexcept {}); + +//! Also remove them from the embedding direction level buffer. + +//! Returns non-0 in case of non-matching level buffer size. + +int bidi_cleanup(std::u32string &string, +		 std::vector<unicode_bidi_level_t> &levels, +		 const std::function<void (size_t) noexcept> &removed_callback= +		  [](size_t) noexcept {}); + + +//! Remove directional markers and isolation markers. + +//! Removes them from the string, in place. Optional lambda gets notified +//! of the index (in the original string, of each removed marker. + +void bidi_extra_cleanup(std::u32string &string, +			const std::function<void (size_t) noexcept> +			&removed_callback= +			[](size_t) noexcept {}); + +//! Also remove them from the embedding direction level buffer. + +//! Returns non-0 in case of non-matching level buffer size. + +int bidi_extra_cleanup(std::u32string &string, +		       std::vector<unicode_bidi_level_t> &levels, +		       const std::function<void (size_t) noexcept> +		       &removed_callback= +		       [](size_t) noexcept {}); + +//! Convert Unicode string from canonical rendering order to logical order. +int bidi_logical_order(std::u32string &string, +		       std::vector<unicode_bidi_level_t> &levels, +		       unicode_bidi_level_t paragraph_embedding, +		       const std::function<void (size_t, size_t) noexcept> +		       &lambda=[](size_t,size_t){}); + +//! Convert Unicode string from canonical rendering order to logical order. +void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, +			unicode_bidi_level_t paragraph_embedding, +			const std::function<void (size_t, size_t) noexcept> +			&lambda); + +//! Embed directional and isolation markers + +//! Non-0 return value indicates the string and levels' sizes do not match. +//! +//! The lambda gets called repeatedly, to specify the contents of the +//! string with embedded direction markers. + +int bidi_embed(const std::u32string &string, +	       const std::vector<unicode_bidi_level_t> &levels, +	       unicode_bidi_level_t paragraph_embedding, +	       const std::function<void (const char32_t *string, +					 size_t n) noexcept> &lambda); + +//! Embed directional and isolation markers + +//! \overload +//! +//! Provides a lambda that collects the new string, and returns it. An +//! empty string gets returned if the string and levels' sizes do not match. + +std::u32string bidi_embed(const std::u32string &string, +			  const std::vector<unicode_bidi_level_t> &levels, +			  unicode_bidi_level_t paragraph_embedding); + +//! Check if a directional marker needs to be inserted + +//! In order for the unicode string to have the specified default +//! paragraph embedding level. + +extern char32_t bidi_embed_paragraph_level(const std::u32string &string, +					   unicode_bidi_level_t level);  #if 0  { diff --git a/unicode/docbook/book.css b/unicode/docbook/book.css index d1420cd..a133e82 100644 --- a/unicode/docbook/book.css +++ b/unicode/docbook/book.css @@ -44,7 +44,7 @@ code.computeroutput div.literallayout {      font-weight: bold;  } -.command, .acronym, .symbol { +.command, .acronym, .symbol, .structname {      font-family: "liberation mono", "courier new", monospace;      background-color: #eeeeee;  } diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index 055ee89..a35e9b5 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -112,6 +112,17 @@ typedef enum {  	 (c) == UNICODE_BIDI_TYPE_LRO ||		\  	 (c) == UNICODE_BIDI_TYPE_RLO) +#define is_explicit_indicator_except_b(c)	\ +	( is_isolate_initiator(c) ||		\ +	  is_embedding_initiator(c) ||		\ +	  (c) == UNICODE_BIDI_TYPE_BN ||        \ +	  (c) == UNICODE_BIDI_TYPE_PDF ||       \ +	  (c) == UNICODE_BIDI_TYPE_PDI) + +#define is_explicit_indicator(c)               \ +	( is_explicit_indicator_except_b(c) || \ +	  (c) == UNICODE_BIDI_TYPE_B) +  /* BD13 implementation */  /* A level run, specified as indexes */ @@ -529,6 +540,8 @@ static void directional_status_stack_push  		(struct directional_status_stack_entry *)  		malloc(sizeof(struct directional_status_stack_entry)); +	if (!p) +		abort();  #ifdef BIDI_DEBUG  	fprintf(DEBUGDUMP, "BIDI: Push level %d, override: %s, isolate: %s\n",  		(int)embedding_level, @@ -548,16 +561,21 @@ static void directional_status_stack_push  }  static unicode_bidi_level_t -compute_paragraph_embedding_level(const enum_bidi_type_t *p, -				  size_t i, size_t j) +compute_paragraph_embedding_level(size_t i, size_t j, +				  enum_bidi_type_t (*get)(size_t i, +							  void *arg), +				  void *arg) +  {  	unicode_bidi_level_t in_isolation=0;  	for (; i<j; ++i)  	{ -		if (is_isolate_initiator(p[i])) +		enum_bidi_type_t t=get(i, arg); + +		if (is_isolate_initiator(t))  			++in_isolation; -		else if (p[i] == UNICODE_BIDI_TYPE_PDI) +		else if (t == UNICODE_BIDI_TYPE_PDI)  		{  			if (in_isolation)  				--in_isolation; @@ -565,16 +583,43 @@ compute_paragraph_embedding_level(const enum_bidi_type_t *p,  		if (in_isolation == 0)  		{ -			if (p[i] == UNICODE_BIDI_TYPE_AL || -			    p[i] == UNICODE_BIDI_TYPE_R) +			if (t == UNICODE_BIDI_TYPE_AL || +			    t == UNICODE_BIDI_TYPE_R)  			{ -				return 1; +				return UNICODE_BIDI_RL;  			} -			if (p[i] == UNICODE_BIDI_TYPE_L) +			if (t == UNICODE_BIDI_TYPE_L)  				break;  		}  	} -	return 0; +	return UNICODE_BIDI_LR; +} + +struct compute_paragraph_embedding_level_type_info { +	const enum_bidi_type_t *p; +}; + +static enum_bidi_type_t +get_enum_bidi_type_for_paragraph_embedding_level(size_t i, +						 void *arg) +{ +	struct compute_paragraph_embedding_level_type_info *p= +		(struct compute_paragraph_embedding_level_type_info *)arg; + +	return p->p[i]; +} + +static unicode_bidi_level_t +compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p, +					     size_t i, size_t j) +{ +	struct compute_paragraph_embedding_level_type_info info; +	info.p=p; + +	return compute_paragraph_embedding_level +		(i, j, +		 get_enum_bidi_type_for_paragraph_embedding_level, +		 &info);  }  static directional_status_stack_t @@ -591,7 +636,7 @@ directional_status_stack_init(const char32_t *chars,  	stack->paragraph_embedding_level=  		initial_embedding_level  		? *initial_embedding_level & 1 -		: compute_paragraph_embedding_level(classes, 0, n); +		: compute_paragraph_embedding_level_from_types(classes, 0, n);  	stack->chars=chars;  	stack->classes=classes; @@ -676,6 +721,8 @@ unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,  	enum_bidi_type_t *buf=  		(enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); +	if (!buf) +		abort();  	for (size_t i=0; i<n; ++i)  	{  		buf[i]=unicode_bidi_type(p[i]); @@ -732,7 +779,7 @@ unicode_bidi_b(const char32_t *p,  		}							\  	} while(0) -static void unicode_bidi_w(directional_status_stack_t stack, +static void unicode_bidi_w(enum_bidi_type_t *classes,  			   struct isolating_run_sequence_s *seq);  static void unicode_bidi_n(directional_status_stack_t stack,  			   struct isolating_run_sequence_s *seq); @@ -900,7 +947,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)  				}  			} -			cur_class=compute_paragraph_embedding_level +			cur_class=compute_paragraph_embedding_level_from_types  				(stack->classes, i+1, j) == 1  				? UNICODE_BIDI_TYPE_RLI  				: UNICODE_BIDI_TYPE_LRI; @@ -955,24 +1002,11 @@ static void unicode_bidi_cl(directional_status_stack_t stack)  			break;  		} -		switch (stack->orig_classes[i]) { -		case UNICODE_BIDI_TYPE_BN: -		case UNICODE_BIDI_TYPE_B: -		case UNICODE_BIDI_TYPE_RLE: -		case UNICODE_BIDI_TYPE_LRE: -		case UNICODE_BIDI_TYPE_RLO: -		case UNICODE_BIDI_TYPE_LRO: -		case UNICODE_BIDI_TYPE_PDF: -		case UNICODE_BIDI_TYPE_RLI: -		case UNICODE_BIDI_TYPE_LRI: -		case UNICODE_BIDI_TYPE_FSI: -		case UNICODE_BIDI_TYPE_PDI: -			break; -		default: +		if (!is_explicit_indicator(stack->orig_classes[i])) +		{  			/* X6 */  			stack->levels[i]=stack->head->embedding_level;  			RESET_CLASS(stack->classes[i],stack); -			break;  		}  		if (stack->classes[i] == UNICODE_BIDI_TYPE_PDI) @@ -1210,7 +1244,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)  		dump_sequence("Contents before W", stack, p);  #endif -		unicode_bidi_w(stack, p); +		unicode_bidi_w(stack->classes, p);  #ifdef BIDI_DEBUG  		dump_sequence("Contents after W", stack, p); @@ -1258,7 +1292,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack)  	}  } -static void unicode_bidi_w(directional_status_stack_t stack, +static void unicode_bidi_w(enum_bidi_type_t *classes,  			   struct isolating_run_sequence_s *seq)  {  	irs_iterator iter=irs_begin(seq), end=irs_end(seq); @@ -1268,10 +1302,10 @@ static void unicode_bidi_w(directional_status_stack_t stack,  	while (irs_compare(&iter, &end))  	{ -		if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_NSM) +		if (classes[iter.i] == UNICODE_BIDI_TYPE_NSM)  		{  			/* W1 */ -			stack->classes[iter.i] = +			classes[iter.i] =  				is_isolate_initiator(previous_type) ||  				previous_type == UNICODE_BIDI_TYPE_PDI  				? UNICODE_BIDI_TYPE_ON @@ -1281,14 +1315,14 @@ static void unicode_bidi_w(directional_status_stack_t stack,  		/* W2 */ -		if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_EN && +		if (classes[iter.i] == UNICODE_BIDI_TYPE_EN &&  		    strong_type == UNICODE_BIDI_TYPE_AL)  		{ -			stack->classes[iter.i] = UNICODE_BIDI_TYPE_AN; +			classes[iter.i] = UNICODE_BIDI_TYPE_AN;  		}  		/* W2 */ -		previous_type=stack->classes[iter.i]; +		previous_type=classes[iter.i];  		switch (previous_type) {  		case UNICODE_BIDI_TYPE_R: @@ -1312,12 +1346,12 @@ static void unicode_bidi_w(directional_status_stack_t stack,  	while (not_eol)  	{  		/* W3 */ -		if (stack->classes[iter.i] == UNICODE_BIDI_TYPE_AL) -			stack->classes[iter.i] = UNICODE_BIDI_TYPE_R; +		if (classes[iter.i] == UNICODE_BIDI_TYPE_AL) +			classes[iter.i] = UNICODE_BIDI_TYPE_R;  		/* W4 */ -		enum_bidi_type_t this_type=stack->classes[iter.i]; +		enum_bidi_type_t this_type=classes[iter.i];  		irs_incr(&iter);  		not_eol=irs_compare(&iter, &end); @@ -1332,13 +1366,13 @@ static void unicode_bidi_w(directional_status_stack_t stack,  		       previous_type == UNICODE_BIDI_TYPE_AN)  		      )  		     ) && -		    stack->classes[iter.i] == previous_type) +		    classes[iter.i] == previous_type)  		{  			irs_iterator prev=iter;  			irs_decr(&prev); -			stack->classes[prev.i]=previous_type; +			classes[prev.i]=previous_type;  		}  		if (not_eol) @@ -1353,9 +1387,9 @@ static void unicode_bidi_w(directional_status_stack_t stack,  	while (irs_compare(&iter, &end))  	{ -		if (stack->classes[iter.i] != UNICODE_BIDI_TYPE_ET) +		if (classes[iter.i] != UNICODE_BIDI_TYPE_ET)  		{ -			previous_type=stack->classes[iter.i]; +			previous_type=classes[iter.i];  			irs_incr(&iter);  			continue;  		} @@ -1363,7 +1397,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,  		/* ET after EN */  		if (previous_type == UNICODE_BIDI_TYPE_EN)  		{ -			stack->classes[iter.i] = UNICODE_BIDI_TYPE_EN; +			classes[iter.i] = UNICODE_BIDI_TYPE_EN;  			irs_incr(&iter);  			continue;  		} @@ -1374,7 +1408,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,  		while (irs_incr(&iter), irs_compare(&iter, &end))  		{ -			previous_type=stack->classes[iter.i]; +			previous_type=classes[iter.i];  			if (previous_type == UNICODE_BIDI_TYPE_ET)  				continue; @@ -1383,7 +1417,7 @@ static void unicode_bidi_w(directional_status_stack_t stack,  			{  				while (irs_compare(&start, &iter))  				{ -					stack->classes[start.i]= +					classes[start.i]=  						UNICODE_BIDI_TYPE_EN;  					irs_incr(&start);  				} @@ -1397,12 +1431,12 @@ static void unicode_bidi_w(directional_status_stack_t stack,  	for (iter=irs_begin(seq);  	     irs_compare(&iter, &end); irs_incr(&iter))  	{ -		switch (stack->classes[iter.i]) { +		switch (classes[iter.i]) {  		case UNICODE_BIDI_TYPE_ET:  		case UNICODE_BIDI_TYPE_ES:  		case UNICODE_BIDI_TYPE_CS:  			/* W6 */ -			stack->classes[iter.i]=UNICODE_BIDI_TYPE_ON; +			classes[iter.i]=UNICODE_BIDI_TYPE_ON;  			break;  		default:  			break; @@ -1416,14 +1450,14 @@ static void unicode_bidi_w(directional_status_stack_t stack,  	while (irs_compare(&iter, &end))  	{ -		switch (stack->classes[iter.i]) { +		switch (classes[iter.i]) {  		case UNICODE_BIDI_TYPE_L:  		case UNICODE_BIDI_TYPE_R: -			previous_type=stack->classes[iter.i]; +			previous_type=classes[iter.i];  			break;  		case UNICODE_BIDI_TYPE_EN:  			if (previous_type == UNICODE_BIDI_TYPE_L) -				stack->classes[iter.i]=previous_type; +				classes[iter.i]=previous_type;  			break;  		default:  			break; @@ -1573,13 +1607,13 @@ static void unicode_bidi_n(directional_status_stack_t stack,  		ADJUST_EOCLASS(eoclass); -#define E_CLASS (seq->embedding_level & 1 ?			\ -		 UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L) +#define E_CLASS(level) ((level) & 1 ?					\ +			UNICODE_BIDI_TYPE_R:UNICODE_BIDI_TYPE_L) -#define O_CLASS (seq->embedding_level & 1 ?			\ -		 UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R) +#define O_CLASS(level) ((level) & 1 ?					\ +			UNICODE_BIDI_TYPE_L:UNICODE_BIDI_TYPE_R) -		if (eoclass == E_CLASS) +		if (eoclass == E_CLASS(seq->embedding_level))  		{  #ifdef BIDI_DEBUG  			if (stackp) @@ -1599,7 +1633,7 @@ static void unicode_bidi_n(directional_status_stack_t stack,  			for (size_t i=0; i<stackp; ++i)  				stack_iters[i]->has_e=1;  		} -		else if (eoclass == O_CLASS) +		else if (eoclass == O_CLASS(seq->embedding_level))  		{  #ifdef BIDI_DEBUG  			if (stackp) @@ -1636,8 +1670,8 @@ static void unicode_bidi_n(directional_status_stack_t stack,  				"Brackets: %d and %d: e=%s, o=%s",  				(int)p->start.i,  				(int)p->end.i, -				bidi_classname(E_CLASS), -				bidi_classname(O_CLASS)); +				bidi_classname(E_CLASS(seq->embedding_level)), +				bidi_classname(O_CLASS(seq->embedding_level)));  			fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n",  				p->has_e, @@ -1879,6 +1913,37 @@ static void level_run_layers_add(struct level_run_layers *p)  	level_runs_init(p->lruns + (p->n_lruns++));  } +static void reverse_str(char32_t *p, +			unicode_bidi_level_t *levels, +			size_t start, +			size_t end, +			void (*reorder_callback)(size_t, size_t, void *), +			void *arg) +{ +	size_t right=end; +	size_t left=start; + +	while (right > left) +	{ +		--right; + +		if (p) +		{ +			char32_t c=p[left]; +			unicode_bidi_level_t l=levels[left]; + +			p[left]=p[right]; +			levels[left]=levels[right]; +			p[right]=c; +			levels[right]=l; +		} +		++left; +	} + +	if (end-start > 1 && reorder_callback) +		(*reorder_callback)(start, end-start, arg); +} +  void unicode_bidi_reorder(char32_t *p,  			  unicode_bidi_level_t *levels,  			  size_t n, @@ -1887,6 +1952,15 @@ void unicode_bidi_reorder(char32_t *p,  {  	/* L2 */ +#ifdef BIDI_DEBUG +	fprintf(DEBUGDUMP, "Before L2:"); +	for (size_t i=0; i<n; ++i) +		fprintf(DEBUGDUMP, " %04x/%d", +			(unsigned)p[i], +			(int)levels[i]); +	fprintf(DEBUGDUMP, "\n"); +#endif +  	struct level_run_layers layers;  	unicode_bidi_level_t previous_level=0; @@ -1920,39 +1994,738 @@ void unicode_bidi_reorder(char32_t *p,  			}  		}  	} - +#ifdef BIDI_DEBUG +	fprintf(DEBUGDUMP, "L2:\n"); +#endif  	for (size_t i=layers.n_lruns; i; )  	{  		struct level_runs *runs=layers.lruns+ --i; +#ifdef BIDI_DEBUG +		if (runs->n_level_runs) +			fprintf(DEBUGDUMP, "Reverse %d:", +				(int)i); +#endif +  		for (size_t j=0; j<runs->n_level_runs; ++j)  		{  			size_t start=runs->runs[j].start;  			size_t end=runs->runs[j].end; -			size_t right=end; -			size_t left=start; +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, " %d-%d", +				(int)start, (int)end-1); +#endif -			while (right > left) +			reverse_str(p, levels, start, end, +				    reorder_callback, arg); +		} + +#ifdef BIDI_DEBUG +		if (runs->n_level_runs) +			fprintf(DEBUGDUMP, "\n"); +#endif +	} + +	level_run_layers_deinit(&layers); +} + +#define LRM	0x200E +#define RLM	0x200F +#define ALM	0x061C + +size_t unicode_bidi_cleanup(char32_t *string, +			    unicode_bidi_level_t *levels, +			    size_t n, +			    void (*removed_callback)(size_t, void *), +			    void *arg) +{ +	size_t i=0; +	for (size_t j=0; j<n; ++j) +	{ +		enum_bidi_type_t cl=unicode_bidi_type(string[j]); + +		if (IS_X9(cl)) +		{ +			if (removed_callback) +				(*removed_callback)(j, arg); +			continue; +		} +		if (levels) +			levels[i]=levels[j] & 1; +		++i; +	} +	return i; +} + +size_t unicode_bidi_extra_cleanup(char32_t *string, +				  unicode_bidi_level_t *levels, +				  size_t n, +				  void (*removed_callback)(size_t, void *), +				  void *arg) +{ +	size_t i=0; +	for (size_t j=0; j<n; ++j) +	{ +		enum_bidi_type_t cl=unicode_bidi_type(string[j]); + +		if (is_explicit_indicator_except_b(cl) || +		    (string[j] == LRM || +		     string[j] == RLM || +		     string[j] == ALM)) +		{ +			if (removed_callback) +				(*removed_callback)(j, arg); +			continue; +		} +		string[i]=cl == UNICODE_BIDI_TYPE_B ? '\n' : string[j]; +		if (levels) +			levels[i]=levels[j] & 1; +		++i; +	} +	return i; +} + +void unicode_bidi_logical_order(char32_t *string, +				unicode_bidi_level_t *levels, +				size_t n, +				unicode_bidi_level_t paragraph_embedding, +				void (*reorder_callback)(size_t, size_t, +							 void *), +				void *arg) +{ +	size_t i=0; + +	// On this pass: +	// +	// When paragraph_embedding is 0, we reverse odd embedding levels. +	// When paragraph_embedding is 1, we reverse even embedding levels. + +#define LOGICAL_FLIP(n) ( ((n) ^ paragraph_embedding) & 1) + +	while (i<n) +	{ +		if ( !LOGICAL_FLIP(levels[i])) +		{ +			++i; +			continue; +		} + +		size_t j=i; + +		while (i<n) +		{ +			if (!LOGICAL_FLIP(levels[i])) +				break; +			++i; +		} + +		reverse_str(string, levels, j, i, +			    reorder_callback, arg); +	} + +	if (paragraph_embedding & 1) +		reverse_str(string, levels, 0, n, reorder_callback, arg); +} + +/* +** Track consecutive sequences of characters with the same embedding level. +** +** Linked list create in compute_bidi_embed_levelruns(). +*/ + +struct bidi_embed_levelrun { +	struct bidi_embed_levelrun *next; +	size_t start; +	size_t end; +	unicode_bidi_level_t level; +}; + +static struct bidi_embed_levelrun ** +record_bidi_embed_levelrun(struct bidi_embed_levelrun **tailp, +			   size_t start, +			   size_t end, +			   unicode_bidi_level_t level) +{ +	struct bidi_embed_levelrun *p; + +	p=(struct bidi_embed_levelrun *)calloc(1, sizeof(*p)); +	if (!p) +		abort(); + +	p->start=start; +	p->end=end; +	p->level=level; + +	if (*tailp) +	{ +		(*tailp)->next=p; +		return &(*tailp)->next; +	} +	else +	{ +		*tailp=p; +		return tailp; +	} +} + +static void compute_bidi_embed_levelruns(const char32_t *string, +					 const unicode_bidi_level_t *levels, +					 size_t n, +					 struct bidi_embed_levelrun **tailp) +{ +	size_t i=0; + +	while (i<n) +	{ +		size_t j=i; + +		while (++i < n) +		{ +			if ((levels[i] & 1) != (levels[j] & 1)) +				break; +		} +		tailp=record_bidi_embed_levelrun(tailp, j, i, +						 levels[j] & 1); +	} +} + +#define RLI 0x2067 +#define LRI 0x2066 +#define RLO 0x202e +#define LRO 0x202d +#define PDF 0x202c +#define PDI 0x2069 + +/* +** Whether a directional marker and a PDI is required to be generated after +** some subset of characters. +*/ + +struct need_marker_info { +	int need_marker; +	int need_pdi; +}; + +static void need_marker_info_init(struct need_marker_info *info) +{ +	info->need_marker=0; +	info->need_pdi=0; +} + +static void need_marker_info_merge(struct need_marker_info *info, +				   const struct need_marker_info *other_info) +{ +	if (other_info->need_marker) +		info->need_marker=1; +	if (other_info->need_pdi) +		info->need_pdi=1; +} + +static void emit_bidi_embed_levelrun(const char32_t *string, +				     enum_bidi_type_t *classes, +				     struct bidi_embed_levelrun *run, +				     unicode_bidi_level_t paragraph_level, +				     unicode_bidi_level_t previous_level, +				     unicode_bidi_level_t next_level, +				     struct need_marker_info *need_marker, +				     void (*emit)(const char32_t *string, +						  size_t n, +						  void *arg), +				     void *arg); + +/* L1 */ + +static int is_l1_on_or_after(const enum_bidi_type_t *classes, +			     size_t n, +			     size_t i, +			     int atend) +{ +	/* +	** Determine if rule L1 will apply starting at the given position. +	*/ +	while (i<n) +	{ +		enum_bidi_type_t t=classes[i]; + +		if (t == UNICODE_BIDI_TYPE_WS) +		{ +			++i; +			continue; +		} + +		if (t == UNICODE_BIDI_TYPE_S || +		    t == UNICODE_BIDI_TYPE_B) +			return 1; +		return 0; +	} +	return atend; +} + +static void emit_marker(struct bidi_embed_levelrun *p, +			struct need_marker_info *info, +			void (*emit)(const char32_t *string, +				     size_t n, +				     void *arg), +			void *arg) +{ +	char32_t marker= (p->level & 1) ? RLM:LRM; + +	if (info->need_marker) +		(*emit)(&marker, 1, arg); + +	if (info->need_pdi) +	{ +		marker=PDI; +		(*emit)(&marker, 1, arg); +	} +} + +void unicode_bidi_embed(const char32_t *string, +			const unicode_bidi_level_t *levels, +			size_t n, +			unicode_bidi_level_t paragraph_level, +			void (*emit)(const char32_t *string, +				     size_t n, +				     void *arg), +			void *arg) +{ +	struct bidi_embed_levelrun *runs=0; +	enum_bidi_type_t *classes= +		(enum_bidi_type_t *)calloc(n, sizeof(enum_bidi_type_t)); + +	if (!classes) +		abort(); + +	for (size_t i=0; i<n; ++i) +		classes[i]=unicode_bidi_type(string[i]); + +	compute_bidi_embed_levelruns(string, levels, +				     n, +				     &runs); + +	/* +	** Go through the sequences of consecutive characters with the +	** same embedding level. Keep track of the preceding and the +	** next embedding level, which is usually the opposite from the +	** current sequence's embedding level. Except that the first and +	** the last sequence of characters, in the string, are bound to +	** the paragraph_level, which may be the same. +	*/ + +	unicode_bidi_level_t previous_level=paragraph_level; + +	while (runs) +	{ +		struct bidi_embed_levelrun *p=runs; + +		runs=runs->next; + +		unicode_bidi_level_t next_level=paragraph_level; + +		if (runs) +			next_level=runs->level; + +#ifdef BIDI_DEBUG +		fprintf(DEBUGDUMP, "  Range %d-%d, level %d\n", +			(int)p->start, (int)(p->end-1), p->level); +#endif + +		if (((p->level ^ paragraph_level) & 1) == 0) +		{ +			/* +			** Sequence in the same direction as the paragraph +			** embedding level. +			** +			** We'll definitely need a directional marker if +			** rule L1 applies after this sequence. +			*/ + +			struct need_marker_info need_marker; + +			need_marker_info_init(&need_marker); + +			if (classes[p->end-1] == UNICODE_BIDI_TYPE_WS) +			{ +				need_marker.need_marker= +					is_l1_on_or_after(classes, n, +							  p->end, +							  0); +#ifdef BIDI_DEBUG +				fprintf(DEBUGDUMP, "    need marker=%d\n", +					need_marker.need_marker); +#endif + +			} + +			emit_bidi_embed_levelrun(string, classes, +						 p, paragraph_level, +						 previous_level, +						 next_level, +						 &need_marker, +						 emit, arg); + +			emit_marker(p, &need_marker, emit, arg); +		} +		else +		{ +			struct need_marker_info need_marker; +			size_t orig_end=p->end; + +			/* +			** Sequence in the opposite direction. Because S and +			** B reset to the paragraph level, no matter what, +			** if we want things to render like that we will need +			** to emit sequences on each side of S/B in reverse +			** order. We start at the end of this sequence, then +			** search towards the beginning, emit that sequence, +			** emit the S and B, then go to the next sequence. +			*/ + +			need_marker_info_init(&need_marker); + +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, "    need marker=%d\n", +				need_marker); +#endif + +			while (p->start < p->end)  			{ -				--right; +				size_t j=p->end; -				if (p) +				int end_with_ws= +					classes[j-1] == UNICODE_BIDI_TYPE_WS; +				while (j > p->start)  				{ -					char32_t c=p[left]; -					unicode_bidi_level_t l=levels[left]; +					--j; -					p[left]=p[right]; -					levels[left]=levels[right]; -					p[right]=c; -					levels[right]=l; +					enum_bidi_type_t t=classes[j]; + +					if (t == UNICODE_BIDI_TYPE_S || +					    t == UNICODE_BIDI_TYPE_B) +					{ +						++j; +						break; +					} +				} + +				if (j == p->end) /* Must be lone break */ +				{ +#ifdef BIDI_DEBUG +					fprintf(DEBUGDUMP, +						"    break: %d\n", +						(int)j); +#endif +					--p->end; + +					previous_level=paragraph_level; + +					(*emit)(string+p->end, 1, arg); +					continue; +				} + +				struct need_marker_info need_marker_partial; + +				need_marker_info_init(&need_marker_partial); + +				/* +				** Rule L1, there's going to be an S or a B +				** after we emit this sequence. +				*/ + +				if (j != p->start) +					need_marker_partial.need_marker=1; + +				/* +				** To emit this sequence, we monkey-patch +				** the run level to indicate the sub- +				** sequence to emit. +				*/ +				size_t i=p->start; + +				p->start=j; + +				emit_bidi_embed_levelrun +					(string, classes, p, paragraph_level, +					 previous_level, + +					 j == i +					 /* No more, this is next */ +					 ? next_level +					 /* We'll emit a paragraph brk */ +					 : paragraph_level, +					 &need_marker_partial, +					 emit, arg); + +				/* Continue monkey-patching. */ + +				p->end=p->start; +				p->start=i; + +				if (p->start == p->end) +					/* Do it below */ +				{ +					if (end_with_ws) +						need_marker.need_marker= +							is_l1_on_or_after +							(classes, n, +							 orig_end, +							 0); +					need_marker_info_merge +						(&need_marker, +						 &need_marker_partial); +				} +				else +				{ +					emit_marker(p, &need_marker_partial, +						    emit, arg);  				} -				++left;  			} +			emit_marker(p, &need_marker, emit, arg); +		} +		free(p); +	} +	free(classes); +} + +#define ADJUST_LR(t,e) do {					\ +		switch (t) {					\ +		case UNICODE_BIDI_TYPE_AL:			\ +			(t)=UNICODE_BIDI_TYPE_R;		\ +			break;					\ +		case UNICODE_BIDI_TYPE_ET:			\ +		case UNICODE_BIDI_TYPE_ES:			\ +		case UNICODE_BIDI_TYPE_AN:			\ +		case UNICODE_BIDI_TYPE_EN:			\ +			(t)=UNICODE_BIDI_TYPE_L;		\ +			break;					\ +		default:					\ +			break;					\ +		}						\ +	} while (0) + +#define ADJUST_LRSTRONG(t) do {					\ +		switch (t) {					\ +		case UNICODE_BIDI_TYPE_AL:			\ +			(t)=UNICODE_BIDI_TYPE_R;		\ +		default:					\ +			break;					\ +		}						\ +	} while (0) + +static void emit_bidi_embed_levelrun(const char32_t *string, +				     enum_bidi_type_t *classes, +				     struct bidi_embed_levelrun *run, +				     unicode_bidi_level_t paragraph_level, +				     unicode_bidi_level_t previous_level, +				     unicode_bidi_level_t next_level, +				     struct need_marker_info *need_marker, +				     void (*emit)(const char32_t *string, +						  size_t n, +						  void *arg), +				     void *arg) +{ +	/* +	** Our first order of business will be to apply rules W to this +	** sequence, to resolve weak types. +	** +	** It's easy to simulate what unicode_bidi_w() expects. +	*/ + +	struct level_run lrun; +	struct isolating_run_sequence_s seq; +	enum_bidi_type_t e_type=E_CLASS(run->level); +	enum_bidi_type_t o_type=O_CLASS(run->level); + +	if (run->start == run->end) +		return; + +	memset(&seq, 0, sizeof(seq)); + +	seq.embedding_level=run->level; +	seq.sos=seq.eos=e_type; +	seq.runs.runs=&lrun; +	seq.runs.n_level_runs=1; +	seq.runs.cap_level_runs=1; +	lrun.start=run->start; +	lrun.end=run->end; +	unicode_bidi_w(classes, &seq); + +	/* +	** Peek at the first character's class. +	** +	** If the previous sequence's embedding level was the same, it +	** guarantees the peristence of the embedding direction. We can +	** accept classes that default to our embedding level. +	** +	** Otherwise we recognize only strong classes. +	*/ +	enum_bidi_type_t t=classes[run->start]; + +	if (previous_level == run->level) +	{ +		ADJUST_LR(t, E_CLASS(previous_level)); +	} +	else +	{ +		ADJUST_LRSTRONG(t); +	} + +	/* +	** Sequence in the opposite direction always get isolated. +	*/ +	char32_t override_start=run->level ? RLI:LRI; + +	if (run->level != paragraph_level) +		(*emit)(&override_start, 1, arg); + +	/* +	** Make sure the character sequence has strong context. +	*/ +	if (t == o_type) +	{ +		struct need_marker_info need_marker; + +		need_marker_info_init(&need_marker); + +		need_marker.need_marker=1; + +		emit_marker(run, &need_marker, emit, arg); +	} + +	override_start=run->level ? RLO:LRO; +	char32_t override_end=PDF; + +	size_t start=run->start; +	size_t end=run->end; + +	while (start < end) +	{ +		size_t i=start; +		size_t word_start=i; + +#ifdef BIDI_DEBUG +		fprintf(DEBUGDUMP, +			"    examining, starting at: %d\n", (int)i); +#endif + +		/* +		** Look for the next character with the opposite class. +		** While doing that, keep an eye out on any WS or ONs, +		** which will tell us where the most recent "word"s starts, +		** before this character. +		*/ +		while (i < end) +		{ +			enum_bidi_type_t t=classes[i]; + +			ADJUST_LR(t, e_type); + +			if (t == o_type) +				break; + +			switch (t) { +			case UNICODE_BIDI_TYPE_WS: +			case UNICODE_BIDI_TYPE_ON: +				word_start=i+1; +				break; +			default: +				break; +			} + +			++i; +		} + +		if (i < end) +		{ +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, +				"    override needed: %d," +				" start of word at %d, ", +				(int)i, (int)word_start); +#endif +			/* +			** Found something to override. First, emit everything +			** up to the start of this "word". +			** +			** Then emit the RLO or LRO, then look for the end +			** of the "word", and drop the PDF there. +			*/ +			if (word_start > start) +				(*emit)(string+start, +					word_start-start, arg); + +			(*emit)(&override_start, 1, arg); +			while (++i < end) +			{ +				enum_bidi_type_t t=classes[i]; -			if (end-start > 1 && reorder_callback) -				(*reorder_callback)(start, end-start, arg); +				switch (t) { +				case UNICODE_BIDI_TYPE_WS: +				case UNICODE_BIDI_TYPE_ON: +					break; +				default: +					continue; +				} +				break; +			} +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, "end of word at %d\n", +				(int)i); +#endif +			(*emit)(string+word_start, i-word_start, arg); +			(*emit)(&override_end, 1, arg); +			start=i; +			continue;  		} +		(*emit)(string+start, i-start, arg); +		start=i;  	} -	level_run_layers_deinit(&layers); +	/* +	** Make sure that if a different embedding level follows we will +	** emit a marker, to ensure strong context. +	*/ +	t=classes[run->end-1]; + +	if (next_level != run->level) +	{ +		ADJUST_LRSTRONG(t); + +		if (e_type != t) +			need_marker->need_marker=1; +	} + +	if (run->level != paragraph_level) +		need_marker->need_pdi=1; +} + +struct compute_paragraph_embedding_level_char_info { +	const char32_t *str; +}; + +static enum_bidi_type_t +get_enum_bidi_type_for_embedding_paragraph_level(size_t i, +						 void *arg) +{ +	struct compute_paragraph_embedding_level_char_info *p= +		(struct compute_paragraph_embedding_level_char_info *)arg; + +	return unicode_bidi_type(p->str[i]); +} + +char32_t unicode_bidi_embed_paragraph_level(const char32_t *str, +					    size_t n, +					    unicode_bidi_level_t paragraph_level +					    ) +{ +	struct compute_paragraph_embedding_level_char_info info; +	info.str=str; + +	if ((compute_paragraph_embedding_level +	     (0, n, +	      get_enum_bidi_type_for_embedding_paragraph_level, +	      &info) ^ paragraph_level) == 0) +		return 0; + +	return (paragraph_level & 1) ? RLM:LRM;  } diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index ca139cc..04d2893 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -1,5 +1,5 @@  /* -** Copyright 2011-2014 Double Precision, Inc. +** Copyright 2011-2020 Double Precision, Inc.  ** See COPYING for distribution information.  **  */ @@ -596,7 +596,8 @@ extern "C" {  				     void *arg)  	{  		auto p=reinterpret_cast<const std::function<void (size_t, -								  size_t)> *> +								  size_t) +							    noexcept> *>  			(arg);  		(*p)(i, cnt); @@ -605,7 +606,8 @@ extern "C" {  int unicode::bidi_reorder(std::u32string &string,  			  std::vector<unicode_bidi_level_t> &levels, -			  const std::function<void (size_t, size_t)> &lambda) +			  const std::function<void (size_t, size_t) +			  noexcept> &lambda)  {  	size_t s=string.size(); @@ -624,7 +626,8 @@ int unicode::bidi_reorder(std::u32string &string,  }  void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels, -			   const std::function<void (size_t, size_t)> &lambda) +			   const std::function<void (size_t, size_t) +			   noexcept> &lambda)  {  	size_t s=levels.size(); @@ -636,3 +639,189 @@ void unicode::bidi_reorder(std::vector<unicode_bidi_level_t> &levels,  			     (reinterpret_cast<const void *>(&lambda)));  } + +extern "C" { +	static void removed_callback(size_t i, +				     void *arg) +	{ +		auto p=reinterpret_cast<const std::function<void (size_t) +							    noexcept> *> +			(arg); + +		(*p)(i); +	} +} + +void unicode::bidi_cleanup(std::u32string &string, +			   const std::function<void (size_t) noexcept> &lambda) +{ +	if (string.empty()) +		return; + +	size_t n=unicode_bidi_cleanup(&string[0], +				      0, +				      string.size(), +				      removed_callback, +				      const_cast<void *> +				      (reinterpret_cast<const void *> +				       (&lambda))); + +	string.resize(n); +} + +int unicode::bidi_cleanup(std::u32string &string, +			  std::vector<unicode_bidi_level_t> &levels, +			  const std::function<void (size_t) noexcept> &lambda) +{ +	if (levels.size() != string.size()) +		return -1; + +	size_t n=unicode_bidi_cleanup(&string[0], +				      &levels[0], +				      string.size(), +				      removed_callback, +				      const_cast<void *> +				      (reinterpret_cast<const void *> +				       (&lambda))); + +	string.resize(n); +	levels.resize(n); +	return 0; +} + + +void unicode::bidi_extra_cleanup(std::u32string &string, +				 const std::function<void (size_t) noexcept> +				 &lambda) +{ +	if (string.empty()) +		return; + +	size_t n=unicode_bidi_extra_cleanup(&string[0], +					    0, +					    string.size(), +					    removed_callback, +					    const_cast<void *> +					    (reinterpret_cast<const void *> +					     (&lambda))); + +	string.resize(n); +} + +int unicode::bidi_extra_cleanup(std::u32string &string, +				std::vector<unicode_bidi_level_t> &levels, +				const std::function<void (size_t) noexcept> +				&lambda) +{ +	if (levels.size() != string.size()) +		return -1; + +	size_t n=unicode_bidi_extra_cleanup(&string[0], +					    &levels[0], +					    string.size(), +					    removed_callback, +					    const_cast<void *> +					    (reinterpret_cast<const void *> +					     (&lambda))); + +	string.resize(n); +	levels.resize(n); +	return 0; +} + +int unicode::bidi_logical_order(std::u32string &string, +				std::vector<unicode_bidi_level_t> &levels, +				unicode_bidi_level_t paragraph_embedding, +				const std::function<void (size_t, size_t) +				noexcept> &lambda) +{ +	if (string.size() != levels.size()) +		return -1; + +	if (string.empty()) +		return 0; + +	unicode_bidi_logical_order(&string[0], &levels[0], string.size(), +				   paragraph_embedding, +				   &reorder_callback, +				   const_cast<void *> +				   (reinterpret_cast<const void *>(&lambda))); +	return 0; +} + +void unicode::bidi_logical_order(std::vector<unicode_bidi_level_t> &levels, +				 unicode_bidi_level_t paragraph_embedding, +				 const std::function<void (size_t, size_t) +				 noexcept> &lambda) +{ +	if (levels.size() == 0) +		return; + +	unicode_bidi_logical_order(NULL, &levels[0], levels.size(), +				   paragraph_embedding, +				   &reorder_callback, +				   const_cast<void *> +				   (reinterpret_cast<const void *>(&lambda))); +} + +extern "C" { +	static void embed_callback(const char32_t *string, +				   size_t n, +				   void *arg) +	{ +		auto p=reinterpret_cast<const std::function<void +							    (const char32_t *, +							     size_t n) +							    noexcept> *>(arg); +		(*p)(string, n); +	} +} + +int unicode::bidi_embed(const std::u32string &string, +			const std::vector<unicode_bidi_level_t> &levels, +			unicode_bidi_level_t paragraph_embedding, +			const std::function<void (const char32_t *string, +						  size_t n) noexcept> +			&lambda) +{ +	if (string.size() != levels.size()) +		return -1; + +	if (string.empty()) +		return 0; + +	unicode_bidi_embed(&string[0], &levels[0], string.size(), +			   paragraph_embedding, +			   embed_callback, +			   const_cast<void *> +			   (reinterpret_cast<const void *> +			    (&lambda))); +	return 0; +} + +std::u32string unicode::bidi_embed(const std::u32string &string, +				   const std::vector<unicode_bidi_level_t +				   > &levels, +				   unicode_bidi_level_t paragraph_embedding) +{ +	std::u32string new_string; + +	(void)bidi_embed(string, levels, paragraph_embedding, +			 [&] +			 (const char32_t *string, +			  size_t n) +			 { +				 new_string.insert(new_string.end(), +						   string, string+n); +			 }); + +	return new_string; +} + +char32_t unicode::bidi_embed_paragraph_level(const std::u32string &string, +					     unicode_bidi_level_t level) +{ +	return unicode_bidi_embed_paragraph_level(string.c_str(), +						  string.size(), +						  level); +} | 
