Implement unicode_canonical.

Fixes biditest2.
author: Sam Varshavchik 2020-07-10 08:25:53 -0400
committer: Sam Varshavchik 2020-07-12 15:56:45 -0400
commit: f94fc14a9f3019f110c71d084f4bc59261434519 (patch)
tree: a2c8bfc5b325f9bb0516b14700effc97084185dc /unicode
parent: 1ef92db9dbbefff98b93c8c66e4693a31b4f31a5 (diff)
download: courier-libs-f94fc14a9f3019f110c71d084f4bc59261434519.tar.bz2
10 files changed, 712 insertions, 87 deletions
diff --git a/unicode/.gitignore b/unicode/.gitignore
index 1bdc8ce..8905e05 100644
--- a/unicode/.gitignore
+++ b/unicode/.gitignore
@@ -19,6 +19,8 @@
 /WordBreakTest.txt
 /emoji-data.txt
 /biditest
+/biditest2
+/canonicalmappings.h
 /config.cache
 /config.guess
 /config.sub
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index 8b1d3cf..83034c5 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -11,6 +11,7 @@ noinst_SCRIPTS=update.sh		\
 	mkbidi.pl			\
 	mkbidiclass.pl			\
 	mkbidiclassnames.pl		\
+	mkcanonical.pl			\
 	mkeastasianwidth.pl		\
 	mkemojidata.pl			\
 	mkgraphemebreak.pl		\
@@ -20,7 +21,7 @@ noinst_SCRIPTS=update.sh		\
 	mkwordbreak.pl
 
 noinst_PROGRAMS=unicodetest graphemetest linebreaktest wordbreaktest \
-	enttest scripttest biditest
+	enttest scripttest biditest biditest2
 
 aclocaldir=$(datadir)/aclocal
 aclocal_DATA=m4/courier-unicode.m4
@@ -111,6 +112,7 @@ man_MANS= \
         $(srcdir)/man/unicode_bidi_calc.3 \
         $(srcdir)/man/unicode_bidi_mirror.3 \
         $(srcdir)/man/unicode_bidi_reorder.3 \
+        $(srcdir)/man/unicode_canonical.3 \
         $(srcdir)/man/unicode_category_lookup.3 \
         $(srcdir)/man/unicode_convert.3 \
         $(srcdir)/man/unicode_convert_deinit.3 \
@@ -187,6 +189,7 @@ libcourier_unicode_la_SOURCES=\
 			unicode_graphemebreak.c graphemebreaktab.h \
 			unicode_linebreak.c linebreaktab.h \
 			unicode_htmlent.c unicode_htmlent.h \
+			unicode_canonical.c canonicalmappings.h \
 			linebreaktab_internal.h \
 			unicode_wordbreak.c wordbreaktab.h scriptstab.h \
 			unicode_emoji.c emojitab.h \
@@ -214,6 +217,7 @@ BUILT_SOURCES=unicode_ultcasetab.c \
 	bidi_class.h \
 	bidi_classnames.h \
 	bidi_mirroring.h \
+	canonicalmappings.h \
 	categoriestab.h \
 	eastasianwidth.h \
 	emojitab.h \
@@ -277,6 +281,9 @@ bidi_classnames.h: unicode_bidi.c mkbidiclassnames.pl
 	@PERL@ $(srcdir)/mkbidiclassnames.pl <$(srcdir)/courier-unicode.h.in >bidi_classnames.h.tmp
 	mv bidi_classnames.h.tmp bidi_classnames.h
 
+canonicalmappings.h: UnicodeData.txt mkcanonical.pl
+	@PERL@ -I$(srcdir) $(srcdir)/mkcanonical.pl >canonicalmappings.h.tmp
+	mv canonicalmappings.h.tmp canonicalmappings.h
 endif
 
 unicodetest_SOURCES=unicodetest.c
@@ -314,7 +321,10 @@ biditest_DEPENDENCIES=libcourier-unicode.la
 biditest_LDADD=libcourier-unicode.la
 biditest_LDFLAGS=-static
 
-
+biditest2_SOURCES=biditest2.C
+biditest2_DEPENDENCIES=libcourier-unicode.la
+biditest2_LDADD=libcourier-unicode.la
+biditest2_LDFLAGS=-static
 
 check-am: unicodetest
 	./unicodetest
@@ -385,6 +395,7 @@ check-am: unicodetest
 	test "`./biditest 8261`" = "8262 8262 o"
 	test "`./biditest 8262`" = "8261 8261 c"
 	./biditest
+	./biditest2
 
 if HAVE_DOCS
 
diff --git a/unicode/biditest.C b/unicode/biditest.C
index 6343866..2d2a6e5 100644
--- a/unicode/biditest.C
+++ b/unicode/biditest.C
@@ -35,8 +35,10 @@ int main(int argc, char **argv)
 	std::ifstream fp("BidiTest.txt");
 
 	if (!fp.is_open())
+	{
+		std::cerr << "Cannot open BidiTest.txt" << std::endl;
 		exit(1);
-
+	}
 	size_t linenum=0;
 	size_t nextlogline=0;
 	std::string logmsg;
@@ -178,9 +180,10 @@ int main(int argc, char **argv)
 		{
 			if (n & 1)
 			{
-				actual_levels=level ?
+				auto ret=level ?
 					unicode::bidi_calc(dummy_input,*level)
 					: unicode::bidi_calc(dummy_input);
+				actual_levels=std::get<0>(ret);
 
 				int matched=0;
 
@@ -350,31 +353,6 @@ extern "C" {
 
 #include "unicode_bidi.c"
 
-static const struct {
-	char			classname[8];
-	enum_bidi_type_t	classenum;
-} bidiclassnames[]={
-
-#include "bidi_classnames.h"
-
-};
-
-const char *bidi_classname(enum_bidi_type_t classenum)
-{
-	for (const auto &cn:bidiclassnames)
-	{
-		if (cn.classenum == classenum)
-			return cn.classname;
-	}
-
-	return "???";
-}
-
-static const char *lookup_classname(const std::string &s)
-{
-	abort();
-}
-
 enum_bidi_type_t fudge_unicode_bidi(size_t i)
 {
 	if (i >= testcase.size())
diff --git a/unicode/biditest2.C b/unicode/biditest2.C
new file mode 100644
index 0000000..f497bcf
--- /dev/null
+++ b/unicode/biditest2.C
@@ -0,0 +1,203 @@
+#include	"unicode_config.h"
+#include	"courier-unicode.h"
+#include	<iostream>
+#include	<sstream>
+#include	<fstream>
+#include	<cstdint>
+#include	<iomanip>
+
+FILE *DEBUGDUMP;
+
+int main(int argc, char **argv)
+{
+	std::ifstream fp("BidiCharacterTest.txt");
+
+	if (!fp.is_open())
+	{
+		std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl;
+		exit(1);
+	}
+
+	DEBUGDUMP=fopen("/dev/null", "w");
+	if (!DEBUGDUMP)
+	{
+		perror("/dev/null");
+		exit(1);
+	}
+
+	std::string buf;
+
+	size_t linenum=0;
+
+	while (1)
+	{
+		buf.clear();
+
+		if (std::getline(fp, buf).eof() && buf.empty())
+			break;
+		++linenum;
+
+		auto p=buf.find('#');
+
+		if (p != buf.npos)
+			buf=buf.substr(0, p);
+
+		p=buf.find(';');
+
+		if (p == buf.npos)
+			continue;
+
+		std::istringstream chars{buf.substr(0, p)};
+
+		auto q=buf.find(';', ++p);
+
+		if (q == buf.npos)
+		{
+			std::cerr << "Cannot parse line " << linenum
+				  << std::endl;
+			exit(2);
+		}
+
+		int direction;
+
+		if (!(std::istringstream{buf.substr(p, q-p)} >> direction))
+		{
+			std::cerr << "Cannot parse line " << linenum
+				  << std::endl;
+			exit(3);
+		}
+
+		p=++q;
+		q=buf.find(';', p);
+
+		if (q == buf.npos)
+		{
+			std::cerr << "Cannot parse line " << linenum
+				  << std::endl;
+			exit(4);
+		}
+
+		int paragraph_embedding_level;
+
+		if (!(std::istringstream{buf.substr(p, q-p)} >>
+		      paragraph_embedding_level))
+		{
+			std::cerr << "Cannot parse line " << linenum
+				  << std::endl;
+			exit(5);
+		}
+		p=++q;
+		q=buf.find(';', p);
+
+		if (q == buf.npos)
+		{
+			std::cerr << "Cannot parse line " << linenum
+				  << std::endl;
+			exit(6);
+		}
+
+		std::vector<unicode_bidi_level_t> levels;
+
+		{
+			std::istringstream level_s{buf.substr(p, q-p)};
+
+			std::string s;
+
+			while (level_s >> s)
+			{
+				size_t l;
+
+				if (!(std::istringstream{s} >> l))
+				{
+					l=UNICODE_BIDI_SKIP;
+				}
+				levels.push_back(l);
+			}
+		}
+
+		std::vector<size_t> render_order;
+
+		{
+			size_t n;
+
+			std::istringstream order_i{buf.substr(++q)};
+
+			while (order_i >> n)
+				render_order.push_back(n);
+		}
+		std::u32string s;
+		uintmax_t c;
+
+		while (chars >> std::hex >> c)
+			s.push_back(c);
+
+		auto ret=direction == UNICODE_BIDI_LR ||
+			direction == UNICODE_BIDI_RL
+			? unicode::bidi_calc(s, direction)
+			: unicode::bidi_calc(s);
+
+		if (std::get<1>(ret) != paragraph_embedding_level)
+		{
+			std::cerr << "Regression, line "
+				  << linenum
+				  << ": expected "
+				  << paragraph_embedding_level
+				  << " paragraph embedding level, got "
+				  << (int)std::get<1>(ret)
+				  << std::endl;
+			exit(1);
+		}
+
+		if (std::get<0>(ret) != levels)
+		{
+			fclose(DEBUGDUMP);
+			DEBUGDUMP=stderr;
+
+			(void)(direction == UNICODE_BIDI_LR ||
+			       direction == UNICODE_BIDI_RL
+			       ? unicode::bidi_calc(s, direction)
+			       : unicode::bidi_calc(s));
+
+			std::cerr << "Regression, line "
+				  << linenum
+				  << ": embedding levels"
+				  << std::endl
+				  << "   Expected:";
+
+			for (int l:levels)
+			{
+				std::cerr << " ";
+				if (l == UNICODE_BIDI_SKIP)
+					std::cerr << "x";
+				else
+					std::cerr << l;
+			}
+
+			std::cerr << std::endl
+				  << "     Actual:";
+
+			for (int l:std::get<0>(ret))
+			{
+				std::cerr << " ";
+				if (l == UNICODE_BIDI_SKIP)
+					std::cerr << "x";
+				else
+					std::cerr << l;
+			}
+			std::cerr << std::endl;
+			exit(1);
+		}
+	}
+	return 0;
+}
+
+#define BIDI_DEBUG
+
+extern "C" {
+#if 0
+}
+#endif
+
+#include "unicode_bidi.c"
+
+}
diff --git a/unicode/book.xml b/unicode/book.xml
index 9c1486c..ad0009a 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -4,6 +4,7 @@
 
 <!ENTITY tr9ver  "42">
 <!ENTITY tr14ver "45">
+<!ENTITY tr15ver "50">
 <!ENTITY tr24ver "31">
 <!ENTITY tr29ver "37">
 <!ENTITY tr51ver "18">
@@ -232,6 +233,9 @@ See COPYING for distribution information.
 	    <link linkend="unicode_bidi">
 	      <citerefentry><refentrytitle>unicode_bidi</refentrytitle>
 	      <manvolnum>3</manvolnum></citerefentry></link>,
+	    <link linkend="unicode_canonical">
+	      <citerefentry><refentrytitle>unicode_canonical</refentrytitle>
+	      <manvolnum>3</manvolnum></citerefentry></link>,
 	    <link linkend="unicode_category_lookup">
 	      <citerefentry><refentrytitle>unicode_category_lookup</refentrytitle>
 	      <manvolnum>3</manvolnum></citerefentry></link>,
@@ -475,6 +479,91 @@ See COPYING for distribution information.
 	</refsect1>
       </refentry>
 
+      <refentry id="unicode_canonical">
+	<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+
+	<refmeta>
+	  <refentrytitle>unicode_canonical</refentrytitle>
+	  <manvolnum>3</manvolnum>
+	</refmeta>
+
+	<refnamediv>
+	  <refname>unicode_canonical</refname>
+
+	  <refpurpose>unicode canonical character mapping</refpurpose>
+	</refnamediv>
+
+	<refsynopsisdiv>
+	  <funcsynopsis>
+	    <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+	    <funcprototype>
+	      <funcdef>unicode_canonical_t <function>unicode_canonical</function></funcdef>
+              <paramdef>char32_t <parameter>c</parameter></paramdef>
+	    </funcprototype>
+	  </funcsynopsis>
+	</refsynopsisdiv>
+	<refsect1>
+	  <title>DESCRIPTION</title>
+
+	  <para>
+	    <function>unicode_canonical</function>() looks up the
+	    character's
+	    <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html
+">canonical
+	    and compatibility mapping</ulink>.
+
+	    <function>unicode_canonical</function>() returns a structure
+	    with the following fields:
+	  </para>
+
+	  <variablelist>
+	    <varlistentry>
+	      <term><structfield>canonical_chars</structfield></term>
+	      <listitem>
+		<para>
+		  A pointer to the canonical or equivalent representation
+		  of the character.
+	        </para>
+	      </listitem>
+	    </varlistentry>
+	    <varlistentry>
+	      <term><structfield>n_canonical_chars</structfield></term>
+	      <listitem>
+		<para>
+		  Number of characters in the
+		  <structfield>canonical_chars</structfield>.
+	        </para>
+	      </listitem>
+	    </varlistentry>
+	    <varlistentry>
+	      <term><structfield>format</structfield></term>
+	      <listitem>
+		<para>
+		  The character's canonical formatting flag, if any.
+	        </para>
+	      </listitem>
+	    </varlistentry>
+	  </variablelist>
+
+	  <para>
+	    A NULL <structfield>canonical_chars</structfield> (with a 0
+	    <structfield>n_canonical_chars</structfield>) indicates
+	    that the character without a canonical or compatibility
+	    equivalence.
+	  </para>
+	</refsect1>
+	<refsect1>
+	  <title>SEE ALSO</title>
+	  <para>
+	    <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>,
+	    <link linkend="courier-unicode">
+	      <citerefentry>
+		<refentrytitle>courier-unicode</refentrytitle>
+		<manvolnum>7</manvolnum></citerefentry></link>.
+	    </para>
+	</refsect1>
+      </refentry>
+
       <refentry id="unicode_category_lookup">
 	<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
 
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index 55a7152..c8161ea 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -584,6 +584,15 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i);
 ** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates
 ** directional markers (from step X9). These characters should be removed
 ** before using unicode_bidi_reorder().
+**
+** unicode_bidi_calc() returns the resolved paragraph direction level, which
+** always matches the passed in level, if specified, else it reports the
+** derived one.
+**
+** unicode_bidi_reorder() reorders the characters according to the resolved
+** embedding levels. A non-null reorder_callback gets invoked repeatedly,
+** indicating the starting index and the number of characters reversed, so
+** that any related metadata can be updated accordingly.
 */
 
 typedef char unicode_bidi_bracket_type_t;
@@ -604,10 +613,10 @@ typedef unsigned char unicode_bidi_level_t;
 #define UNICODE_BIDI_RL		((unicode_bidi_level_t)1)
 #define UNICODE_BIDI_SKIP	((unicode_bidi_level_t)254)
 
-extern void unicode_bidi_calc(const char32_t *p, size_t n,
-			      unicode_bidi_level_t *bufp,
-			      const unicode_bidi_level_t *
-			      initial_embedding_level);
+extern unicode_bidi_level_t unicode_bidi_calc(const char32_t *p, size_t n,
+					      unicode_bidi_level_t *bufp,
+					      const unicode_bidi_level_t *
+					      initial_embedding_level);
 
 extern void unicode_bidi_reorder(char32_t *p,
 				 unicode_bidi_level_t *levels,
@@ -646,6 +655,48 @@ typedef enum {
 extern enum_bidi_type_t unicode_bidi_type(char32_t c);
 
 /*
+** unicode_canonical() returns the canonical mapping of the given Unicode
+** character. The returned structure specifies:
+**
+** - A pointer to the canonical decomposition of the given character.
+** - Number of characters in the canonical decomposition.
+** - An optional formatting tag.
+**
+** A null pointer, and a 0 character count gets returned for characters
+** without a canonical decomposition.
+**
+*/
+
+typedef enum {
+	      UNICODE_CANONICAL_FMT_NONE=0,
+
+	      UNICODE_CANONICAL_FMT_CIRCLE,
+	      UNICODE_CANONICAL_FMT_COMPAT,
+	      UNICODE_CANONICAL_FMT_FINAL,
+	      UNICODE_CANONICAL_FMT_FONT,
+	      UNICODE_CANONICAL_FMT_FRACTION,
+	      UNICODE_CANONICAL_FMT_INITIAL,
+	      UNICODE_CANONICAL_FMT_ISOLATED,
+	      UNICODE_CANONICAL_FMT_MEDIAL,
+	      UNICODE_CANONICAL_FMT_NARROW,
+	      UNICODE_CANONICAL_FMT_NOBREAK,
+	      UNICODE_CANONICAL_FMT_SMALL,
+	      UNICODE_CANONICAL_FMT_SQUARE,
+	      UNICODE_CANONICAL_FMT_SUB,
+	      UNICODE_CANONICAL_FMT_SUPER,
+	      UNICODE_CANONICAL_FMT_VERTICAL,
+	      UNICODE_CANONICAL_FMT_WIDE,
+} unicode_canonical_fmt_t;
+
+typedef struct {
+	const char32_t *canonical_chars;
+	size_t n_canonical_chars;
+	unicode_canonical_fmt_t format;
+} unicode_canonical_t;
+
+extern unicode_canonical_t unicode_canonical(char32_t);
+
+/*
 ** A buffer that holds unicode characters, and dynamically grows as needed.
 */
 
@@ -2066,11 +2117,13 @@ std::u32string tolower(const std::u32string &u);
 std::u32string toupper(const std::u32string &u);
 
 //! Calculate bidirectional embedding levels
-std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s);
+std::tuple<std::vector<unicode_bidi_level_t>,
+	   unicode_bidi_level_t> bidi_calc(const std::u32string &s);
 
 //! Calculate bidirectional embedding levels
-std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s,
-					    unicode_bidi_level_t level);
+std::tuple<std::vector<unicode_bidi_level_t>,
+	   unicode_bidi_level_t> bidi_calc(const std::u32string &s,
+					   unicode_bidi_level_t level);
 
 //! Reorder bidirectional text
 int bidi_reorder(std::u32string &string,
diff --git a/unicode/mkcanonical.pl b/unicode/mkcanonical.pl
new file mode 100644
index 0000000..43d7e44
--- /dev/null
+++ b/unicode/mkcanonical.pl
@@ -0,0 +1,110 @@
+#! /usr/bin/perl
+#
+# Creates a lookup table for canonical mappings in UnicodeData.txt
+
+use strict;
+use warnings;
+
+open(F, "<UnicodeData.txt") || die;
+
+my @mappings;
+my @data;
+
+while (defined($_=<F>))
+{
+    my @w=split(/;/, $_, -1);
+
+    next unless $w[5];
+
+    my $code=$w[0];
+
+    my @mapping=split(/\s/, $w[5]);
+
+    my $formatting_tag = "UNICODE_CANONICAL_FMT_NONE";
+
+    if ($mapping[0] =~ /^</)
+    {
+	$formatting_tag = shift @mapping;
+
+	$formatting_tag =~ s/<//g;
+	$formatting_tag =~ s/>//g;
+	$formatting_tag = "UNICODE_CANONICAL_FMT_" . uc($formatting_tag);
+    };
+
+    die "Too long\n" if (scalar @mapping) > 0xFFFF;
+
+    my $dec_code;
+
+    eval "\$dec_code=0x$code\n";
+
+    push @data, [$dec_code, "\t{0x$code, (unsigned char)$formatting_tag, "
+	. (scalar @mapping) . ", "
+	. scalar(@mappings) . "}" ];
+    push @mappings, @mapping;
+}
+
+my $hash_size = int( (scalar @data) * 3 / 4);
+
+my %buckets;
+
+my $keep_going = 1;
+
+while ($keep_going)
+{
+    %buckets = ();
+
+    $keep_going = 0;
+
+    foreach my $m (@data)
+    {
+	my $bucket = $m->[0] % $hash_size;
+
+	push @{$buckets{$bucket}}, $m;
+
+	if ((scalar @{$buckets{$bucket}}) > 3)
+	{
+	    $keep_going = 1;
+	    ++$hash_size;
+	    last;
+	}
+    }
+}
+
+print "#define HASH_SIZE $hash_size\n";
+
+@data = ();
+
+my $pfix = "";
+
+print "static const unsigned short canon_map_hash[]={\n";
+
+foreach my $bucket (0.. ($hash_size)-1)
+{
+    print "$pfix\t" . (scalar @data);
+    $pfix = ",\n";
+
+    push @data, @{ $buckets{$bucket} // [] };
+}
+
+
+print "};\n\nstatic const struct canon_map_table canon_map_lookup[]={\n";
+
+$pfix = "";
+
+foreach my $m (@data)
+{
+    print "$pfix" . $m->[1];
+    $pfix = ",\n";
+}
+
+print "\n};\n\nstatic const char32_t canon_map_values[]={\n";
+
+$pfix="";
+
+foreach my $v (@mappings)
+{
+    print "$pfix\t0x$v";
+
+    $pfix=",\n";
+}
+print "};\n";
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index da15966..055ee89 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -467,6 +467,28 @@ typedef struct {
 } *directional_status_stack_t;
 
 #ifdef BIDI_DEBUG
+
+static const struct {
+	char			classname[8];
+	enum_bidi_type_t	classenum;
+} bidiclassnames[]={
+
+#include "bidi_classnames.h"
+
+};
+
+const char *bidi_classname(enum_bidi_type_t classenum)
+{
+	for (const auto &cn:bidiclassnames)
+	{
+		if (cn.classenum == classenum)
+			return cn.classname;
+	}
+
+	return "???";
+}
+
+
 void dump_classes(const char *prefix, directional_status_stack_t stack)
 {
 	fprintf(DEBUGDUMP, "%s: ", prefix);
@@ -621,11 +643,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack)
 	free(stack);
 }
 
-static void unicode_bidi_b(const char32_t *p,
-			   size_t n,
-			   enum_bidi_type_t *buf,
-			   unicode_bidi_level_t *bufp,
-			   const unicode_bidi_level_t *initial_embedding_level);
+static unicode_bidi_level_t
+unicode_bidi_b(const char32_t *p,
+	       size_t n,
+	       enum_bidi_type_t *buf,
+	       unicode_bidi_level_t *bufp,
+	       const unicode_bidi_level_t *initial_embedding_level);
 
 enum_bidi_type_t unicode_bidi_type(char32_t c)
 {
@@ -639,8 +662,9 @@ enum_bidi_type_t unicode_bidi_type(char32_t c)
 				   UNICODE_BIDI_TYPE_L);
 }
 
-void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
-		       const unicode_bidi_level_t *initial_embedding_level)
+unicode_bidi_level_t
+unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
+		  const unicode_bidi_level_t *initial_embedding_level)
 {
 	/*
 	** Look up the bidi class for each char32_t.
@@ -661,27 +685,33 @@ void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
 		bufp[i]=UNICODE_BIDI_SKIP;
 	}
 
-	unicode_bidi_b(p, n,
-		       buf,
-		       bufp,
-		       initial_embedding_level);
+	unicode_bidi_level_t level=unicode_bidi_b(p, n,
+						  buf,
+						  bufp,
+						  initial_embedding_level);
 
 	free(buf);
+
+	return level;
 }
 
 static void unicode_bidi_cl(directional_status_stack_t stack);
 
-static void unicode_bidi_b(const char32_t *p,
-			   size_t n,
-			   enum_bidi_type_t *buf,
-			   unicode_bidi_level_t *bufp,
-			   const unicode_bidi_level_t *initial_embedding_level)
+static unicode_bidi_level_t
+unicode_bidi_b(const char32_t *p,
+	       size_t n,
+	       enum_bidi_type_t *buf,
+	       unicode_bidi_level_t *bufp,
+	       const unicode_bidi_level_t *initial_embedding_level)
 {
 	directional_status_stack_t stack;
 
 	stack=directional_status_stack_init(p, buf, n, bufp,
 					    initial_embedding_level);
 
+	unicode_bidi_level_t paragraph_embedding_level=
+		stack->paragraph_embedding_level;
+
 #ifdef BIDI_DEBUG
 	fprintf(DEBUGDUMP, "BIDI: START: Paragraph embedding level: %d\n",
 		(int)stack->paragraph_embedding_level);
@@ -690,6 +720,8 @@ static void unicode_bidi_b(const char32_t *p,
 	unicode_bidi_cl(stack);
 
 	directional_status_stack_deinit(stack);
+
+	return paragraph_embedding_level;
 }
 
 #define RESET_CLASS(p,stack) do {				\
@@ -1173,6 +1205,8 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
 	{
 #ifdef BIDI_DEBUG
 		dump_sequence_info(stack, p);
+		fprintf(DEBUGDUMP, "Sequence embedding level: %d\n",
+			(int)p->embedding_level);
 		dump_sequence("Contents before W", stack, p);
 #endif
 
@@ -1408,6 +1442,16 @@ struct bidi_n_stack {
 	short matched;
 };
 
+#define IS_NI(class)						\
+	((class) == UNICODE_BIDI_TYPE_B ||			\
+	 (class) == UNICODE_BIDI_TYPE_S ||			\
+	 (class) == UNICODE_BIDI_TYPE_WS ||			\
+	 (class) == UNICODE_BIDI_TYPE_ON ||			\
+	 (class) == UNICODE_BIDI_TYPE_FSI ||			\
+	 (class) == UNICODE_BIDI_TYPE_LRI ||			\
+	 (class) == UNICODE_BIDI_TYPE_RLI ||			\
+	 (class) == UNICODE_BIDI_TYPE_PDI)
+
 static void unicode_bidi_n(directional_status_stack_t stack,
 			   struct isolating_run_sequence_s *seq)
 {
@@ -1430,45 +1474,86 @@ static void unicode_bidi_n(directional_status_stack_t stack,
 
 	for (; irs_compare(&iter, &end); irs_incr(&iter))
 	{
-		unicode_bidi_bracket_type_t bracket_type;
-		char32_t open_bracket=
-			unicode_bidi_bracket_type(stack->chars[iter.i],
-						  &bracket_type);
+		unicode_bidi_bracket_type_t bracket_type=UNICODE_BIDI_n;
+
+		char32_t open_or_close_bracket=0;
+
+		if (IS_NI(stack->classes[iter.i]))
+		{
+			open_or_close_bracket=
+				unicode_bidi_bracket_type(stack->chars[iter.i],
+							  &bracket_type);
+		}
 
 		if (bracket_type == UNICODE_BIDI_o)
 		{
 			if (stackp >= NSTACKSIZE)
+			{
+#ifdef BIDI_DEBUG
+				fprintf(DEBUGDUMP,
+					"BD16 stack exceeded on index %d\n",
+					(int)iter.i);
+#endif
 				break; /* BD16 failure */
-
+			}
 			if (!((*bracket_stack_tail)=(struct bidi_n_stack *)
 			      calloc(1, sizeof(struct bidi_n_stack))))
 				abort();
 
 			stack_iters[stackp]=*bracket_stack_tail;
-
-			(*bracket_stack_tail)->start=iter;
+			stack_iters[stackp]->start=iter;
 
 			stack_chars[stackp]=stack->chars[iter.i];
 
+			unicode_canonical_t canon=
+				unicode_canonical(stack_chars[stackp]);
+
+			if (canon.n_canonical_chars == 1 &&
+			    !canon.format)
+			{
+				stack_chars[stackp]=
+					canon.canonical_chars[0];
+			}
+
 			bracket_stack_tail= &(*bracket_stack_tail)->next;
 			++stackp;
-			continue;
+#ifdef BIDI_DEBUG
+			fprintf(DEBUGDUMP, "Found opening bracket at index %d\n",
+				(int)iter.i);
+#endif
 		}
 
-		if (bracket_type == UNICODE_BIDI_c) /* Should be "n" */
+		if (bracket_type == UNICODE_BIDI_c)
 		{
+			unicode_canonical_t canon=
+				unicode_canonical(open_or_close_bracket);
+
+			if (canon.n_canonical_chars == 1 &&
+			    !canon.format)
+			{
+				open_or_close_bracket=
+					canon.canonical_chars[0];
+			}
+#ifdef BIDI_DEBUG
+			fprintf(DEBUGDUMP, "Found closing bracket at index %d\n",
+				(int)iter.i);
+#endif
 			for (size_t i=stackp; i > 0; )
 			{
 				--i;
-				if (stack_chars[i] != open_bracket)
+				if (stack_chars[i] != open_or_close_bracket)
 					continue;
+#ifdef BIDI_DEBUG
+				fprintf(DEBUGDUMP,
+					"Matched to open bracket at index %d\n",
+					(int)stack_iters[i]->start.i);
+#endif
 
 				stack_iters[i]->end = iter;
 				stack_iters[i]->matched=1;
 				stackp=i;
 				break;
 			}
-			continue;
 		}
 
 		/*
@@ -1496,11 +1581,41 @@ static void unicode_bidi_n(directional_status_stack_t stack,
 
 		if (eoclass == E_CLASS)
 		{
+#ifdef BIDI_DEBUG
+			if (stackp)
+			{
+				fprintf(DEBUGDUMP,
+					"Found e for brackets at:");
+
+				for (size_t i=0; i<stackp; ++i)
+				{
+					fprintf(DEBUGDUMP,
+						" %d",
+						(int)stack_iters[i]->start.i);
+				}
+				fprintf(DEBUGDUMP, "\n");
+			}
+#endif
 			for (size_t i=0; i<stackp; ++i)
 				stack_iters[i]->has_e=1;
 		}
 		else if (eoclass == O_CLASS)
 		{
+#ifdef BIDI_DEBUG
+			if (stackp)
+			{
+				fprintf(DEBUGDUMP,
+					"Found o for brackets at:");
+
+				for (size_t i=0; i<stackp; ++i)
+				{
+					fprintf(DEBUGDUMP,
+						" %d",
+						(int)stack_iters[i]->start.i);
+				}
+				fprintf(DEBUGDUMP, "\n");
+			}
+#endif
 			for (size_t i=0; i<stackp; ++i)
 				stack_iters[i]->has_o=1;
 		}
@@ -1516,6 +1631,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,
 		{
 			int set=0;
 
+#ifdef BIDI_DEBUG
+			fprintf(DEBUGDUMP,
+				"Brackets: %d and %d: e=%s, o=%s",
+				(int)p->start.i,
+				(int)p->end.i,
+				bidi_classname(E_CLASS),
+				bidi_classname(O_CLASS));
+
+			fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n",
+				p->has_e,
+				p->has_o);
+#endif
 			if (p->has_e)
 			{
 				stack->classes[p->start.i]=
@@ -1548,16 +1675,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,
 					}
 
 					strong_type=eoclass;
+#ifdef BIDI_DEBUG
+					fprintf(DEBUGDUMP,
+						"Brackets: O context: %s\n",
+						bidi_classname(strong_type));
+#endif
 					break;
 				}
 
-				if (strong_type == O_CLASS)
-				{
-					stack->classes[p->start.i]=
-						stack->classes[p->end.i]=
-						strong_type;
-					set=1;
-				}
+				stack->classes[p->start.i]=
+					stack->classes[p->end.i]=
+					strong_type;
+				set=1;
 			}
 
 			if (set)
@@ -1581,16 +1710,6 @@ static void unicode_bidi_n(directional_status_stack_t stack,
 
 	/* N1 */
 
-#define IS_NI(class)						\
-	((class) == UNICODE_BIDI_TYPE_B ||			\
-	 (class) == UNICODE_BIDI_TYPE_S ||			\
-	 (class) == UNICODE_BIDI_TYPE_WS ||			\
-	 (class) == UNICODE_BIDI_TYPE_ON ||			\
-	 (class) == UNICODE_BIDI_TYPE_FSI ||			\
-	 (class) == UNICODE_BIDI_TYPE_LRI ||			\
-	 (class) == UNICODE_BIDI_TYPE_RLI ||			\
-	 (class) == UNICODE_BIDI_TYPE_PDI)
-
 	enum_bidi_type_t prev_type=seq->sos;
 
 	for (iter=beg; irs_compare(&iter, &end); )
diff --git a/unicode/unicode_canonical.c b/unicode/unicode_canonical.c
new file mode 100644
index 0000000..3f6773f
--- /dev/null
+++ b/unicode/unicode_canonical.c
@@ -0,0 +1,57 @@
+/*
+** Copyright 2020 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include	"unicode_config.h"
+#include	"courier-unicode.h"
+#include <string.h>
+
+struct canon_map_table {
+	char32_t lookup_char;
+	unsigned char fmt_flag_v;
+	unsigned char n_chars;
+	unsigned short offset;
+};
+
+#include "canonicalmappings.h"
+
+unicode_canonical_t unicode_canonical(char32_t c)
+{
+	size_t i=canon_map_hash[c % HASH_SIZE]
+		+
+		/* Compile-time sanity check */
+		sizeof(char[ sizeof(canon_map_hash)/
+			     sizeof(canon_map_hash[0]) == HASH_SIZE
+			     ? 1:-1])*0;
+
+	while (i < sizeof(canon_map_lookup)/sizeof(canon_map_lookup[0]))
+	{
+		if (canon_map_lookup[i].lookup_char == c)
+		{
+			unicode_canonical_t ret;
+
+			ret.canonical_chars=
+				canon_map_values+canon_map_lookup[i].offset;
+			ret.n_canonical_chars=
+				canon_map_lookup[i].n_chars;
+			ret.format=
+				(unicode_canonical_fmt_t)
+				canon_map_lookup[i].fmt_flag_v;
+
+			return ret;
+		}
+
+		if ((canon_map_lookup[i].lookup_char % HASH_SIZE) !=
+		    (c % HASH_SIZE))
+			break;
+		++i;
+	}
+
+	unicode_canonical_t ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	return ret;
+}
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index adb7869..ca139cc 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -558,13 +558,13 @@ std::u32string unicode::toupper(const std::u32string &u)
 	return copy;
 }
 
-std::vector<unicode_bidi_level_t>
+std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
 unicode::bidi_calc(const std::u32string &s)
 {
 	return unicode::bidi_calc(s, UNICODE_BIDI_SKIP);
 }
 
-std::vector<unicode_bidi_level_t>
+std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
 unicode::bidi_calc(const std::u32string &s,
 		   unicode_bidi_level_t paragraph_embedding_level)
 {
@@ -576,16 +576,19 @@ unicode::bidi_calc(const std::u32string &s,
 		initial_embedding_level=&paragraph_embedding_level;
 	}
 
-	std::vector<unicode_bidi_level_t> buf;
+	std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
+		ret;
 
-	buf.resize(s.size());
+	std::get<0>(ret).resize(s.size());
+	std::get<1>(ret)=UNICODE_BIDI_LR;
 
 	if (s.size())
 	{
-		unicode_bidi_calc(s.c_str(), s.size(), &buf[0],
-				  initial_embedding_level);
+		std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(),
+						   &std::get<0>(ret)[0],
+						   initial_embedding_level);
 	}
-	return buf;
+	return ret;
 }
 
 extern "C" {
author	Sam Varshavchik	2020-07-10 08:25:53 -0400
committer	Sam Varshavchik	2020-07-12 15:56:45 -0400
commit	f94fc14a9f3019f110c71d084f4bc59261434519 (patch)
tree	a2c8bfc5b325f9bb0516b14700effc97084185dc /unicode
parent	1ef92db9dbbefff98b93c8c66e4693a31b4f31a5 (diff)
download	courier-libs-f94fc14a9f3019f110c71d084f4bc59261434519.tar.bz2