diff options
| author | Sam Varshavchik | 2020-07-10 08:25:53 -0400 | 
|---|---|---|
| committer | Sam Varshavchik | 2020-07-12 15:56:45 -0400 | 
| commit | f94fc14a9f3019f110c71d084f4bc59261434519 (patch) | |
| tree | a2c8bfc5b325f9bb0516b14700effc97084185dc | |
| parent | 1ef92db9dbbefff98b93c8c66e4693a31b4f31a5 (diff) | |
| download | courier-libs-f94fc14a9f3019f110c71d084f4bc59261434519.tar.bz2 | |
Implement unicode_canonical.
Fixes biditest2.
| -rw-r--r-- | unicode/.gitignore | 2 | ||||
| -rw-r--r-- | unicode/Makefile.am | 15 | ||||
| -rw-r--r-- | unicode/biditest.C | 32 | ||||
| -rw-r--r-- | unicode/biditest2.C | 203 | ||||
| -rw-r--r-- | unicode/book.xml | 89 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 67 | ||||
| -rw-r--r-- | unicode/mkcanonical.pl | 110 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 207 | ||||
| -rw-r--r-- | unicode/unicode_canonical.c | 57 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 17 | 
10 files changed, 712 insertions, 87 deletions
| diff --git a/unicode/.gitignore b/unicode/.gitignore index 1bdc8ce..8905e05 100644 --- a/unicode/.gitignore +++ b/unicode/.gitignore @@ -19,6 +19,8 @@  /WordBreakTest.txt  /emoji-data.txt  /biditest +/biditest2 +/canonicalmappings.h  /config.cache  /config.guess  /config.sub diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 8b1d3cf..83034c5 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -11,6 +11,7 @@ noinst_SCRIPTS=update.sh		\  	mkbidi.pl			\  	mkbidiclass.pl			\  	mkbidiclassnames.pl		\ +	mkcanonical.pl			\  	mkeastasianwidth.pl		\  	mkemojidata.pl			\  	mkgraphemebreak.pl		\ @@ -20,7 +21,7 @@ noinst_SCRIPTS=update.sh		\  	mkwordbreak.pl  noinst_PROGRAMS=unicodetest graphemetest linebreaktest wordbreaktest \ -	enttest scripttest biditest +	enttest scripttest biditest biditest2  aclocaldir=$(datadir)/aclocal  aclocal_DATA=m4/courier-unicode.m4 @@ -111,6 +112,7 @@ man_MANS= \          $(srcdir)/man/unicode_bidi_calc.3 \          $(srcdir)/man/unicode_bidi_mirror.3 \          $(srcdir)/man/unicode_bidi_reorder.3 \ +        $(srcdir)/man/unicode_canonical.3 \          $(srcdir)/man/unicode_category_lookup.3 \          $(srcdir)/man/unicode_convert.3 \          $(srcdir)/man/unicode_convert_deinit.3 \ @@ -187,6 +189,7 @@ libcourier_unicode_la_SOURCES=\  			unicode_graphemebreak.c graphemebreaktab.h \  			unicode_linebreak.c linebreaktab.h \  			unicode_htmlent.c unicode_htmlent.h \ +			unicode_canonical.c canonicalmappings.h \  			linebreaktab_internal.h \  			unicode_wordbreak.c wordbreaktab.h scriptstab.h \  			unicode_emoji.c emojitab.h \ @@ -214,6 +217,7 @@ BUILT_SOURCES=unicode_ultcasetab.c \  	bidi_class.h \  	bidi_classnames.h \  	bidi_mirroring.h \ +	canonicalmappings.h \  	categoriestab.h \  	eastasianwidth.h \  	emojitab.h \ @@ -277,6 +281,9 @@ bidi_classnames.h: unicode_bidi.c mkbidiclassnames.pl  	@PERL@ $(srcdir)/mkbidiclassnames.pl <$(srcdir)/courier-unicode.h.in >bidi_classnames.h.tmp  	mv bidi_classnames.h.tmp bidi_classnames.h +canonicalmappings.h: UnicodeData.txt mkcanonical.pl +	@PERL@ -I$(srcdir) $(srcdir)/mkcanonical.pl >canonicalmappings.h.tmp +	mv canonicalmappings.h.tmp canonicalmappings.h  endif  unicodetest_SOURCES=unicodetest.c @@ -314,7 +321,10 @@ biditest_DEPENDENCIES=libcourier-unicode.la  biditest_LDADD=libcourier-unicode.la  biditest_LDFLAGS=-static - +biditest2_SOURCES=biditest2.C +biditest2_DEPENDENCIES=libcourier-unicode.la +biditest2_LDADD=libcourier-unicode.la +biditest2_LDFLAGS=-static  check-am: unicodetest  	./unicodetest @@ -385,6 +395,7 @@ check-am: unicodetest  	test "`./biditest 8261`" = "8262 8262 o"  	test "`./biditest 8262`" = "8261 8261 c"  	./biditest +	./biditest2  if HAVE_DOCS diff --git a/unicode/biditest.C b/unicode/biditest.C index 6343866..2d2a6e5 100644 --- a/unicode/biditest.C +++ b/unicode/biditest.C @@ -35,8 +35,10 @@ int main(int argc, char **argv)  	std::ifstream fp("BidiTest.txt");  	if (!fp.is_open()) +	{ +		std::cerr << "Cannot open BidiTest.txt" << std::endl;  		exit(1); - +	}  	size_t linenum=0;  	size_t nextlogline=0;  	std::string logmsg; @@ -178,9 +180,10 @@ int main(int argc, char **argv)  		{  			if (n & 1)  			{ -				actual_levels=level ? +				auto ret=level ?  					unicode::bidi_calc(dummy_input,*level)  					: unicode::bidi_calc(dummy_input); +				actual_levels=std::get<0>(ret);  				int matched=0; @@ -350,31 +353,6 @@ extern "C" {  #include "unicode_bidi.c" -static const struct { -	char			classname[8]; -	enum_bidi_type_t	classenum; -} bidiclassnames[]={ - -#include "bidi_classnames.h" - -}; - -const char *bidi_classname(enum_bidi_type_t classenum) -{ -	for (const auto &cn:bidiclassnames) -	{ -		if (cn.classenum == classenum) -			return cn.classname; -	} - -	return "???"; -} - -static const char *lookup_classname(const std::string &s) -{ -	abort(); -} -  enum_bidi_type_t fudge_unicode_bidi(size_t i)  {  	if (i >= testcase.size()) diff --git a/unicode/biditest2.C b/unicode/biditest2.C new file mode 100644 index 0000000..f497bcf --- /dev/null +++ b/unicode/biditest2.C @@ -0,0 +1,203 @@ +#include	"unicode_config.h" +#include	"courier-unicode.h" +#include	<iostream> +#include	<sstream> +#include	<fstream> +#include	<cstdint> +#include	<iomanip> + +FILE *DEBUGDUMP; + +int main(int argc, char **argv) +{ +	std::ifstream fp("BidiCharacterTest.txt"); + +	if (!fp.is_open()) +	{ +		std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl; +		exit(1); +	} + +	DEBUGDUMP=fopen("/dev/null", "w"); +	if (!DEBUGDUMP) +	{ +		perror("/dev/null"); +		exit(1); +	} + +	std::string buf; + +	size_t linenum=0; + +	while (1) +	{ +		buf.clear(); + +		if (std::getline(fp, buf).eof() && buf.empty()) +			break; +		++linenum; + +		auto p=buf.find('#'); + +		if (p != buf.npos) +			buf=buf.substr(0, p); + +		p=buf.find(';'); + +		if (p == buf.npos) +			continue; + +		std::istringstream chars{buf.substr(0, p)}; + +		auto q=buf.find(';', ++p); + +		if (q == buf.npos) +		{ +			std::cerr << "Cannot parse line " << linenum +				  << std::endl; +			exit(2); +		} + +		int direction; + +		if (!(std::istringstream{buf.substr(p, q-p)} >> direction)) +		{ +			std::cerr << "Cannot parse line " << linenum +				  << std::endl; +			exit(3); +		} + +		p=++q; +		q=buf.find(';', p); + +		if (q == buf.npos) +		{ +			std::cerr << "Cannot parse line " << linenum +				  << std::endl; +			exit(4); +		} + +		int paragraph_embedding_level; + +		if (!(std::istringstream{buf.substr(p, q-p)} >> +		      paragraph_embedding_level)) +		{ +			std::cerr << "Cannot parse line " << linenum +				  << std::endl; +			exit(5); +		} +		p=++q; +		q=buf.find(';', p); + +		if (q == buf.npos) +		{ +			std::cerr << "Cannot parse line " << linenum +				  << std::endl; +			exit(6); +		} + +		std::vector<unicode_bidi_level_t> levels; + +		{ +			std::istringstream level_s{buf.substr(p, q-p)}; + +			std::string s; + +			while (level_s >> s) +			{ +				size_t l; + +				if (!(std::istringstream{s} >> l)) +				{ +					l=UNICODE_BIDI_SKIP; +				} +				levels.push_back(l); +			} +		} + +		std::vector<size_t> render_order; + +		{ +			size_t n; + +			std::istringstream order_i{buf.substr(++q)}; + +			while (order_i >> n) +				render_order.push_back(n); +		} +		std::u32string s; +		uintmax_t c; + +		while (chars >> std::hex >> c) +			s.push_back(c); + +		auto ret=direction == UNICODE_BIDI_LR || +			direction == UNICODE_BIDI_RL +			? unicode::bidi_calc(s, direction) +			: unicode::bidi_calc(s); + +		if (std::get<1>(ret) != paragraph_embedding_level) +		{ +			std::cerr << "Regression, line " +				  << linenum +				  << ": expected " +				  << paragraph_embedding_level +				  << " paragraph embedding level, got " +				  << (int)std::get<1>(ret) +				  << std::endl; +			exit(1); +		} + +		if (std::get<0>(ret) != levels) +		{ +			fclose(DEBUGDUMP); +			DEBUGDUMP=stderr; + +			(void)(direction == UNICODE_BIDI_LR || +			       direction == UNICODE_BIDI_RL +			       ? unicode::bidi_calc(s, direction) +			       : unicode::bidi_calc(s)); + +			std::cerr << "Regression, line " +				  << linenum +				  << ": embedding levels" +				  << std::endl +				  << "   Expected:"; + +			for (int l:levels) +			{ +				std::cerr << " "; +				if (l == UNICODE_BIDI_SKIP) +					std::cerr << "x"; +				else +					std::cerr << l; +			} + +			std::cerr << std::endl +				  << "     Actual:"; + +			for (int l:std::get<0>(ret)) +			{ +				std::cerr << " "; +				if (l == UNICODE_BIDI_SKIP) +					std::cerr << "x"; +				else +					std::cerr << l; +			} +			std::cerr << std::endl; +			exit(1); +		} +	} +	return 0; +} + +#define BIDI_DEBUG + +extern "C" { +#if 0 +} +#endif + +#include "unicode_bidi.c" + +} diff --git a/unicode/book.xml b/unicode/book.xml index 9c1486c..ad0009a 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -4,6 +4,7 @@  <!ENTITY tr9ver  "42">  <!ENTITY tr14ver "45"> +<!ENTITY tr15ver "50">  <!ENTITY tr24ver "31">  <!ENTITY tr29ver "37">  <!ENTITY tr51ver "18"> @@ -232,6 +233,9 @@ See COPYING for distribution information.  	    <link linkend="unicode_bidi">  	      <citerefentry><refentrytitle>unicode_bidi</refentrytitle>  	      <manvolnum>3</manvolnum></citerefentry></link>, +	    <link linkend="unicode_canonical"> +	      <citerefentry><refentrytitle>unicode_canonical</refentrytitle> +	      <manvolnum>3</manvolnum></citerefentry></link>,  	    <link linkend="unicode_category_lookup">  	      <citerefentry><refentrytitle>unicode_category_lookup</refentrytitle>  	      <manvolnum>3</manvolnum></citerefentry></link>, @@ -475,6 +479,91 @@ See COPYING for distribution information.  	</refsect1>        </refentry> +      <refentry id="unicode_canonical"> +	<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + +	<refmeta> +	  <refentrytitle>unicode_canonical</refentrytitle> +	  <manvolnum>3</manvolnum> +	</refmeta> + +	<refnamediv> +	  <refname>unicode_canonical</refname> + +	  <refpurpose>unicode canonical character mapping</refpurpose> +	</refnamediv> + +	<refsynopsisdiv> +	  <funcsynopsis> +	    <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> +	    <funcprototype> +	      <funcdef>unicode_canonical_t <function>unicode_canonical</function></funcdef> +              <paramdef>char32_t <parameter>c</parameter></paramdef> +	    </funcprototype> +	  </funcsynopsis> +	</refsynopsisdiv> +	<refsect1> +	  <title>DESCRIPTION</title> + +	  <para> +	    <function>unicode_canonical</function>() looks up the +	    character's +	    <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html +">canonical +	    and compatibility mapping</ulink>. + +	    <function>unicode_canonical</function>() returns a structure +	    with the following fields: +	  </para> + +	  <variablelist> +	    <varlistentry> +	      <term><structfield>canonical_chars</structfield></term> +	      <listitem> +		<para> +		  A pointer to the canonical or equivalent representation +		  of the character. +	        </para> +	      </listitem> +	    </varlistentry> +	    <varlistentry> +	      <term><structfield>n_canonical_chars</structfield></term> +	      <listitem> +		<para> +		  Number of characters in the +		  <structfield>canonical_chars</structfield>. +	        </para> +	      </listitem> +	    </varlistentry> +	    <varlistentry> +	      <term><structfield>format</structfield></term> +	      <listitem> +		<para> +		  The character's canonical formatting flag, if any. +	        </para> +	      </listitem> +	    </varlistentry> +	  </variablelist> + +	  <para> +	    A NULL <structfield>canonical_chars</structfield> (with a 0 +	    <structfield>n_canonical_chars</structfield>) indicates +	    that the character without a canonical or compatibility +	    equivalence. +	  </para> +	</refsect1> +	<refsect1> +	  <title>SEE ALSO</title> +	  <para> +	    <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>, +	    <link linkend="courier-unicode"> +	      <citerefentry> +		<refentrytitle>courier-unicode</refentrytitle> +		<manvolnum>7</manvolnum></citerefentry></link>. +	    </para> +	</refsect1> +      </refentry> +        <refentry id="unicode_category_lookup">  	<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 55a7152..c8161ea 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -584,6 +584,15 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i);  ** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates  ** directional markers (from step X9). These characters should be removed  ** before using unicode_bidi_reorder(). +** +** unicode_bidi_calc() returns the resolved paragraph direction level, which +** always matches the passed in level, if specified, else it reports the +** derived one. +** +** unicode_bidi_reorder() reorders the characters according to the resolved +** embedding levels. A non-null reorder_callback gets invoked repeatedly, +** indicating the starting index and the number of characters reversed, so +** that any related metadata can be updated accordingly.  */  typedef char unicode_bidi_bracket_type_t; @@ -604,10 +613,10 @@ typedef unsigned char unicode_bidi_level_t;  #define UNICODE_BIDI_RL		((unicode_bidi_level_t)1)  #define UNICODE_BIDI_SKIP	((unicode_bidi_level_t)254) -extern void unicode_bidi_calc(const char32_t *p, size_t n, -			      unicode_bidi_level_t *bufp, -			      const unicode_bidi_level_t * -			      initial_embedding_level); +extern unicode_bidi_level_t unicode_bidi_calc(const char32_t *p, size_t n, +					      unicode_bidi_level_t *bufp, +					      const unicode_bidi_level_t * +					      initial_embedding_level);  extern void unicode_bidi_reorder(char32_t *p,  				 unicode_bidi_level_t *levels, @@ -646,6 +655,48 @@ typedef enum {  extern enum_bidi_type_t unicode_bidi_type(char32_t c);  /* +** unicode_canonical() returns the canonical mapping of the given Unicode +** character. The returned structure specifies: +** +** - A pointer to the canonical decomposition of the given character. +** - Number of characters in the canonical decomposition. +** - An optional formatting tag. +** +** A null pointer, and a 0 character count gets returned for characters +** without a canonical decomposition. +** +*/ + +typedef enum { +	      UNICODE_CANONICAL_FMT_NONE=0, + +	      UNICODE_CANONICAL_FMT_CIRCLE, +	      UNICODE_CANONICAL_FMT_COMPAT, +	      UNICODE_CANONICAL_FMT_FINAL, +	      UNICODE_CANONICAL_FMT_FONT, +	      UNICODE_CANONICAL_FMT_FRACTION, +	      UNICODE_CANONICAL_FMT_INITIAL, +	      UNICODE_CANONICAL_FMT_ISOLATED, +	      UNICODE_CANONICAL_FMT_MEDIAL, +	      UNICODE_CANONICAL_FMT_NARROW, +	      UNICODE_CANONICAL_FMT_NOBREAK, +	      UNICODE_CANONICAL_FMT_SMALL, +	      UNICODE_CANONICAL_FMT_SQUARE, +	      UNICODE_CANONICAL_FMT_SUB, +	      UNICODE_CANONICAL_FMT_SUPER, +	      UNICODE_CANONICAL_FMT_VERTICAL, +	      UNICODE_CANONICAL_FMT_WIDE, +} unicode_canonical_fmt_t; + +typedef struct { +	const char32_t *canonical_chars; +	size_t n_canonical_chars; +	unicode_canonical_fmt_t format; +} unicode_canonical_t; + +extern unicode_canonical_t unicode_canonical(char32_t); + +/*  ** A buffer that holds unicode characters, and dynamically grows as needed.  */ @@ -2066,11 +2117,13 @@ std::u32string tolower(const std::u32string &u);  std::u32string toupper(const std::u32string &u);  //! Calculate bidirectional embedding levels -std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s); +std::tuple<std::vector<unicode_bidi_level_t>, +	   unicode_bidi_level_t> bidi_calc(const std::u32string &s);  //! Calculate bidirectional embedding levels -std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s, -					    unicode_bidi_level_t level); +std::tuple<std::vector<unicode_bidi_level_t>, +	   unicode_bidi_level_t> bidi_calc(const std::u32string &s, +					   unicode_bidi_level_t level);  //! Reorder bidirectional text  int bidi_reorder(std::u32string &string, diff --git a/unicode/mkcanonical.pl b/unicode/mkcanonical.pl new file mode 100644 index 0000000..43d7e44 --- /dev/null +++ b/unicode/mkcanonical.pl @@ -0,0 +1,110 @@ +#! /usr/bin/perl +# +# Creates a lookup table for canonical mappings in UnicodeData.txt + +use strict; +use warnings; + +open(F, "<UnicodeData.txt") || die; + +my @mappings; +my @data; + +while (defined($_=<F>)) +{ +    my @w=split(/;/, $_, -1); + +    next unless $w[5]; + +    my $code=$w[0]; + +    my @mapping=split(/\s/, $w[5]); + +    my $formatting_tag = "UNICODE_CANONICAL_FMT_NONE"; + +    if ($mapping[0] =~ /^</) +    { +	$formatting_tag = shift @mapping; + +	$formatting_tag =~ s/<//g; +	$formatting_tag =~ s/>//g; +	$formatting_tag = "UNICODE_CANONICAL_FMT_" . uc($formatting_tag); +    }; + +    die "Too long\n" if (scalar @mapping) > 0xFFFF; + +    my $dec_code; + +    eval "\$dec_code=0x$code\n"; + +    push @data, [$dec_code, "\t{0x$code, (unsigned char)$formatting_tag, " +	. (scalar @mapping) . ", " +	. scalar(@mappings) . "}" ]; +    push @mappings, @mapping; +} + +my $hash_size = int( (scalar @data) * 3 / 4); + +my %buckets; + +my $keep_going = 1; + +while ($keep_going) +{ +    %buckets = (); + +    $keep_going = 0; + +    foreach my $m (@data) +    { +	my $bucket = $m->[0] % $hash_size; + +	push @{$buckets{$bucket}}, $m; + +	if ((scalar @{$buckets{$bucket}}) > 3) +	{ +	    $keep_going = 1; +	    ++$hash_size; +	    last; +	} +    } +} + +print "#define HASH_SIZE $hash_size\n"; + +@data = (); + +my $pfix = ""; + +print "static const unsigned short canon_map_hash[]={\n"; + +foreach my $bucket (0.. ($hash_size)-1) +{ +    print "$pfix\t" . (scalar @data); +    $pfix = ",\n"; + +    push @data, @{ $buckets{$bucket} // [] }; +} + + +print "};\n\nstatic const struct canon_map_table canon_map_lookup[]={\n"; + +$pfix = ""; + +foreach my $m (@data) +{ +    print "$pfix" . $m->[1]; +    $pfix = ",\n"; +} + +print "\n};\n\nstatic const char32_t canon_map_values[]={\n"; + +$pfix=""; + +foreach my $v (@mappings) +{ +    print "$pfix\t0x$v"; + +    $pfix=",\n"; +} +print "};\n"; diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index da15966..055ee89 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -467,6 +467,28 @@ typedef struct {  } *directional_status_stack_t;  #ifdef BIDI_DEBUG + +static const struct { +	char			classname[8]; +	enum_bidi_type_t	classenum; +} bidiclassnames[]={ + +#include "bidi_classnames.h" + +}; + +const char *bidi_classname(enum_bidi_type_t classenum) +{ +	for (const auto &cn:bidiclassnames) +	{ +		if (cn.classenum == classenum) +			return cn.classname; +	} + +	return "???"; +} + +  void dump_classes(const char *prefix, directional_status_stack_t stack)  {  	fprintf(DEBUGDUMP, "%s: ", prefix); @@ -621,11 +643,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack)  	free(stack);  } -static void unicode_bidi_b(const char32_t *p, -			   size_t n, -			   enum_bidi_type_t *buf, -			   unicode_bidi_level_t *bufp, -			   const unicode_bidi_level_t *initial_embedding_level); +static unicode_bidi_level_t +unicode_bidi_b(const char32_t *p, +	       size_t n, +	       enum_bidi_type_t *buf, +	       unicode_bidi_level_t *bufp, +	       const unicode_bidi_level_t *initial_embedding_level);  enum_bidi_type_t unicode_bidi_type(char32_t c)  { @@ -639,8 +662,9 @@ enum_bidi_type_t unicode_bidi_type(char32_t c)  				   UNICODE_BIDI_TYPE_L);  } -void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, -		       const unicode_bidi_level_t *initial_embedding_level) +unicode_bidi_level_t +unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, +		  const unicode_bidi_level_t *initial_embedding_level)  {  	/*  	** Look up the bidi class for each char32_t. @@ -661,27 +685,33 @@ void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,  		bufp[i]=UNICODE_BIDI_SKIP;  	} -	unicode_bidi_b(p, n, -		       buf, -		       bufp, -		       initial_embedding_level); +	unicode_bidi_level_t level=unicode_bidi_b(p, n, +						  buf, +						  bufp, +						  initial_embedding_level);  	free(buf); + +	return level;  }  static void unicode_bidi_cl(directional_status_stack_t stack); -static void unicode_bidi_b(const char32_t *p, -			   size_t n, -			   enum_bidi_type_t *buf, -			   unicode_bidi_level_t *bufp, -			   const unicode_bidi_level_t *initial_embedding_level) +static unicode_bidi_level_t +unicode_bidi_b(const char32_t *p, +	       size_t n, +	       enum_bidi_type_t *buf, +	       unicode_bidi_level_t *bufp, +	       const unicode_bidi_level_t *initial_embedding_level)  {  	directional_status_stack_t stack;  	stack=directional_status_stack_init(p, buf, n, bufp,  					    initial_embedding_level); +	unicode_bidi_level_t paragraph_embedding_level= +		stack->paragraph_embedding_level; +  #ifdef BIDI_DEBUG  	fprintf(DEBUGDUMP, "BIDI: START: Paragraph embedding level: %d\n",  		(int)stack->paragraph_embedding_level); @@ -690,6 +720,8 @@ static void unicode_bidi_b(const char32_t *p,  	unicode_bidi_cl(stack);  	directional_status_stack_deinit(stack); + +	return paragraph_embedding_level;  }  #define RESET_CLASS(p,stack) do {				\ @@ -1173,6 +1205,8 @@ static void unicode_bidi_cl(directional_status_stack_t stack)  	{  #ifdef BIDI_DEBUG  		dump_sequence_info(stack, p); +		fprintf(DEBUGDUMP, "Sequence embedding level: %d\n", +			(int)p->embedding_level);  		dump_sequence("Contents before W", stack, p);  #endif @@ -1408,6 +1442,16 @@ struct bidi_n_stack {  	short matched;  }; +#define IS_NI(class)						\ +	((class) == UNICODE_BIDI_TYPE_B ||			\ +	 (class) == UNICODE_BIDI_TYPE_S ||			\ +	 (class) == UNICODE_BIDI_TYPE_WS ||			\ +	 (class) == UNICODE_BIDI_TYPE_ON ||			\ +	 (class) == UNICODE_BIDI_TYPE_FSI ||			\ +	 (class) == UNICODE_BIDI_TYPE_LRI ||			\ +	 (class) == UNICODE_BIDI_TYPE_RLI ||			\ +	 (class) == UNICODE_BIDI_TYPE_PDI) +  static void unicode_bidi_n(directional_status_stack_t stack,  			   struct isolating_run_sequence_s *seq)  { @@ -1430,45 +1474,86 @@ static void unicode_bidi_n(directional_status_stack_t stack,  	for (; irs_compare(&iter, &end); irs_incr(&iter))  	{ -		unicode_bidi_bracket_type_t bracket_type; -		char32_t open_bracket= -			unicode_bidi_bracket_type(stack->chars[iter.i], -						  &bracket_type); +		unicode_bidi_bracket_type_t bracket_type=UNICODE_BIDI_n; + +		char32_t open_or_close_bracket=0; + +		if (IS_NI(stack->classes[iter.i])) +		{ +			open_or_close_bracket= +				unicode_bidi_bracket_type(stack->chars[iter.i], +							  &bracket_type); +		}  		if (bracket_type == UNICODE_BIDI_o)  		{  			if (stackp >= NSTACKSIZE) +			{ +#ifdef BIDI_DEBUG +				fprintf(DEBUGDUMP, +					"BD16 stack exceeded on index %d\n", +					(int)iter.i); +#endif  				break; /* BD16 failure */ - +			}  			if (!((*bracket_stack_tail)=(struct bidi_n_stack *)  			      calloc(1, sizeof(struct bidi_n_stack))))  				abort();  			stack_iters[stackp]=*bracket_stack_tail; - -			(*bracket_stack_tail)->start=iter; +			stack_iters[stackp]->start=iter;  			stack_chars[stackp]=stack->chars[iter.i]; +			unicode_canonical_t canon= +				unicode_canonical(stack_chars[stackp]); + +			if (canon.n_canonical_chars == 1 && +			    !canon.format) +			{ +				stack_chars[stackp]= +					canon.canonical_chars[0]; +			} +  			bracket_stack_tail= &(*bracket_stack_tail)->next;  			++stackp; -			continue; +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, "Found opening bracket at index %d\n", +				(int)iter.i); +#endif  		} -		if (bracket_type == UNICODE_BIDI_c) /* Should be "n" */ +		if (bracket_type == UNICODE_BIDI_c)  		{ +			unicode_canonical_t canon= +				unicode_canonical(open_or_close_bracket); + +			if (canon.n_canonical_chars == 1 && +			    !canon.format) +			{ +				open_or_close_bracket= +					canon.canonical_chars[0]; +			} +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, "Found closing bracket at index %d\n", +				(int)iter.i); +#endif  			for (size_t i=stackp; i > 0; )  			{  				--i; -				if (stack_chars[i] != open_bracket) +				if (stack_chars[i] != open_or_close_bracket)  					continue; +#ifdef BIDI_DEBUG +				fprintf(DEBUGDUMP, +					"Matched to open bracket at index %d\n", +					(int)stack_iters[i]->start.i); +#endif  				stack_iters[i]->end = iter;  				stack_iters[i]->matched=1;  				stackp=i;  				break;  			} -			continue;  		}  		/* @@ -1496,11 +1581,41 @@ static void unicode_bidi_n(directional_status_stack_t stack,  		if (eoclass == E_CLASS)  		{ +#ifdef BIDI_DEBUG +			if (stackp) +			{ +				fprintf(DEBUGDUMP, +					"Found e for brackets at:"); + +				for (size_t i=0; i<stackp; ++i) +				{ +					fprintf(DEBUGDUMP, +						" %d", +						(int)stack_iters[i]->start.i); +				} +				fprintf(DEBUGDUMP, "\n"); +			} +#endif  			for (size_t i=0; i<stackp; ++i)  				stack_iters[i]->has_e=1;  		}  		else if (eoclass == O_CLASS)  		{ +#ifdef BIDI_DEBUG +			if (stackp) +			{ +				fprintf(DEBUGDUMP, +					"Found o for brackets at:"); + +				for (size_t i=0; i<stackp; ++i) +				{ +					fprintf(DEBUGDUMP, +						" %d", +						(int)stack_iters[i]->start.i); +				} +				fprintf(DEBUGDUMP, "\n"); +			} +#endif  			for (size_t i=0; i<stackp; ++i)  				stack_iters[i]->has_o=1;  		} @@ -1516,6 +1631,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,  		{  			int set=0; +#ifdef BIDI_DEBUG +			fprintf(DEBUGDUMP, +				"Brackets: %d and %d: e=%s, o=%s", +				(int)p->start.i, +				(int)p->end.i, +				bidi_classname(E_CLASS), +				bidi_classname(O_CLASS)); + +			fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n", +				p->has_e, +				p->has_o); +#endif  			if (p->has_e)  			{  				stack->classes[p->start.i]= @@ -1548,16 +1675,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,  					}  					strong_type=eoclass; +#ifdef BIDI_DEBUG +					fprintf(DEBUGDUMP, +						"Brackets: O context: %s\n", +						bidi_classname(strong_type)); +#endif  					break;  				} -				if (strong_type == O_CLASS) -				{ -					stack->classes[p->start.i]= -						stack->classes[p->end.i]= -						strong_type; -					set=1; -				} +				stack->classes[p->start.i]= +					stack->classes[p->end.i]= +					strong_type; +				set=1;  			}  			if (set) @@ -1581,16 +1710,6 @@ static void unicode_bidi_n(directional_status_stack_t stack,  	/* N1 */ -#define IS_NI(class)						\ -	((class) == UNICODE_BIDI_TYPE_B ||			\ -	 (class) == UNICODE_BIDI_TYPE_S ||			\ -	 (class) == UNICODE_BIDI_TYPE_WS ||			\ -	 (class) == UNICODE_BIDI_TYPE_ON ||			\ -	 (class) == UNICODE_BIDI_TYPE_FSI ||			\ -	 (class) == UNICODE_BIDI_TYPE_LRI ||			\ -	 (class) == UNICODE_BIDI_TYPE_RLI ||			\ -	 (class) == UNICODE_BIDI_TYPE_PDI) -  	enum_bidi_type_t prev_type=seq->sos;  	for (iter=beg; irs_compare(&iter, &end); ) diff --git a/unicode/unicode_canonical.c b/unicode/unicode_canonical.c new file mode 100644 index 0000000..3f6773f --- /dev/null +++ b/unicode/unicode_canonical.c @@ -0,0 +1,57 @@ +/* +** Copyright 2020 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include	"unicode_config.h" +#include	"courier-unicode.h" +#include <string.h> + +struct canon_map_table { +	char32_t lookup_char; +	unsigned char fmt_flag_v; +	unsigned char n_chars; +	unsigned short offset; +}; + +#include "canonicalmappings.h" + +unicode_canonical_t unicode_canonical(char32_t c) +{ +	size_t i=canon_map_hash[c % HASH_SIZE] +		+ +		/* Compile-time sanity check */ +		sizeof(char[ sizeof(canon_map_hash)/ +			     sizeof(canon_map_hash[0]) == HASH_SIZE +			     ? 1:-1])*0; + +	while (i < sizeof(canon_map_lookup)/sizeof(canon_map_lookup[0])) +	{ +		if (canon_map_lookup[i].lookup_char == c) +		{ +			unicode_canonical_t ret; + +			ret.canonical_chars= +				canon_map_values+canon_map_lookup[i].offset; +			ret.n_canonical_chars= +				canon_map_lookup[i].n_chars; +			ret.format= +				(unicode_canonical_fmt_t) +				canon_map_lookup[i].fmt_flag_v; + +			return ret; +		} + +		if ((canon_map_lookup[i].lookup_char % HASH_SIZE) != +		    (c % HASH_SIZE)) +			break; +		++i; +	} + +	unicode_canonical_t ret; + +	memset(&ret, 0, sizeof(ret)); + +	return ret; +} diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index adb7869..ca139cc 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -558,13 +558,13 @@ std::u32string unicode::toupper(const std::u32string &u)  	return copy;  } -std::vector<unicode_bidi_level_t> +std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>  unicode::bidi_calc(const std::u32string &s)  {  	return unicode::bidi_calc(s, UNICODE_BIDI_SKIP);  } -std::vector<unicode_bidi_level_t> +std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>  unicode::bidi_calc(const std::u32string &s,  		   unicode_bidi_level_t paragraph_embedding_level)  { @@ -576,16 +576,19 @@ unicode::bidi_calc(const std::u32string &s,  		initial_embedding_level=¶graph_embedding_level;  	} -	std::vector<unicode_bidi_level_t> buf; +	std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> +		ret; -	buf.resize(s.size()); +	std::get<0>(ret).resize(s.size()); +	std::get<1>(ret)=UNICODE_BIDI_LR;  	if (s.size())  	{ -		unicode_bidi_calc(s.c_str(), s.size(), &buf[0], -				  initial_embedding_level); +		std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(), +						   &std::get<0>(ret)[0], +						   initial_embedding_level);  	} -	return buf; +	return ret;  }  extern "C" { | 
