diff options
| author | Sam Varshavchik | 2020-07-10 08:25:53 -0400 |
|---|---|---|
| committer | Sam Varshavchik | 2020-07-12 15:56:45 -0400 |
| commit | f94fc14a9f3019f110c71d084f4bc59261434519 (patch) | |
| tree | a2c8bfc5b325f9bb0516b14700effc97084185dc /unicode | |
| parent | 1ef92db9dbbefff98b93c8c66e4693a31b4f31a5 (diff) | |
| download | courier-libs-f94fc14a9f3019f110c71d084f4bc59261434519.tar.bz2 | |
Implement unicode_canonical.
Fixes biditest2.
Diffstat (limited to 'unicode')
| -rw-r--r-- | unicode/.gitignore | 2 | ||||
| -rw-r--r-- | unicode/Makefile.am | 15 | ||||
| -rw-r--r-- | unicode/biditest.C | 32 | ||||
| -rw-r--r-- | unicode/biditest2.C | 203 | ||||
| -rw-r--r-- | unicode/book.xml | 89 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 67 | ||||
| -rw-r--r-- | unicode/mkcanonical.pl | 110 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 207 | ||||
| -rw-r--r-- | unicode/unicode_canonical.c | 57 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 17 |
10 files changed, 712 insertions, 87 deletions
diff --git a/unicode/.gitignore b/unicode/.gitignore index 1bdc8ce..8905e05 100644 --- a/unicode/.gitignore +++ b/unicode/.gitignore @@ -19,6 +19,8 @@ /WordBreakTest.txt /emoji-data.txt /biditest +/biditest2 +/canonicalmappings.h /config.cache /config.guess /config.sub diff --git a/unicode/Makefile.am b/unicode/Makefile.am index 8b1d3cf..83034c5 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -11,6 +11,7 @@ noinst_SCRIPTS=update.sh \ mkbidi.pl \ mkbidiclass.pl \ mkbidiclassnames.pl \ + mkcanonical.pl \ mkeastasianwidth.pl \ mkemojidata.pl \ mkgraphemebreak.pl \ @@ -20,7 +21,7 @@ noinst_SCRIPTS=update.sh \ mkwordbreak.pl noinst_PROGRAMS=unicodetest graphemetest linebreaktest wordbreaktest \ - enttest scripttest biditest + enttest scripttest biditest biditest2 aclocaldir=$(datadir)/aclocal aclocal_DATA=m4/courier-unicode.m4 @@ -111,6 +112,7 @@ man_MANS= \ $(srcdir)/man/unicode_bidi_calc.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ + $(srcdir)/man/unicode_canonical.3 \ $(srcdir)/man/unicode_category_lookup.3 \ $(srcdir)/man/unicode_convert.3 \ $(srcdir)/man/unicode_convert_deinit.3 \ @@ -187,6 +189,7 @@ libcourier_unicode_la_SOURCES=\ unicode_graphemebreak.c graphemebreaktab.h \ unicode_linebreak.c linebreaktab.h \ unicode_htmlent.c unicode_htmlent.h \ + unicode_canonical.c canonicalmappings.h \ linebreaktab_internal.h \ unicode_wordbreak.c wordbreaktab.h scriptstab.h \ unicode_emoji.c emojitab.h \ @@ -214,6 +217,7 @@ BUILT_SOURCES=unicode_ultcasetab.c \ bidi_class.h \ bidi_classnames.h \ bidi_mirroring.h \ + canonicalmappings.h \ categoriestab.h \ eastasianwidth.h \ emojitab.h \ @@ -277,6 +281,9 @@ bidi_classnames.h: unicode_bidi.c mkbidiclassnames.pl @PERL@ $(srcdir)/mkbidiclassnames.pl <$(srcdir)/courier-unicode.h.in >bidi_classnames.h.tmp mv bidi_classnames.h.tmp bidi_classnames.h +canonicalmappings.h: UnicodeData.txt mkcanonical.pl + @PERL@ -I$(srcdir) $(srcdir)/mkcanonical.pl >canonicalmappings.h.tmp + mv canonicalmappings.h.tmp canonicalmappings.h endif unicodetest_SOURCES=unicodetest.c @@ -314,7 +321,10 @@ biditest_DEPENDENCIES=libcourier-unicode.la biditest_LDADD=libcourier-unicode.la biditest_LDFLAGS=-static - +biditest2_SOURCES=biditest2.C +biditest2_DEPENDENCIES=libcourier-unicode.la +biditest2_LDADD=libcourier-unicode.la +biditest2_LDFLAGS=-static check-am: unicodetest ./unicodetest @@ -385,6 +395,7 @@ check-am: unicodetest test "`./biditest 8261`" = "8262 8262 o" test "`./biditest 8262`" = "8261 8261 c" ./biditest + ./biditest2 if HAVE_DOCS diff --git a/unicode/biditest.C b/unicode/biditest.C index 6343866..2d2a6e5 100644 --- a/unicode/biditest.C +++ b/unicode/biditest.C @@ -35,8 +35,10 @@ int main(int argc, char **argv) std::ifstream fp("BidiTest.txt"); if (!fp.is_open()) + { + std::cerr << "Cannot open BidiTest.txt" << std::endl; exit(1); - + } size_t linenum=0; size_t nextlogline=0; std::string logmsg; @@ -178,9 +180,10 @@ int main(int argc, char **argv) { if (n & 1) { - actual_levels=level ? + auto ret=level ? unicode::bidi_calc(dummy_input,*level) : unicode::bidi_calc(dummy_input); + actual_levels=std::get<0>(ret); int matched=0; @@ -350,31 +353,6 @@ extern "C" { #include "unicode_bidi.c" -static const struct { - char classname[8]; - enum_bidi_type_t classenum; -} bidiclassnames[]={ - -#include "bidi_classnames.h" - -}; - -const char *bidi_classname(enum_bidi_type_t classenum) -{ - for (const auto &cn:bidiclassnames) - { - if (cn.classenum == classenum) - return cn.classname; - } - - return "???"; -} - -static const char *lookup_classname(const std::string &s) -{ - abort(); -} - enum_bidi_type_t fudge_unicode_bidi(size_t i) { if (i >= testcase.size()) diff --git a/unicode/biditest2.C b/unicode/biditest2.C new file mode 100644 index 0000000..f497bcf --- /dev/null +++ b/unicode/biditest2.C @@ -0,0 +1,203 @@ +#include "unicode_config.h" +#include "courier-unicode.h" +#include <iostream> +#include <sstream> +#include <fstream> +#include <cstdint> +#include <iomanip> + +FILE *DEBUGDUMP; + +int main(int argc, char **argv) +{ + std::ifstream fp("BidiCharacterTest.txt"); + + if (!fp.is_open()) + { + std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl; + exit(1); + } + + DEBUGDUMP=fopen("/dev/null", "w"); + if (!DEBUGDUMP) + { + perror("/dev/null"); + exit(1); + } + + std::string buf; + + size_t linenum=0; + + while (1) + { + buf.clear(); + + if (std::getline(fp, buf).eof() && buf.empty()) + break; + ++linenum; + + auto p=buf.find('#'); + + if (p != buf.npos) + buf=buf.substr(0, p); + + p=buf.find(';'); + + if (p == buf.npos) + continue; + + std::istringstream chars{buf.substr(0, p)}; + + auto q=buf.find(';', ++p); + + if (q == buf.npos) + { + std::cerr << "Cannot parse line " << linenum + << std::endl; + exit(2); + } + + int direction; + + if (!(std::istringstream{buf.substr(p, q-p)} >> direction)) + { + std::cerr << "Cannot parse line " << linenum + << std::endl; + exit(3); + } + + p=++q; + q=buf.find(';', p); + + if (q == buf.npos) + { + std::cerr << "Cannot parse line " << linenum + << std::endl; + exit(4); + } + + int paragraph_embedding_level; + + if (!(std::istringstream{buf.substr(p, q-p)} >> + paragraph_embedding_level)) + { + std::cerr << "Cannot parse line " << linenum + << std::endl; + exit(5); + } + p=++q; + q=buf.find(';', p); + + if (q == buf.npos) + { + std::cerr << "Cannot parse line " << linenum + << std::endl; + exit(6); + } + + std::vector<unicode_bidi_level_t> levels; + + { + std::istringstream level_s{buf.substr(p, q-p)}; + + std::string s; + + while (level_s >> s) + { + size_t l; + + if (!(std::istringstream{s} >> l)) + { + l=UNICODE_BIDI_SKIP; + } + levels.push_back(l); + } + } + + std::vector<size_t> render_order; + + { + size_t n; + + std::istringstream order_i{buf.substr(++q)}; + + while (order_i >> n) + render_order.push_back(n); + } + std::u32string s; + uintmax_t c; + + while (chars >> std::hex >> c) + s.push_back(c); + + auto ret=direction == UNICODE_BIDI_LR || + direction == UNICODE_BIDI_RL + ? unicode::bidi_calc(s, direction) + : unicode::bidi_calc(s); + + if (std::get<1>(ret) != paragraph_embedding_level) + { + std::cerr << "Regression, line " + << linenum + << ": expected " + << paragraph_embedding_level + << " paragraph embedding level, got " + << (int)std::get<1>(ret) + << std::endl; + exit(1); + } + + if (std::get<0>(ret) != levels) + { + fclose(DEBUGDUMP); + DEBUGDUMP=stderr; + + (void)(direction == UNICODE_BIDI_LR || + direction == UNICODE_BIDI_RL + ? unicode::bidi_calc(s, direction) + : unicode::bidi_calc(s)); + + std::cerr << "Regression, line " + << linenum + << ": embedding levels" + << std::endl + << " Expected:"; + + for (int l:levels) + { + std::cerr << " "; + if (l == UNICODE_BIDI_SKIP) + std::cerr << "x"; + else + std::cerr << l; + } + + std::cerr << std::endl + << " Actual:"; + + for (int l:std::get<0>(ret)) + { + std::cerr << " "; + if (l == UNICODE_BIDI_SKIP) + std::cerr << "x"; + else + std::cerr << l; + } + std::cerr << std::endl; + exit(1); + } + } + return 0; +} + +#define BIDI_DEBUG + +extern "C" { +#if 0 +} +#endif + +#include "unicode_bidi.c" + +} diff --git a/unicode/book.xml b/unicode/book.xml index 9c1486c..ad0009a 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -4,6 +4,7 @@ <!ENTITY tr9ver "42"> <!ENTITY tr14ver "45"> +<!ENTITY tr15ver "50"> <!ENTITY tr24ver "31"> <!ENTITY tr29ver "37"> <!ENTITY tr51ver "18"> @@ -232,6 +233,9 @@ See COPYING for distribution information. <link linkend="unicode_bidi"> <citerefentry><refentrytitle>unicode_bidi</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, + <link linkend="unicode_canonical"> + <citerefentry><refentrytitle>unicode_canonical</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link>, <link linkend="unicode_category_lookup"> <citerefentry><refentrytitle>unicode_category_lookup</refentrytitle> <manvolnum>3</manvolnum></citerefentry></link>, @@ -475,6 +479,91 @@ See COPYING for distribution information. </refsect1> </refentry> + <refentry id="unicode_canonical"> + <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> + + <refmeta> + <refentrytitle>unicode_canonical</refentrytitle> + <manvolnum>3</manvolnum> + </refmeta> + + <refnamediv> + <refname>unicode_canonical</refname> + + <refpurpose>unicode canonical character mapping</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> + <funcprototype> + <funcdef>unicode_canonical_t <function>unicode_canonical</function></funcdef> + <paramdef>char32_t <parameter>c</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + <refsect1> + <title>DESCRIPTION</title> + + <para> + <function>unicode_canonical</function>() looks up the + character's + <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html +">canonical + and compatibility mapping</ulink>. + + <function>unicode_canonical</function>() returns a structure + with the following fields: + </para> + + <variablelist> + <varlistentry> + <term><structfield>canonical_chars</structfield></term> + <listitem> + <para> + A pointer to the canonical or equivalent representation + of the character. + </para> + </listitem> + </varlistentry> + <varlistentry> + <term><structfield>n_canonical_chars</structfield></term> + <listitem> + <para> + Number of characters in the + <structfield>canonical_chars</structfield>. + </para> + </listitem> + </varlistentry> + <varlistentry> + <term><structfield>format</structfield></term> + <listitem> + <para> + The character's canonical formatting flag, if any. + </para> + </listitem> + </varlistentry> + </variablelist> + + <para> + A NULL <structfield>canonical_chars</structfield> (with a 0 + <structfield>n_canonical_chars</structfield>) indicates + that the character without a canonical or compatibility + equivalence. + </para> + </refsect1> + <refsect1> + <title>SEE ALSO</title> + <para> + <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>, + <link linkend="courier-unicode"> + <citerefentry> + <refentrytitle>courier-unicode</refentrytitle> + <manvolnum>7</manvolnum></citerefentry></link>. + </para> + </refsect1> + </refentry> + <refentry id="unicode_category_lookup"> <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 55a7152..c8161ea 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -584,6 +584,15 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i); ** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates ** directional markers (from step X9). These characters should be removed ** before using unicode_bidi_reorder(). +** +** unicode_bidi_calc() returns the resolved paragraph direction level, which +** always matches the passed in level, if specified, else it reports the +** derived one. +** +** unicode_bidi_reorder() reorders the characters according to the resolved +** embedding levels. A non-null reorder_callback gets invoked repeatedly, +** indicating the starting index and the number of characters reversed, so +** that any related metadata can be updated accordingly. */ typedef char unicode_bidi_bracket_type_t; @@ -604,10 +613,10 @@ typedef unsigned char unicode_bidi_level_t; #define UNICODE_BIDI_RL ((unicode_bidi_level_t)1) #define UNICODE_BIDI_SKIP ((unicode_bidi_level_t)254) -extern void unicode_bidi_calc(const char32_t *p, size_t n, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t * - initial_embedding_level); +extern unicode_bidi_level_t unicode_bidi_calc(const char32_t *p, size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t * + initial_embedding_level); extern void unicode_bidi_reorder(char32_t *p, unicode_bidi_level_t *levels, @@ -646,6 +655,48 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); /* +** unicode_canonical() returns the canonical mapping of the given Unicode +** character. The returned structure specifies: +** +** - A pointer to the canonical decomposition of the given character. +** - Number of characters in the canonical decomposition. +** - An optional formatting tag. +** +** A null pointer, and a 0 character count gets returned for characters +** without a canonical decomposition. +** +*/ + +typedef enum { + UNICODE_CANONICAL_FMT_NONE=0, + + UNICODE_CANONICAL_FMT_CIRCLE, + UNICODE_CANONICAL_FMT_COMPAT, + UNICODE_CANONICAL_FMT_FINAL, + UNICODE_CANONICAL_FMT_FONT, + UNICODE_CANONICAL_FMT_FRACTION, + UNICODE_CANONICAL_FMT_INITIAL, + UNICODE_CANONICAL_FMT_ISOLATED, + UNICODE_CANONICAL_FMT_MEDIAL, + UNICODE_CANONICAL_FMT_NARROW, + UNICODE_CANONICAL_FMT_NOBREAK, + UNICODE_CANONICAL_FMT_SMALL, + UNICODE_CANONICAL_FMT_SQUARE, + UNICODE_CANONICAL_FMT_SUB, + UNICODE_CANONICAL_FMT_SUPER, + UNICODE_CANONICAL_FMT_VERTICAL, + UNICODE_CANONICAL_FMT_WIDE, +} unicode_canonical_fmt_t; + +typedef struct { + const char32_t *canonical_chars; + size_t n_canonical_chars; + unicode_canonical_fmt_t format; +} unicode_canonical_t; + +extern unicode_canonical_t unicode_canonical(char32_t); + +/* ** A buffer that holds unicode characters, and dynamically grows as needed. */ @@ -2066,11 +2117,13 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); //! Calculate bidirectional embedding levels -std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s); +std::tuple<std::vector<unicode_bidi_level_t>, + unicode_bidi_level_t> bidi_calc(const std::u32string &s); //! Calculate bidirectional embedding levels -std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s, - unicode_bidi_level_t level); +std::tuple<std::vector<unicode_bidi_level_t>, + unicode_bidi_level_t> bidi_calc(const std::u32string &s, + unicode_bidi_level_t level); //! Reorder bidirectional text int bidi_reorder(std::u32string &string, diff --git a/unicode/mkcanonical.pl b/unicode/mkcanonical.pl new file mode 100644 index 0000000..43d7e44 --- /dev/null +++ b/unicode/mkcanonical.pl @@ -0,0 +1,110 @@ +#! /usr/bin/perl +# +# Creates a lookup table for canonical mappings in UnicodeData.txt + +use strict; +use warnings; + +open(F, "<UnicodeData.txt") || die; + +my @mappings; +my @data; + +while (defined($_=<F>)) +{ + my @w=split(/;/, $_, -1); + + next unless $w[5]; + + my $code=$w[0]; + + my @mapping=split(/\s/, $w[5]); + + my $formatting_tag = "UNICODE_CANONICAL_FMT_NONE"; + + if ($mapping[0] =~ /^</) + { + $formatting_tag = shift @mapping; + + $formatting_tag =~ s/<//g; + $formatting_tag =~ s/>//g; + $formatting_tag = "UNICODE_CANONICAL_FMT_" . uc($formatting_tag); + }; + + die "Too long\n" if (scalar @mapping) > 0xFFFF; + + my $dec_code; + + eval "\$dec_code=0x$code\n"; + + push @data, [$dec_code, "\t{0x$code, (unsigned char)$formatting_tag, " + . (scalar @mapping) . ", " + . scalar(@mappings) . "}" ]; + push @mappings, @mapping; +} + +my $hash_size = int( (scalar @data) * 3 / 4); + +my %buckets; + +my $keep_going = 1; + +while ($keep_going) +{ + %buckets = (); + + $keep_going = 0; + + foreach my $m (@data) + { + my $bucket = $m->[0] % $hash_size; + + push @{$buckets{$bucket}}, $m; + + if ((scalar @{$buckets{$bucket}}) > 3) + { + $keep_going = 1; + ++$hash_size; + last; + } + } +} + +print "#define HASH_SIZE $hash_size\n"; + +@data = (); + +my $pfix = ""; + +print "static const unsigned short canon_map_hash[]={\n"; + +foreach my $bucket (0.. ($hash_size)-1) +{ + print "$pfix\t" . (scalar @data); + $pfix = ",\n"; + + push @data, @{ $buckets{$bucket} // [] }; +} + + +print "};\n\nstatic const struct canon_map_table canon_map_lookup[]={\n"; + +$pfix = ""; + +foreach my $m (@data) +{ + print "$pfix" . $m->[1]; + $pfix = ",\n"; +} + +print "\n};\n\nstatic const char32_t canon_map_values[]={\n"; + +$pfix=""; + +foreach my $v (@mappings) +{ + print "$pfix\t0x$v"; + + $pfix=",\n"; +} +print "};\n"; diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index da15966..055ee89 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -467,6 +467,28 @@ typedef struct { } *directional_status_stack_t; #ifdef BIDI_DEBUG + +static const struct { + char classname[8]; + enum_bidi_type_t classenum; +} bidiclassnames[]={ + +#include "bidi_classnames.h" + +}; + +const char *bidi_classname(enum_bidi_type_t classenum) +{ + for (const auto &cn:bidiclassnames) + { + if (cn.classenum == classenum) + return cn.classname; + } + + return "???"; +} + + void dump_classes(const char *prefix, directional_status_stack_t stack) { fprintf(DEBUGDUMP, "%s: ", prefix); @@ -621,11 +643,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack) free(stack); } -static void unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level); +static unicode_bidi_level_t +unicode_bidi_b(const char32_t *p, + size_t n, + enum_bidi_type_t *buf, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level); enum_bidi_type_t unicode_bidi_type(char32_t c) { @@ -639,8 +662,9 @@ enum_bidi_type_t unicode_bidi_type(char32_t c) UNICODE_BIDI_TYPE_L); } -void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) +unicode_bidi_level_t +unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) { /* ** Look up the bidi class for each char32_t. @@ -661,27 +685,33 @@ void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, bufp[i]=UNICODE_BIDI_SKIP; } - unicode_bidi_b(p, n, - buf, - bufp, - initial_embedding_level); + unicode_bidi_level_t level=unicode_bidi_b(p, n, + buf, + bufp, + initial_embedding_level); free(buf); + + return level; } static void unicode_bidi_cl(directional_status_stack_t stack); -static void unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) +static unicode_bidi_level_t +unicode_bidi_b(const char32_t *p, + size_t n, + enum_bidi_type_t *buf, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) { directional_status_stack_t stack; stack=directional_status_stack_init(p, buf, n, bufp, initial_embedding_level); + unicode_bidi_level_t paragraph_embedding_level= + stack->paragraph_embedding_level; + #ifdef BIDI_DEBUG fprintf(DEBUGDUMP, "BIDI: START: Paragraph embedding level: %d\n", (int)stack->paragraph_embedding_level); @@ -690,6 +720,8 @@ static void unicode_bidi_b(const char32_t *p, unicode_bidi_cl(stack); directional_status_stack_deinit(stack); + + return paragraph_embedding_level; } #define RESET_CLASS(p,stack) do { \ @@ -1173,6 +1205,8 @@ static void unicode_bidi_cl(directional_status_stack_t stack) { #ifdef BIDI_DEBUG dump_sequence_info(stack, p); + fprintf(DEBUGDUMP, "Sequence embedding level: %d\n", + (int)p->embedding_level); dump_sequence("Contents before W", stack, p); #endif @@ -1408,6 +1442,16 @@ struct bidi_n_stack { short matched; }; +#define IS_NI(class) \ + ((class) == UNICODE_BIDI_TYPE_B || \ + (class) == UNICODE_BIDI_TYPE_S || \ + (class) == UNICODE_BIDI_TYPE_WS || \ + (class) == UNICODE_BIDI_TYPE_ON || \ + (class) == UNICODE_BIDI_TYPE_FSI || \ + (class) == UNICODE_BIDI_TYPE_LRI || \ + (class) == UNICODE_BIDI_TYPE_RLI || \ + (class) == UNICODE_BIDI_TYPE_PDI) + static void unicode_bidi_n(directional_status_stack_t stack, struct isolating_run_sequence_s *seq) { @@ -1430,45 +1474,86 @@ static void unicode_bidi_n(directional_status_stack_t stack, for (; irs_compare(&iter, &end); irs_incr(&iter)) { - unicode_bidi_bracket_type_t bracket_type; - char32_t open_bracket= - unicode_bidi_bracket_type(stack->chars[iter.i], - &bracket_type); + unicode_bidi_bracket_type_t bracket_type=UNICODE_BIDI_n; + + char32_t open_or_close_bracket=0; + + if (IS_NI(stack->classes[iter.i])) + { + open_or_close_bracket= + unicode_bidi_bracket_type(stack->chars[iter.i], + &bracket_type); + } if (bracket_type == UNICODE_BIDI_o) { if (stackp >= NSTACKSIZE) + { +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + "BD16 stack exceeded on index %d\n", + (int)iter.i); +#endif break; /* BD16 failure */ - + } if (!((*bracket_stack_tail)=(struct bidi_n_stack *) calloc(1, sizeof(struct bidi_n_stack)))) abort(); stack_iters[stackp]=*bracket_stack_tail; - - (*bracket_stack_tail)->start=iter; + stack_iters[stackp]->start=iter; stack_chars[stackp]=stack->chars[iter.i]; + unicode_canonical_t canon= + unicode_canonical(stack_chars[stackp]); + + if (canon.n_canonical_chars == 1 && + !canon.format) + { + stack_chars[stackp]= + canon.canonical_chars[0]; + } + bracket_stack_tail= &(*bracket_stack_tail)->next; ++stackp; - continue; +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, "Found opening bracket at index %d\n", + (int)iter.i); +#endif } - if (bracket_type == UNICODE_BIDI_c) /* Should be "n" */ + if (bracket_type == UNICODE_BIDI_c) { + unicode_canonical_t canon= + unicode_canonical(open_or_close_bracket); + + if (canon.n_canonical_chars == 1 && + !canon.format) + { + open_or_close_bracket= + canon.canonical_chars[0]; + } +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, "Found closing bracket at index %d\n", + (int)iter.i); +#endif for (size_t i=stackp; i > 0; ) { --i; - if (stack_chars[i] != open_bracket) + if (stack_chars[i] != open_or_close_bracket) continue; +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + "Matched to open bracket at index %d\n", + (int)stack_iters[i]->start.i); +#endif stack_iters[i]->end = iter; stack_iters[i]->matched=1; stackp=i; break; } - continue; } /* @@ -1496,11 +1581,41 @@ static void unicode_bidi_n(directional_status_stack_t stack, if (eoclass == E_CLASS) { +#ifdef BIDI_DEBUG + if (stackp) + { + fprintf(DEBUGDUMP, + "Found e for brackets at:"); + + for (size_t i=0; i<stackp; ++i) + { + fprintf(DEBUGDUMP, + " %d", + (int)stack_iters[i]->start.i); + } + fprintf(DEBUGDUMP, "\n"); + } +#endif for (size_t i=0; i<stackp; ++i) stack_iters[i]->has_e=1; } else if (eoclass == O_CLASS) { +#ifdef BIDI_DEBUG + if (stackp) + { + fprintf(DEBUGDUMP, + "Found o for brackets at:"); + + for (size_t i=0; i<stackp; ++i) + { + fprintf(DEBUGDUMP, + " %d", + (int)stack_iters[i]->start.i); + } + fprintf(DEBUGDUMP, "\n"); + } +#endif for (size_t i=0; i<stackp; ++i) stack_iters[i]->has_o=1; } @@ -1516,6 +1631,18 @@ static void unicode_bidi_n(directional_status_stack_t stack, { int set=0; +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + "Brackets: %d and %d: e=%s, o=%s", + (int)p->start.i, + (int)p->end.i, + bidi_classname(E_CLASS), + bidi_classname(O_CLASS)); + + fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n", + p->has_e, + p->has_o); +#endif if (p->has_e) { stack->classes[p->start.i]= @@ -1548,16 +1675,18 @@ static void unicode_bidi_n(directional_status_stack_t stack, } strong_type=eoclass; +#ifdef BIDI_DEBUG + fprintf(DEBUGDUMP, + "Brackets: O context: %s\n", + bidi_classname(strong_type)); +#endif break; } - if (strong_type == O_CLASS) - { - stack->classes[p->start.i]= - stack->classes[p->end.i]= - strong_type; - set=1; - } + stack->classes[p->start.i]= + stack->classes[p->end.i]= + strong_type; + set=1; } if (set) @@ -1581,16 +1710,6 @@ static void unicode_bidi_n(directional_status_stack_t stack, /* N1 */ -#define IS_NI(class) \ - ((class) == UNICODE_BIDI_TYPE_B || \ - (class) == UNICODE_BIDI_TYPE_S || \ - (class) == UNICODE_BIDI_TYPE_WS || \ - (class) == UNICODE_BIDI_TYPE_ON || \ - (class) == UNICODE_BIDI_TYPE_FSI || \ - (class) == UNICODE_BIDI_TYPE_LRI || \ - (class) == UNICODE_BIDI_TYPE_RLI || \ - (class) == UNICODE_BIDI_TYPE_PDI) - enum_bidi_type_t prev_type=seq->sos; for (iter=beg; irs_compare(&iter, &end); ) diff --git a/unicode/unicode_canonical.c b/unicode/unicode_canonical.c new file mode 100644 index 0000000..3f6773f --- /dev/null +++ b/unicode/unicode_canonical.c @@ -0,0 +1,57 @@ +/* +** Copyright 2020 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include "unicode_config.h" +#include "courier-unicode.h" +#include <string.h> + +struct canon_map_table { + char32_t lookup_char; + unsigned char fmt_flag_v; + unsigned char n_chars; + unsigned short offset; +}; + +#include "canonicalmappings.h" + +unicode_canonical_t unicode_canonical(char32_t c) +{ + size_t i=canon_map_hash[c % HASH_SIZE] + + + /* Compile-time sanity check */ + sizeof(char[ sizeof(canon_map_hash)/ + sizeof(canon_map_hash[0]) == HASH_SIZE + ? 1:-1])*0; + + while (i < sizeof(canon_map_lookup)/sizeof(canon_map_lookup[0])) + { + if (canon_map_lookup[i].lookup_char == c) + { + unicode_canonical_t ret; + + ret.canonical_chars= + canon_map_values+canon_map_lookup[i].offset; + ret.n_canonical_chars= + canon_map_lookup[i].n_chars; + ret.format= + (unicode_canonical_fmt_t) + canon_map_lookup[i].fmt_flag_v; + + return ret; + } + + if ((canon_map_lookup[i].lookup_char % HASH_SIZE) != + (c % HASH_SIZE)) + break; + ++i; + } + + unicode_canonical_t ret; + + memset(&ret, 0, sizeof(ret)); + + return ret; +} diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index adb7869..ca139cc 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -558,13 +558,13 @@ std::u32string unicode::toupper(const std::u32string &u) return copy; } -std::vector<unicode_bidi_level_t> +std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> unicode::bidi_calc(const std::u32string &s) { return unicode::bidi_calc(s, UNICODE_BIDI_SKIP); } -std::vector<unicode_bidi_level_t> +std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> unicode::bidi_calc(const std::u32string &s, unicode_bidi_level_t paragraph_embedding_level) { @@ -576,16 +576,19 @@ unicode::bidi_calc(const std::u32string &s, initial_embedding_level=¶graph_embedding_level; } - std::vector<unicode_bidi_level_t> buf; + std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> + ret; - buf.resize(s.size()); + std::get<0>(ret).resize(s.size()); + std::get<1>(ret)=UNICODE_BIDI_LR; if (s.size()) { - unicode_bidi_calc(s.c_str(), s.size(), &buf[0], - initial_embedding_level); + std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(), + &std::get<0>(ret)[0], + initial_embedding_level); } - return buf; + return ret; } extern "C" { |
