summaryrefslogtreecommitdiffstats
path: root/unicode
diff options
context:
space:
mode:
authorSam Varshavchik2020-07-10 08:25:53 -0400
committerSam Varshavchik2020-07-12 15:56:45 -0400
commitf94fc14a9f3019f110c71d084f4bc59261434519 (patch)
treea2c8bfc5b325f9bb0516b14700effc97084185dc /unicode
parent1ef92db9dbbefff98b93c8c66e4693a31b4f31a5 (diff)
downloadcourier-libs-f94fc14a9f3019f110c71d084f4bc59261434519.tar.bz2
Implement unicode_canonical.
Fixes biditest2.
Diffstat (limited to 'unicode')
-rw-r--r--unicode/.gitignore2
-rw-r--r--unicode/Makefile.am15
-rw-r--r--unicode/biditest.C32
-rw-r--r--unicode/biditest2.C203
-rw-r--r--unicode/book.xml89
-rw-r--r--unicode/courier-unicode.h.in67
-rw-r--r--unicode/mkcanonical.pl110
-rw-r--r--unicode/unicode_bidi.c207
-rw-r--r--unicode/unicode_canonical.c57
-rw-r--r--unicode/unicodecpp.C17
10 files changed, 712 insertions, 87 deletions
diff --git a/unicode/.gitignore b/unicode/.gitignore
index 1bdc8ce..8905e05 100644
--- a/unicode/.gitignore
+++ b/unicode/.gitignore
@@ -19,6 +19,8 @@
/WordBreakTest.txt
/emoji-data.txt
/biditest
+/biditest2
+/canonicalmappings.h
/config.cache
/config.guess
/config.sub
diff --git a/unicode/Makefile.am b/unicode/Makefile.am
index 8b1d3cf..83034c5 100644
--- a/unicode/Makefile.am
+++ b/unicode/Makefile.am
@@ -11,6 +11,7 @@ noinst_SCRIPTS=update.sh \
mkbidi.pl \
mkbidiclass.pl \
mkbidiclassnames.pl \
+ mkcanonical.pl \
mkeastasianwidth.pl \
mkemojidata.pl \
mkgraphemebreak.pl \
@@ -20,7 +21,7 @@ noinst_SCRIPTS=update.sh \
mkwordbreak.pl
noinst_PROGRAMS=unicodetest graphemetest linebreaktest wordbreaktest \
- enttest scripttest biditest
+ enttest scripttest biditest biditest2
aclocaldir=$(datadir)/aclocal
aclocal_DATA=m4/courier-unicode.m4
@@ -111,6 +112,7 @@ man_MANS= \
$(srcdir)/man/unicode_bidi_calc.3 \
$(srcdir)/man/unicode_bidi_mirror.3 \
$(srcdir)/man/unicode_bidi_reorder.3 \
+ $(srcdir)/man/unicode_canonical.3 \
$(srcdir)/man/unicode_category_lookup.3 \
$(srcdir)/man/unicode_convert.3 \
$(srcdir)/man/unicode_convert_deinit.3 \
@@ -187,6 +189,7 @@ libcourier_unicode_la_SOURCES=\
unicode_graphemebreak.c graphemebreaktab.h \
unicode_linebreak.c linebreaktab.h \
unicode_htmlent.c unicode_htmlent.h \
+ unicode_canonical.c canonicalmappings.h \
linebreaktab_internal.h \
unicode_wordbreak.c wordbreaktab.h scriptstab.h \
unicode_emoji.c emojitab.h \
@@ -214,6 +217,7 @@ BUILT_SOURCES=unicode_ultcasetab.c \
bidi_class.h \
bidi_classnames.h \
bidi_mirroring.h \
+ canonicalmappings.h \
categoriestab.h \
eastasianwidth.h \
emojitab.h \
@@ -277,6 +281,9 @@ bidi_classnames.h: unicode_bidi.c mkbidiclassnames.pl
@PERL@ $(srcdir)/mkbidiclassnames.pl <$(srcdir)/courier-unicode.h.in >bidi_classnames.h.tmp
mv bidi_classnames.h.tmp bidi_classnames.h
+canonicalmappings.h: UnicodeData.txt mkcanonical.pl
+ @PERL@ -I$(srcdir) $(srcdir)/mkcanonical.pl >canonicalmappings.h.tmp
+ mv canonicalmappings.h.tmp canonicalmappings.h
endif
unicodetest_SOURCES=unicodetest.c
@@ -314,7 +321,10 @@ biditest_DEPENDENCIES=libcourier-unicode.la
biditest_LDADD=libcourier-unicode.la
biditest_LDFLAGS=-static
-
+biditest2_SOURCES=biditest2.C
+biditest2_DEPENDENCIES=libcourier-unicode.la
+biditest2_LDADD=libcourier-unicode.la
+biditest2_LDFLAGS=-static
check-am: unicodetest
./unicodetest
@@ -385,6 +395,7 @@ check-am: unicodetest
test "`./biditest 8261`" = "8262 8262 o"
test "`./biditest 8262`" = "8261 8261 c"
./biditest
+ ./biditest2
if HAVE_DOCS
diff --git a/unicode/biditest.C b/unicode/biditest.C
index 6343866..2d2a6e5 100644
--- a/unicode/biditest.C
+++ b/unicode/biditest.C
@@ -35,8 +35,10 @@ int main(int argc, char **argv)
std::ifstream fp("BidiTest.txt");
if (!fp.is_open())
+ {
+ std::cerr << "Cannot open BidiTest.txt" << std::endl;
exit(1);
-
+ }
size_t linenum=0;
size_t nextlogline=0;
std::string logmsg;
@@ -178,9 +180,10 @@ int main(int argc, char **argv)
{
if (n & 1)
{
- actual_levels=level ?
+ auto ret=level ?
unicode::bidi_calc(dummy_input,*level)
: unicode::bidi_calc(dummy_input);
+ actual_levels=std::get<0>(ret);
int matched=0;
@@ -350,31 +353,6 @@ extern "C" {
#include "unicode_bidi.c"
-static const struct {
- char classname[8];
- enum_bidi_type_t classenum;
-} bidiclassnames[]={
-
-#include "bidi_classnames.h"
-
-};
-
-const char *bidi_classname(enum_bidi_type_t classenum)
-{
- for (const auto &cn:bidiclassnames)
- {
- if (cn.classenum == classenum)
- return cn.classname;
- }
-
- return "???";
-}
-
-static const char *lookup_classname(const std::string &s)
-{
- abort();
-}
-
enum_bidi_type_t fudge_unicode_bidi(size_t i)
{
if (i >= testcase.size())
diff --git a/unicode/biditest2.C b/unicode/biditest2.C
new file mode 100644
index 0000000..f497bcf
--- /dev/null
+++ b/unicode/biditest2.C
@@ -0,0 +1,203 @@
+#include "unicode_config.h"
+#include "courier-unicode.h"
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <cstdint>
+#include <iomanip>
+
+FILE *DEBUGDUMP;
+
+int main(int argc, char **argv)
+{
+ std::ifstream fp("BidiCharacterTest.txt");
+
+ if (!fp.is_open())
+ {
+ std::cerr << "Cannot open BidiCharacterTest.txt" << std::endl;
+ exit(1);
+ }
+
+ DEBUGDUMP=fopen("/dev/null", "w");
+ if (!DEBUGDUMP)
+ {
+ perror("/dev/null");
+ exit(1);
+ }
+
+ std::string buf;
+
+ size_t linenum=0;
+
+ while (1)
+ {
+ buf.clear();
+
+ if (std::getline(fp, buf).eof() && buf.empty())
+ break;
+ ++linenum;
+
+ auto p=buf.find('#');
+
+ if (p != buf.npos)
+ buf=buf.substr(0, p);
+
+ p=buf.find(';');
+
+ if (p == buf.npos)
+ continue;
+
+ std::istringstream chars{buf.substr(0, p)};
+
+ auto q=buf.find(';', ++p);
+
+ if (q == buf.npos)
+ {
+ std::cerr << "Cannot parse line " << linenum
+ << std::endl;
+ exit(2);
+ }
+
+ int direction;
+
+ if (!(std::istringstream{buf.substr(p, q-p)} >> direction))
+ {
+ std::cerr << "Cannot parse line " << linenum
+ << std::endl;
+ exit(3);
+ }
+
+ p=++q;
+ q=buf.find(';', p);
+
+ if (q == buf.npos)
+ {
+ std::cerr << "Cannot parse line " << linenum
+ << std::endl;
+ exit(4);
+ }
+
+ int paragraph_embedding_level;
+
+ if (!(std::istringstream{buf.substr(p, q-p)} >>
+ paragraph_embedding_level))
+ {
+ std::cerr << "Cannot parse line " << linenum
+ << std::endl;
+ exit(5);
+ }
+ p=++q;
+ q=buf.find(';', p);
+
+ if (q == buf.npos)
+ {
+ std::cerr << "Cannot parse line " << linenum
+ << std::endl;
+ exit(6);
+ }
+
+ std::vector<unicode_bidi_level_t> levels;
+
+ {
+ std::istringstream level_s{buf.substr(p, q-p)};
+
+ std::string s;
+
+ while (level_s >> s)
+ {
+ size_t l;
+
+ if (!(std::istringstream{s} >> l))
+ {
+ l=UNICODE_BIDI_SKIP;
+ }
+ levels.push_back(l);
+ }
+ }
+
+ std::vector<size_t> render_order;
+
+ {
+ size_t n;
+
+ std::istringstream order_i{buf.substr(++q)};
+
+ while (order_i >> n)
+ render_order.push_back(n);
+ }
+ std::u32string s;
+ uintmax_t c;
+
+ while (chars >> std::hex >> c)
+ s.push_back(c);
+
+ auto ret=direction == UNICODE_BIDI_LR ||
+ direction == UNICODE_BIDI_RL
+ ? unicode::bidi_calc(s, direction)
+ : unicode::bidi_calc(s);
+
+ if (std::get<1>(ret) != paragraph_embedding_level)
+ {
+ std::cerr << "Regression, line "
+ << linenum
+ << ": expected "
+ << paragraph_embedding_level
+ << " paragraph embedding level, got "
+ << (int)std::get<1>(ret)
+ << std::endl;
+ exit(1);
+ }
+
+ if (std::get<0>(ret) != levels)
+ {
+ fclose(DEBUGDUMP);
+ DEBUGDUMP=stderr;
+
+ (void)(direction == UNICODE_BIDI_LR ||
+ direction == UNICODE_BIDI_RL
+ ? unicode::bidi_calc(s, direction)
+ : unicode::bidi_calc(s));
+
+ std::cerr << "Regression, line "
+ << linenum
+ << ": embedding levels"
+ << std::endl
+ << " Expected:";
+
+ for (int l:levels)
+ {
+ std::cerr << " ";
+ if (l == UNICODE_BIDI_SKIP)
+ std::cerr << "x";
+ else
+ std::cerr << l;
+ }
+
+ std::cerr << std::endl
+ << " Actual:";
+
+ for (int l:std::get<0>(ret))
+ {
+ std::cerr << " ";
+ if (l == UNICODE_BIDI_SKIP)
+ std::cerr << "x";
+ else
+ std::cerr << l;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
+ }
+ return 0;
+}
+
+#define BIDI_DEBUG
+
+extern "C" {
+#if 0
+}
+#endif
+
+#include "unicode_bidi.c"
+
+}
diff --git a/unicode/book.xml b/unicode/book.xml
index 9c1486c..ad0009a 100644
--- a/unicode/book.xml
+++ b/unicode/book.xml
@@ -4,6 +4,7 @@
<!ENTITY tr9ver "42">
<!ENTITY tr14ver "45">
+<!ENTITY tr15ver "50">
<!ENTITY tr24ver "31">
<!ENTITY tr29ver "37">
<!ENTITY tr51ver "18">
@@ -232,6 +233,9 @@ See COPYING for distribution information.
<link linkend="unicode_bidi">
<citerefentry><refentrytitle>unicode_bidi</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
+ <link linkend="unicode_canonical">
+ <citerefentry><refentrytitle>unicode_canonical</refentrytitle>
+ <manvolnum>3</manvolnum></citerefentry></link>,
<link linkend="unicode_category_lookup">
<citerefentry><refentrytitle>unicode_category_lookup</refentrytitle>
<manvolnum>3</manvolnum></citerefentry></link>,
@@ -475,6 +479,91 @@ See COPYING for distribution information.
</refsect1>
</refentry>
+ <refentry id="unicode_canonical">
+ <refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
+
+ <refmeta>
+ <refentrytitle>unicode_canonical</refentrytitle>
+ <manvolnum>3</manvolnum>
+ </refmeta>
+
+ <refnamediv>
+ <refname>unicode_canonical</refname>
+
+ <refpurpose>unicode canonical character mapping</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <funcsynopsis>
+ <funcsynopsisinfo>#include &lt;courier-unicode.h&gt;</funcsynopsisinfo>
+ <funcprototype>
+ <funcdef>unicode_canonical_t <function>unicode_canonical</function></funcdef>
+ <paramdef>char32_t <parameter>c</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+ </refsynopsisdiv>
+ <refsect1>
+ <title>DESCRIPTION</title>
+
+ <para>
+ <function>unicode_canonical</function>() looks up the
+ character's
+ <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html
+">canonical
+ and compatibility mapping</ulink>.
+
+ <function>unicode_canonical</function>() returns a structure
+ with the following fields:
+ </para>
+
+ <variablelist>
+ <varlistentry>
+ <term><structfield>canonical_chars</structfield></term>
+ <listitem>
+ <para>
+ A pointer to the canonical or equivalent representation
+ of the character.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><structfield>n_canonical_chars</structfield></term>
+ <listitem>
+ <para>
+ Number of characters in the
+ <structfield>canonical_chars</structfield>.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><structfield>format</structfield></term>
+ <listitem>
+ <para>
+ The character's canonical formatting flag, if any.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+
+ <para>
+ A NULL <structfield>canonical_chars</structfield> (with a 0
+ <structfield>n_canonical_chars</structfield>) indicates
+ that the character without a canonical or compatibility
+ equivalence.
+ </para>
+ </refsect1>
+ <refsect1>
+ <title>SEE ALSO</title>
+ <para>
+ <ulink url="https://www.unicode.org/reports/tr15/tr15-&tr15ver;.html">TR-15</ulink>,
+ <link linkend="courier-unicode">
+ <citerefentry>
+ <refentrytitle>courier-unicode</refentrytitle>
+ <manvolnum>7</manvolnum></citerefentry></link>.
+ </para>
+ </refsect1>
+ </refentry>
+
<refentry id="unicode_category_lookup">
<refentryinfo><author><firstname>Sam</firstname><surname>Varshavchik</surname><contrib>Author</contrib></author><productname>Courier Unicode Library</productname></refentryinfo>
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in
index 55a7152..c8161ea 100644
--- a/unicode/courier-unicode.h.in
+++ b/unicode/courier-unicode.h.in
@@ -584,6 +584,15 @@ size_t unicode_wbscan_end(unicode_wbscan_info_t i);
** and odd for right-to-left). A value of UNICODE_BIDI_SKIP designates
** directional markers (from step X9). These characters should be removed
** before using unicode_bidi_reorder().
+**
+** unicode_bidi_calc() returns the resolved paragraph direction level, which
+** always matches the passed in level, if specified, else it reports the
+** derived one.
+**
+** unicode_bidi_reorder() reorders the characters according to the resolved
+** embedding levels. A non-null reorder_callback gets invoked repeatedly,
+** indicating the starting index and the number of characters reversed, so
+** that any related metadata can be updated accordingly.
*/
typedef char unicode_bidi_bracket_type_t;
@@ -604,10 +613,10 @@ typedef unsigned char unicode_bidi_level_t;
#define UNICODE_BIDI_RL ((unicode_bidi_level_t)1)
#define UNICODE_BIDI_SKIP ((unicode_bidi_level_t)254)
-extern void unicode_bidi_calc(const char32_t *p, size_t n,
- unicode_bidi_level_t *bufp,
- const unicode_bidi_level_t *
- initial_embedding_level);
+extern unicode_bidi_level_t unicode_bidi_calc(const char32_t *p, size_t n,
+ unicode_bidi_level_t *bufp,
+ const unicode_bidi_level_t *
+ initial_embedding_level);
extern void unicode_bidi_reorder(char32_t *p,
unicode_bidi_level_t *levels,
@@ -646,6 +655,48 @@ typedef enum {
extern enum_bidi_type_t unicode_bidi_type(char32_t c);
/*
+** unicode_canonical() returns the canonical mapping of the given Unicode
+** character. The returned structure specifies:
+**
+** - A pointer to the canonical decomposition of the given character.
+** - Number of characters in the canonical decomposition.
+** - An optional formatting tag.
+**
+** A null pointer, and a 0 character count gets returned for characters
+** without a canonical decomposition.
+**
+*/
+
+typedef enum {
+ UNICODE_CANONICAL_FMT_NONE=0,
+
+ UNICODE_CANONICAL_FMT_CIRCLE,
+ UNICODE_CANONICAL_FMT_COMPAT,
+ UNICODE_CANONICAL_FMT_FINAL,
+ UNICODE_CANONICAL_FMT_FONT,
+ UNICODE_CANONICAL_FMT_FRACTION,
+ UNICODE_CANONICAL_FMT_INITIAL,
+ UNICODE_CANONICAL_FMT_ISOLATED,
+ UNICODE_CANONICAL_FMT_MEDIAL,
+ UNICODE_CANONICAL_FMT_NARROW,
+ UNICODE_CANONICAL_FMT_NOBREAK,
+ UNICODE_CANONICAL_FMT_SMALL,
+ UNICODE_CANONICAL_FMT_SQUARE,
+ UNICODE_CANONICAL_FMT_SUB,
+ UNICODE_CANONICAL_FMT_SUPER,
+ UNICODE_CANONICAL_FMT_VERTICAL,
+ UNICODE_CANONICAL_FMT_WIDE,
+} unicode_canonical_fmt_t;
+
+typedef struct {
+ const char32_t *canonical_chars;
+ size_t n_canonical_chars;
+ unicode_canonical_fmt_t format;
+} unicode_canonical_t;
+
+extern unicode_canonical_t unicode_canonical(char32_t);
+
+/*
** A buffer that holds unicode characters, and dynamically grows as needed.
*/
@@ -2066,11 +2117,13 @@ std::u32string tolower(const std::u32string &u);
std::u32string toupper(const std::u32string &u);
//! Calculate bidirectional embedding levels
-std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s);
+std::tuple<std::vector<unicode_bidi_level_t>,
+ unicode_bidi_level_t> bidi_calc(const std::u32string &s);
//! Calculate bidirectional embedding levels
-std::vector<unicode_bidi_level_t> bidi_calc(const std::u32string &s,
- unicode_bidi_level_t level);
+std::tuple<std::vector<unicode_bidi_level_t>,
+ unicode_bidi_level_t> bidi_calc(const std::u32string &s,
+ unicode_bidi_level_t level);
//! Reorder bidirectional text
int bidi_reorder(std::u32string &string,
diff --git a/unicode/mkcanonical.pl b/unicode/mkcanonical.pl
new file mode 100644
index 0000000..43d7e44
--- /dev/null
+++ b/unicode/mkcanonical.pl
@@ -0,0 +1,110 @@
+#! /usr/bin/perl
+#
+# Creates a lookup table for canonical mappings in UnicodeData.txt
+
+use strict;
+use warnings;
+
+open(F, "<UnicodeData.txt") || die;
+
+my @mappings;
+my @data;
+
+while (defined($_=<F>))
+{
+ my @w=split(/;/, $_, -1);
+
+ next unless $w[5];
+
+ my $code=$w[0];
+
+ my @mapping=split(/\s/, $w[5]);
+
+ my $formatting_tag = "UNICODE_CANONICAL_FMT_NONE";
+
+ if ($mapping[0] =~ /^</)
+ {
+ $formatting_tag = shift @mapping;
+
+ $formatting_tag =~ s/<//g;
+ $formatting_tag =~ s/>//g;
+ $formatting_tag = "UNICODE_CANONICAL_FMT_" . uc($formatting_tag);
+ };
+
+ die "Too long\n" if (scalar @mapping) > 0xFFFF;
+
+ my $dec_code;
+
+ eval "\$dec_code=0x$code\n";
+
+ push @data, [$dec_code, "\t{0x$code, (unsigned char)$formatting_tag, "
+ . (scalar @mapping) . ", "
+ . scalar(@mappings) . "}" ];
+ push @mappings, @mapping;
+}
+
+my $hash_size = int( (scalar @data) * 3 / 4);
+
+my %buckets;
+
+my $keep_going = 1;
+
+while ($keep_going)
+{
+ %buckets = ();
+
+ $keep_going = 0;
+
+ foreach my $m (@data)
+ {
+ my $bucket = $m->[0] % $hash_size;
+
+ push @{$buckets{$bucket}}, $m;
+
+ if ((scalar @{$buckets{$bucket}}) > 3)
+ {
+ $keep_going = 1;
+ ++$hash_size;
+ last;
+ }
+ }
+}
+
+print "#define HASH_SIZE $hash_size\n";
+
+@data = ();
+
+my $pfix = "";
+
+print "static const unsigned short canon_map_hash[]={\n";
+
+foreach my $bucket (0.. ($hash_size)-1)
+{
+ print "$pfix\t" . (scalar @data);
+ $pfix = ",\n";
+
+ push @data, @{ $buckets{$bucket} // [] };
+}
+
+
+print "};\n\nstatic const struct canon_map_table canon_map_lookup[]={\n";
+
+$pfix = "";
+
+foreach my $m (@data)
+{
+ print "$pfix" . $m->[1];
+ $pfix = ",\n";
+}
+
+print "\n};\n\nstatic const char32_t canon_map_values[]={\n";
+
+$pfix="";
+
+foreach my $v (@mappings)
+{
+ print "$pfix\t0x$v";
+
+ $pfix=",\n";
+}
+print "};\n";
diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c
index da15966..055ee89 100644
--- a/unicode/unicode_bidi.c
+++ b/unicode/unicode_bidi.c
@@ -467,6 +467,28 @@ typedef struct {
} *directional_status_stack_t;
#ifdef BIDI_DEBUG
+
+static const struct {
+ char classname[8];
+ enum_bidi_type_t classenum;
+} bidiclassnames[]={
+
+#include "bidi_classnames.h"
+
+};
+
+const char *bidi_classname(enum_bidi_type_t classenum)
+{
+ for (const auto &cn:bidiclassnames)
+ {
+ if (cn.classenum == classenum)
+ return cn.classname;
+ }
+
+ return "???";
+}
+
+
void dump_classes(const char *prefix, directional_status_stack_t stack)
{
fprintf(DEBUGDUMP, "%s: ", prefix);
@@ -621,11 +643,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack)
free(stack);
}
-static void unicode_bidi_b(const char32_t *p,
- size_t n,
- enum_bidi_type_t *buf,
- unicode_bidi_level_t *bufp,
- const unicode_bidi_level_t *initial_embedding_level);
+static unicode_bidi_level_t
+unicode_bidi_b(const char32_t *p,
+ size_t n,
+ enum_bidi_type_t *buf,
+ unicode_bidi_level_t *bufp,
+ const unicode_bidi_level_t *initial_embedding_level);
enum_bidi_type_t unicode_bidi_type(char32_t c)
{
@@ -639,8 +662,9 @@ enum_bidi_type_t unicode_bidi_type(char32_t c)
UNICODE_BIDI_TYPE_L);
}
-void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
- const unicode_bidi_level_t *initial_embedding_level)
+unicode_bidi_level_t
+unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
+ const unicode_bidi_level_t *initial_embedding_level)
{
/*
** Look up the bidi class for each char32_t.
@@ -661,27 +685,33 @@ void unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp,
bufp[i]=UNICODE_BIDI_SKIP;
}
- unicode_bidi_b(p, n,
- buf,
- bufp,
- initial_embedding_level);
+ unicode_bidi_level_t level=unicode_bidi_b(p, n,
+ buf,
+ bufp,
+ initial_embedding_level);
free(buf);
+
+ return level;
}
static void unicode_bidi_cl(directional_status_stack_t stack);
-static void unicode_bidi_b(const char32_t *p,
- size_t n,
- enum_bidi_type_t *buf,
- unicode_bidi_level_t *bufp,
- const unicode_bidi_level_t *initial_embedding_level)
+static unicode_bidi_level_t
+unicode_bidi_b(const char32_t *p,
+ size_t n,
+ enum_bidi_type_t *buf,
+ unicode_bidi_level_t *bufp,
+ const unicode_bidi_level_t *initial_embedding_level)
{
directional_status_stack_t stack;
stack=directional_status_stack_init(p, buf, n, bufp,
initial_embedding_level);
+ unicode_bidi_level_t paragraph_embedding_level=
+ stack->paragraph_embedding_level;
+
#ifdef BIDI_DEBUG
fprintf(DEBUGDUMP, "BIDI: START: Paragraph embedding level: %d\n",
(int)stack->paragraph_embedding_level);
@@ -690,6 +720,8 @@ static void unicode_bidi_b(const char32_t *p,
unicode_bidi_cl(stack);
directional_status_stack_deinit(stack);
+
+ return paragraph_embedding_level;
}
#define RESET_CLASS(p,stack) do { \
@@ -1173,6 +1205,8 @@ static void unicode_bidi_cl(directional_status_stack_t stack)
{
#ifdef BIDI_DEBUG
dump_sequence_info(stack, p);
+ fprintf(DEBUGDUMP, "Sequence embedding level: %d\n",
+ (int)p->embedding_level);
dump_sequence("Contents before W", stack, p);
#endif
@@ -1408,6 +1442,16 @@ struct bidi_n_stack {
short matched;
};
+#define IS_NI(class) \
+ ((class) == UNICODE_BIDI_TYPE_B || \
+ (class) == UNICODE_BIDI_TYPE_S || \
+ (class) == UNICODE_BIDI_TYPE_WS || \
+ (class) == UNICODE_BIDI_TYPE_ON || \
+ (class) == UNICODE_BIDI_TYPE_FSI || \
+ (class) == UNICODE_BIDI_TYPE_LRI || \
+ (class) == UNICODE_BIDI_TYPE_RLI || \
+ (class) == UNICODE_BIDI_TYPE_PDI)
+
static void unicode_bidi_n(directional_status_stack_t stack,
struct isolating_run_sequence_s *seq)
{
@@ -1430,45 +1474,86 @@ static void unicode_bidi_n(directional_status_stack_t stack,
for (; irs_compare(&iter, &end); irs_incr(&iter))
{
- unicode_bidi_bracket_type_t bracket_type;
- char32_t open_bracket=
- unicode_bidi_bracket_type(stack->chars[iter.i],
- &bracket_type);
+ unicode_bidi_bracket_type_t bracket_type=UNICODE_BIDI_n;
+
+ char32_t open_or_close_bracket=0;
+
+ if (IS_NI(stack->classes[iter.i]))
+ {
+ open_or_close_bracket=
+ unicode_bidi_bracket_type(stack->chars[iter.i],
+ &bracket_type);
+ }
if (bracket_type == UNICODE_BIDI_o)
{
if (stackp >= NSTACKSIZE)
+ {
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ "BD16 stack exceeded on index %d\n",
+ (int)iter.i);
+#endif
break; /* BD16 failure */
-
+ }
if (!((*bracket_stack_tail)=(struct bidi_n_stack *)
calloc(1, sizeof(struct bidi_n_stack))))
abort();
stack_iters[stackp]=*bracket_stack_tail;
-
- (*bracket_stack_tail)->start=iter;
+ stack_iters[stackp]->start=iter;
stack_chars[stackp]=stack->chars[iter.i];
+ unicode_canonical_t canon=
+ unicode_canonical(stack_chars[stackp]);
+
+ if (canon.n_canonical_chars == 1 &&
+ !canon.format)
+ {
+ stack_chars[stackp]=
+ canon.canonical_chars[0];
+ }
+
bracket_stack_tail= &(*bracket_stack_tail)->next;
++stackp;
- continue;
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, "Found opening bracket at index %d\n",
+ (int)iter.i);
+#endif
}
- if (bracket_type == UNICODE_BIDI_c) /* Should be "n" */
+ if (bracket_type == UNICODE_BIDI_c)
{
+ unicode_canonical_t canon=
+ unicode_canonical(open_or_close_bracket);
+
+ if (canon.n_canonical_chars == 1 &&
+ !canon.format)
+ {
+ open_or_close_bracket=
+ canon.canonical_chars[0];
+ }
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP, "Found closing bracket at index %d\n",
+ (int)iter.i);
+#endif
for (size_t i=stackp; i > 0; )
{
--i;
- if (stack_chars[i] != open_bracket)
+ if (stack_chars[i] != open_or_close_bracket)
continue;
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ "Matched to open bracket at index %d\n",
+ (int)stack_iters[i]->start.i);
+#endif
stack_iters[i]->end = iter;
stack_iters[i]->matched=1;
stackp=i;
break;
}
- continue;
}
/*
@@ -1496,11 +1581,41 @@ static void unicode_bidi_n(directional_status_stack_t stack,
if (eoclass == E_CLASS)
{
+#ifdef BIDI_DEBUG
+ if (stackp)
+ {
+ fprintf(DEBUGDUMP,
+ "Found e for brackets at:");
+
+ for (size_t i=0; i<stackp; ++i)
+ {
+ fprintf(DEBUGDUMP,
+ " %d",
+ (int)stack_iters[i]->start.i);
+ }
+ fprintf(DEBUGDUMP, "\n");
+ }
+#endif
for (size_t i=0; i<stackp; ++i)
stack_iters[i]->has_e=1;
}
else if (eoclass == O_CLASS)
{
+#ifdef BIDI_DEBUG
+ if (stackp)
+ {
+ fprintf(DEBUGDUMP,
+ "Found o for brackets at:");
+
+ for (size_t i=0; i<stackp; ++i)
+ {
+ fprintf(DEBUGDUMP,
+ " %d",
+ (int)stack_iters[i]->start.i);
+ }
+ fprintf(DEBUGDUMP, "\n");
+ }
+#endif
for (size_t i=0; i<stackp; ++i)
stack_iters[i]->has_o=1;
}
@@ -1516,6 +1631,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,
{
int set=0;
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ "Brackets: %d and %d: e=%s, o=%s",
+ (int)p->start.i,
+ (int)p->end.i,
+ bidi_classname(E_CLASS),
+ bidi_classname(O_CLASS));
+
+ fprintf(DEBUGDUMP, ", has e=%d, has o=%d\n",
+ p->has_e,
+ p->has_o);
+#endif
if (p->has_e)
{
stack->classes[p->start.i]=
@@ -1548,16 +1675,18 @@ static void unicode_bidi_n(directional_status_stack_t stack,
}
strong_type=eoclass;
+#ifdef BIDI_DEBUG
+ fprintf(DEBUGDUMP,
+ "Brackets: O context: %s\n",
+ bidi_classname(strong_type));
+#endif
break;
}
- if (strong_type == O_CLASS)
- {
- stack->classes[p->start.i]=
- stack->classes[p->end.i]=
- strong_type;
- set=1;
- }
+ stack->classes[p->start.i]=
+ stack->classes[p->end.i]=
+ strong_type;
+ set=1;
}
if (set)
@@ -1581,16 +1710,6 @@ static void unicode_bidi_n(directional_status_stack_t stack,
/* N1 */
-#define IS_NI(class) \
- ((class) == UNICODE_BIDI_TYPE_B || \
- (class) == UNICODE_BIDI_TYPE_S || \
- (class) == UNICODE_BIDI_TYPE_WS || \
- (class) == UNICODE_BIDI_TYPE_ON || \
- (class) == UNICODE_BIDI_TYPE_FSI || \
- (class) == UNICODE_BIDI_TYPE_LRI || \
- (class) == UNICODE_BIDI_TYPE_RLI || \
- (class) == UNICODE_BIDI_TYPE_PDI)
-
enum_bidi_type_t prev_type=seq->sos;
for (iter=beg; irs_compare(&iter, &end); )
diff --git a/unicode/unicode_canonical.c b/unicode/unicode_canonical.c
new file mode 100644
index 0000000..3f6773f
--- /dev/null
+++ b/unicode/unicode_canonical.c
@@ -0,0 +1,57 @@
+/*
+** Copyright 2020 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include "unicode_config.h"
+#include "courier-unicode.h"
+#include <string.h>
+
+struct canon_map_table {
+ char32_t lookup_char;
+ unsigned char fmt_flag_v;
+ unsigned char n_chars;
+ unsigned short offset;
+};
+
+#include "canonicalmappings.h"
+
+unicode_canonical_t unicode_canonical(char32_t c)
+{
+ size_t i=canon_map_hash[c % HASH_SIZE]
+ +
+ /* Compile-time sanity check */
+ sizeof(char[ sizeof(canon_map_hash)/
+ sizeof(canon_map_hash[0]) == HASH_SIZE
+ ? 1:-1])*0;
+
+ while (i < sizeof(canon_map_lookup)/sizeof(canon_map_lookup[0]))
+ {
+ if (canon_map_lookup[i].lookup_char == c)
+ {
+ unicode_canonical_t ret;
+
+ ret.canonical_chars=
+ canon_map_values+canon_map_lookup[i].offset;
+ ret.n_canonical_chars=
+ canon_map_lookup[i].n_chars;
+ ret.format=
+ (unicode_canonical_fmt_t)
+ canon_map_lookup[i].fmt_flag_v;
+
+ return ret;
+ }
+
+ if ((canon_map_lookup[i].lookup_char % HASH_SIZE) !=
+ (c % HASH_SIZE))
+ break;
+ ++i;
+ }
+
+ unicode_canonical_t ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ return ret;
+}
diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C
index adb7869..ca139cc 100644
--- a/unicode/unicodecpp.C
+++ b/unicode/unicodecpp.C
@@ -558,13 +558,13 @@ std::u32string unicode::toupper(const std::u32string &u)
return copy;
}
-std::vector<unicode_bidi_level_t>
+std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
unicode::bidi_calc(const std::u32string &s)
{
return unicode::bidi_calc(s, UNICODE_BIDI_SKIP);
}
-std::vector<unicode_bidi_level_t>
+std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
unicode::bidi_calc(const std::u32string &s,
unicode_bidi_level_t paragraph_embedding_level)
{
@@ -576,16 +576,19 @@ unicode::bidi_calc(const std::u32string &s,
initial_embedding_level=&paragraph_embedding_level;
}
- std::vector<unicode_bidi_level_t> buf;
+ std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t>
+ ret;
- buf.resize(s.size());
+ std::get<0>(ret).resize(s.size());
+ std::get<1>(ret)=UNICODE_BIDI_LR;
if (s.size())
{
- unicode_bidi_calc(s.c_str(), s.size(), &buf[0],
- initial_embedding_level);
+ std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(),
+ &std::get<0>(ret)[0],
+ initial_embedding_level);
}
- return buf;
+ return ret;
}
extern "C" {