diff options
| -rw-r--r-- | unicode/Makefile.am | 6 | ||||
| -rw-r--r-- | unicode/book.xml | 233 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 56 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 282 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 39 |
5 files changed, 460 insertions, 156 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am index dbc71aa..135617a 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -91,6 +91,7 @@ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ $(srcdir)/man/unicode\:\:bidi.3 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ + $(srcdir)/man/unicode\:\:bidi_calc_types.3 \ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_embed.3 \ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ @@ -115,12 +116,15 @@ man_MANS= \ $(srcdir)/man/unicode_bidi.3 \ $(srcdir)/man/unicode_bidi_bracket_type.3 \ $(srcdir)/man/unicode_bidi_calc.3 \ + $(srcdir)/man/unicode_bidi_calc_levels.3 \ + $(srcdir)/man/unicode_bidi_calc_types.3 \ $(srcdir)/man/unicode_bidi_cleanup.3 \ $(srcdir)/man/unicode_bidi_embed.3 \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ $(srcdir)/man/unicode_bidi_logical_order.3 \ $(srcdir)/man/unicode_bidi_mirror.3 \ $(srcdir)/man/unicode_bidi_reorder.3 \ + $(srcdir)/man/unicode_bidi_setbnl.3 \ $(srcdir)/man/unicode_bidi_type.3 \ $(srcdir)/man/unicode_canonical.3 \ $(srcdir)/man/unicode_category_lookup.3 \ @@ -432,7 +436,7 @@ docs.stamp: rm -rf man.tmp mkdir man.tmp d=`cd $(srcdir); pwd`; cd man.tmp; xsltproc --nonet --xinclude \ - http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl\ + http://cdn.docbook.org/release/xsl-nons/current//manpages/docbook.xsl\ $$d/book.xml mkdir -p man rm -f man/*.[123456789] diff --git a/unicode/book.xml b/unicode/book.xml index b0342ea..c0e0485 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -301,6 +301,8 @@ See COPYING for distribution information. <refnamediv> <refname>unicode_bidi</refname> + <refname>unicode_bidi_calc_levels</refname> + <refname>unicode_bidi_calc_types</refname> <refname>unicode_bidi_calc</refname> <refname>unicode_bidi_reorder</refname> <refname>unicode_bidi_cleanup</refname> @@ -309,6 +311,7 @@ See COPYING for distribution information. <refname>unicode_bidi_embed_paragraph_level</refname> <refname>unicode_bidi_type</refname> + <refname>unicode_bidi_setbnl</refname> <refname>unicode_bidi_mirror</refname> <refname>unicode_bidi_bracket_type</refname> @@ -318,6 +321,23 @@ See COPYING for distribution information. <refsynopsisdiv> <funcsynopsis> <funcsynopsisinfo>#include <courier-unicode.h> unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo> + + <funcprototype> + <funcdef>void <function>unicode_bidi_calc_types</function></funcdef> + <paramdef>const char32_t *<parameter>p</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_type_t *<parameter>types</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>void <function>unicode_bidi_calc_levels</function></funcdef> + <paramdef>const char32_t *<parameter>p</parameter></paramdef> + <paramdef>const unicode_bidi_type_t *<parameter>types</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>const unicode_bidi_level_t *<parameter>initial_embedding_level</parameter></paramdef> + </funcprototype> + <funcprototype> <funcdef>void <function>unicode_bidi_calc</function></funcdef> <paramdef>const char32_t *<parameter>p</parameter></paramdef> @@ -387,6 +407,14 @@ See COPYING for distribution information. <funcdef>enum_bidi_type_t <function>unicode_bidi_type</function></funcdef> <paramdef>char32_t <parameter>c</parameter></paramdef> </funcprototype> + + <funcprototype> + <funcdef>void <function>unicode_bidi_setbnl</function></funcdef> + <paramdef>char32_t *<parameter>p</parameter></paramdef> + <paramdef>const unicode_bidi_type_t *<parameter>types</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + </funcprototype> + </funcsynopsis> </refsynopsisdiv> <refsect1 id="unicode_bidi_descr"> @@ -417,19 +445,49 @@ See COPYING for distribution information. <listitem> <para> Allocate an array of + <structname>unicode_bidi_type_t</structname> that's the + same size as the Unicode string. + </para> + </listitem> + <listitem> + <para> + Allocate an array of <structname>unicode_bidi_level_t</structname> that's the same size as the Unicode string. </para> </listitem> + + <listitem> + <para> + Use <function>unicode_bidi_calc_types</function>() to compute + the Unicode string's characters' bi-directional types, + and populate the + <structname>unicode_bidi_type_t</structname> buffer. + </para> + </listitem> + <listitem> <para> - Use <function>unicode_bidi_calc</function>() to compute + Use <function>unicode_bidi_calc_levels</function>() to compute the Unicode string's characters' bi-directional embedding level (executes the Bi-Directional algorithm up to and including step L1). This populates the <structname>unicode_bidi_level_t</structname> buffer. </para> </listitem> + + <listitem> + <para> + Alternatively: allocate only the + <structname>unicode_bidi_level_t</structname> array + and use <function>unicode_bidi_calc</function>(), which + <function>malloc</function>()s the + <structname>unicode_bidi_type_t</structname> buffer, + calls <function>unicode_bidi_calc_levels</function>(), + and then <function>free</function>()s the buffer. + </para> + </listitem> + <listitem> <para> Use <function>unicode_bidi_reorder</function>() to reverse @@ -451,7 +509,7 @@ See COPYING for distribution information. <para> The parameters to - <function>unicode_bidi_calc</function>() are: + <function>unicode_bidi_calc_types</function>() are: </para> <itemizedlist> @@ -468,6 +526,42 @@ See COPYING for distribution information. <listitem> <para> A pointer to an array of + <structname>unicode_bidi_type_t</structname> values. + The caller is + responsible for allocating and deallocating this array, + which has the same size as the Unicode string. + </para> + </listitem> + </itemizedlist> + + <para> + The parameters to + <function>unicode_bidi_calc_levels</function>() are: + </para> + + <itemizedlist> + <listitem> + <para> + A pointer to the Unicode string. + </para> + </listitem> + + <listitem> + <para> + A pointer to the buffer that was passed to + <function>unicode_bidi_calc_types</function>(). + </para> + </listitem> + + <listitem> + <para> + Number of characters in the Unicode string and the + <structname>unicode_bidi_type_t</structname> buffer. + </para> + </listitem> + <listitem> + <para> + A pointer to an array of <structname>unicode_bidi_level_t</structname> values. The caller is responsible for allocating and deallocating this array, @@ -488,7 +582,18 @@ See COPYING for distribution information. </itemizedlist> <para> - <function>unicode_bidi_calc</function>() fills in the + The parameters to <function>unicode_bidi_calc</function>() are + the same except for the + <structname>unicode_bidi_type_t</structname> pointer. + <function>unicode_bidi_calc</function>() allocates this + buffer by itself and calls + <function>unicode_bidi_calc_types</function>, and + destroys the buffer before returning. + </para> + + <para> + <function>unicode_bidi_calc</function>() + and <function>unicode_bidi_calc_levels</function>() fill in the <structname>unicode_bidi_level_t</structname> array with the values corresponding to the embedding level of the corresponding character, @@ -500,7 +605,9 @@ See COPYING for distribution information. </para> <para> - <function>unicode_bidi_calc</function>() returns the resolved + <function>unicode_bidi_calc</function>() + and <function>unicode_bidi_calc_levels</function>() + return the resolved paragraph direction level, which always matches the passed in level, if specified, else it reports the @@ -510,7 +617,8 @@ See COPYING for distribution information. <para> <function>unicode_bidi_reorder</function>() takes the actual unicode string together with the embedding values from - <function>unicode_bidi_calc</function>, then reverses the + <function>unicode_bidi_calc</function> or + <function>unicode_bidi_calc_levels</function>(), then reverses the bi-directional string, as specified by step L2 of the bi-directional algorithm. The parameters to @@ -698,7 +806,8 @@ See COPYING for distribution information. basic, but the resulting bi-directional string produces the same canonical rendering order after applying - <function>unicode_bidi_calc()</function>, + <function>unicode_bidi_calc()</function> or + <function>unicode_bidi_calc_levels</function>(), <function>unicode_reorder()</function> and <function>unicode_bidi_cleanup()</function> (with the canonical option), @@ -847,7 +956,9 @@ See COPYING for distribution information. default paragraph embedding level and returns 0 if it matches. Otherwise it returns a directional marker that should be <emphasis>prepended</emphasis> to the Unicode string to allow - <function>unicode_bidi_calc</function>'s optional paragraph + <function>unicode_bidi_calc</function>'s + (or <function>unicode_bidi_calc_levels</function>()) + optional paragraph embedding level pointer's value to be <literal>NULL</literal>, but derive the same default embedding level. The parameters to @@ -880,6 +991,18 @@ See COPYING for distribution information. <function>unicode_bidi_type</function> looks up each character's bi-directional character type. </para> + + <para> + <function>unicode_bidi_setbnl</function> + takes a pointer to a unicode string, a pointer to an + array of <classname>enum_bidi_type_t</classname> values and + the number of characters in the string and the array. + <function>unicode_bidi_setbnl</function> replaces all + paragraph separators in the unicode string with a newline + character (same as the <literal>UNICODE_BIDI_CLEANUP_BNL</literal> + option to <function>unicode_bidi_cleanup</function>. + </para> + <para> <function>unicode_bidi_mirror</function> returns the glyph that's a mirror image of the parameter @@ -2661,6 +2784,7 @@ See COPYING for distribution information. <refnamediv> <refname>unicode::bidi</refname> <refname>unicode::bidi_calc</refname> + <refname>unicode::bidi_calc_types</refname> <refname>unicode::bidi_reorder</refname> <refname>unicode::bidi_cleanup</refname> <refname>unicode::bidi_logical_order</refname> @@ -2670,16 +2794,40 @@ See COPYING for distribution information. </refnamediv> <refsynopsisdiv> + <synopsis>#include <courier-unicode.h></synopsis> + <classsynopsis class="class" language="C++"> + <ooclass> + <classname>struct unicode::bidi_calc_types</classname> + </ooclass> + <constructorsynopsis> + <methodname>bidi_calc_types</methodname> + <methodparam><modifier>const std::u32string &</modifier><parameter>string</parameter> + </methodparam> + </constructorsynopsis> + <fieldsynopsis> + <modifier>std::vector<unicode_bidi_type_t></modifier> + <varname>types</varname> + </fieldsynopsis> + + <methodsynopsis> + <void /> + <methodname>setbnl</methodname> + <methodparam> + <modifier>std::u32string &</modifier> + <parameter>string</parameter> + </methodparam> + </methodsynopsis> + </classsynopsis> + <funcsynopsis> - <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> <funcprototype> <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> - <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const unicode::bidi_calc_types &<parameter>ustring</parameter></paramdef> </funcprototype> <funcprototype> <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> - <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const unicode::bidi_calc_types &<parameter>ustring</parameter></paramdef> <paramdef>unicode_bidi_level_t <parameter>embedding_level</parameter></paramdef> </funcprototype> @@ -2766,9 +2914,70 @@ See COPYING for distribution information. <listitem> <para> <function>unicode::bidi_calc</function> returns the - directional embedding value buffer and the paragraph - embedding level. + directional embedding value buffer and the calculated paragraph + embedding level. Its <parameter>ustring</parameter> + is implicitly converted from a + <classname>std::u32string</classname>: </para> + <blockquote> + <informalexample> + <programlisting><![CDATA[ +std::u32string text; + +auto [levels, level]=unicode::bidi_calc(text); + +]]></programlisting> + </informalexample> + </blockquote> + + <para> + Alternatively a <classname>unicode::bidi_calc_types</classname> + objects gets constructed from the same + <classname>std::u32string</classname> and then passed + directly to <function>unicode::bidi_calc</function>: + </para> + <blockquote> + <informalexample> + <programlisting><![CDATA[ +std::u32string text; + +unicode::bidi_calc_types types{text}; + +types.setbnl(text); // Optional + +// types.types is a std::vector of enum_bidi_types_t values + +auto [levels, level]=unicode::bidi_calc(types); + +]]></programlisting> + </informalexample> + </blockquote> + <para> + This provides the means to access the intermediate + <classname>enum_bidi_types_t</classname> values that + get calculated from the Unicode text string. + </para> + + <note> + <para> + In all cases the <classname>std::u32string</classname> + cannot be a temporary object, and it must remain in scope + until <function>unicode::bidi_calc</function>() returns. + </para> + </note> + + <para> + The optional <methodname>setbnl</methodname>() method uses + <link linkend="unicode_bidi"> + <citerefentry> + <refentrytitle>unicode_bidi_setbnl</refentrytitle> + <manvolnum>3</manvolnum></citerefentry></link> + to replace paragraph separators with newline characters, + in the unicode string. It requires the same unicode string + that was passed to the constructor as a parameter (because + the constructor takes a constant reference, but this + method modifies the string. + </para> </listitem> <listitem> <para> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 3de76d3..a3a59f4 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -626,6 +626,20 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +extern void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf); + +extern void unicode_bidi_setbnl(char32_t *p, + const enum_bidi_type_t *types, + size_t n); + +extern unicode_bidi_level_t unicode_bidi_calc_levels(const char32_t *p, + const enum_bidi_type_t + *types, + size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t + *initial_embedding_level); /* Bitmask options to unicode_bidi_cleanup */ /* @@ -2153,13 +2167,51 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); +//! Calculate bidirectional character types + +//! Passed as a parameter to bidi_calc(), supplying the string and the +//! calculated bidirectional types. + +struct bidi_calc_types { + const std::u32string &s; + + //! Calculated bidirectional types. + + std::vector<enum_bidi_type_t> types; + + //! A reference to an existing std::u32string + + //! bidi_calc_types can be constructed only from a reference to + //! an existing std::u32string. + bidi_calc_types(const std::u32string &); + +#if __cplusplus >= 201103L + //! Deleted constructor + + //! bidi_calc_types cannot be constructed from a temporary + //! std::u32string. + bidi_calc_types(std::u32string &&)=delete; +#endif + //! Replace all paragraph breaks by newlines. + + void setbnl(std::u32string &); + + //! Destructor + + ~bidi_calc_types(); +}; + //! Calculate bidirectional embedding levels //! Returns the bidirectional embedding levels, and the paragraph //! embedding level. +//! +//! The first parameter can be implicitly converted from an existing +//! std::u32string object. Alternatively a bidi_calc_types helper +//! can be constructed explicitly, and then passed in directly. std::tuple<std::vector<unicode_bidi_level_t>, - unicode_bidi_level_t> bidi_calc(const std::u32string &s); + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s); //! Calculate bidirectional embedding levels @@ -2170,7 +2222,7 @@ std::tuple<std::vector<unicode_bidi_level_t>, //! embedding level. std::tuple<std::vector<unicode_bidi_level_t>, - unicode_bidi_level_t> bidi_calc(const std::u32string &s, + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s, unicode_bidi_level_t level); //! Reorder bidirectional text diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index cfae12f..92fe8a7 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -466,8 +466,8 @@ typedef struct { unicode_bidi_level_t paragraph_embedding_level; const char32_t *chars; - enum_bidi_type_t *classes; - enum_bidi_type_t *orig_classes; + enum_bidi_type_t *types; + const enum_bidi_type_t *orig_types; unicode_bidi_level_t *levels; size_t size; int overflow_isolate_count; @@ -500,29 +500,29 @@ const char *bidi_classname(enum_bidi_type_t classenum) } -void dump_classes(const char *prefix, directional_status_stack_t stack) +void dump_types(const char *prefix, directional_status_stack_t stack) { fprintf(DEBUGDUMP, "%s: ", prefix); for (size_t i=0; i<stack->size; ++i) { fprintf(DEBUGDUMP, " %s(%d)", - bidi_classname(stack->classes[i]), + bidi_classname(stack->types[i]), (int)stack->levels[i]); } fprintf(DEBUGDUMP, "\n"); } -void dump_orig_classes(const char *prefix, directional_status_stack_t stack) +void dump_orig_types(const char *prefix, directional_status_stack_t stack) { fprintf(DEBUGDUMP, "%s: ", prefix); for (size_t i=0; i<stack->size; ++i) { fprintf(DEBUGDUMP, " %s(%s%s%d)", - bidi_classname(stack->classes[i]), - (stack->classes[i] != stack->orig_classes[i] ? - bidi_classname(stack->orig_classes[i]):""), - (stack->classes[i] != stack->orig_classes[i] ? "/":""), + bidi_classname(stack->types[i]), + (stack->types[i] != stack->orig_types[i] ? + bidi_classname(stack->orig_types[i]):""), + (stack->types[i] != stack->orig_types[i] ? "/":""), (int)stack->levels[i]); } fprintf(DEBUGDUMP, "\n"); @@ -624,7 +624,7 @@ compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p, static directional_status_stack_t directional_status_stack_init(const char32_t *chars, - enum_bidi_type_t *classes, size_t n, + const enum_bidi_type_t *types, size_t n, unicode_bidi_level_t *levels, const unicode_bidi_level_t *initial_embedding_level) @@ -636,23 +636,23 @@ directional_status_stack_init(const char32_t *chars, stack->paragraph_embedding_level= initial_embedding_level ? *initial_embedding_level & 1 - : compute_paragraph_embedding_level_from_types(classes, 0, n); + : compute_paragraph_embedding_level_from_types(types, 0, n); stack->chars=chars; - stack->classes=classes; + stack->orig_types=types; if (n) { - classes=(enum_bidi_type_t *) + stack->types=(enum_bidi_type_t *) malloc(sizeof(enum_bidi_type_t)*n); - if (!classes) + if (!stack->types) abort(); - memcpy(classes, stack->classes, sizeof(enum_bidi_type_t)*n); + memcpy(stack->types, stack->orig_types, + sizeof(enum_bidi_type_t)*n); } else { - classes=0; + stack->types=0; } - stack->orig_classes=classes; stack->levels=levels; stack->size=n; @@ -682,19 +682,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack) { while (stack->head) directional_status_stack_pop(stack); - if (stack->orig_classes) - free(stack->orig_classes); + if (stack->types) + free(stack->types); isolating_run_sequences_deinit(&stack->isolating_run_sequences); free(stack); } -static unicode_bidi_level_t -unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level); - enum_bidi_type_t unicode_bidi_type(char32_t c) { return (enum_bidi_type_t) @@ -707,35 +700,51 @@ enum_bidi_type_t unicode_bidi_type(char32_t c) UNICODE_BIDI_TYPE_L); } -unicode_bidi_level_t -unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) + +void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf) { /* ** Look up the bidi class for each char32_t. - ** - ** When we encounter a paragraph break we call unicode_bidi_b() to - ** process it. */ - - enum_bidi_type_t *buf= - (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); - - if (!buf) - abort(); for (size_t i=0; i<n; ++i) { buf[i]=unicode_bidi_type(p[i]); #ifdef UNICODE_BIDI_TEST UNICODE_BIDI_TEST(i); #endif - bufp[i]=UNICODE_BIDI_SKIP; } +} + +void unicode_bidi_setbnl(char32_t *p, + const enum_bidi_type_t *types, + size_t n) +{ + for (size_t i=0; i<n; i++) + if (types[i] == UNICODE_BIDI_TYPE_B) + { + p[i]='\n'; + } +} + +unicode_bidi_level_t +unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) +{ + enum_bidi_type_t *buf= + (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); - unicode_bidi_level_t level=unicode_bidi_b(p, n, - buf, - bufp, - initial_embedding_level); + if (!buf) + abort(); + + unicode_bidi_calc_types(p, n, buf); + + unicode_bidi_level_t level= + unicode_bidi_calc_levels(p, + buf, + n, + bufp, + initial_embedding_level); free(buf); @@ -744,16 +753,21 @@ unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, static void unicode_bidi_cl(directional_status_stack_t stack); -static unicode_bidi_level_t -unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) +unicode_bidi_level_t +unicode_bidi_calc_levels(const char32_t *p, + const enum_bidi_type_t *types, + size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) { directional_status_stack_t stack; - stack=directional_status_stack_init(p, buf, n, bufp, + for (size_t i=0; i<n; ++i) + { + bufp[i]=UNICODE_BIDI_SKIP; + } + + stack=directional_status_stack_init(p, types, n, bufp, initial_embedding_level); unicode_bidi_level_t paragraph_embedding_level= @@ -779,7 +793,7 @@ unicode_bidi_b(const char32_t *p, } \ } while(0) -static void unicode_bidi_w(enum_bidi_type_t *classes, +static void unicode_bidi_w(enum_bidi_type_t *types, struct isolating_run_sequence_s *seq); static void unicode_bidi_n(directional_status_stack_t stack, struct isolating_run_sequence_s *seq); @@ -811,7 +825,7 @@ void dump_sequence(const char *what, directional_status_stack_t stack, while (irs_compare(&beg, &end)) { fprintf(DEBUGDUMP, " %s(%d)", - bidi_classname(stack->classes[beg.i]), + bidi_classname(stack->types[beg.i]), (int)stack->levels[beg.i]); irs_incr(&beg); } @@ -822,7 +836,7 @@ void dump_sequence(const char *what, directional_status_stack_t stack, static void unicode_bidi_cl(directional_status_stack_t stack) { #ifdef BIDI_DEBUG - dump_classes("Before X1", stack); + dump_types("Before X1", stack); #endif for (size_t i=0; i<stack->size; i++) @@ -839,7 +853,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) embedding_level |= 1, \ ++embedding_level) - switch (stack->classes[i]) { + switch (stack->types[i]) { case UNICODE_BIDI_TYPE_RLE: /* X2 */ NEXT_ODD_EMBEDDING_LEVEL; @@ -927,7 +941,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) break; } - enum_bidi_type_t cur_class=stack->classes[i]; + enum_bidi_type_t cur_class=stack->types[i]; if (cur_class == UNICODE_BIDI_TYPE_FSI) { /* X5c */ @@ -938,9 +952,9 @@ static void unicode_bidi_cl(directional_status_stack_t stack) while (++j < stack->size) { - if (is_isolate_initiator(stack->classes[j])) + if (is_isolate_initiator(stack->types[j])) ++in_isolation; - else if (stack->classes[j] == UNICODE_BIDI_TYPE_PDI) + else if (stack->types[j] == UNICODE_BIDI_TYPE_PDI) { if (--in_isolation == 0) break; @@ -948,7 +962,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } cur_class=compute_paragraph_embedding_level_from_types - (stack->classes, i+1, j) == 1 + (stack->types, i+1, j) == 1 ? UNICODE_BIDI_TYPE_RLI : UNICODE_BIDI_TYPE_LRI; } @@ -957,7 +971,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) case UNICODE_BIDI_TYPE_RLI: /* X5a */ stack->levels[i]=stack->head->embedding_level; - RESET_CLASS(stack->classes[i],stack); + RESET_CLASS(stack->types[i],stack); NEXT_ODD_EMBEDDING_LEVEL; @@ -979,7 +993,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) case UNICODE_BIDI_TYPE_LRI: /* X5b */ stack->levels[i]=stack->head->embedding_level; - RESET_CLASS(stack->classes[i],stack); + RESET_CLASS(stack->types[i],stack); NEXT_EVEN_EMBEDDING_LEVEL; @@ -1002,14 +1016,14 @@ static void unicode_bidi_cl(directional_status_stack_t stack) break; } - if (!is_explicit_indicator(stack->orig_classes[i])) + if (!is_explicit_indicator(stack->orig_types[i])) { /* X6 */ stack->levels[i]=stack->head->embedding_level; - RESET_CLASS(stack->classes[i],stack); + RESET_CLASS(stack->types[i],stack); } - if (stack->classes[i] == UNICODE_BIDI_TYPE_PDI) + if (stack->types[i] == UNICODE_BIDI_TYPE_PDI) { /* X6a */ if (stack->overflow_isolate_count > 0) @@ -1052,10 +1066,10 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } } stack->levels[i]=stack->head->embedding_level; - RESET_CLASS(stack->classes[i],stack); + RESET_CLASS(stack->types[i],stack); } - if (stack->classes[i] == UNICODE_BIDI_TYPE_PDF) + if (stack->types[i] == UNICODE_BIDI_TYPE_PDF) { /* X7 */ @@ -1077,7 +1091,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } } - if (stack->classes[i] == UNICODE_BIDI_TYPE_B) + if (stack->types[i] == UNICODE_BIDI_TYPE_B) { /* X8 */ @@ -1105,7 +1119,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) for (size_t i=0; i<stack->size; ++i) { - if (IS_X9(stack->classes[i])) + if (IS_X9(stack->types[i])) { if (stack->levels[i] != UNICODE_BIDI_SKIP) { @@ -1166,7 +1180,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) /* X10 */ #ifdef BIDI_DEBUG - dump_classes("Before X10", stack); + dump_types("Before X10", stack); #endif for (struct isolating_run_sequence_s *p= @@ -1201,7 +1215,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) if (first_i > 0) before=stack->levels[first_i-1]; - if (!is_isolate_initiator(stack->classes[end_iter.i])) + if (!is_isolate_initiator(stack->types[end_iter.i])) { while (end_i < stack->size && stack->levels[end_i] == UNICODE_BIDI_SKIP) @@ -1244,7 +1258,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) dump_sequence("Contents before W", stack, p); #endif - unicode_bidi_w(stack->classes, p); + unicode_bidi_w(stack->types, p); #ifdef BIDI_DEBUG dump_sequence("Contents after W", stack, p); @@ -1252,7 +1266,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) unicode_bidi_n(stack, p); } #ifdef BIDI_DEBUG - dump_orig_classes("Before L1", stack); + dump_orig_types("Before L1", stack); #endif /* @@ -1267,10 +1281,10 @@ static void unicode_bidi_cl(directional_status_stack_t stack) { --i; - if (IS_X9(stack->orig_classes[i])) + if (IS_X9(stack->orig_types[i])) continue; - switch (stack->orig_classes[i]) { + switch (stack->orig_types[i]) { case UNICODE_BIDI_TYPE_WS: case UNICODE_BIDI_TYPE_FSI: case UNICODE_BIDI_TYPE_LRI: @@ -1292,7 +1306,7 @@ static void unicode_bidi_cl(directional_status_stack_t stack) } } -static void unicode_bidi_w(enum_bidi_type_t *classes, +static void unicode_bidi_w(enum_bidi_type_t *types, struct isolating_run_sequence_s *seq) { irs_iterator iter=irs_begin(seq), end=irs_end(seq); @@ -1302,10 +1316,10 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, while (irs_compare(&iter, &end)) { - if (classes[iter.i] == UNICODE_BIDI_TYPE_NSM) + if (types[iter.i] == UNICODE_BIDI_TYPE_NSM) { /* W1 */ - classes[iter.i] = + types[iter.i] = is_isolate_initiator(previous_type) || previous_type == UNICODE_BIDI_TYPE_PDI ? UNICODE_BIDI_TYPE_ON @@ -1315,14 +1329,14 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, /* W2 */ - if (classes[iter.i] == UNICODE_BIDI_TYPE_EN && + if (types[iter.i] == UNICODE_BIDI_TYPE_EN && strong_type == UNICODE_BIDI_TYPE_AL) { - classes[iter.i] = UNICODE_BIDI_TYPE_AN; + types[iter.i] = UNICODE_BIDI_TYPE_AN; } /* W2 */ - previous_type=classes[iter.i]; + previous_type=types[iter.i]; switch (previous_type) { case UNICODE_BIDI_TYPE_R: @@ -1346,12 +1360,12 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, while (not_eol) { /* W3 */ - if (classes[iter.i] == UNICODE_BIDI_TYPE_AL) - classes[iter.i] = UNICODE_BIDI_TYPE_R; + if (types[iter.i] == UNICODE_BIDI_TYPE_AL) + types[iter.i] = UNICODE_BIDI_TYPE_R; /* W4 */ - enum_bidi_type_t this_type=classes[iter.i]; + enum_bidi_type_t this_type=types[iter.i]; irs_incr(&iter); not_eol=irs_compare(&iter, &end); @@ -1366,13 +1380,13 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, previous_type == UNICODE_BIDI_TYPE_AN) ) ) && - classes[iter.i] == previous_type) + types[iter.i] == previous_type) { irs_iterator prev=iter; irs_decr(&prev); - classes[prev.i]=previous_type; + types[prev.i]=previous_type; } if (not_eol) @@ -1387,9 +1401,9 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, while (irs_compare(&iter, &end)) { - if (classes[iter.i] != UNICODE_BIDI_TYPE_ET) + if (types[iter.i] != UNICODE_BIDI_TYPE_ET) { - previous_type=classes[iter.i]; + previous_type=types[iter.i]; irs_incr(&iter); continue; } @@ -1397,7 +1411,7 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, /* ET after EN */ if (previous_type == UNICODE_BIDI_TYPE_EN) { - classes[iter.i] = UNICODE_BIDI_TYPE_EN; + types[iter.i] = UNICODE_BIDI_TYPE_EN; irs_incr(&iter); continue; } @@ -1408,7 +1422,7 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, while (irs_incr(&iter), irs_compare(&iter, &end)) { - previous_type=classes[iter.i]; + previous_type=types[iter.i]; if (previous_type == UNICODE_BIDI_TYPE_ET) continue; @@ -1417,7 +1431,7 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, { while (irs_compare(&start, &iter)) { - classes[start.i]= + types[start.i]= UNICODE_BIDI_TYPE_EN; irs_incr(&start); } @@ -1431,12 +1445,12 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, for (iter=irs_begin(seq); irs_compare(&iter, &end); irs_incr(&iter)) { - switch (classes[iter.i]) { + switch (types[iter.i]) { case UNICODE_BIDI_TYPE_ET: case UNICODE_BIDI_TYPE_ES: case UNICODE_BIDI_TYPE_CS: /* W6 */ - classes[iter.i]=UNICODE_BIDI_TYPE_ON; + types[iter.i]=UNICODE_BIDI_TYPE_ON; break; default: break; @@ -1450,14 +1464,14 @@ static void unicode_bidi_w(enum_bidi_type_t *classes, while (irs_compare(&iter, &end)) { - switch (classes[iter.i]) { + switch (types[iter.i]) { case UNICODE_BIDI_TYPE_L: case UNICODE_BIDI_TYPE_R: - previous_type=classes[iter.i]; + previous_type=types[iter.i]; break; case UNICODE_BIDI_TYPE_EN: if (previous_type == UNICODE_BIDI_TYPE_L) - classes[iter.i]=previous_type; + types[iter.i]=previous_type; break; default: break; @@ -1512,7 +1526,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, char32_t open_or_close_bracket=0; - if (IS_NI(stack->classes[iter.i])) + if (IS_NI(stack->types[iter.i])) { open_or_close_bracket= unicode_bidi_bracket_type(stack->chars[iter.i], @@ -1596,7 +1610,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, ** we record these facts there. */ - enum_bidi_type_t eoclass=stack->classes[iter.i]; + enum_bidi_type_t eoclass=stack->types[iter.i]; #define ADJUST_EOCLASS(eoclass) do { \ \ @@ -1679,8 +1693,8 @@ static void unicode_bidi_n(directional_status_stack_t stack, #endif if (p->has_e) { - stack->classes[p->start.i]= - stack->classes[p->end.i]= + stack->types[p->start.i]= + stack->types[p->end.i]= seq->embedding_level & 1 ? UNICODE_BIDI_TYPE_R : UNICODE_BIDI_TYPE_L; @@ -1696,7 +1710,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, irs_decr(&iter); enum_bidi_type_t eoclass= - stack->classes[iter.i]; + stack->types[iter.i]; ADJUST_EOCLASS(eoclass); @@ -1717,8 +1731,8 @@ static void unicode_bidi_n(directional_status_stack_t stack, break; } - stack->classes[p->start.i]= - stack->classes[p->end.i]= + stack->types[p->start.i]= + stack->types[p->end.i]= strong_type; set=1; } @@ -1726,16 +1740,16 @@ static void unicode_bidi_n(directional_status_stack_t stack, if (set) { enum_bidi_type_t strong_type= - stack->classes[p->end.i]; + stack->types[p->end.i]; while (irs_incr(&p->end), irs_compare(&p->end, &end)) { - if (stack->orig_classes[p->end.i] != + if (stack->orig_types[p->end.i] != UNICODE_BIDI_TYPE_NSM) break; - stack->classes[p->end.i]=strong_type; + stack->types[p->end.i]=strong_type; } } } @@ -1752,7 +1766,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, ** N1 */ - enum_bidi_type_t this_type=stack->classes[iter.i]; + enum_bidi_type_t this_type=stack->types[iter.i]; ADJUST_EOCLASS(this_type); @@ -1777,13 +1791,13 @@ static void unicode_bidi_n(directional_status_stack_t stack, while (irs_compare(&iter, &end)) { - if (IS_NI(stack->classes[iter.i])) + if (IS_NI(stack->types[iter.i])) { irs_incr(&iter); continue; } - enum_bidi_type_t other_type=stack->classes[iter.i]; + enum_bidi_type_t other_type=stack->types[iter.i]; ADJUST_EOCLASS(other_type); @@ -1808,7 +1822,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, if (next_type == prev_type) { - stack->classes[start.i]=next_type; /* N1 */ + stack->types[start.i]=next_type; /* N1 */ } irs_incr(&start); @@ -1817,9 +1831,9 @@ static void unicode_bidi_n(directional_status_stack_t stack, for (iter=beg; irs_compare(&iter, &end); ) { - if (IS_NI(stack->classes[iter.i])) + if (IS_NI(stack->types[iter.i])) { - stack->classes[iter.i]= + stack->types[iter.i]= stack->levels[iter.i] & 1 ? UNICODE_BIDI_TYPE_R : UNICODE_BIDI_TYPE_L; /* N2 */ @@ -1838,7 +1852,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, { if ((stack->levels[iter.i] & 1) == 0) { - switch (stack->classes[iter.i]) { + switch (stack->types[iter.i]) { case UNICODE_BIDI_TYPE_R: ++stack->levels[iter.i]; break; @@ -1851,7 +1865,7 @@ static void unicode_bidi_n(directional_status_stack_t stack, } else { - switch (stack->classes[iter.i]) { + switch (stack->types[iter.i]) { case UNICODE_BIDI_TYPE_L: case UNICODE_BIDI_TYPE_AN: case UNICODE_BIDI_TYPE_EN: @@ -2193,7 +2207,7 @@ static void need_marker_info_merge(struct need_marker_info *info, } static void emit_bidi_embed_levelrun(const char32_t *string, - enum_bidi_type_t *classes, + enum_bidi_type_t *types, struct bidi_embed_levelrun *run, unicode_bidi_level_t paragraph_level, unicode_bidi_level_t previous_level, @@ -2206,7 +2220,7 @@ static void emit_bidi_embed_levelrun(const char32_t *string, /* L1 */ -static int is_l1_on_or_after(const enum_bidi_type_t *classes, +static int is_l1_on_or_after(const enum_bidi_type_t *types, size_t n, size_t i, int atend) @@ -2216,7 +2230,7 @@ static int is_l1_on_or_after(const enum_bidi_type_t *classes, */ while (i<n) { - enum_bidi_type_t t=classes[i]; + enum_bidi_type_t t=types[i]; if (t == UNICODE_BIDI_TYPE_WS) { @@ -2261,14 +2275,14 @@ void unicode_bidi_embed(const char32_t *string, void *arg) { struct bidi_embed_levelrun *runs=0; - enum_bidi_type_t *classes= + enum_bidi_type_t *types= (enum_bidi_type_t *)calloc(n, sizeof(enum_bidi_type_t)); - if (!classes) + if (!types) abort(); for (size_t i=0; i<n; ++i) - classes[i]=unicode_bidi_type(string[i]); + types[i]=unicode_bidi_type(string[i]); compute_bidi_embed_levelruns(string, levels, n, @@ -2315,10 +2329,10 @@ void unicode_bidi_embed(const char32_t *string, need_marker_info_init(&need_marker); - if (classes[p->end-1] == UNICODE_BIDI_TYPE_WS) + if (types[p->end-1] == UNICODE_BIDI_TYPE_WS) { need_marker.need_marker= - is_l1_on_or_after(classes, n, + is_l1_on_or_after(types, n, p->end, 0); #ifdef BIDI_DEBUG @@ -2328,7 +2342,7 @@ void unicode_bidi_embed(const char32_t *string, } - emit_bidi_embed_levelrun(string, classes, + emit_bidi_embed_levelrun(string, types, p, paragraph_level, previous_level, next_level, @@ -2364,12 +2378,12 @@ void unicode_bidi_embed(const char32_t *string, size_t j=p->end; int end_with_ws= - classes[j-1] == UNICODE_BIDI_TYPE_WS; + types[j-1] == UNICODE_BIDI_TYPE_WS; while (j > p->start) { --j; - enum_bidi_type_t t=classes[j]; + enum_bidi_type_t t=types[j]; if (t == UNICODE_BIDI_TYPE_S || t == UNICODE_BIDI_TYPE_B) @@ -2416,7 +2430,7 @@ void unicode_bidi_embed(const char32_t *string, p->start=j; emit_bidi_embed_levelrun - (string, classes, p, paragraph_level, + (string, types, p, paragraph_level, previous_level, j == i @@ -2438,7 +2452,7 @@ void unicode_bidi_embed(const char32_t *string, if (end_with_ws) need_marker.need_marker= is_l1_on_or_after - (classes, n, + (types, n, orig_end, 0); need_marker_info_merge @@ -2455,7 +2469,7 @@ void unicode_bidi_embed(const char32_t *string, } free(p); } - free(classes); + free(types); } #define ADJUST_LR(t,e) do { \ @@ -2484,7 +2498,7 @@ void unicode_bidi_embed(const char32_t *string, } while (0) static void emit_bidi_embed_levelrun(const char32_t *string, - enum_bidi_type_t *classes, + enum_bidi_type_t *types, struct bidi_embed_levelrun *run, unicode_bidi_level_t paragraph_level, unicode_bidi_level_t previous_level, @@ -2519,18 +2533,18 @@ static void emit_bidi_embed_levelrun(const char32_t *string, seq.runs.cap_level_runs=1; lrun.start=run->start; lrun.end=run->end; - unicode_bidi_w(classes, &seq); + unicode_bidi_w(types, &seq); /* ** Peek at the first character's class. ** ** If the previous sequence's embedding level was the same, it ** guarantees the peristence of the embedding direction. We can - ** accept classes that default to our embedding level. + ** accept types that default to our embedding level. ** - ** Otherwise we recognize only strong classes. + ** Otherwise we recognize only strong types. */ - enum_bidi_type_t t=classes[run->start]; + enum_bidi_type_t t=types[run->start]; if (previous_level == run->level) { @@ -2587,7 +2601,7 @@ static void emit_bidi_embed_levelrun(const char32_t *string, */ while (i < end) { - enum_bidi_type_t t=classes[i]; + enum_bidi_type_t t=types[i]; ADJUST_LR(t, e_type); @@ -2628,7 +2642,7 @@ static void emit_bidi_embed_levelrun(const char32_t *string, (*emit)(&override_start, 1, arg); while (++i < end) { - enum_bidi_type_t t=classes[i]; + enum_bidi_type_t t=types[i]; switch (t) { case UNICODE_BIDI_TYPE_WS: @@ -2656,7 +2670,7 @@ static void emit_bidi_embed_levelrun(const char32_t *string, ** Make sure that if a different embedding level follows we will ** emit a marker, to ensure strong context. */ - t=classes[run->end-1]; + t=types[run->end-1]; if (next_level != run->level) { diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index a0d5ac4..48cc3c6 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -562,16 +562,38 @@ std::u32string unicode::toupper(const std::u32string &u) return copy; } + +unicode::bidi_calc_types::bidi_calc_types(const std::u32string &s) + : s{s} +{ + types.resize(s.size()); + if (!s.empty()) + unicode_bidi_calc_types(s.c_str(), s.size(), &types[0]); +} + +unicode::bidi_calc_types::~bidi_calc_types()=default; + +void unicode::bidi_calc_types::setbnl(std::u32string &s) +{ + if (s.empty() || s.size() != types.size()) + return; + + unicode_bidi_setbnl(&s[0], &types[0], s.size()); +} + std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s) +unicode::bidi_calc(const bidi_calc_types &s) { return unicode::bidi_calc(s, UNICODE_BIDI_SKIP); } std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s, +unicode::bidi_calc(const bidi_calc_types &st, unicode_bidi_level_t paragraph_embedding_level) { + if (st.s.size() != st.types.size()) + return { {}, UNICODE_BIDI_LR }; + const unicode_bidi_level_t *initial_embedding_level=0; if (paragraph_embedding_level == UNICODE_BIDI_LR || @@ -583,14 +605,17 @@ unicode::bidi_calc(const std::u32string &s, std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> ret; - std::get<0>(ret).resize(s.size()); + std::get<0>(ret).resize(st.s.size()); std::get<1>(ret)=UNICODE_BIDI_LR; - if (s.size()) + if (st.s.size()) { - std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(), - &std::get<0>(ret)[0], - initial_embedding_level); + std::get<1>(ret)= + unicode_bidi_calc_levels(st.s.c_str(), + &st.types[0], + st.s.size(), + &std::get<0>(ret)[0], + initial_embedding_level); } return ret; } |
