diff options
| author | Sam Varshavchik | 2020-11-30 23:36:40 -0500 |
|---|---|---|
| committer | Sam Varshavchik | 2020-11-30 23:44:59 -0500 |
| commit | 521f14cdadc891575b4599bd8ceb92a1dd41615a (patch) | |
| tree | a7a3178ee43de2babbb1de513d7f6810c02d08a6 /unicode | |
| parent | 844f6a9ef755c1c5826c9583b364af08b54a4dcc (diff) | |
| download | courier-libs-521f14cdadc891575b4599bd8ceb92a1dd41615a.tar.bz2 | |
Break up bidi_calc into bidi_calc_types and bidi_calc_levels.
Diffstat (limited to 'unicode')
| -rw-r--r-- | unicode/Makefile.am | 5 | ||||
| -rw-r--r-- | unicode/book.xml | 188 | ||||
| -rw-r--r-- | unicode/courier-unicode.h.in | 46 | ||||
| -rw-r--r-- | unicode/unicode_bidi.c | 85 | ||||
| -rw-r--r-- | unicode/unicodecpp.C | 31 |
5 files changed, 292 insertions, 63 deletions
diff --git a/unicode/Makefile.am b/unicode/Makefile.am index dbc71aa..25b0719 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -91,6 +91,7 @@ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ $(srcdir)/man/unicode\:\:bidi.3 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ + $(srcdir)/man/unicode\:\:bidi_calc_types.3 \ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_embed.3 \ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ @@ -115,6 +116,8 @@ man_MANS= \ $(srcdir)/man/unicode_bidi.3 \ $(srcdir)/man/unicode_bidi_bracket_type.3 \ $(srcdir)/man/unicode_bidi_calc.3 \ + $(srcdir)/man/unicode_bidi_calc_levels.3 \ + $(srcdir)/man/unicode_bidi_calc_types.3 \ $(srcdir)/man/unicode_bidi_cleanup.3 \ $(srcdir)/man/unicode_bidi_embed.3 \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ @@ -432,7 +435,7 @@ docs.stamp: rm -rf man.tmp mkdir man.tmp d=`cd $(srcdir); pwd`; cd man.tmp; xsltproc --nonet --xinclude \ - http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl\ + http://cdn.docbook.org/release/xsl-nons/current//manpages/docbook.xsl\ $$d/book.xml mkdir -p man rm -f man/*.[123456789] diff --git a/unicode/book.xml b/unicode/book.xml index b0342ea..ad96d82 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -301,6 +301,8 @@ See COPYING for distribution information. <refnamediv> <refname>unicode_bidi</refname> + <refname>unicode_bidi_calc_levels</refname> + <refname>unicode_bidi_calc_types</refname> <refname>unicode_bidi_calc</refname> <refname>unicode_bidi_reorder</refname> <refname>unicode_bidi_cleanup</refname> @@ -318,6 +320,23 @@ See COPYING for distribution information. <refsynopsisdiv> <funcsynopsis> <funcsynopsisinfo>#include <courier-unicode.h> unicode_bidi_level_t lr=UNICODE_BIDI_LR;</funcsynopsisinfo> + + <funcprototype> + <funcdef>void <function>unicode_bidi_calc_types</function></funcdef> + <paramdef>const char32_t *<parameter>p</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_type_t *<parameter>types</parameter></paramdef> + </funcprototype> + + <funcprototype> + <funcdef>void <function>unicode_bidi_calc_levels</function></funcdef> + <paramdef>const char32_t *<parameter>p</parameter></paramdef> + <paramdef>const unicode_bidi_type_t *<parameter>types</parameter></paramdef> + <paramdef>size_t <parameter>n</parameter></paramdef> + <paramdef>unicode_bidi_level_t *<parameter>levels</parameter></paramdef> + <paramdef>const unicode_bidi_level_t *<parameter>initial_embedding_level</parameter></paramdef> + </funcprototype> + <funcprototype> <funcdef>void <function>unicode_bidi_calc</function></funcdef> <paramdef>const char32_t *<parameter>p</parameter></paramdef> @@ -417,19 +436,49 @@ See COPYING for distribution information. <listitem> <para> Allocate an array of + <structname>unicode_bidi_type_t</structname> that's the + same size as the Unicode string. + </para> + </listitem> + <listitem> + <para> + Allocate an array of <structname>unicode_bidi_level_t</structname> that's the same size as the Unicode string. </para> </listitem> + <listitem> <para> - Use <function>unicode_bidi_calc</function>() to compute + Use <function>unicode_bidi_calc_types</function>() to compute + the Unicode string's characters' bi-directional types, + and populate the + <structname>unicode_bidi_type_t</structname> buffer. + </para> + </listitem> + + <listitem> + <para> + Use <function>unicode_bidi_calc_levels</function>() to compute the Unicode string's characters' bi-directional embedding level (executes the Bi-Directional algorithm up to and including step L1). This populates the <structname>unicode_bidi_level_t</structname> buffer. </para> </listitem> + + <listitem> + <para> + Alternatively: allocate only the + <structname>unicode_bidi_level_t</structname> array + and use <function>unicode_bidi_calc</function>(), which + <function>malloc</function>()s the + <structname>unicode_bidi_type_t</structname> buffer, + calls <function>unicode_bidi_calc_levels</function>(), + and then <function>free</function>()s the buffer. + </para> + </listitem> + <listitem> <para> Use <function>unicode_bidi_reorder</function>() to reverse @@ -451,7 +500,7 @@ See COPYING for distribution information. <para> The parameters to - <function>unicode_bidi_calc</function>() are: + <function>unicode_bidi_calc_types</function>() are: </para> <itemizedlist> @@ -468,6 +517,42 @@ See COPYING for distribution information. <listitem> <para> A pointer to an array of + <structname>unicode_bidi_type_t</structname> values. + The caller is + responsible for allocating and deallocating this array, + which has the same size as the Unicode string. + </para> + </listitem> + </itemizedlist> + + <para> + The parameters to + <function>unicode_bidi_calc_levels</function>() are: + </para> + + <itemizedlist> + <listitem> + <para> + A pointer to the Unicode string. + </para> + </listitem> + + <listitem> + <para> + A pointer to the buffer that was passed to + <function>unicode_bidi_calc_types</function>(). + </para> + </listitem> + + <listitem> + <para> + Number of characters in the Unicode string and the + <structname>unicode_bidi_type_t</structname> buffer. + </para> + </listitem> + <listitem> + <para> + A pointer to an array of <structname>unicode_bidi_level_t</structname> values. The caller is responsible for allocating and deallocating this array, @@ -488,7 +573,18 @@ See COPYING for distribution information. </itemizedlist> <para> - <function>unicode_bidi_calc</function>() fills in the + The parameters to <function>unicode_bidi_calc</function>() are + the same except for the + <structname>unicode_bidi_type_t</structname> pointer. + <function>unicode_bidi_calc</function>() allocates this + buffer by itself and calls + <function>unicode_bidi_calc_types</function>, and + destroys the buffer before returning. + </para> + + <para> + <function>unicode_bidi_calc</function>() + and <function>unicode_bidi_calc_levels</function>() fill in the <structname>unicode_bidi_level_t</structname> array with the values corresponding to the embedding level of the corresponding character, @@ -500,7 +596,9 @@ See COPYING for distribution information. </para> <para> - <function>unicode_bidi_calc</function>() returns the resolved + <function>unicode_bidi_calc</function>() + and <function>unicode_bidi_calc_levels</function>() + return the resolved paragraph direction level, which always matches the passed in level, if specified, else it reports the @@ -510,7 +608,8 @@ See COPYING for distribution information. <para> <function>unicode_bidi_reorder</function>() takes the actual unicode string together with the embedding values from - <function>unicode_bidi_calc</function>, then reverses the + <function>unicode_bidi_calc</function> or + <function>unicode_bidi_calc_levels</function>(), then reverses the bi-directional string, as specified by step L2 of the bi-directional algorithm. The parameters to @@ -698,7 +797,8 @@ See COPYING for distribution information. basic, but the resulting bi-directional string produces the same canonical rendering order after applying - <function>unicode_bidi_calc()</function>, + <function>unicode_bidi_calc()</function> or + <function>unicode_bidi_calc_levels</function>(), <function>unicode_reorder()</function> and <function>unicode_bidi_cleanup()</function> (with the canonical option), @@ -847,7 +947,9 @@ See COPYING for distribution information. default paragraph embedding level and returns 0 if it matches. Otherwise it returns a directional marker that should be <emphasis>prepended</emphasis> to the Unicode string to allow - <function>unicode_bidi_calc</function>'s optional paragraph + <function>unicode_bidi_calc</function>'s + (or <function>unicode_bidi_calc_levels</function>()) + optional paragraph embedding level pointer's value to be <literal>NULL</literal>, but derive the same default embedding level. The parameters to @@ -2661,6 +2763,7 @@ See COPYING for distribution information. <refnamediv> <refname>unicode::bidi</refname> <refname>unicode::bidi_calc</refname> + <refname>unicode::bidi_calc_types</refname> <refname>unicode::bidi_reorder</refname> <refname>unicode::bidi_cleanup</refname> <refname>unicode::bidi_logical_order</refname> @@ -2670,16 +2773,31 @@ See COPYING for distribution information. </refnamediv> <refsynopsisdiv> + <synopsis>#include <courier-unicode.h></synopsis> + <classsynopsis class="class" language="C++"> + <ooclass> + <classname>struct unicode::bidi_calc_types</classname> + </ooclass> + <constructorsynopsis> + <methodname>bidi_calc_types</methodname> + <methodparam><modifier>const std::u32string &</modifier><parameter>string</parameter> + </methodparam> + </constructorsynopsis> + <fieldsynopsis> + <modifier>std::vector<unicode_bidi_type_t></modifier> + <varname>types</varname> + </fieldsynopsis> + </classsynopsis> + <funcsynopsis> - <funcsynopsisinfo>#include <courier-unicode.h></funcsynopsisinfo> <funcprototype> <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> - <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const unicode::bidi_calc_types &<parameter>ustring</parameter></paramdef> </funcprototype> <funcprototype> <funcdef>std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> <function>unicode::bidi_calc</function></funcdef> - <paramdef>const std::u32string &<parameter>string</parameter></paramdef> + <paramdef>const unicode::bidi_calc_types &<parameter>ustring</parameter></paramdef> <paramdef>unicode_bidi_level_t <parameter>embedding_level</parameter></paramdef> </funcprototype> @@ -2766,9 +2884,55 @@ See COPYING for distribution information. <listitem> <para> <function>unicode::bidi_calc</function> returns the - directional embedding value buffer and the paragraph - embedding level. + directional embedding value buffer and the calculated paragraph + embedding level. Its <parameter>ustring</parameter> + is implicitly converted from a + <classname>std::u32string</classname>: </para> + <blockquote> + <informalexample> + <programlisting><![CDATA[ +std::u32string text; + +auto [levels, level]=unicode::bidi_calc(text); + +]]></programlisting> + </informalexample> + </blockquote> + + <para> + Alternatively a <classname>unicode::bidi_calc_types</classname> + objects gets constructed from the same + <classname>std::u32string</classname> and then passed + directly to <function>unicode::bidi_calc</function>: + </para> + <blockquote> + <informalexample> + <programlisting><![CDATA[ +std::u32string text; + +unicode::bidi_calc_types types{text}; + +// types.types is a std::vector of enum_bidi_types_t values + +auto [levels, level]=unicode::bidi_calc(types); + +]]></programlisting> + </informalexample> + </blockquote> + <para> + This provides the means to access the intermediate + <classname>enum_bidi_types_t</classname> values that + get calculated from the Unicode text string. + </para> + + <note> + <para> + In all cases the <classname>std::u32string</classname> + cannot be a temporary object, and it must remain in scope + until <function>unicode::bidi_calc</function>() returns. + </para> + </note> </listitem> <listitem> <para> diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 3de76d3..f8ab117 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -626,6 +626,16 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +extern void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf); + +extern unicode_bidi_level_t unicode_bidi_calc_levels(const char32_t *p, + const enum_bidi_type_t + *types, + size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t + *initial_embedding_level); /* Bitmask options to unicode_bidi_cleanup */ /* @@ -2153,13 +2163,45 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); +//! Calculate bidirectional character types + +//! Passed as a parameter to bidi_calc(), supplying the string and the +//! calculated bidirectional types. + +struct bidi_calc_types { + const std::u32string &s; + + //! Calculated bidirectional types. + + std::vector<enum_bidi_type_t> types; + + //! A reference to an existing std::u32string + + //! bidi_calc_types can be constructed only from a reference to + //! an existing std::u32string. + bidi_calc_types(const std::u32string &); + + //! Deleted constructor + + //! bidi_calc_types cannot be constructed from a temporary + //! std::u32string. + bidi_calc_types(std::u32string &&)=delete; + + //! Destructor + ~bidi_calc_types(); +}; + //! Calculate bidirectional embedding levels //! Returns the bidirectional embedding levels, and the paragraph //! embedding level. +//! +//! The first parameter can be implicitly converted from an existing +//! std::u32string object. Alternatively a bidi_calc_types helper +//! can be constructed explicitly, and then passed in directly. std::tuple<std::vector<unicode_bidi_level_t>, - unicode_bidi_level_t> bidi_calc(const std::u32string &s); + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s); //! Calculate bidirectional embedding levels @@ -2170,7 +2212,7 @@ std::tuple<std::vector<unicode_bidi_level_t>, //! embedding level. std::tuple<std::vector<unicode_bidi_level_t>, - unicode_bidi_level_t> bidi_calc(const std::u32string &s, + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s, unicode_bidi_level_t level); //! Reorder bidirectional text diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index cfae12f..cbb11dc 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -467,7 +467,7 @@ typedef struct { unicode_bidi_level_t paragraph_embedding_level; const char32_t *chars; enum_bidi_type_t *classes; - enum_bidi_type_t *orig_classes; + const enum_bidi_type_t *orig_classes; unicode_bidi_level_t *levels; size_t size; int overflow_isolate_count; @@ -624,7 +624,7 @@ compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p, static directional_status_stack_t directional_status_stack_init(const char32_t *chars, - enum_bidi_type_t *classes, size_t n, + const enum_bidi_type_t *classes, size_t n, unicode_bidi_level_t *levels, const unicode_bidi_level_t *initial_embedding_level) @@ -638,21 +638,21 @@ directional_status_stack_init(const char32_t *chars, ? *initial_embedding_level & 1 : compute_paragraph_embedding_level_from_types(classes, 0, n); stack->chars=chars; - stack->classes=classes; + stack->orig_classes=classes; if (n) { - classes=(enum_bidi_type_t *) + stack->classes=(enum_bidi_type_t *) malloc(sizeof(enum_bidi_type_t)*n); - if (!classes) + if (!stack->classes) abort(); - memcpy(classes, stack->classes, sizeof(enum_bidi_type_t)*n); + memcpy(stack->classes, stack->orig_classes, + sizeof(enum_bidi_type_t)*n); } else { - classes=0; + stack->classes=0; } - stack->orig_classes=classes; stack->levels=levels; stack->size=n; @@ -682,19 +682,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack) { while (stack->head) directional_status_stack_pop(stack); - if (stack->orig_classes) - free(stack->orig_classes); + if (stack->classes) + free(stack->classes); isolating_run_sequences_deinit(&stack->isolating_run_sequences); free(stack); } -static unicode_bidi_level_t -unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level); - enum_bidi_type_t unicode_bidi_type(char32_t c) { return (enum_bidi_type_t) @@ -707,35 +700,40 @@ enum_bidi_type_t unicode_bidi_type(char32_t c) UNICODE_BIDI_TYPE_L); } -unicode_bidi_level_t -unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) + +void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf) { /* ** Look up the bidi class for each char32_t. - ** - ** When we encounter a paragraph break we call unicode_bidi_b() to - ** process it. */ - - enum_bidi_type_t *buf= - (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); - - if (!buf) - abort(); for (size_t i=0; i<n; ++i) { buf[i]=unicode_bidi_type(p[i]); #ifdef UNICODE_BIDI_TEST UNICODE_BIDI_TEST(i); #endif - bufp[i]=UNICODE_BIDI_SKIP; } +} - unicode_bidi_level_t level=unicode_bidi_b(p, n, - buf, - bufp, - initial_embedding_level); +unicode_bidi_level_t +unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) +{ + enum_bidi_type_t *buf= + (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); + + if (!buf) + abort(); + + unicode_bidi_calc_types(p, n, buf); + + unicode_bidi_level_t level= + unicode_bidi_calc_levels(p, + buf, + n, + bufp, + initial_embedding_level); free(buf); @@ -744,16 +742,21 @@ unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, static void unicode_bidi_cl(directional_status_stack_t stack); -static unicode_bidi_level_t -unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) +unicode_bidi_level_t +unicode_bidi_calc_levels(const char32_t *p, + const enum_bidi_type_t *classes, + size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t *initial_embedding_level) { directional_status_stack_t stack; - stack=directional_status_stack_init(p, buf, n, bufp, + for (size_t i=0; i<n; ++i) + { + bufp[i]=UNICODE_BIDI_SKIP; + } + + stack=directional_status_stack_init(p, classes, n, bufp, initial_embedding_level); unicode_bidi_level_t paragraph_embedding_level= diff --git a/unicode/unicodecpp.C b/unicode/unicodecpp.C index a0d5ac4..4b864b3 100644 --- a/unicode/unicodecpp.C +++ b/unicode/unicodecpp.C @@ -562,16 +562,30 @@ std::u32string unicode::toupper(const std::u32string &u) return copy; } + +unicode::bidi_calc_types::bidi_calc_types(const std::u32string &s) + : s{s} +{ + types.resize(s.size()); + if (!s.empty()) + unicode_bidi_calc_types(s.c_str(), s.size(), &types[0]); +} + +unicode::bidi_calc_types::~bidi_calc_types()=default; + std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s) +unicode::bidi_calc(const bidi_calc_types &s) { return unicode::bidi_calc(s, UNICODE_BIDI_SKIP); } std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s, +unicode::bidi_calc(const bidi_calc_types &st, unicode_bidi_level_t paragraph_embedding_level) { + if (st.s.size() != st.types.size()) + return { {}, UNICODE_BIDI_LR }; + const unicode_bidi_level_t *initial_embedding_level=0; if (paragraph_embedding_level == UNICODE_BIDI_LR || @@ -583,14 +597,17 @@ unicode::bidi_calc(const std::u32string &s, std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> ret; - std::get<0>(ret).resize(s.size()); + std::get<0>(ret).resize(st.s.size()); std::get<1>(ret)=UNICODE_BIDI_LR; - if (s.size()) + if (st.s.size()) { - std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(), - &std::get<0>(ret)[0], - initial_embedding_level); + std::get<1>(ret)= + unicode_bidi_calc_levels(st.s.c_str(), + &st.types[0], + st.s.size(), + &std::get<0>(ret)[0], + initial_embedding_level); } return ret; } |
