From 521f14cdadc891575b4599bd8ceb92a1dd41615a Mon Sep 17 00:00:00 2001 From: Sam Varshavchik Date: Mon, 30 Nov 2020 23:36:40 -0500 Subject: Break up bidi_calc into bidi_calc_types and bidi_calc_levels. --- unicode/Makefile.am | 5 +- unicode/book.xml | 188 ++++++++++++++++++++++++++++++++++++++++--- unicode/courier-unicode.h.in | 46 ++++++++++- unicode/unicode_bidi.c | 85 +++++++++---------- unicode/unicodecpp.C | 31 +++++-- 5 files changed, 292 insertions(+), 63 deletions(-) (limited to 'unicode') diff --git a/unicode/Makefile.am b/unicode/Makefile.am index dbc71aa..25b0719 100644 --- a/unicode/Makefile.am +++ b/unicode/Makefile.am @@ -91,6 +91,7 @@ man_MANS= \ $(srcdir)/man/courier-unicode.7 \ $(srcdir)/man/unicode\:\:bidi.3 \ $(srcdir)/man/unicode\:\:bidi_calc.3 \ + $(srcdir)/man/unicode\:\:bidi_calc_types.3 \ $(srcdir)/man/unicode\:\:bidi_cleanup.3 \ $(srcdir)/man/unicode\:\:bidi_embed.3 \ $(srcdir)/man/unicode\:\:bidi_embed_paragraph_level.3 \ @@ -115,6 +116,8 @@ man_MANS= \ $(srcdir)/man/unicode_bidi.3 \ $(srcdir)/man/unicode_bidi_bracket_type.3 \ $(srcdir)/man/unicode_bidi_calc.3 \ + $(srcdir)/man/unicode_bidi_calc_levels.3 \ + $(srcdir)/man/unicode_bidi_calc_types.3 \ $(srcdir)/man/unicode_bidi_cleanup.3 \ $(srcdir)/man/unicode_bidi_embed.3 \ $(srcdir)/man/unicode_bidi_embed_paragraph_level.3 \ @@ -432,7 +435,7 @@ docs.stamp: rm -rf man.tmp mkdir man.tmp d=`cd $(srcdir); pwd`; cd man.tmp; xsltproc --nonet --xinclude \ - http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl\ + http://cdn.docbook.org/release/xsl-nons/current//manpages/docbook.xsl\ $$d/book.xml mkdir -p man rm -f man/*.[123456789] diff --git a/unicode/book.xml b/unicode/book.xml index b0342ea..ad96d82 100644 --- a/unicode/book.xml +++ b/unicode/book.xml @@ -301,6 +301,8 @@ See COPYING for distribution information. unicode_bidi + unicode_bidi_calc_levels + unicode_bidi_calc_types unicode_bidi_calc unicode_bidi_reorder unicode_bidi_cleanup @@ -318,6 +320,23 @@ See COPYING for distribution information. #include <courier-unicode.h> unicode_bidi_level_t lr=UNICODE_BIDI_LR; + + + void unicode_bidi_calc_types + const char32_t *p + size_t n + unicode_bidi_type_t *types + + + + void unicode_bidi_calc_levels + const char32_t *p + const unicode_bidi_type_t *types + size_t n + unicode_bidi_level_t *levels + const unicode_bidi_level_t *initial_embedding_level + + void unicode_bidi_calc const char32_t *p @@ -414,6 +433,13 @@ See COPYING for distribution information. + + + Allocate an array of + unicode_bidi_type_t that's the + same size as the Unicode string. + + Allocate an array of @@ -421,15 +447,38 @@ See COPYING for distribution information. same size as the Unicode string. + - Use unicode_bidi_calc() to compute + Use unicode_bidi_calc_types() to compute + the Unicode string's characters' bi-directional types, + and populate the + unicode_bidi_type_t buffer. + + + + + + Use unicode_bidi_calc_levels() to compute the Unicode string's characters' bi-directional embedding level (executes the Bi-Directional algorithm up to and including step L1). This populates the unicode_bidi_level_t buffer. + + + + Alternatively: allocate only the + unicode_bidi_level_t array + and use unicode_bidi_calc(), which + malloc()s the + unicode_bidi_type_t buffer, + calls unicode_bidi_calc_levels(), + and then free()s the buffer. + + + Use unicode_bidi_reorder() to reverse @@ -451,7 +500,7 @@ See COPYING for distribution information. The parameters to - unicode_bidi_calc() are: + unicode_bidi_calc_types() are: @@ -465,6 +514,42 @@ See COPYING for distribution information. Number of characters in the Unicode string. + + + A pointer to an array of + unicode_bidi_type_t values. + The caller is + responsible for allocating and deallocating this array, + which has the same size as the Unicode string. + + + + + + The parameters to + unicode_bidi_calc_levels() are: + + + + + + A pointer to the Unicode string. + + + + + + A pointer to the buffer that was passed to + unicode_bidi_calc_types(). + + + + + + Number of characters in the Unicode string and the + unicode_bidi_type_t buffer. + + A pointer to an array of @@ -488,7 +573,18 @@ See COPYING for distribution information. - unicode_bidi_calc() fills in the + The parameters to unicode_bidi_calc() are + the same except for the + unicode_bidi_type_t pointer. + unicode_bidi_calc() allocates this + buffer by itself and calls + unicode_bidi_calc_types, and + destroys the buffer before returning. + + + + unicode_bidi_calc() + and unicode_bidi_calc_levels() fill in the unicode_bidi_level_t array with the values corresponding to the embedding level of the corresponding character, @@ -500,7 +596,9 @@ See COPYING for distribution information. - unicode_bidi_calc() returns the resolved + unicode_bidi_calc() + and unicode_bidi_calc_levels() + return the resolved paragraph direction level, which always matches the passed in level, if specified, else it reports the @@ -510,7 +608,8 @@ See COPYING for distribution information. unicode_bidi_reorder() takes the actual unicode string together with the embedding values from - unicode_bidi_calc, then reverses the + unicode_bidi_calc or + unicode_bidi_calc_levels(), then reverses the bi-directional string, as specified by step L2 of the bi-directional algorithm. The parameters to @@ -698,7 +797,8 @@ See COPYING for distribution information. basic, but the resulting bi-directional string produces the same canonical rendering order after applying - unicode_bidi_calc(), + unicode_bidi_calc() or + unicode_bidi_calc_levels(), unicode_reorder() and unicode_bidi_cleanup() (with the canonical option), @@ -847,7 +947,9 @@ See COPYING for distribution information. default paragraph embedding level and returns 0 if it matches. Otherwise it returns a directional marker that should be prepended to the Unicode string to allow - unicode_bidi_calc's optional paragraph + unicode_bidi_calc's + (or unicode_bidi_calc_levels()) + optional paragraph embedding level pointer's value to be NULL, but derive the same default embedding level. The parameters to @@ -2661,6 +2763,7 @@ See COPYING for distribution information. unicode::bidi unicode::bidi_calc + unicode::bidi_calc_types unicode::bidi_reorder unicode::bidi_cleanup unicode::bidi_logical_order @@ -2670,16 +2773,31 @@ See COPYING for distribution information. + #include <courier-unicode.h> + + + struct unicode::bidi_calc_types + + + bidi_calc_types + const std::u32string &string + + + + std::vector<unicode_bidi_type_t> + types + + + - #include <courier-unicode.h> std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> unicode::bidi_calc - const std::u32string &string + const unicode::bidi_calc_types &ustring std::tuple<std::vector<unicode_bidi_level_t>, unicode_bidi_level_t> unicode::bidi_calc - const std::u32string &string + const unicode::bidi_calc_types &ustring unicode_bidi_level_t embedding_level @@ -2766,9 +2884,55 @@ See COPYING for distribution information. unicode::bidi_calc returns the - directional embedding value buffer and the paragraph - embedding level. + directional embedding value buffer and the calculated paragraph + embedding level. Its ustring + is implicitly converted from a + std::u32string: +
+ + + +
+ + + Alternatively a unicode::bidi_calc_types + objects gets constructed from the same + std::u32string and then passed + directly to unicode::bidi_calc: + +
+ + + +
+ + This provides the means to access the intermediate + enum_bidi_types_t values that + get calculated from the Unicode text string. + + + + + In all cases the std::u32string + cannot be a temporary object, and it must remain in scope + until unicode::bidi_calc() returns. + +
diff --git a/unicode/courier-unicode.h.in b/unicode/courier-unicode.h.in index 3de76d3..f8ab117 100644 --- a/unicode/courier-unicode.h.in +++ b/unicode/courier-unicode.h.in @@ -626,6 +626,16 @@ typedef enum { extern enum_bidi_type_t unicode_bidi_type(char32_t c); +extern void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf); + +extern unicode_bidi_level_t unicode_bidi_calc_levels(const char32_t *p, + const enum_bidi_type_t + *types, + size_t n, + unicode_bidi_level_t *bufp, + const unicode_bidi_level_t + *initial_embedding_level); /* Bitmask options to unicode_bidi_cleanup */ /* @@ -2153,13 +2163,45 @@ std::u32string tolower(const std::u32string &u); std::u32string toupper(const std::u32string &u); +//! Calculate bidirectional character types + +//! Passed as a parameter to bidi_calc(), supplying the string and the +//! calculated bidirectional types. + +struct bidi_calc_types { + const std::u32string &s; + + //! Calculated bidirectional types. + + std::vector types; + + //! A reference to an existing std::u32string + + //! bidi_calc_types can be constructed only from a reference to + //! an existing std::u32string. + bidi_calc_types(const std::u32string &); + + //! Deleted constructor + + //! bidi_calc_types cannot be constructed from a temporary + //! std::u32string. + bidi_calc_types(std::u32string &&)=delete; + + //! Destructor + ~bidi_calc_types(); +}; + //! Calculate bidirectional embedding levels //! Returns the bidirectional embedding levels, and the paragraph //! embedding level. +//! +//! The first parameter can be implicitly converted from an existing +//! std::u32string object. Alternatively a bidi_calc_types helper +//! can be constructed explicitly, and then passed in directly. std::tuple, - unicode_bidi_level_t> bidi_calc(const std::u32string &s); + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s); //! Calculate bidirectional embedding levels @@ -2170,7 +2212,7 @@ std::tuple, //! embedding level. std::tuple, - unicode_bidi_level_t> bidi_calc(const std::u32string &s, + unicode_bidi_level_t> bidi_calc(const bidi_calc_types &s, unicode_bidi_level_t level); //! Reorder bidirectional text diff --git a/unicode/unicode_bidi.c b/unicode/unicode_bidi.c index cfae12f..cbb11dc 100644 --- a/unicode/unicode_bidi.c +++ b/unicode/unicode_bidi.c @@ -467,7 +467,7 @@ typedef struct { unicode_bidi_level_t paragraph_embedding_level; const char32_t *chars; enum_bidi_type_t *classes; - enum_bidi_type_t *orig_classes; + const enum_bidi_type_t *orig_classes; unicode_bidi_level_t *levels; size_t size; int overflow_isolate_count; @@ -624,7 +624,7 @@ compute_paragraph_embedding_level_from_types(const enum_bidi_type_t *p, static directional_status_stack_t directional_status_stack_init(const char32_t *chars, - enum_bidi_type_t *classes, size_t n, + const enum_bidi_type_t *classes, size_t n, unicode_bidi_level_t *levels, const unicode_bidi_level_t *initial_embedding_level) @@ -638,21 +638,21 @@ directional_status_stack_init(const char32_t *chars, ? *initial_embedding_level & 1 : compute_paragraph_embedding_level_from_types(classes, 0, n); stack->chars=chars; - stack->classes=classes; + stack->orig_classes=classes; if (n) { - classes=(enum_bidi_type_t *) + stack->classes=(enum_bidi_type_t *) malloc(sizeof(enum_bidi_type_t)*n); - if (!classes) + if (!stack->classes) abort(); - memcpy(classes, stack->classes, sizeof(enum_bidi_type_t)*n); + memcpy(stack->classes, stack->orig_classes, + sizeof(enum_bidi_type_t)*n); } else { - classes=0; + stack->classes=0; } - stack->orig_classes=classes; stack->levels=levels; stack->size=n; @@ -682,19 +682,12 @@ static void directional_status_stack_deinit(directional_status_stack_t stack) { while (stack->head) directional_status_stack_pop(stack); - if (stack->orig_classes) - free(stack->orig_classes); + if (stack->classes) + free(stack->classes); isolating_run_sequences_deinit(&stack->isolating_run_sequences); free(stack); } -static unicode_bidi_level_t -unicode_bidi_b(const char32_t *p, - size_t n, - enum_bidi_type_t *buf, - unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level); - enum_bidi_type_t unicode_bidi_type(char32_t c) { return (enum_bidi_type_t) @@ -707,35 +700,40 @@ enum_bidi_type_t unicode_bidi_type(char32_t c) UNICODE_BIDI_TYPE_L); } -unicode_bidi_level_t -unicode_bidi_calc(const char32_t *p, size_t n, unicode_bidi_level_t *bufp, - const unicode_bidi_level_t *initial_embedding_level) + +void unicode_bidi_calc_types(const char32_t *p, size_t n, + enum_bidi_type_t *buf) { /* ** Look up the bidi class for each char32_t. - ** - ** When we encounter a paragraph break we call unicode_bidi_b() to - ** process it. */ - - enum_bidi_type_t *buf= - (enum_bidi_type_t *)malloc(n * sizeof(enum_bidi_type_t)); - - if (!buf) - abort(); for (size_t i=0; i, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s) +unicode::bidi_calc(const bidi_calc_types &s) { return unicode::bidi_calc(s, UNICODE_BIDI_SKIP); } std::tuple, unicode_bidi_level_t> -unicode::bidi_calc(const std::u32string &s, +unicode::bidi_calc(const bidi_calc_types &st, unicode_bidi_level_t paragraph_embedding_level) { + if (st.s.size() != st.types.size()) + return { {}, UNICODE_BIDI_LR }; + const unicode_bidi_level_t *initial_embedding_level=0; if (paragraph_embedding_level == UNICODE_BIDI_LR || @@ -583,14 +597,17 @@ unicode::bidi_calc(const std::u32string &s, std::tuple, unicode_bidi_level_t> ret; - std::get<0>(ret).resize(s.size()); + std::get<0>(ret).resize(st.s.size()); std::get<1>(ret)=UNICODE_BIDI_LR; - if (s.size()) + if (st.s.size()) { - std::get<1>(ret)=unicode_bidi_calc(s.c_str(), s.size(), - &std::get<0>(ret)[0], - initial_embedding_level); + std::get<1>(ret)= + unicode_bidi_calc_levels(st.s.c_str(), + &st.types[0], + st.s.size(), + &std::get<0>(ret)[0], + initial_embedding_level); } return ret; } -- cgit v1.2.3