diff options
Diffstat (limited to 'unicode/unicode_graphemebreak.c')
| -rw-r--r-- | unicode/unicode_graphemebreak.c | 76 |
1 files changed, 62 insertions, 14 deletions
diff --git a/unicode/unicode_graphemebreak.c b/unicode/unicode_graphemebreak.c index 926bfb1..db6b92b 100644 --- a/unicode/unicode_graphemebreak.c +++ b/unicode/unicode_graphemebreak.c @@ -1,5 +1,5 @@ /* -** Copyright 2011 Double Precision, Inc. +** Copyright 2011-2020 Double Precision, Inc. ** See COPYING for distribution information. ** */ @@ -8,6 +8,7 @@ #include "courier-unicode.h" #include <unistd.h> #include <stdint.h> +#include <string.h> #include <stdlib.h> #define UNICODE_GRAPHEMEBREAK_ANY 0x00 @@ -24,22 +25,63 @@ #define UNICODE_GRAPHEMEBREAK_LVT 0x0B #define UNICODE_GRAPHEMEBREAK_Regional_Indicator 0x0C +#define UNICODE_GRAPHEMEBREAK_ZWJ 0x0D + +#define UNICODE_GRAPHEMEBREAK_SOT 0xFF + #include "graphemebreaktab.h" +struct unicode_grapheme_break_info_s { + uint8_t prev_class; + unsigned prev_count; +}; + +unicode_grapheme_break_info_t unicode_grapheme_break_init() +{ + unicode_grapheme_break_info_t t=(unicode_grapheme_break_info_t) + calloc(1, sizeof(struct unicode_grapheme_break_info_s)); + + if (!t) + abort(); + + t->prev_class=UNICODE_GRAPHEMEBREAK_SOT; + + return t; +} + +void unicode_grapheme_break_deinit(unicode_grapheme_break_info_t t) +{ + free(t); +} + int unicode_grapheme_break(char32_t a, char32_t b) { - uint8_t ac=unicode_tab_lookup(a, unicode_indextab, - sizeof(unicode_indextab)/sizeof(unicode_indextab[0]), - unicode_rangetab, - unicode_classtab, - UNICODE_GRAPHEMEBREAK_ANY), - bc=unicode_tab_lookup(b, unicode_indextab, + struct unicode_grapheme_break_info_s s; + + memset((char *)&s, 0, sizeof(s)); + + (void)unicode_grapheme_break_next(&s, a); + + return unicode_grapheme_break_next(&s, b); +} + +int unicode_grapheme_break_next(unicode_grapheme_break_info_t t, char32_t b) +{ + uint8_t ac=t->prev_class; + uint8_t bc=unicode_tab_lookup(b, unicode_indextab, sizeof(unicode_indextab)/sizeof(unicode_indextab[0]), unicode_rangetab, unicode_classtab, UNICODE_GRAPHEMEBREAK_ANY); - /* GB1 and GB2 are implied */ + if (ac != bc) + t->prev_count=0; + ++t->prev_count; + + t->prev_class=bc; + + if (ac == UNICODE_GRAPHEMEBREAK_SOT) + return 1; /* GB1, GB2 is implied */ if (ac == UNICODE_GRAPHEMEBREAK_CR && bc == UNICODE_GRAPHEMEBREAK_LF) return 0; /* GB3 */ @@ -83,11 +125,8 @@ int unicode_grapheme_break(char32_t a, char32_t b) bc == UNICODE_GRAPHEMEBREAK_T) return 0; /* GB8 */ - if (ac == UNICODE_GRAPHEMEBREAK_Regional_Indicator && - bc == UNICODE_GRAPHEMEBREAK_Regional_Indicator) - return 0; /* GB8a */ - - if (bc == UNICODE_GRAPHEMEBREAK_Extend) + if (bc == UNICODE_GRAPHEMEBREAK_Extend || + bc == UNICODE_GRAPHEMEBREAK_ZWJ) return 0; /* GB9 */ if (bc == UNICODE_GRAPHEMEBREAK_SpacingMark) @@ -96,5 +135,14 @@ int unicode_grapheme_break(char32_t a, char32_t b) if (ac == UNICODE_GRAPHEMEBREAK_Prepend) return 0; /* GB9b */ - return 1; /* GB10 */ + if (ac == UNICODE_GRAPHEMEBREAK_Extend || + ac == UNICODE_GRAPHEMEBREAK_ZWJ) + return 0; /* GB11? */ + + if (ac == UNICODE_GRAPHEMEBREAK_Regional_Indicator && + bc == UNICODE_GRAPHEMEBREAK_Regional_Indicator && + (t->prev_count % 2) == 0) + return 0; /* GB12, GB13 */ + + return 1; /* GB999 */ } |
