diff options
Diffstat (limited to 'unicode/unicode_wordbreak.c')
| -rw-r--r-- | unicode/unicode_wordbreak.c | 529 |
1 files changed, 322 insertions, 207 deletions
diff --git a/unicode/unicode_wordbreak.c b/unicode/unicode_wordbreak.c index 991f0ba..669a382 100644 --- a/unicode/unicode_wordbreak.c +++ b/unicode/unicode_wordbreak.c @@ -16,35 +16,76 @@ #include "wordbreaktab_internal.h" #include "wordbreaktab.h" +/* +** We need to keep track of the original character, in addition +** to the wordbreaking class, to check WB3. +*/ + +typedef struct { + uint8_t cl; + char32_t ch; +} wb_info_t; + +/* +** Internal object. +*/ struct unicode_wb_info { int (*cb_func)(int, void *); void *cb_arg; - uint8_t prevclass; - uint8_t wb7_first_char; + /* Previous character seen. */ + wb_info_t prevclass; + + /* + ** For some rules we peek an extra character, and so need to + ** stash away the 2nd previous character seen, when we're looking at + ** it. + */ + wb_info_t prev2class; + + /* + ** How many (Extend | Format | ZWJ) were processed, so far, + ** for WB4's sake. + */ size_t wb4_cnt; - size_t wb4_extra_cnt; + /* + ** Most recently processed WB4 character. + */ + wb_info_t wb4_last; + + /* + ** Each character received by unicode_wb_next is forwarded to + ** this handler. + */ + int (*next_handler)(unicode_wb_info_t, wb_info_t); - int (*next_handler)(unicode_wb_info_t, uint8_t); + /* + ** unicode_wb_end() calls this. If we were in a middle of a + ** multi-char rule, this wraps things up. + */ int (*end_handler)(unicode_wb_info_t); }; -static int sot(unicode_wb_info_t i, uint8_t cl); -static int wb4(unicode_wb_info_t i); -static int wb1and2_done(unicode_wb_info_t i, uint8_t cl); +/* Forward declarations */ + +static int sot(unicode_wb_info_t i, wb_info_t cl); +static int wb1and2_done(unicode_wb_info_t i, wb_info_t cl); -static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb67_handler(unicode_wb_info_t i, wb_info_t cl); static int seen_wb67_end_handler(unicode_wb_info_t i); -static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); +static int wb67_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl); -static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb7bc_handler(unicode_wb_info_t i, wb_info_t cl); static int seen_wb7bc_end_handler(unicode_wb_info_t i); -static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); +static int wb7bc_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl); -static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb1112_handler(unicode_wb_info_t i, wb_info_t cl); static int seen_wb1112_end_handler(unicode_wb_info_t i); -static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); +static int wb1112_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl); + +static int seen_wb1516_handler(unicode_wb_info_t i, wb_info_t cl); +static int wb1516_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl); unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), void *cb_arg) @@ -57,17 +98,16 @@ unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), i->next_handler=sot; i->cb_func=cb_func; i->cb_arg=cb_arg; + i->wb4_cnt=0; return i; } int unicode_wb_end(unicode_wb_info_t i) { - int rc; + int rc=0; if (i->end_handler) rc=(*i->end_handler)(i); - else - rc=wb4(i); free(i); return rc; @@ -91,16 +131,42 @@ int unicode_wb_next_cnt(unicode_wb_info_t i, int unicode_wb_next(unicode_wb_info_t i, char32_t ch) { - return (*i->next_handler) - (i, unicode_tab_lookup(ch, - unicode_indextab, - sizeof(unicode_indextab) - / sizeof(unicode_indextab[0]), - unicode_rangetab, - unicode_classtab, - UNICODE_WB_OTHER)); + wb_info_t info; + + info.ch=ch; + info.cl=unicode_tab_lookup(ch, + unicode_indextab, + sizeof(unicode_indextab) + / sizeof(unicode_indextab[0]), + unicode_rangetab, + unicode_classtab, + UNICODE_WB_OTHER); + + return (*i->next_handler)(i, info); } +#if 0 + +static int result(unicode_wb_info_t i, int flag) +{ + return (*i->cb_func)(flag, i->cb_arg); +} +#else +#define result(i,flag) ( (*(i)->cb_func)( (flag), (i)->cb_arg)) +#endif + +/* +** Check for WB3C +*/ + +#define WB3C_APPLIES(prevclass,uclass) \ + ((prevclass).cl == UNICODE_WB_ZWJ && \ + unicode_emoji_extended_pictographic((uclass).ch)) + +/* +** Finished WB4 processing. Emit the equivalent number of non-break +** indications. +*/ static int wb4(unicode_wb_info_t i) { int rc=0; @@ -110,24 +176,14 @@ static int wb4(unicode_wb_info_t i) --i->wb4_cnt; if (rc == 0) - rc=(*i->cb_func)(0, i->cb_arg); + rc=result(i, 0); } return rc; } -static int result(unicode_wb_info_t i, int flag) -{ - int rc=wb4(i); - - if (rc == 0) - rc=(*i->cb_func)(flag, i->cb_arg); - - return rc; -} - #define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end) -static int sot(unicode_wb_info_t i, uint8_t cl) +static int sot(unicode_wb_info_t i, wb_info_t cl) { i->prevclass=cl; SET_HANDLER(wb1and2_done, NULL); @@ -135,51 +191,147 @@ static int sot(unicode_wb_info_t i, uint8_t cl) return result(i, 1); /* WB1 */ } -static int wb1and2_done(unicode_wb_info_t i, uint8_t cl) +static int wb4_handled(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl); + +static int wb1and2_done(unicode_wb_info_t i, wb_info_t cl) { - uint8_t prevclass=i->prevclass; + wb_info_t prevclass=i->prevclass; i->prevclass=cl; - if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF) + if (prevclass.cl == UNICODE_WB_CR && cl.cl == UNICODE_WB_LF) return result(i, 0); /* WB3 */ - switch (prevclass) { + switch (prevclass.cl) { case UNICODE_WB_CR: case UNICODE_WB_LF: case UNICODE_WB_Newline: return result(i, 1); /* WB3a */ } - switch (cl) { + switch (cl.cl) { case UNICODE_WB_CR: case UNICODE_WB_LF: case UNICODE_WB_Newline: return result(i, 1); /* WB3b */ } - if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) + if (WB3C_APPLIES(prevclass, cl)) + return result(i, 0); /* WB3c */ + + if (prevclass.cl == UNICODE_WB_WSegSpace && + cl.cl == UNICODE_WB_WSegSpace) + return result(i, 0); /* WB3d */ + + return wb4_handled(i, prevclass, cl); +} + +/* +** Macros, as defined in the TR +*/ +#define AHLetter(c) (c.cl == UNICODE_WB_ALetter || \ + c.cl == UNICODE_WB_Hebrew_Letter) +#define MidNumLetQ(c) (c.cl == UNICODE_WB_MidNumLet || \ + c.cl == UNICODE_WB_Single_Quote) + +/* +** Whether the character is applicable to the WB4 rule. +*/ + +#define WB4(C) ((C).cl == UNICODE_WB_Extend || (C).cl == UNICODE_WB_Format ||\ + (C).cl == UNICODE_WB_ZWJ) + +/* +** Check if the current character invokes the WB4 rule, if so return s0, +** doing nothing, here, after performing some record keeping. +*/ + +#define WB4_APPLY(i,cl) \ + do { \ + if (WB4(cl)) \ + { \ + ++(i)->wb4_cnt; \ + (i)->wb4_last=(cl); \ + return 0; \ + } \ + } while (0) + +/* +** After processing WB4, check if the last WB4-processed character +** will invoke WB3C for the next character. +** +** This is invoked after WB4_APPLY. The return value must be stored in an +** int. +** +** This must be followed by WB4_END. Then, after WB4_END, if this returned +** non 0, WB3C applies, returning a non-break indication. +*/ + +#define WB3C_APPLIES_AFTER_WB4(i,cl) \ + ( (i)->wb4_cnt > 0 && \ + WB3C_APPLIES( (i)->wb4_last, (cl))) + +/* +** Wrapper for invoking wb4() after detecting that it no longer applies. This +** gets invoked: +** +** - After WB4_APPLY +** +** - After WB3C_APPLIES_AFTER_WB4 +*/ + +#define WB4_END(i) \ + do { \ + \ + int rc=wb4(i); \ + \ + if (rc) \ + return rc; \ + } while (0) + + +static int resume_wb4(unicode_wb_info_t i, wb_info_t cl) +{ + if (!WB4(cl)) + { + SET_HANDLER(wb1and2_done, NULL); + + if (WB3C_APPLIES(i->wb4_last, cl)) + { + i->prevclass=cl; + return result(i, 0); + } + + wb_info_t prevclass=i->prevclass; + + i->prevclass=cl; + + return wb4_handled(i, prevclass, cl); + } + i->wb4_last=cl; + return result(i, 0); +} + + +static int wb4_handled(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl) +{ + if (WB4(cl)) { i->prevclass=prevclass; - ++i->wb4_cnt; - return 0; /* WB4 */ + i->wb4_last=cl; + SET_HANDLER(resume_wb4, 0); + return result(i, 0); /* WB4 */ } - if ((prevclass == UNICODE_WB_ALetter || - prevclass == UNICODE_WB_Hebrew_Letter) && - (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter)) + if (AHLetter(prevclass) && AHLetter(cl)) { return result(i, 0); /* WB5 */ } - if ((prevclass == UNICODE_WB_ALetter || - prevclass == UNICODE_WB_Hebrew_Letter) - && - (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet || - cl == UNICODE_WB_Single_Quote)) + if (AHLetter(prevclass) && + (MidNumLetQ(cl) || cl.cl == UNICODE_WB_MidLetter)) { - i->wb4_extra_cnt=0; - i->wb7_first_char=prevclass; + i->prev2class=prevclass; SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler); return 0; } @@ -188,94 +340,69 @@ static int wb1and2_done(unicode_wb_info_t i, uint8_t cl) } /* -** (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_quote) ? -** -** prevclass cl -** -** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second -** character's status not returned yet. +** AHLetter (MidLetter | MidNumLetQ) seen, is this followed by AHLetter? */ -static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl) +static int seen_wb67_handler(unicode_wb_info_t i, wb_info_t cl) { int rc; - uint8_t prevclass; - size_t extra_cnt; - - if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) - { - ++i->wb4_extra_cnt; - return 0; - } - extra_cnt=i->wb4_extra_cnt; - - /* - ** Reset the handler to the default, then check WB6 - */ + WB4_APPLY(i, cl); SET_HANDLER(wb1and2_done, NULL); - if (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter) + if (AHLetter(cl)) { - rc=result(i, 0); /* WB6 */ - i->wb4_cnt=extra_cnt; + i->prevclass=cl; + + rc=result(i, 0); /* WB6 */ + WB4_END(i); if (rc == 0) rc=result(i, 0); /* WB7 */ - i->prevclass=cl; - return rc; } - prevclass=i->prevclass; /* This was the second character */ + int wb3c_applies=WB3C_APPLIES_AFTER_WB4(i, cl); - /* - ** Process the second character, starting with WB7 - */ - - rc=wb67_done(i, i->wb7_first_char, prevclass); + rc=seen_wb67_end_handler(i); - i->prevclass=prevclass; - i->wb4_cnt=extra_cnt; + if (wb3c_applies) + return result(i, 0); if (rc == 0) rc=(*i->next_handler)(i, cl); - /* Process the current char now */ return rc; } /* -** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second -** character's status not returned yet, and now sot. +** AHLetter (MidLetter | MidNumLetQ) seen, with the second +** character's status not returned yet, and now either sot, or something +** else. */ static int seen_wb67_end_handler(unicode_wb_info_t i) { - int rc; - size_t extra_cnt=i->wb4_extra_cnt; + int rc=wb67_done(i, i->prev2class, i->prevclass); - /* - ** Process the second character, starting with WB7. - */ - - rc=wb67_done(i, i->wb7_first_char, i->prevclass); - i->wb4_cnt=extra_cnt; - if (rc == 0) - rc=wb4(i); - return rc; + if (rc) + return rc; + WB4_END(i); + return 0; } -static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +static int wb67_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl) { - if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Single_Quote) + if (prevclass.cl == UNICODE_WB_Hebrew_Letter && + cl.cl == UNICODE_WB_Single_Quote) return result(i, 0); /* WB7a */ - if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Double_Quote) + if (prevclass.cl == UNICODE_WB_Hebrew_Letter && + cl.cl == UNICODE_WB_Double_Quote) { - i->wb4_extra_cnt=0; + i->prev2class=prevclass; SET_HANDLER(seen_wb7bc_handler, seen_wb7bc_end_handler); return 0; } @@ -292,97 +419,72 @@ static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) ** not returned yet. */ -static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl) +static int seen_wb7bc_handler(unicode_wb_info_t i, wb_info_t cl) { int rc; - uint8_t prevclass; - size_t extra_cnt; - if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) - { - ++i->wb4_extra_cnt; - return 0; - } - - extra_cnt=i->wb4_extra_cnt; - - /* - ** Reset the handler to the default, then check WB7a and WB7b - */ + WB4_APPLY(i, cl); SET_HANDLER(wb1and2_done, NULL); - if (cl == UNICODE_WB_Hebrew_Letter) + if (cl.cl == UNICODE_WB_Hebrew_Letter) { - rc=result(i, 0); /* WB7b */ - i->wb4_cnt=extra_cnt; + i->prevclass=cl; + + rc=result(i, 0); /* WB7b */ + + WB4_END(i); if (rc == 0) - rc=result(i, 0); /* WB7bc */ + rc=result(i, 0); /* WB7c */ - i->prevclass=cl; - return rc; } - prevclass=i->prevclass; /* This was the second character */ + int wb3c_applies=WB3C_APPLIES_AFTER_WB4(i, cl); - /* - ** Process the second character, starting with WB8 - */ - - rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, prevclass); - - i->prevclass=prevclass; - i->wb4_cnt=extra_cnt; + rc=seen_wb7bc_end_handler(i); + if (wb3c_applies) + return result(i, 0); if (rc == 0) rc=(*i->next_handler)(i, cl); - /* Process the current char now */ return rc; } /* ** Seen Hebrew_Letter Double_Quote, with the second -** character's status not returned yet, and now sot. +** character's status not returned yet, and now sot or something else. */ static int seen_wb7bc_end_handler(unicode_wb_info_t i) { - int rc; - size_t extra_cnt=i->wb4_extra_cnt; + int rc=wb7bc_done(i, i->prev2class, i->prevclass); - /* - ** Process the second character, starting with WB8. - */ + if (rc) + return rc; - rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, i->prevclass); - i->wb4_cnt=extra_cnt; - if (rc == 0) - rc=wb4(i); - return rc; + WB4_END(i); + + return 0; } -static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +static int wb7bc_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl) { - if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric) + if (prevclass.cl == UNICODE_WB_Numeric && cl.cl == UNICODE_WB_Numeric) return result(i, 0); /* WB8 */ - if ((prevclass == UNICODE_WB_ALetter || - prevclass == UNICODE_WB_Hebrew_Letter) && cl == UNICODE_WB_Numeric) + if (AHLetter(prevclass) && cl.cl == UNICODE_WB_Numeric) return result(i, 0); /* WB9 */ - if (prevclass == UNICODE_WB_Numeric && - (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter)) + if (prevclass.cl == UNICODE_WB_Numeric && AHLetter(cl)) return result(i, 0); /* WB10 */ - - if (prevclass == UNICODE_WB_Numeric && - (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet || - cl == UNICODE_WB_Single_Quote)) + if (prevclass.cl == UNICODE_WB_Numeric && + (cl.cl == UNICODE_WB_MidNum || MidNumLetQ(cl))) { - i->wb4_extra_cnt=0; + i->prev2class=prevclass; SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler); return 0; } @@ -399,53 +501,37 @@ static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) ** not returned yet. */ -static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl) +static int seen_wb1112_handler(unicode_wb_info_t i, wb_info_t cl) { int rc; - uint8_t prevclass; - size_t extra_cnt; - if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) - { - ++i->wb4_extra_cnt; - return 0; - } - - extra_cnt=i->wb4_extra_cnt; - - /* - ** Reset the handler to the default, then check WB6 - */ + WB4_APPLY(i, cl); SET_HANDLER(wb1and2_done, NULL); - if (cl == UNICODE_WB_Numeric) + if (cl.cl == UNICODE_WB_Numeric) { - rc=result(i, 0); /* WB11 */ - i->wb4_cnt=extra_cnt; + i->prevclass=cl; + + rc=result(i, 0); /* WB12 */ + + WB4_END(i); if (rc == 0) - rc=result(i, 0); /* WB12 */ + rc=result(i, 0); /* WB11 */ - i->prevclass=cl; - return rc; } - prevclass=i->prevclass; /* This was the second character */ - - /* - ** Process the second character, starting with WB7 - */ + int wb3c_applies=WB3C_APPLIES_AFTER_WB4(i, cl); - rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass); + rc=seen_wb1112_end_handler(i); - i->prevclass=prevclass; - i->wb4_cnt=extra_cnt; + if (wb3c_applies) + return result(i, 0); if (rc == 0) rc=(*i->next_handler)(i, cl); - /* Process the current char now */ return rc; } @@ -457,38 +543,34 @@ static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl) static int seen_wb1112_end_handler(unicode_wb_info_t i) { - int rc; - size_t extra_cnt=i->wb4_extra_cnt; + int rc=wb1112_done(i, i->prev2class, i->prevclass); - /* - ** Process the second character, starting with WB11. - */ + if (rc) + return rc; - rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass); - i->wb4_cnt=extra_cnt; - if (rc == 0) - rc=wb4(i); - return rc; + WB4_END(i); + + return 0; } -static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +static int wb1112_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl) { - if (prevclass == UNICODE_WB_Katakana && - cl == UNICODE_WB_Katakana) + if (prevclass.cl == UNICODE_WB_Katakana && + cl.cl == UNICODE_WB_Katakana) return result(i, 0); /* WB13 */ - switch (prevclass) { + switch (prevclass.cl) { case UNICODE_WB_ALetter: case UNICODE_WB_Hebrew_Letter: case UNICODE_WB_Numeric: case UNICODE_WB_Katakana: case UNICODE_WB_ExtendNumLet: - if (cl == UNICODE_WB_ExtendNumLet) + if (cl.cl == UNICODE_WB_ExtendNumLet) return result(i, 0); /* WB13a */ } - if (prevclass == UNICODE_WB_ExtendNumLet) - switch (cl) { + if (prevclass.cl == UNICODE_WB_ExtendNumLet) + switch (cl.cl) { case UNICODE_WB_ALetter: case UNICODE_WB_Hebrew_Letter: case UNICODE_WB_Numeric: @@ -496,10 +578,44 @@ static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) return result(i, 0); /* WB13b */ } - if (prevclass == UNICODE_WB_Regional_Indicator && - cl == UNICODE_WB_Regional_Indicator) + if (prevclass.cl == UNICODE_WB_Regional_Indicator && + cl.cl == UNICODE_WB_Regional_Indicator) + { + SET_HANDLER(seen_wb1516_handler, 0); + return result(i, 0); + } + + return wb1516_done(i, prevclass, cl); +} + +static int seen_wb1516_handler(unicode_wb_info_t i, wb_info_t cl) +{ + WB4_APPLY(i, cl); + + SET_HANDLER(wb1and2_done, NULL); + + int wb3c_applies=WB3C_APPLIES_AFTER_WB4(i, cl); + + WB4_END(i); + + if (wb3c_applies) return result(i, 0); - return result(i, 1); /* WB14 */ + + if (cl.cl == UNICODE_WB_Regional_Indicator) + { + wb_info_t prevclass=i->prevclass; + + i->prevclass=cl; + + return wb1516_done(i, prevclass, cl); + } + + return (*i->next_handler)(i, cl); +} + +static int wb1516_done(unicode_wb_info_t i, wb_info_t prevclass, wb_info_t cl) +{ + return result(i, 1); /* WB999 */ } /* --------------------------------------------------------------------- */ @@ -559,4 +675,3 @@ static int unicode_wbscan_callback(int flag, void *arg) ++i->cnt; return 0; } - |
