/* ** Copyright 2011 Double Precision, Inc. ** See COPYING for distribution information. ** */ #include "unicode_config.h" #include "unicode.h" #include #include #include #include #include #include "wordbreaktab_internal.h" #include "wordbreaktab.h" struct unicode_wb_info { int (*cb_func)(int, void *); void *cb_arg; uint8_t prevclass; size_t wb4_cnt; size_t wb4_extra_cnt; int (*next_handler)(unicode_wb_info_t, uint8_t); int (*end_handler)(unicode_wb_info_t); }; static int sot(unicode_wb_info_t i, uint8_t cl); static int wb4(unicode_wb_info_t i); static int wb1and2_done(unicode_wb_info_t i, uint8_t cl); static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl); static int seen_wb67_end_handler(unicode_wb_info_t i); static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl); static int seen_wb1112_end_handler(unicode_wb_info_t i); static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), void *cb_arg) { unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info)); if (!i) return NULL; i->next_handler=sot; i->cb_func=cb_func; i->cb_arg=cb_arg; return i; } int unicode_wb_end(unicode_wb_info_t i) { int rc; if (i->end_handler) rc=(*i->end_handler)(i); else rc=wb4(i); free(i); return rc; } int unicode_wb_next_cnt(unicode_wb_info_t i, const unicode_char *chars, size_t cnt) { int rc; while (cnt) { rc=unicode_wb_next(i, *chars++); --cnt; if (rc) return rc; } return 0; } int unicode_wb_next(unicode_wb_info_t i, unicode_char ch) { return (*i->next_handler) (i, unicode_tab_lookup(ch, unicode_indextab, sizeof(unicode_indextab) / sizeof(unicode_indextab[0]), unicode_rangetab, unicode_classtab, UNICODE_WB_OTHER)); } static int wb4(unicode_wb_info_t i) { int rc=0; while (i->wb4_cnt > 0) { --i->wb4_cnt; if (rc == 0) rc=(*i->cb_func)(0, i->cb_arg); } return rc; } static int result(unicode_wb_info_t i, int flag) { int rc=wb4(i); if (rc == 0) rc=(*i->cb_func)(flag, i->cb_arg); return rc; } #define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end) static int sot(unicode_wb_info_t i, uint8_t cl) { i->prevclass=cl; SET_HANDLER(wb1and2_done, NULL); return result(i, 1); /* WB1 */ } static int wb1and2_done(unicode_wb_info_t i, uint8_t cl) { uint8_t prevclass=i->prevclass; i->prevclass=cl; if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF) return result(i, 0); /* WB3 */ switch (prevclass) { case UNICODE_WB_CR: case UNICODE_WB_LF: case UNICODE_WB_Newline: return result(i, 1); /* WB3a */ } switch (cl) { case UNICODE_WB_CR: case UNICODE_WB_LF: case UNICODE_WB_Newline: return result(i, 1); /* WB3b */ } if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) { i->prevclass=prevclass; ++i->wb4_cnt; return 0; /* WB4 */ } if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter) { return result(i, 0); /* WB5 */ } if (prevclass == UNICODE_WB_ALetter && (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet)) { i->wb4_extra_cnt=0; SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler); return 0; } return wb67_done(i, prevclass, cl); } /* ** ALetter (MidLetter | MidNumLet ) ? ** ** prevclass cl ** ** Seen ALetter (MidLetter | MidNumLet), with the second character's status ** not returned yet. */ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl) { int rc; uint8_t prevclass; size_t extra_cnt; if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) { ++i->wb4_extra_cnt; return 0; } extra_cnt=i->wb4_extra_cnt; /* ** Reset the handler to the default, then check WB6 */ SET_HANDLER(wb1and2_done, NULL); if (cl == UNICODE_WB_ALetter) { rc=result(i, 0); /* WB6 */ i->wb4_cnt=extra_cnt; if (rc == 0) rc=result(i, 0); /* WB7 */ i->prevclass=cl; return rc; } prevclass=i->prevclass; /* This was the second character */ /* ** Process the second character, starting with WB7 */ rc=wb67_done(i, UNICODE_WB_ALetter, prevclass); i->prevclass=prevclass; i->wb4_cnt=extra_cnt; if (rc == 0) rc=(*i->next_handler)(i, cl); /* Process the current char now */ return rc; } /* ** Seen ALetter (MidLetter | MidNumLet), with the second character's status ** not returned yet, and now sot. */ static int seen_wb67_end_handler(unicode_wb_info_t i) { int rc; size_t extra_cnt=i->wb4_extra_cnt; /* ** Process the second character, starting with WB7. */ rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass); i->wb4_cnt=extra_cnt; if (rc == 0) rc=wb4(i); return rc; } static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) { if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric) return result(i, 0); /* WB8 */ if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric) return result(i, 0); /* WB9 */ if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter) return result(i, 0); /* WB10 */ if (prevclass == UNICODE_WB_Numeric && (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet)) { i->wb4_extra_cnt=0; SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler); return 0; } return wb1112_done(i, prevclass, cl); } /* ** Numeric (MidNum | MidNumLet ) ? ** ** prevclass cl ** ** Seen Numeric (MidNum | MidNumLet), with the second character's status ** not returned yet. */ static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl) { int rc; uint8_t prevclass; size_t extra_cnt; if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) { ++i->wb4_extra_cnt; return 0; } extra_cnt=i->wb4_extra_cnt; /* ** Reset the handler to the default, then check WB6 */ SET_HANDLER(wb1and2_done, NULL); if (cl == UNICODE_WB_Numeric) { rc=result(i, 0); /* WB11 */ i->wb4_cnt=extra_cnt; if (rc == 0) rc=result(i, 0); /* WB12 */ i->prevclass=cl; return rc; } prevclass=i->prevclass; /* This was the second character */ /* ** Process the second character, starting with WB7 */ rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass); i->prevclass=prevclass; i->wb4_cnt=extra_cnt; if (rc == 0) rc=(*i->next_handler)(i, cl); /* Process the current char now */ return rc; } /* ** Seen Numeric (MidNum | MidNumLet), with the second character's status ** not returned yet, and now sot. */ static int seen_wb1112_end_handler(unicode_wb_info_t i) { int rc; size_t extra_cnt=i->wb4_extra_cnt; /* ** Process the second character, starting with WB11. */ rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass); i->wb4_cnt=extra_cnt; if (rc == 0) rc=wb4(i); return rc; } static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) { if (prevclass == UNICODE_WB_Katakana && cl == UNICODE_WB_Katakana) return result(i, 0); /* WB13 */ switch (prevclass) { case UNICODE_WB_ALetter: case UNICODE_WB_Numeric: case UNICODE_WB_Katakana: case UNICODE_WB_ExtendNumLet: if (cl == UNICODE_WB_ExtendNumLet) return result(i, 0); /* WB13a */ } if (prevclass == UNICODE_WB_ExtendNumLet) switch (cl) { case UNICODE_WB_ALetter: case UNICODE_WB_Numeric: case UNICODE_WB_Katakana: return result(i, 0); /* WB13b */ } return result(i, 1); /* WB14 */ } /* --------------------------------------------------------------------- */ struct unicode_wbscan_info { unicode_wb_info_t wb_handle; int found; size_t cnt; }; static int unicode_wbscan_callback(int, void *); unicode_wbscan_info_t unicode_wbscan_init() { unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info)); if (!i) return NULL; if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL) { free(i); return NULL; } return i; } int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch) { if (!i->found) unicode_wb_next(i->wb_handle, ch); return i->found; } size_t unicode_wbscan_end(unicode_wbscan_info_t i) { size_t n; unicode_wb_end(i->wb_handle); n=i->cnt; free(i); return n; } static int unicode_wbscan_callback(int flag, void *arg) { unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg; if (flag && i->cnt > 0) i->found=1; if (!i->found) ++i->cnt; return 0; }