diff options
Diffstat (limited to 'unicode/unicode_wordbreak.c')
| -rw-r--r-- | unicode/unicode_wordbreak.c | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/unicode/unicode_wordbreak.c b/unicode/unicode_wordbreak.c new file mode 100644 index 0000000..dee4b52 --- /dev/null +++ b/unicode/unicode_wordbreak.c @@ -0,0 +1,448 @@ +/* +** Copyright 2011 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include "unicode_config.h" +#include "unicode.h" + +#include <unistd.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "wordbreaktab_internal.h" +#include "wordbreaktab.h" + +struct unicode_wb_info { + int (*cb_func)(int, void *); + void *cb_arg; + + uint8_t prevclass; + size_t wb4_cnt; + + size_t wb4_extra_cnt; + + int (*next_handler)(unicode_wb_info_t, uint8_t); + int (*end_handler)(unicode_wb_info_t); +}; + +static int sot(unicode_wb_info_t i, uint8_t cl); +static int wb4(unicode_wb_info_t i); +static int wb1and2_done(unicode_wb_info_t i, uint8_t cl); + +static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb67_end_handler(unicode_wb_info_t i); +static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); + +static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb1112_end_handler(unicode_wb_info_t i); +static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); + +unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), + void *cb_arg) +{ + unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info)); + + if (!i) + return NULL; + + i->next_handler=sot; + i->cb_func=cb_func; + i->cb_arg=cb_arg; + return i; +} + +int unicode_wb_end(unicode_wb_info_t i) +{ + int rc; + + if (i->end_handler) + rc=(*i->end_handler)(i); + else + rc=wb4(i); + + free(i); + return rc; +} + +int unicode_wb_next_cnt(unicode_wb_info_t i, + const unicode_char *chars, + size_t cnt) +{ + int rc; + + while (cnt) + { + rc=unicode_wb_next(i, *chars++); + --cnt; + if (rc) + return rc; + } + return 0; +} + +int unicode_wb_next(unicode_wb_info_t i, unicode_char ch) +{ + return (*i->next_handler) + (i, unicode_tab_lookup(ch, + unicode_indextab, + sizeof(unicode_indextab) + / sizeof(unicode_indextab[0]), + unicode_rangetab, + unicode_classtab, + UNICODE_WB_OTHER)); +} + +static int wb4(unicode_wb_info_t i) +{ + int rc=0; + + while (i->wb4_cnt > 0) + { + --i->wb4_cnt; + + if (rc == 0) + rc=(*i->cb_func)(0, i->cb_arg); + } + return rc; +} + +static int result(unicode_wb_info_t i, int flag) +{ + int rc=wb4(i); + + if (rc == 0) + rc=(*i->cb_func)(flag, i->cb_arg); + + return rc; +} + +#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end) + +static int sot(unicode_wb_info_t i, uint8_t cl) +{ + i->prevclass=cl; + SET_HANDLER(wb1and2_done, NULL); + + return result(i, 1); /* WB1 */ +} + +static int wb1and2_done(unicode_wb_info_t i, uint8_t cl) +{ + uint8_t prevclass=i->prevclass; + + i->prevclass=cl; + + if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF) + return result(i, 0); /* WB3 */ + + switch (prevclass) { + case UNICODE_WB_CR: + case UNICODE_WB_LF: + case UNICODE_WB_Newline: + return result(i, 1); /* WB3a */ + } + + switch (cl) { + case UNICODE_WB_CR: + case UNICODE_WB_LF: + case UNICODE_WB_Newline: + return result(i, 1); /* WB3b */ + } + + if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) + { + i->prevclass=prevclass; + ++i->wb4_cnt; + return 0; /* WB4 */ + } + + if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter) + { + return result(i, 0); /* WB5 */ + } + + if (prevclass == UNICODE_WB_ALetter && + (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet)) + { + i->wb4_extra_cnt=0; + SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler); + return 0; + } + + return wb67_done(i, prevclass, cl); +} + +/* +** ALetter (MidLetter | MidNumLet ) ? +** +** prevclass cl +** +** Seen ALetter (MidLetter | MidNumLet), with the second character's status +** not returned yet. +*/ + +static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl) +{ + int rc; + uint8_t prevclass; + size_t extra_cnt; + + if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) + { + ++i->wb4_extra_cnt; + return 0; + } + + extra_cnt=i->wb4_extra_cnt; + + /* + ** Reset the handler to the default, then check WB6 + */ + + SET_HANDLER(wb1and2_done, NULL); + + if (cl == UNICODE_WB_ALetter) + { + rc=result(i, 0); /* WB6 */ + i->wb4_cnt=extra_cnt; + + if (rc == 0) + rc=result(i, 0); /* WB7 */ + + i->prevclass=cl; + + return rc; + } + + prevclass=i->prevclass; /* This was the second character */ + + /* + ** Process the second character, starting with WB7 + */ + + rc=wb67_done(i, UNICODE_WB_ALetter, prevclass); + + i->prevclass=prevclass; + i->wb4_cnt=extra_cnt; + + if (rc == 0) + rc=(*i->next_handler)(i, cl); + /* Process the current char now */ + + return rc; +} + +/* +** Seen ALetter (MidLetter | MidNumLet), with the second character's status +** not returned yet, and now sot. +*/ + +static int seen_wb67_end_handler(unicode_wb_info_t i) +{ + int rc; + size_t extra_cnt=i->wb4_extra_cnt; + + /* + ** Process the second character, starting with WB7. + */ + + rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass); + i->wb4_cnt=extra_cnt; + if (rc == 0) + rc=wb4(i); + return rc; +} + + +static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +{ + if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric) + return result(i, 0); /* WB8 */ + + if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric) + return result(i, 0); /* WB9 */ + + if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter) + return result(i, 0); /* WB10 */ + + + if (prevclass == UNICODE_WB_Numeric && + (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet)) + { + i->wb4_extra_cnt=0; + SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler); + return 0; + } + + return wb1112_done(i, prevclass, cl); +} + +/* +** Numeric (MidNum | MidNumLet ) ? +** +** prevclass cl +** +** Seen Numeric (MidNum | MidNumLet), with the second character's status +** not returned yet. +*/ + +static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl) +{ + int rc; + uint8_t prevclass; + size_t extra_cnt; + + if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) + { + ++i->wb4_extra_cnt; + return 0; + } + + extra_cnt=i->wb4_extra_cnt; + + /* + ** Reset the handler to the default, then check WB6 + */ + + SET_HANDLER(wb1and2_done, NULL); + + if (cl == UNICODE_WB_Numeric) + { + rc=result(i, 0); /* WB11 */ + i->wb4_cnt=extra_cnt; + + if (rc == 0) + rc=result(i, 0); /* WB12 */ + + i->prevclass=cl; + + return rc; + } + + prevclass=i->prevclass; /* This was the second character */ + + /* + ** Process the second character, starting with WB7 + */ + + rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass); + + i->prevclass=prevclass; + i->wb4_cnt=extra_cnt; + + if (rc == 0) + rc=(*i->next_handler)(i, cl); + /* Process the current char now */ + + return rc; +} + +/* +** Seen Numeric (MidNum | MidNumLet), with the second character's status +** not returned yet, and now sot. +*/ + +static int seen_wb1112_end_handler(unicode_wb_info_t i) +{ + int rc; + size_t extra_cnt=i->wb4_extra_cnt; + + /* + ** Process the second character, starting with WB11. + */ + + rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass); + i->wb4_cnt=extra_cnt; + if (rc == 0) + rc=wb4(i); + return rc; +} + +static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +{ + if (prevclass == UNICODE_WB_Katakana && + cl == UNICODE_WB_Katakana) + return result(i, 0); /* WB13 */ + + switch (prevclass) { + case UNICODE_WB_ALetter: + case UNICODE_WB_Numeric: + case UNICODE_WB_Katakana: + case UNICODE_WB_ExtendNumLet: + if (cl == UNICODE_WB_ExtendNumLet) + return result(i, 0); /* WB13a */ + } + + if (prevclass == UNICODE_WB_ExtendNumLet) + switch (cl) { + case UNICODE_WB_ALetter: + case UNICODE_WB_Numeric: + case UNICODE_WB_Katakana: + return result(i, 0); /* WB13b */ + } + + return result(i, 1); /* WB14 */ +} + +/* --------------------------------------------------------------------- */ + +struct unicode_wbscan_info { + unicode_wb_info_t wb_handle; + + int found; + size_t cnt; +}; + +static int unicode_wbscan_callback(int, void *); + +unicode_wbscan_info_t unicode_wbscan_init() +{ + unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info)); + + if (!i) + return NULL; + + if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL) + { + free(i); + return NULL; + } + + return i; +} + +int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch) +{ + if (!i->found) + unicode_wb_next(i->wb_handle, ch); + + return i->found; +} + +size_t unicode_wbscan_end(unicode_wbscan_info_t i) +{ + size_t n; + + unicode_wb_end(i->wb_handle); + + n=i->cnt; + free(i); + return n; +} + +static int unicode_wbscan_callback(int flag, void *arg) +{ + unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg; + + if (flag && i->cnt > 0) + i->found=1; + + if (!i->found) + ++i->cnt; + return 0; +} + |
