diff options
Diffstat (limited to 'unicode/unicode_linebreak.c')
| -rw-r--r-- | unicode/unicode_linebreak.c | 632 |
1 files changed, 632 insertions, 0 deletions
diff --git a/unicode/unicode_linebreak.c b/unicode/unicode_linebreak.c new file mode 100644 index 0000000..1105dec --- /dev/null +++ b/unicode/unicode_linebreak.c @@ -0,0 +1,632 @@ +/* +** Copyright 2011 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include "unicode_config.h" +#include "unicode.h" + +#include <unistd.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "linebreaktab_internal.h" + +#include "linebreaktab.h" + +#define UNICODE_LB_SOT 0xFF + +struct unicode_lb_info { + int (*cb_func)(int, void *); + void *cb_arg; + + int opts; + + uint8_t savedclass; + size_t savedcmcnt; + + uint8_t prevclass; + uint8_t prevclass_nsp; + + int (*next_handler)(struct unicode_lb_info *, uint8_t); + int (*end_handler)(struct unicode_lb_info *); +}; + + +/* http://www.unicode.org/reports/tr14/#Algorithm */ + +static int next_def(unicode_lb_info_t, uint8_t); +static int end_def(unicode_lb_info_t); + +static int next_lb25_seenophy(unicode_lb_info_t, uint8_t); +static int end_lb25_seenophy(unicode_lb_info_t); + +static int next_lb25_seennu(unicode_lb_info_t, uint8_t); + +static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t); + +static void unicode_lb_reset(unicode_lb_info_t i) +{ + i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT; + i->next_handler=next_def; + i->end_handler=end_def; +} + +unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *), + void *cb_arg) +{ + unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info)); + + i->cb_func=cb_func; + i->cb_arg=cb_arg; + + unicode_lb_reset(i); + return i; +} + +int unicode_lb_end(unicode_lb_info_t i) +{ + int rc=(*i->end_handler)(i); + + free(i); + return rc; +} + +void unicode_lb_set_opts(unicode_lb_info_t i, int opts) +{ + i->opts=opts; +} + +/* Default end handler has nothing to do */ + +static int end_def(unicode_lb_info_t i) +{ + /* LB3 N/A */ + return 0; +} +#define RESULT(x) (*i->cb_func)((x), i->cb_arg) + +int unicode_lb_next_cnt(unicode_lb_info_t i, + const unicode_char *chars, + size_t cnt) +{ + while (cnt) + { + int rc=unicode_lb_next(i, *chars); + + if (rc) + return rc; + + ++chars; + --cnt; + } + return 0; +} + +int unicode_lb_lookup(unicode_char ch) +{ + return unicode_tab_lookup(ch, + unicode_indextab, + sizeof(unicode_indextab) + / sizeof(unicode_indextab[0]), + unicode_rangetab, + unicode_classtab, + UNICODE_LB_AL /* XX, LB1 */); +} + +int unicode_lb_next(unicode_lb_info_t i, + unicode_char ch) +{ + return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) && + (ch == 0x2012 || ch == 0x2013) + ? UNICODE_LB_WJ:unicode_lb_lookup(ch)); +} + +static int next_def_nolb25(unicode_lb_info_t i, + uint8_t uclass, + int nolb25); + +/* +** Default logic for next unicode char. +*/ +static int next_def(unicode_lb_info_t i, + uint8_t uclass) +{ + return next_def_nolb25(i, uclass, 0); +} + +static int next_def_nolb25(unicode_lb_info_t i, + uint8_t uclass, + + /* Flag -- recursively invoked after discarding LB25 */ + int nolb25) +{ + + /* Retrieve the previous unicode character's linebreak class. */ + + uint8_t prevclass=i->prevclass; + uint8_t prevclass_nsp=i->prevclass_nsp; + + /* Save this unicode char's linebreak class, for the next goaround */ + i->prevclass=uclass; + + if (uclass != UNICODE_LB_SP) + i->prevclass_nsp=uclass; + + if (uclass == UNICODE_LB_NU) + i->next_handler=next_lb25_seennu; /* LB25 */ + + if (prevclass == UNICODE_LB_SOT) + { + if (uclass == UNICODE_LB_CM) /* LB9 */ + i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; + + return RESULT(UNICODE_LB_NONE); /* LB2 */ + } + + if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF) + return RESULT(UNICODE_LB_NONE); /* LB5 */ + + switch (prevclass) { + case UNICODE_LB_BK: + case UNICODE_LB_CR: + case UNICODE_LB_LF: + case UNICODE_LB_NL: + + if (uclass == UNICODE_LB_CM) + { + i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; + /* LB9 */ + } + + return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */ + + case UNICODE_LB_SP: + case UNICODE_LB_ZW: + if (uclass == UNICODE_LB_CM) + i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; + /* LB10 */ + break; + default: + break; + } + + switch (uclass) { + + /* LB6: */ + case UNICODE_LB_BK: + case UNICODE_LB_CR: + case UNICODE_LB_LF: + case UNICODE_LB_NL: + + /* LB7: */ + case UNICODE_LB_SP: + case UNICODE_LB_ZW: + + return RESULT(UNICODE_LB_NONE); + default: + break; + } + + if (prevclass_nsp == UNICODE_LB_ZW) + return RESULT(UNICODE_LB_ALLOWED); /* LB8 */ + + if (uclass == UNICODE_LB_CM) + { + i->prevclass=prevclass; + i->prevclass_nsp=prevclass_nsp; + return RESULT(UNICODE_LB_NONE); /* LB9 */ + } + + if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ) + return RESULT(UNICODE_LB_NONE); /* LB11 */ + + if (prevclass == UNICODE_LB_GL) + return RESULT(UNICODE_LB_NONE); /* LB12 */ + + if (uclass == UNICODE_LB_GL && + prevclass != UNICODE_LB_SP && + prevclass != UNICODE_LB_BA && + prevclass != UNICODE_LB_HY) + return RESULT(UNICODE_LB_NONE); /* LB12a */ + + + switch (uclass) { + case UNICODE_LB_SY: + if (i->opts & UNICODE_LB_OPT_SYBREAK) + { + if (prevclass == UNICODE_LB_SP) + return RESULT(UNICODE_LB_ALLOWED); + } + + case UNICODE_LB_CL: + case UNICODE_LB_CP: + case UNICODE_LB_EX: + case UNICODE_LB_IS: + return RESULT(UNICODE_LB_NONE); /* LB13 */ + default: + break; + } + + if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY) + switch (uclass) { + case UNICODE_LB_EX: + case UNICODE_LB_AL: + case UNICODE_LB_ID: + return RESULT(UNICODE_LB_NONE); + } + + if (prevclass_nsp == UNICODE_LB_OP) + return RESULT(UNICODE_LB_NONE); /* LB14 */ + + if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP) + return RESULT(UNICODE_LB_NONE); /* LB15 */ + + if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP) + && uclass == UNICODE_LB_NS) + return RESULT(UNICODE_LB_NONE); /* LB16 */ + + if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2) + return RESULT(UNICODE_LB_NONE); /* LB17 */ + + if (prevclass == UNICODE_LB_SP) + return RESULT(UNICODE_LB_ALLOWED); /* LB18 */ + + if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU) + return RESULT(UNICODE_LB_NONE); /* LB19 */ + + if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB) + return RESULT(UNICODE_LB_ALLOWED); /* LB20 */ + + /* LB21: */ + + switch (uclass) { + case UNICODE_LB_BA: + case UNICODE_LB_HY: + case UNICODE_LB_NS: + return RESULT(UNICODE_LB_NONE); + default: + break; + } + + if (prevclass == UNICODE_LB_BB) + return RESULT(UNICODE_LB_NONE); + + if (uclass == UNICODE_LB_IN) + switch (prevclass) { + case UNICODE_LB_AL: + case UNICODE_LB_ID: + case UNICODE_LB_IN: + case UNICODE_LB_NU: + return RESULT(UNICODE_LB_NONE); /* LB22 */ + default: + break; + } + + + if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO) + return RESULT(UNICODE_LB_NONE); /* LB23 */ + if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU) + return RESULT(UNICODE_LB_NONE); /* LB23 */ + + if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL) + return RESULT(UNICODE_LB_NONE); /* LB23 */ + + + if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID) + return RESULT(UNICODE_LB_NONE); /* LB24 */ + if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL) + return RESULT(UNICODE_LB_NONE); /* LB24 */ + if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL) + return RESULT(UNICODE_LB_NONE); /* LB24 */ + + if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR) + switch (prevclass) { + case UNICODE_LB_PR: + case UNICODE_LB_AL: + case UNICODE_LB_ID: + return RESULT(UNICODE_LB_NONE); + } + + if (!nolb25 && + (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO)) + { + if (uclass == UNICODE_LB_NU) + return RESULT(UNICODE_LB_NONE); /* LB25 */ + + if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY) + { + i->prevclass=prevclass; + i->prevclass_nsp=prevclass_nsp; + + i->savedclass=uclass; + i->savedcmcnt=0; + i->next_handler=next_lb25_seenophy; + i->end_handler=end_lb25_seenophy; + return 0; + } + } + + if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) && + uclass == UNICODE_LB_NU) + return RESULT(UNICODE_LB_NONE); /* LB25 */ + + /*****/ + + if (prevclass == UNICODE_LB_JL) + switch (uclass) { + case UNICODE_LB_JL: + case UNICODE_LB_JV: + case UNICODE_LB_H2: + case UNICODE_LB_H3: + return RESULT(UNICODE_LB_NONE); /* LB26 */ + default: + break; + } + + if ((prevclass == UNICODE_LB_JV || + prevclass == UNICODE_LB_H2) && + (uclass == UNICODE_LB_JV || + uclass == UNICODE_LB_JT)) + return RESULT(UNICODE_LB_NONE); /* LB26 */ + + if ((prevclass == UNICODE_LB_JT || + prevclass == UNICODE_LB_H3) && + uclass == UNICODE_LB_JT) + return RESULT(UNICODE_LB_NONE); /* LB26 */ + + + switch (prevclass) { + case UNICODE_LB_JL: + case UNICODE_LB_JV: + case UNICODE_LB_JT: + case UNICODE_LB_H2: + case UNICODE_LB_H3: + if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO) + return RESULT(UNICODE_LB_NONE); /* LB27 */ + default: + break; + } + + switch (uclass) { + case UNICODE_LB_JL: + case UNICODE_LB_JV: + case UNICODE_LB_JT: + case UNICODE_LB_H2: + case UNICODE_LB_H3: + if (prevclass == UNICODE_LB_PR) + return RESULT(UNICODE_LB_NONE); /* LB27 */ + default: + break; + } + + if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL) + return RESULT(UNICODE_LB_NONE); /* LB28 */ + + if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL) + return RESULT(UNICODE_LB_NONE); /* LB29 */ + + if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) && + uclass == UNICODE_LB_OP) + return RESULT(UNICODE_LB_NONE); /* LB30 */ + + if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) && + prevclass == UNICODE_LB_CP) + return RESULT(UNICODE_LB_NONE); /* LB30 */ + + return RESULT(UNICODE_LB_ALLOWED); /* LB31 */ +} + +/* +** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second +** character, but NU did not follow. Backtrack. +*/ + +static int unwind_lb25_seenophy(unicode_lb_info_t i) +{ + int rc; + + /*uint8_t class=i->savedclass;*/ + int nolb25_flag=1; + + i->next_handler=next_def; + i->end_handler=end_def; + + do + { + rc=next_def_nolb25(i, i->savedclass, nolb25_flag); + + if (rc) + return rc; + + /*class=UNICODE_LB_CM;*/ + nolb25_flag=0; + } while (i->savedcmcnt--); + return 0; +} + +/* +** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second +** character. If there's now a NU, we found the modified LB25 regexp. +*/ + +static int next_lb25_seenophy(unicode_lb_info_t i, + uint8_t uclass) +{ + int rc; + + if (uclass == UNICODE_LB_CM) + { + ++i->savedcmcnt; /* Keep track of CMs, and try again */ + return 0; + } + + if (uclass != UNICODE_LB_NU) + { + rc=unwind_lb25_seenophy(i); + + if (rc) + return rc; + + return next_def_nolb25(i, uclass, 0); + } + + do + { + rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */ + + if (rc) + return rc; + } while (i->savedcmcnt--); + + i->next_handler=next_lb25_seennu; + i->end_handler=end_def; + i->prevclass=i->prevclass_nsp=uclass; + return RESULT(UNICODE_LB_NONE); +} + +/* +** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up. +*/ + +static int end_lb25_seenophy(unicode_lb_info_t i) +{ + int rc=unwind_lb25_seenophy(i); + + if (rc == 0) + rc=end_def(i); + return rc; +} + +/* +** Seen an NU, modified LB25 regexp. +*/ +static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass) +{ + if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY || + uclass == UNICODE_LB_IS) + { + i->prevclass=i->prevclass_nsp=uclass; + return RESULT(UNICODE_LB_NONE); + } + + if (uclass == UNICODE_LB_CM) + return RESULT(UNICODE_LB_NONE); /* LB9 */ + + if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP) + { + i->prevclass=i->prevclass_nsp=uclass; + i->next_handler=next_lb25_seennuclcp; + i->end_handler=end_def; + return RESULT(UNICODE_LB_NONE); + } + + i->next_handler=next_def; + i->end_handler=end_def; + + if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO) + { + i->prevclass=i->prevclass_nsp=uclass; + return RESULT(UNICODE_LB_NONE); + } + + return next_def(i, uclass); /* Not a prefix, process normally */ +} + +/* +** Seen CL|CP, in the modified LB25 regexp. +*/ +static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass) +{ + if (uclass == UNICODE_LB_CM) + return RESULT(UNICODE_LB_NONE); /* LB9 */ + + i->next_handler=next_def; + i->end_handler=end_def; + + if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO) + { + i->prevclass=i->prevclass_nsp=uclass; + + return RESULT(UNICODE_LB_NONE); + } + + return next_def(i, uclass); +} + +/******************/ + +struct unicode_lbc_info { + unicode_lb_info_t handle; + + struct unicode_buf buf; + + size_t buf_ptr; + + int (*cb_func)(int, unicode_char, void *); + void *cb_arg; +}; + +static int unicode_lbc_callback(int value, void *ptr) +{ + unicode_lbc_info_t h=(unicode_lbc_info_t)ptr; + + if (h->buf_ptr >= unicode_buf_len(&h->buf)) + { + errno=EINVAL; + return -1; /* Shouldn't happen */ + } + + return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++], + h->cb_arg); +} + +unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *), + void *cb_arg) +{ + unicode_lbc_info_t h= + (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info)); + + if (!h) + return NULL; + + h->cb_func=cb_func; + h->cb_arg=cb_arg; + + if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL) + { + free(h); + return NULL; + } + unicode_buf_init(&h->buf, (size_t)-1); + return h; +} + +void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts) +{ + unicode_lb_set_opts(i->handle, opts); +} + +int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch) +{ + if (i->buf_ptr >= unicode_buf_len(&i->buf)) + { + i->buf_ptr=0; + unicode_buf_clear(&i->buf); + } + + unicode_buf_append(&i->buf, &ch, 1); + return unicode_lb_next(i->handle, ch); +} + +int unicode_lbc_end(unicode_lbc_info_t i) +{ + int rc=unicode_lb_end(i->handle); + + unicode_buf_deinit(&i->buf); + free(i); + return rc; +} |
