diff options
Diffstat (limited to 'unicode/unicode_linebreak.c')
| -rw-r--r-- | unicode/unicode_linebreak.c | 632 | 
1 files changed, 632 insertions, 0 deletions
| diff --git a/unicode/unicode_linebreak.c b/unicode/unicode_linebreak.c new file mode 100644 index 0000000..1105dec --- /dev/null +++ b/unicode/unicode_linebreak.c @@ -0,0 +1,632 @@ +/* +** Copyright 2011 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include	"unicode_config.h" +#include	"unicode.h" + +#include <unistd.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "linebreaktab_internal.h" + +#include "linebreaktab.h" + +#define UNICODE_LB_SOT	0xFF + +struct unicode_lb_info { +	int (*cb_func)(int, void *); +	void *cb_arg; + +	int opts; + +	uint8_t savedclass; +	size_t savedcmcnt; + +	uint8_t prevclass; +	uint8_t prevclass_nsp; + +	int (*next_handler)(struct unicode_lb_info *, uint8_t); +	int (*end_handler)(struct unicode_lb_info *); +}; + + +/* http://www.unicode.org/reports/tr14/#Algorithm */ + +static int next_def(unicode_lb_info_t, uint8_t); +static int end_def(unicode_lb_info_t); + +static int next_lb25_seenophy(unicode_lb_info_t, uint8_t); +static int end_lb25_seenophy(unicode_lb_info_t); + +static int next_lb25_seennu(unicode_lb_info_t, uint8_t); + +static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t); + +static void unicode_lb_reset(unicode_lb_info_t i) +{ +	i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT; +	i->next_handler=next_def; +	i->end_handler=end_def; +} + +unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *), +				  void *cb_arg) +{ +	unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info)); + +	i->cb_func=cb_func; +	i->cb_arg=cb_arg; + +	unicode_lb_reset(i); +	return i; +} + +int unicode_lb_end(unicode_lb_info_t i) +{ +	int rc=(*i->end_handler)(i); + +	free(i); +	return rc; +} + +void unicode_lb_set_opts(unicode_lb_info_t i, int opts) +{ +	i->opts=opts; +} + +/* Default end handler has nothing to do */ + +static int end_def(unicode_lb_info_t i) +{ +	/* LB3 N/A */ +	return 0; +} +#define RESULT(x) (*i->cb_func)((x), i->cb_arg) + +int unicode_lb_next_cnt(unicode_lb_info_t i, +			const unicode_char *chars, +			size_t cnt) +{ +	while (cnt) +	{ +		int rc=unicode_lb_next(i, *chars); + +		if (rc) +			return rc; + +		++chars; +		--cnt; +	} +	return 0; +} + +int unicode_lb_lookup(unicode_char ch) +{ +	return unicode_tab_lookup(ch, +				  unicode_indextab, +				  sizeof(unicode_indextab) +				  / sizeof(unicode_indextab[0]), +				  unicode_rangetab, +				  unicode_classtab, +				  UNICODE_LB_AL /* XX, LB1 */); +} + +int unicode_lb_next(unicode_lb_info_t i, +		    unicode_char ch) +{ +	return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) && +				  (ch == 0x2012 || ch == 0x2013) +				  ? UNICODE_LB_WJ:unicode_lb_lookup(ch)); +} + +static int next_def_nolb25(unicode_lb_info_t i, +			   uint8_t uclass, +			   int nolb25); + +/* +** Default logic for next unicode char. +*/ +static int next_def(unicode_lb_info_t i, +		    uint8_t uclass) +{ +	return next_def_nolb25(i, uclass, 0); +} + +static int next_def_nolb25(unicode_lb_info_t i, +			   uint8_t uclass, + +			   /* Flag -- recursively invoked after discarding LB25 */ +			   int nolb25) +{ + +	/* Retrieve the previous unicode character's linebreak class. */ + +	uint8_t prevclass=i->prevclass; +	uint8_t prevclass_nsp=i->prevclass_nsp; + +	/* Save this unicode char's linebreak class, for the next goaround */ +	i->prevclass=uclass; + +	if (uclass != UNICODE_LB_SP) +		i->prevclass_nsp=uclass; + +	if (uclass == UNICODE_LB_NU) +		i->next_handler=next_lb25_seennu; /* LB25 */ + +	if (prevclass == UNICODE_LB_SOT) +	{ +		if (uclass == UNICODE_LB_CM) /* LB9 */ +			i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; + +		return RESULT(UNICODE_LB_NONE); /* LB2 */ +	} + +	if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF) +		return RESULT(UNICODE_LB_NONE); /* LB5 */ + +	switch (prevclass) { +	case UNICODE_LB_BK: +	case UNICODE_LB_CR: +	case UNICODE_LB_LF: +	case UNICODE_LB_NL: + +		if (uclass == UNICODE_LB_CM) +		{ +			i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; +			/* LB9 */ +		} + +		return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */ + +	case UNICODE_LB_SP: +	case UNICODE_LB_ZW: +		if (uclass == UNICODE_LB_CM) +			i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL; +		/* LB10 */ +		break; +	default: +		break; +	} + +	switch (uclass) { + +		/* LB6: */ +	case UNICODE_LB_BK: +	case UNICODE_LB_CR: +	case UNICODE_LB_LF: +	case UNICODE_LB_NL: + +		/* LB7: */ +	case UNICODE_LB_SP: +	case UNICODE_LB_ZW: + +		return RESULT(UNICODE_LB_NONE); +	default: +		break; +	} + +	if (prevclass_nsp == UNICODE_LB_ZW) +		return RESULT(UNICODE_LB_ALLOWED); /* LB8 */ + +	if (uclass == UNICODE_LB_CM) +	{ +		i->prevclass=prevclass; +		i->prevclass_nsp=prevclass_nsp; +		return RESULT(UNICODE_LB_NONE); /* LB9 */ +	} + +	if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ) +		return RESULT(UNICODE_LB_NONE); /* LB11 */ + +	if (prevclass == UNICODE_LB_GL) +		return RESULT(UNICODE_LB_NONE); /* LB12 */ + +	if (uclass == UNICODE_LB_GL && +	    prevclass != UNICODE_LB_SP && +	    prevclass != UNICODE_LB_BA && +	    prevclass != UNICODE_LB_HY) +		return RESULT(UNICODE_LB_NONE); /* LB12a */ + + +	switch (uclass) { +	case UNICODE_LB_SY: +		if (i->opts & UNICODE_LB_OPT_SYBREAK) +		{ +			if (prevclass == UNICODE_LB_SP) +				return RESULT(UNICODE_LB_ALLOWED); +		} + +	case UNICODE_LB_CL: +	case UNICODE_LB_CP: +	case UNICODE_LB_EX: +	case UNICODE_LB_IS: +		return RESULT(UNICODE_LB_NONE); /* LB13 */ +	default: +		break; +	} + +	if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY) +		switch (uclass) { +		case UNICODE_LB_EX: +		case UNICODE_LB_AL: +		case UNICODE_LB_ID: +			return RESULT(UNICODE_LB_NONE); +		} + +	if (prevclass_nsp == UNICODE_LB_OP) +		return RESULT(UNICODE_LB_NONE); /* LB14 */ + +	if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP) +		return RESULT(UNICODE_LB_NONE); /* LB15 */ + +	if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP) +	    && uclass == UNICODE_LB_NS) +		return RESULT(UNICODE_LB_NONE); /* LB16 */ + +	if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2) +		return RESULT(UNICODE_LB_NONE); /* LB17 */ + +	if (prevclass == UNICODE_LB_SP) +		return RESULT(UNICODE_LB_ALLOWED); /* LB18 */ + +	if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU) +		return RESULT(UNICODE_LB_NONE); /* LB19 */ + +	if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB) +		return RESULT(UNICODE_LB_ALLOWED); /* LB20 */ + +	/* LB21: */ + +	switch (uclass) { +	case UNICODE_LB_BA: +	case UNICODE_LB_HY: +	case UNICODE_LB_NS: +		return RESULT(UNICODE_LB_NONE); +	default: +		break; +	} + +	if (prevclass == UNICODE_LB_BB) +		return RESULT(UNICODE_LB_NONE); + +	if (uclass == UNICODE_LB_IN) +		switch (prevclass) { +		case UNICODE_LB_AL: +		case UNICODE_LB_ID: +		case UNICODE_LB_IN: +		case UNICODE_LB_NU: +			return RESULT(UNICODE_LB_NONE); /* LB22 */ +		default: +			break; +		} + + +	if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO) +		return RESULT(UNICODE_LB_NONE); /* LB23 */ +	if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU) +		return RESULT(UNICODE_LB_NONE); /* LB23 */ + +	if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL) +		return RESULT(UNICODE_LB_NONE); /* LB23 */ + + +	if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID) +		return RESULT(UNICODE_LB_NONE); /* LB24 */ +	if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL) +		return RESULT(UNICODE_LB_NONE); /* LB24 */ +	if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL) +		return RESULT(UNICODE_LB_NONE); /* LB24 */ + +	if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR) +		switch (prevclass) { +		case UNICODE_LB_PR: +		case UNICODE_LB_AL: +		case UNICODE_LB_ID: +			return RESULT(UNICODE_LB_NONE); +		} +		 +	if (!nolb25 && +	    (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO)) +	{ +		if (uclass == UNICODE_LB_NU) +			return RESULT(UNICODE_LB_NONE); /* LB25 */ + +		if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY) +		{ +			i->prevclass=prevclass; +			i->prevclass_nsp=prevclass_nsp; + +			i->savedclass=uclass; +			i->savedcmcnt=0; +			i->next_handler=next_lb25_seenophy; +			i->end_handler=end_lb25_seenophy; +			return 0; +		} +	} + +	if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) && +	    uclass == UNICODE_LB_NU) +		return RESULT(UNICODE_LB_NONE); /* LB25 */ + +	/*****/ + +	if (prevclass == UNICODE_LB_JL) +		switch (uclass) { +		case UNICODE_LB_JL: +		case UNICODE_LB_JV: +		case UNICODE_LB_H2: +		case UNICODE_LB_H3: +			return RESULT(UNICODE_LB_NONE); /* LB26 */ +		default: +			break; +		} + +	if ((prevclass == UNICODE_LB_JV || +	     prevclass == UNICODE_LB_H2) && +	    (uclass == UNICODE_LB_JV || +	     uclass == UNICODE_LB_JT)) +		return RESULT(UNICODE_LB_NONE); /* LB26 */ + +	if ((prevclass == UNICODE_LB_JT || +	     prevclass == UNICODE_LB_H3) && +	    uclass == UNICODE_LB_JT) +		return RESULT(UNICODE_LB_NONE); /* LB26 */ + + +	switch (prevclass) { +	case UNICODE_LB_JL: +	case UNICODE_LB_JV: +	case UNICODE_LB_JT: +	case UNICODE_LB_H2: +	case UNICODE_LB_H3: +		if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO) +			return RESULT(UNICODE_LB_NONE); /* LB27 */ +	default: +		break; +	} + +	switch (uclass) { +	case UNICODE_LB_JL: +	case UNICODE_LB_JV: +	case UNICODE_LB_JT: +	case UNICODE_LB_H2: +	case UNICODE_LB_H3: +		if (prevclass == UNICODE_LB_PR) +			return RESULT(UNICODE_LB_NONE); /* LB27 */ +	default: +		break; +	} + +	if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL) +		return RESULT(UNICODE_LB_NONE); /* LB28 */ + +	if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL) +		return RESULT(UNICODE_LB_NONE); /* LB29 */ + +	if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) && +	    uclass == UNICODE_LB_OP) +		return RESULT(UNICODE_LB_NONE); /* LB30 */ + +	if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) && +	    prevclass == UNICODE_LB_CP) +		return RESULT(UNICODE_LB_NONE); /* LB30 */ + +	return RESULT(UNICODE_LB_ALLOWED); /* LB31 */ +} + +/* +** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second +** character, but NU did not follow. Backtrack. +*/ + +static int unwind_lb25_seenophy(unicode_lb_info_t i) +{ +	int rc; + +	/*uint8_t class=i->savedclass;*/ +	int nolb25_flag=1; + +	i->next_handler=next_def; +	i->end_handler=end_def; + +	do +	{ +		rc=next_def_nolb25(i, i->savedclass, nolb25_flag); + +		if (rc) +			return rc; + +		/*class=UNICODE_LB_CM;*/ +		nolb25_flag=0; +	} while (i->savedcmcnt--); +	return 0; +} + +/* +** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second +** character. If there's now a NU, we found the modified LB25 regexp. +*/ + +static int next_lb25_seenophy(unicode_lb_info_t i, +			      uint8_t uclass) +{ +	int rc; + +	if (uclass == UNICODE_LB_CM) +	{ +		++i->savedcmcnt; /* Keep track of CMs, and try again */ +		return 0; +	} + +	if (uclass != UNICODE_LB_NU) +	{ +		rc=unwind_lb25_seenophy(i); + +		if (rc) +			return rc; + +		return next_def_nolb25(i, uclass, 0); +	} + +	do +	{ +		rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */ + +		if (rc) +			return rc; +	} while (i->savedcmcnt--); + +	i->next_handler=next_lb25_seennu; +	i->end_handler=end_def; +	i->prevclass=i->prevclass_nsp=uclass; +	return RESULT(UNICODE_LB_NONE); +} + +/* +** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up. +*/ + +static int end_lb25_seenophy(unicode_lb_info_t i) +{ +	int rc=unwind_lb25_seenophy(i); + +	if (rc == 0) +		rc=end_def(i); +	return rc; +} + +/* +** Seen an NU, modified LB25 regexp. +*/ +static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass) +{ +	if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY || +	    uclass == UNICODE_LB_IS) +	{ +		i->prevclass=i->prevclass_nsp=uclass; +		return RESULT(UNICODE_LB_NONE); +	} + +	if (uclass == UNICODE_LB_CM) +		return RESULT(UNICODE_LB_NONE); /* LB9 */ + +	if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP) +	{ +		i->prevclass=i->prevclass_nsp=uclass; +		i->next_handler=next_lb25_seennuclcp; +		i->end_handler=end_def; +		return RESULT(UNICODE_LB_NONE); +	} + +	i->next_handler=next_def; +	i->end_handler=end_def; + +	if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO) +	{ +		i->prevclass=i->prevclass_nsp=uclass; +		return RESULT(UNICODE_LB_NONE); +	} + +	return next_def(i, uclass); /* Not a prefix, process normally */ +} + +/* +** Seen CL|CP, in the modified LB25 regexp. +*/ +static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass) +{ +	if (uclass == UNICODE_LB_CM) +		return RESULT(UNICODE_LB_NONE); /* LB9 */ + +	i->next_handler=next_def; +	i->end_handler=end_def; + +	if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO) +	{ +		i->prevclass=i->prevclass_nsp=uclass; + +		return RESULT(UNICODE_LB_NONE); +	} + +	return next_def(i, uclass); +} + +/******************/ + +struct unicode_lbc_info { +	unicode_lb_info_t handle; + +	struct unicode_buf buf; + +	size_t buf_ptr; + +	int (*cb_func)(int, unicode_char, void *); +	void *cb_arg; +}; + +static int unicode_lbc_callback(int value, void *ptr) +{ +	unicode_lbc_info_t h=(unicode_lbc_info_t)ptr; + +	if (h->buf_ptr >= unicode_buf_len(&h->buf)) +	{ +		errno=EINVAL; +		return -1; /* Shouldn't happen */ +	} + +	return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++], +			     h->cb_arg); +} + +unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *), +				    void *cb_arg) +{ +	unicode_lbc_info_t h= +		(unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info)); + +	if (!h) +		return NULL; + +	h->cb_func=cb_func; +	h->cb_arg=cb_arg; + +	if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL) +	{ +		free(h); +		return NULL; +	} +	unicode_buf_init(&h->buf, (size_t)-1); +	return h; +} + +void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts) +{ +	unicode_lb_set_opts(i->handle, opts); +} +	 +int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch) +{ +	if (i->buf_ptr >= unicode_buf_len(&i->buf)) +	{ +		i->buf_ptr=0; +		unicode_buf_clear(&i->buf); +	} + +	unicode_buf_append(&i->buf, &ch, 1); +	return unicode_lb_next(i->handle, ch); +} + +int unicode_lbc_end(unicode_lbc_info_t i) +{ +	int rc=unicode_lb_end(i->handle); + +	unicode_buf_deinit(&i->buf); +	free(i); +	return rc; +} | 
