diff options
Diffstat (limited to 'unicode/unicode_wordbreak.c')
| -rw-r--r-- | unicode/unicode_wordbreak.c | 148 | 
1 files changed, 131 insertions, 17 deletions
| diff --git a/unicode/unicode_wordbreak.c b/unicode/unicode_wordbreak.c index dee4b52..0bb2b70 100644 --- a/unicode/unicode_wordbreak.c +++ b/unicode/unicode_wordbreak.c @@ -1,5 +1,5 @@  /* -** Copyright 2011 Double Precision, Inc. +** Copyright 2011-2013 Double Precision, Inc.  ** See COPYING for distribution information.  **  */ @@ -21,6 +21,7 @@ struct unicode_wb_info {  	void *cb_arg;  	uint8_t prevclass; +	uint8_t wb7_first_char;  	size_t wb4_cnt;  	size_t wb4_extra_cnt; @@ -37,6 +38,10 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);  static int seen_wb67_end_handler(unicode_wb_info_t i);  static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); +static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl); +static int seen_wb7bc_end_handler(unicode_wb_info_t i); +static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); +  static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);  static int seen_wb1112_end_handler(unicode_wb_info_t i);  static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl); @@ -160,15 +165,21 @@ static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)  		return 0; /* WB4 */  	} -	if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter) +	if ((prevclass == UNICODE_WB_ALetter || +	     prevclass == UNICODE_WB_Hebrew_Letter) && +	    (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter))  	{  		return result(i, 0); /* WB5 */  	} -	if (prevclass == UNICODE_WB_ALetter && -	    (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet)) +	if ((prevclass == UNICODE_WB_ALetter || +	     prevclass == UNICODE_WB_Hebrew_Letter) +	    && +	    (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet || +	     cl == UNICODE_WB_Single_Quote))  	{  		i->wb4_extra_cnt=0; +		i->wb7_first_char=prevclass;  		SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);  		return 0;  	} @@ -177,12 +188,12 @@ static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)  }  /* -**              ALetter     (MidLetter | MidNumLet )     ? +** (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_quote)  ?  ** -**                                  prevclass            cl +**                           prevclass                               cl  ** -** Seen ALetter (MidLetter | MidNumLet), with the second character's status -** not returned yet. +** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second +** character's status not returned yet.  */  static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl) @@ -205,7 +216,7 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)  	SET_HANDLER(wb1and2_done, NULL); -	if (cl == UNICODE_WB_ALetter) +	if (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter)  	{  		rc=result(i, 0); /* WB6 */  		i->wb4_cnt=extra_cnt; @@ -224,7 +235,7 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)  	** Process the second character, starting with WB7  	*/ -	rc=wb67_done(i, UNICODE_WB_ALetter, prevclass); +	rc=wb67_done(i, i->wb7_first_char, prevclass);  	i->prevclass=prevclass;  	i->wb4_cnt=extra_cnt; @@ -237,8 +248,8 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)  }  /* -** Seen ALetter (MidLetter | MidNumLet), with the second character's status -** not returned yet, and now sot. +** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second +** character's status not returned yet, and now sot.  */  static int seen_wb67_end_handler(unicode_wb_info_t i) @@ -250,28 +261,126 @@ static int seen_wb67_end_handler(unicode_wb_info_t i)  	** Process the second character, starting with WB7.  	*/ -	rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass); +	rc=wb67_done(i, i->wb7_first_char, i->prevclass);  	i->wb4_cnt=extra_cnt;  	if (rc == 0)  		rc=wb4(i);  	return rc;  } -  static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)  { +	if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Single_Quote) +		return result(i, 0); /* WB7a */ + +	if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Double_Quote) +	{ +		i->wb4_extra_cnt=0; +		SET_HANDLER(seen_wb7bc_handler, seen_wb7bc_end_handler); +		return 0; +	} + +	return wb7bc_done(i, prevclass, cl); +} + +/* +** Hebrew_Letter Double_Quote       ? +** +**               prevclass          cl +** +** Seen Hebrew_Letter Double_Quote, with the second character's status +** not returned yet. +*/ + +static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl) +{ +	int rc; +	uint8_t prevclass; +	size_t extra_cnt; + +	if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format) +	{ +		++i->wb4_extra_cnt; +		return 0; +	} + +	extra_cnt=i->wb4_extra_cnt; + +	/* +	** Reset the handler to the default, then check WB7a and WB7b +	*/ + +	SET_HANDLER(wb1and2_done, NULL); + +	if (cl == UNICODE_WB_Hebrew_Letter) +	{ +		rc=result(i, 0); /* WB7b */ +		i->wb4_cnt=extra_cnt; + +		if (rc == 0) +			rc=result(i, 0); /* WB7bc */ + +		i->prevclass=cl; +			 +		return rc; +	} + +	prevclass=i->prevclass; /* This was the second character */ + +	/* +	** Process the second character, starting with WB8 +	*/ + +	rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, prevclass); + +	i->prevclass=prevclass; +	i->wb4_cnt=extra_cnt; + +	if (rc == 0) +		rc=(*i->next_handler)(i, cl); +	/* Process the current char now */ + +	return rc; +} + +/* +** Seen Hebrew_Letter Double_Quote, with the second +** character's status not returned yet, and now sot. +*/ + +static int seen_wb7bc_end_handler(unicode_wb_info_t i) +{ +	int rc; +	size_t extra_cnt=i->wb4_extra_cnt; + +	/* +	** Process the second character, starting with WB8. +	*/ + +	rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, i->prevclass); +	i->wb4_cnt=extra_cnt; +	if (rc == 0) +		rc=wb4(i); +	return rc; +} + +static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl) +{  	if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)  		return result(i, 0); /* WB8 */ -	if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric) +	if ((prevclass == UNICODE_WB_ALetter || +	     prevclass == UNICODE_WB_Hebrew_Letter) && cl == UNICODE_WB_Numeric)  		return result(i, 0); /* WB9 */ -	if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter) +	if (prevclass == UNICODE_WB_Numeric && +	    (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter))  		return result(i, 0); /* WB10 */  	if (prevclass == UNICODE_WB_Numeric && -	    (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet)) +	    (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet || +	     cl == UNICODE_WB_Single_Quote))  	{  		i->wb4_extra_cnt=0;  		SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler); @@ -370,6 +479,7 @@ static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)  	switch (prevclass) {  	case UNICODE_WB_ALetter: +	case UNICODE_WB_Hebrew_Letter:  	case UNICODE_WB_Numeric:  	case UNICODE_WB_Katakana:  	case UNICODE_WB_ExtendNumLet: @@ -380,11 +490,15 @@ static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)  	if (prevclass == UNICODE_WB_ExtendNumLet)  		switch (cl) {  		case UNICODE_WB_ALetter: +		case UNICODE_WB_Hebrew_Letter:  		case UNICODE_WB_Numeric:  		case UNICODE_WB_Katakana:  			return result(i, 0); /* WB13b */  		} +	if (prevclass == UNICODE_WB_Regional_Indicator && +	    cl == UNICODE_WB_Regional_Indicator) +		return result(i, 0);  	return result(i, 1); /* WB14 */  } | 
