summaryrefslogtreecommitdiffstats
path: root/unicode/unicode_wordbreak.c
diff options
context:
space:
mode:
Diffstat (limited to 'unicode/unicode_wordbreak.c')
-rw-r--r--unicode/unicode_wordbreak.c148
1 files changed, 131 insertions, 17 deletions
diff --git a/unicode/unicode_wordbreak.c b/unicode/unicode_wordbreak.c
index dee4b52..0bb2b70 100644
--- a/unicode/unicode_wordbreak.c
+++ b/unicode/unicode_wordbreak.c
@@ -1,5 +1,5 @@
/*
-** Copyright 2011 Double Precision, Inc.
+** Copyright 2011-2013 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
@@ -21,6 +21,7 @@ struct unicode_wb_info {
void *cb_arg;
uint8_t prevclass;
+ uint8_t wb7_first_char;
size_t wb4_cnt;
size_t wb4_extra_cnt;
@@ -37,6 +38,10 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
static int seen_wb67_end_handler(unicode_wb_info_t i);
static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl);
+static int seen_wb7bc_end_handler(unicode_wb_info_t i);
+static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+
static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
static int seen_wb1112_end_handler(unicode_wb_info_t i);
static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
@@ -160,15 +165,21 @@ static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
return 0; /* WB4 */
}
- if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
+ if ((prevclass == UNICODE_WB_ALetter ||
+ prevclass == UNICODE_WB_Hebrew_Letter) &&
+ (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter))
{
return result(i, 0); /* WB5 */
}
- if (prevclass == UNICODE_WB_ALetter &&
- (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
+ if ((prevclass == UNICODE_WB_ALetter ||
+ prevclass == UNICODE_WB_Hebrew_Letter)
+ &&
+ (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet ||
+ cl == UNICODE_WB_Single_Quote))
{
i->wb4_extra_cnt=0;
+ i->wb7_first_char=prevclass;
SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
return 0;
}
@@ -177,12 +188,12 @@ static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
}
/*
-** ALetter (MidLetter | MidNumLet ) ?
+** (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_quote) ?
**
-** prevclass cl
+** prevclass cl
**
-** Seen ALetter (MidLetter | MidNumLet), with the second character's status
-** not returned yet.
+** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second
+** character's status not returned yet.
*/
static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
@@ -205,7 +216,7 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
SET_HANDLER(wb1and2_done, NULL);
- if (cl == UNICODE_WB_ALetter)
+ if (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter)
{
rc=result(i, 0); /* WB6 */
i->wb4_cnt=extra_cnt;
@@ -224,7 +235,7 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
** Process the second character, starting with WB7
*/
- rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
+ rc=wb67_done(i, i->wb7_first_char, prevclass);
i->prevclass=prevclass;
i->wb4_cnt=extra_cnt;
@@ -237,8 +248,8 @@ static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
}
/*
-** Seen ALetter (MidLetter | MidNumLet), with the second character's status
-** not returned yet, and now sot.
+** Seen (ALetter | Hebrew_Letter)(MidLetter | MidNumLet), with the second
+** character's status not returned yet, and now sot.
*/
static int seen_wb67_end_handler(unicode_wb_info_t i)
@@ -250,28 +261,126 @@ static int seen_wb67_end_handler(unicode_wb_info_t i)
** Process the second character, starting with WB7.
*/
- rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
+ rc=wb67_done(i, i->wb7_first_char, i->prevclass);
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=wb4(i);
return rc;
}
-
static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
{
+ if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Single_Quote)
+ return result(i, 0); /* WB7a */
+
+ if (prevclass == UNICODE_WB_Hebrew_Letter && cl == UNICODE_WB_Double_Quote)
+ {
+ i->wb4_extra_cnt=0;
+ SET_HANDLER(seen_wb7bc_handler, seen_wb7bc_end_handler);
+ return 0;
+ }
+
+ return wb7bc_done(i, prevclass, cl);
+}
+
+/*
+** Hebrew_Letter Double_Quote ?
+**
+** prevclass cl
+**
+** Seen Hebrew_Letter Double_Quote, with the second character's status
+** not returned yet.
+*/
+
+static int seen_wb7bc_handler(unicode_wb_info_t i, uint8_t cl)
+{
+ int rc;
+ uint8_t prevclass;
+ size_t extra_cnt;
+
+ if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+ {
+ ++i->wb4_extra_cnt;
+ return 0;
+ }
+
+ extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Reset the handler to the default, then check WB7a and WB7b
+ */
+
+ SET_HANDLER(wb1and2_done, NULL);
+
+ if (cl == UNICODE_WB_Hebrew_Letter)
+ {
+ rc=result(i, 0); /* WB7b */
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=result(i, 0); /* WB7bc */
+
+ i->prevclass=cl;
+
+ return rc;
+ }
+
+ prevclass=i->prevclass; /* This was the second character */
+
+ /*
+ ** Process the second character, starting with WB8
+ */
+
+ rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, prevclass);
+
+ i->prevclass=prevclass;
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=(*i->next_handler)(i, cl);
+ /* Process the current char now */
+
+ return rc;
+}
+
+/*
+** Seen Hebrew_Letter Double_Quote, with the second
+** character's status not returned yet, and now sot.
+*/
+
+static int seen_wb7bc_end_handler(unicode_wb_info_t i)
+{
+ int rc;
+ size_t extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Process the second character, starting with WB8.
+ */
+
+ rc=wb7bc_done(i, UNICODE_WB_Hebrew_Letter, i->prevclass);
+ i->wb4_cnt=extra_cnt;
+ if (rc == 0)
+ rc=wb4(i);
+ return rc;
+}
+
+static int wb7bc_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
+{
if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
return result(i, 0); /* WB8 */
- if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
+ if ((prevclass == UNICODE_WB_ALetter ||
+ prevclass == UNICODE_WB_Hebrew_Letter) && cl == UNICODE_WB_Numeric)
return result(i, 0); /* WB9 */
- if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
+ if (prevclass == UNICODE_WB_Numeric &&
+ (cl == UNICODE_WB_ALetter || cl == UNICODE_WB_Hebrew_Letter))
return result(i, 0); /* WB10 */
if (prevclass == UNICODE_WB_Numeric &&
- (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
+ (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet ||
+ cl == UNICODE_WB_Single_Quote))
{
i->wb4_extra_cnt=0;
SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
@@ -370,6 +479,7 @@ static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
switch (prevclass) {
case UNICODE_WB_ALetter:
+ case UNICODE_WB_Hebrew_Letter:
case UNICODE_WB_Numeric:
case UNICODE_WB_Katakana:
case UNICODE_WB_ExtendNumLet:
@@ -380,11 +490,15 @@ static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
if (prevclass == UNICODE_WB_ExtendNumLet)
switch (cl) {
case UNICODE_WB_ALetter:
+ case UNICODE_WB_Hebrew_Letter:
case UNICODE_WB_Numeric:
case UNICODE_WB_Katakana:
return result(i, 0); /* WB13b */
}
+ if (prevclass == UNICODE_WB_Regional_Indicator &&
+ cl == UNICODE_WB_Regional_Indicator)
+ return result(i, 0);
return result(i, 1); /* WB14 */
}