1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
/*
** Copyright 2011-2020 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
#include "unicode_config.h"
#include "courier-unicode.h"
#include <unistd.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#define UNICODE_GRAPHEMEBREAK_ANY 0x00
#define UNICODE_GRAPHEMEBREAK_CR 0x01
#define UNICODE_GRAPHEMEBREAK_LF 0x02
#define UNICODE_GRAPHEMEBREAK_Control 0x03
#define UNICODE_GRAPHEMEBREAK_Extend 0x04
#define UNICODE_GRAPHEMEBREAK_Prepend 0x05
#define UNICODE_GRAPHEMEBREAK_SpacingMark 0x06
#define UNICODE_GRAPHEMEBREAK_L 0x07
#define UNICODE_GRAPHEMEBREAK_V 0x08
#define UNICODE_GRAPHEMEBREAK_T 0x09
#define UNICODE_GRAPHEMEBREAK_LV 0x0A
#define UNICODE_GRAPHEMEBREAK_LVT 0x0B
#define UNICODE_GRAPHEMEBREAK_Regional_Indicator 0x0C
#define UNICODE_GRAPHEMEBREAK_ZWJ 0x0D
#define UNICODE_GRAPHEMEBREAK_SOT 0xFF
#include "graphemebreaktab.h"
struct unicode_grapheme_break_info_s {
uint8_t prev_class;
unsigned prev_count;
};
unicode_grapheme_break_info_t unicode_grapheme_break_init()
{
unicode_grapheme_break_info_t t=(unicode_grapheme_break_info_t)
calloc(1, sizeof(struct unicode_grapheme_break_info_s));
if (!t)
abort();
t->prev_class=UNICODE_GRAPHEMEBREAK_SOT;
return t;
}
void unicode_grapheme_break_deinit(unicode_grapheme_break_info_t t)
{
free(t);
}
int unicode_grapheme_break(char32_t a, char32_t b)
{
struct unicode_grapheme_break_info_s s;
memset((char *)&s, 0, sizeof(s));
(void)unicode_grapheme_break_next(&s, a);
return unicode_grapheme_break_next(&s, b);
}
int unicode_grapheme_break_next(unicode_grapheme_break_info_t t, char32_t b)
{
uint8_t ac=t->prev_class;
uint8_t bc=unicode_tab_lookup(b,
unicode_starting_indextab,
unicode_starting_pagetab,
sizeof(unicode_starting_indextab)/
sizeof(unicode_starting_indextab[0]),
unicode_rangetab,
sizeof(unicode_rangetab)/
sizeof(unicode_rangetab[0]),
unicode_classtab,
UNICODE_GRAPHEMEBREAK_ANY);
if (ac != bc)
t->prev_count=0;
++t->prev_count;
t->prev_class=bc;
if (ac == UNICODE_GRAPHEMEBREAK_SOT)
return 1; /* GB1, GB2 is implied */
if (ac == UNICODE_GRAPHEMEBREAK_CR && bc == UNICODE_GRAPHEMEBREAK_LF)
return 0; /* GB3 */
switch (ac) {
case UNICODE_GRAPHEMEBREAK_CR:
case UNICODE_GRAPHEMEBREAK_LF:
case UNICODE_GRAPHEMEBREAK_Control:
return 1; /* GB4 */
default:
break;
}
switch (bc) {
case UNICODE_GRAPHEMEBREAK_CR:
case UNICODE_GRAPHEMEBREAK_LF:
case UNICODE_GRAPHEMEBREAK_Control:
return 1; /* GB5 */
default:
break;
}
if (ac == UNICODE_GRAPHEMEBREAK_L)
switch (bc) {
case UNICODE_GRAPHEMEBREAK_L:
case UNICODE_GRAPHEMEBREAK_V:
case UNICODE_GRAPHEMEBREAK_LV:
case UNICODE_GRAPHEMEBREAK_LVT:
return 0; /* GB6 */
}
if ((ac == UNICODE_GRAPHEMEBREAK_LV ||
ac == UNICODE_GRAPHEMEBREAK_V) &&
(bc == UNICODE_GRAPHEMEBREAK_V ||
bc == UNICODE_GRAPHEMEBREAK_T))
return 0; /* GB7 */
if ((ac == UNICODE_GRAPHEMEBREAK_LVT ||
ac == UNICODE_GRAPHEMEBREAK_T) &&
bc == UNICODE_GRAPHEMEBREAK_T)
return 0; /* GB8 */
if (bc == UNICODE_GRAPHEMEBREAK_Extend ||
bc == UNICODE_GRAPHEMEBREAK_ZWJ)
return 0; /* GB9 */
if (bc == UNICODE_GRAPHEMEBREAK_SpacingMark)
return 0; /* GB9a */
if (ac == UNICODE_GRAPHEMEBREAK_Prepend)
return 0; /* GB9b */
if (ac == UNICODE_GRAPHEMEBREAK_Extend ||
ac == UNICODE_GRAPHEMEBREAK_ZWJ)
return 0; /* GB11? */
if (ac == UNICODE_GRAPHEMEBREAK_Regional_Indicator &&
bc == UNICODE_GRAPHEMEBREAK_Regional_Indicator &&
(t->prev_count % 2) == 0)
return 0; /* GB12, GB13 */
return 1; /* GB999 */
}
|