summaryrefslogtreecommitdiffstats
path: root/unicode/unicode_graphemebreak.c
blob: db6b92b42ff8ab599b2ffcd4df7282e4b1bb11d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
** Copyright 2011-2020 Double Precision, Inc.
** See COPYING for distribution information.
**
*/

#include	"unicode_config.h"
#include	"courier-unicode.h"
#include	<unistd.h>
#include	<stdint.h>
#include	<string.h>
#include	<stdlib.h>

#define UNICODE_GRAPHEMEBREAK_ANY		0x00
#define UNICODE_GRAPHEMEBREAK_CR		0x01
#define UNICODE_GRAPHEMEBREAK_LF		0x02
#define UNICODE_GRAPHEMEBREAK_Control		0x03
#define UNICODE_GRAPHEMEBREAK_Extend		0x04
#define UNICODE_GRAPHEMEBREAK_Prepend		0x05
#define UNICODE_GRAPHEMEBREAK_SpacingMark	0x06
#define UNICODE_GRAPHEMEBREAK_L			0x07
#define UNICODE_GRAPHEMEBREAK_V			0x08
#define UNICODE_GRAPHEMEBREAK_T			0x09
#define UNICODE_GRAPHEMEBREAK_LV		0x0A
#define UNICODE_GRAPHEMEBREAK_LVT		0x0B
#define UNICODE_GRAPHEMEBREAK_Regional_Indicator 0x0C

#define UNICODE_GRAPHEMEBREAK_ZWJ		0x0D

#define UNICODE_GRAPHEMEBREAK_SOT		0xFF

#include "graphemebreaktab.h"

struct unicode_grapheme_break_info_s {
	uint8_t prev_class;
	unsigned prev_count;
};

unicode_grapheme_break_info_t unicode_grapheme_break_init()
{
	unicode_grapheme_break_info_t t=(unicode_grapheme_break_info_t)
		calloc(1, sizeof(struct unicode_grapheme_break_info_s));

	if (!t)
		abort();

	t->prev_class=UNICODE_GRAPHEMEBREAK_SOT;

	return t;
}

void unicode_grapheme_break_deinit(unicode_grapheme_break_info_t t)
{
	free(t);
}

int unicode_grapheme_break(char32_t a, char32_t b)
{
	struct unicode_grapheme_break_info_s s;

	memset((char *)&s, 0, sizeof(s));

	(void)unicode_grapheme_break_next(&s, a);

	return unicode_grapheme_break_next(&s, b);
}

int unicode_grapheme_break_next(unicode_grapheme_break_info_t t, char32_t b)
{
	uint8_t ac=t->prev_class;
	uint8_t bc=unicode_tab_lookup(b, unicode_indextab,
			 sizeof(unicode_indextab)/sizeof(unicode_indextab[0]),
			 unicode_rangetab,
			 unicode_classtab,
			 UNICODE_GRAPHEMEBREAK_ANY);

	if (ac != bc)
		t->prev_count=0;
	++t->prev_count;

	t->prev_class=bc;

	if (ac == UNICODE_GRAPHEMEBREAK_SOT)
		return 1; /* GB1, GB2 is implied */

	if (ac == UNICODE_GRAPHEMEBREAK_CR && bc == UNICODE_GRAPHEMEBREAK_LF)
		return 0; /* GB3 */


	switch (ac) {
	case UNICODE_GRAPHEMEBREAK_CR:
	case UNICODE_GRAPHEMEBREAK_LF:
	case UNICODE_GRAPHEMEBREAK_Control:
		return 1; /* GB4 */
	default:
		break;
	}

	switch (bc) {
	case UNICODE_GRAPHEMEBREAK_CR:
	case UNICODE_GRAPHEMEBREAK_LF:
	case UNICODE_GRAPHEMEBREAK_Control:
		return 1; /* GB5 */
	default:
		break;
	}

	if (ac == UNICODE_GRAPHEMEBREAK_L)
		switch (bc) {
		case UNICODE_GRAPHEMEBREAK_L:
		case UNICODE_GRAPHEMEBREAK_V:
		case UNICODE_GRAPHEMEBREAK_LV:
		case UNICODE_GRAPHEMEBREAK_LVT:
			return 0; /* GB6 */
		}

	if ((ac == UNICODE_GRAPHEMEBREAK_LV ||
	     ac == UNICODE_GRAPHEMEBREAK_V) &&
	    (bc == UNICODE_GRAPHEMEBREAK_V ||
	     bc == UNICODE_GRAPHEMEBREAK_T))
		return 0; /* GB7 */

	if ((ac == UNICODE_GRAPHEMEBREAK_LVT ||
	     ac == UNICODE_GRAPHEMEBREAK_T) &&
	    bc == UNICODE_GRAPHEMEBREAK_T)
		return 0; /* GB8 */

	if (bc == UNICODE_GRAPHEMEBREAK_Extend ||
	    bc == UNICODE_GRAPHEMEBREAK_ZWJ)
		return 0; /* GB9 */

	if (bc == UNICODE_GRAPHEMEBREAK_SpacingMark)
		return 0; /* GB9a */

	if (ac == UNICODE_GRAPHEMEBREAK_Prepend)
		return 0; /* GB9b */

	if (ac == UNICODE_GRAPHEMEBREAK_Extend ||
	    ac == UNICODE_GRAPHEMEBREAK_ZWJ)
		return 0; /* GB11? */

	if (ac == UNICODE_GRAPHEMEBREAK_Regional_Indicator &&
	    bc == UNICODE_GRAPHEMEBREAK_Regional_Indicator &&
	    (t->prev_count % 2) == 0)
		return 0; /* GB12, GB13 */

	return 1; /* GB999 */
}