summaryrefslogtreecommitdiffstats
path: root/unicode/unicode_normalization.c
diff options
context:
space:
mode:
Diffstat (limited to 'unicode/unicode_normalization.c')
-rw-r--r--unicode/unicode_normalization.c205
1 files changed, 130 insertions, 75 deletions
diff --git a/unicode/unicode_normalization.c b/unicode/unicode_normalization.c
index ea9e256..93e691f 100644
--- a/unicode/unicode_normalization.c
+++ b/unicode/unicode_normalization.c
@@ -51,7 +51,7 @@ static int unicode_nfkc_qc(char32_t ch)
** Lookup a character's canonical combining class.
*/
-static uint8_t unicode_ccc(char32_t ch)
+uint8_t unicode_ccc(char32_t ch)
{
return unicode_tab_lookup(ch,
ccc_starting_indextab,
@@ -113,8 +113,8 @@ unicode_canonical_t unicode_canonical(char32_t ch)
** decomposed.
*/
-static void search_for_decompose(struct unicode_decompose_info *info,
- void (*f)(struct unicode_decompose_info *,
+static void search_for_decompose(unicode_decomposition_t *info,
+ void (*f)(unicode_decomposition_t *,
size_t,
const struct decomposition_info *,
void *),
@@ -182,7 +182,7 @@ struct decompose_meta {
/* Pass 1: count the number of characters to decompose. */
-static void decompose_meta_count(struct unicode_decompose_info *info,
+static void decompose_meta_count(unicode_decomposition_t *info,
size_t i,
const struct decomposition_info *cinfo,
void *arg)
@@ -194,7 +194,7 @@ static void decompose_meta_count(struct unicode_decompose_info *info,
/* Pass 2: compile a list of characters to decompose. */
-static void decompose_meta_save(struct unicode_decompose_info *info,
+static void decompose_meta_save(unicode_decomposition_t *info,
size_t i,
const struct decomposition_info *cinfo,
void *arg)
@@ -208,7 +208,7 @@ static void decompose_meta_save(struct unicode_decompose_info *info,
++ptr->nchars;
}
-size_t unicode_decompose_reallocate_size(struct unicode_decompose_info *info,
+size_t unicode_decompose_reallocate_size(unicode_decomposition_t *info,
const size_t *sizes,
size_t n)
{
@@ -221,7 +221,7 @@ size_t unicode_decompose_reallocate_size(struct unicode_decompose_info *info,
return new_size;
}
-static int unicode_decompose_reallocate(struct unicode_decompose_info *info,
+static int unicode_decompose_reallocate(unicode_decomposition_t *info,
const size_t *offsets,
const size_t *sizes,
size_t n)
@@ -240,10 +240,10 @@ static int unicode_decompose_reallocate(struct unicode_decompose_info *info,
return 0;
}
-void unicode_decompose_info_init(struct unicode_decompose_info *info,
- char32_t *string,
- size_t string_size,
- void *arg)
+void unicode_decomposition_init(unicode_decomposition_t *info,
+ char32_t *string,
+ size_t string_size,
+ void *arg)
{
memset(info, 0, sizeof(*info));
@@ -259,11 +259,11 @@ void unicode_decompose_info_init(struct unicode_decompose_info *info,
info->arg=arg;
}
-void unicode_decompose_info_deinit(struct unicode_decompose_info *info)
+void unicode_decomposition_deinit(unicode_decomposition_t *info)
{
}
-int unicode_decompose(struct unicode_decompose_info *info)
+int unicode_decompose(unicode_decomposition_t *info)
{
int replaced;
int rc=0;
@@ -450,6 +450,14 @@ static char32_t lookup_composition(char32_t a, char32_t b)
return 0;
}
+/* Temporary linked list, until all compositions get built. */
+
+struct unicode_compose_info_list {
+ struct unicode_compose_info_list *next;
+ struct unicode_compose_info *info;
+};
+
+
/*
** Collect consecutive sequence of composable characters. We cache each
** character's composition level.
@@ -525,15 +533,15 @@ static int unicode_composition_init2(const char32_t *string,
size_t string_size,
int flags,
struct chars_and_levels *clptr,
- struct unicode_compositions ***tail_ptr);
+ struct unicode_compose_info_list ***tail_ptr);
int unicode_composition_init(const char32_t *string,
size_t string_size,
int flags,
- struct unicode_compositions **ret)
+ unicode_composition_t *info)
{
/*
- ** Initialize a singly-linked unicode_compositions_list.
+ ** Initialize a singly-linked unicode_compose_info_list_list.
**
** Initialize the tail pointer. We'll be adding onto the tail pointer
** as we find each composition.
@@ -541,11 +549,14 @@ int unicode_composition_init(const char32_t *string,
** Initialize the chars_and_levels buffer.
*/
- struct unicode_compositions *list=NULL;
- struct unicode_compositions **tail=&list;
+ struct unicode_compose_info_list *list=NULL;
+ struct unicode_compose_info_list **tail=&list;
struct chars_and_levels cl;
int c;
+ info->n_compositions=0;
+ info->compositions=0;
+
if (chars_and_levels_init(&cl))
return -1;
@@ -561,13 +572,51 @@ int unicode_composition_init(const char32_t *string,
&cl, &tail);
chars_and_levels_deinit(&cl);
+ if (c == 0)
+ {
+ struct unicode_compose_info_list *ptr;
+
+ info->n_compositions=0;
+
+ for (ptr=list; ptr; ptr=ptr->next)
+ ++info->n_compositions;
+
+ if ((info->compositions=(struct unicode_compose_info **)
+ malloc(sizeof(struct unicode_composition_info *)
+ * (info->n_compositions+1))) == NULL)
+ {
+ c= -1;
+ info->n_compositions=0;
+ }
+ }
+
+ if (c == 0)
+ {
+ struct unicode_compose_info_list *ptr;
+ size_t i=0;
+
+ while (list)
+ {
+ ptr=list->next;
+ info->compositions[i++]=list->info;
+ free(list);
+ list=ptr;
+ }
+ info->compositions[i]=NULL;
+ }
+
if (c)
{
- unicode_composition_deinit(list);
- list=NULL;
+ while (list)
+ {
+ struct unicode_compose_info_list *next=list->next;
+
+ free(list->info);
+ free(list);
+ list=next;
+ }
}
- *ret=list;
return c;
}
@@ -575,22 +624,22 @@ static int compose_chars_and_levels(const char32_t *starterptr,
size_t starter_index,
int flags,
struct chars_and_levels *clptr,
- struct unicode_compositions
+ struct unicode_compose_info_list
**last_compositionptr,
- struct unicode_compositions ***tail_ptr);
+ struct unicode_compose_info_list ***tail_ptr);
static int create_new_composition(size_t starter_index,
size_t n_combining_marks,
- struct unicode_compositions **ptr);
+ struct unicode_compose_info_list **ptr);
static int unicode_composition_init2(const char32_t *string,
size_t string_size,
int flags,
struct chars_and_levels *clptr,
- struct unicode_compositions ***tail_ptr)
+ struct unicode_compose_info_list ***tail_ptr)
{
size_t i;
- struct unicode_compositions *last_composition=NULL;
+ struct unicode_compose_info_list *last_composition=NULL;
/*
** Here we consecutively scan the string and look up each character's
@@ -629,13 +678,13 @@ static int unicode_composition_init2(const char32_t *string,
if (starterptr &&
/* Did we just compose this starter? */
last_composition &&
- last_composition->index == starter_index &&
+ last_composition->info->index == starter_index &&
/*
** Did we compose everything, didn't leave
** any combined marks behind?
*/
- last_composition->n_composition == 1)
+ last_composition->info->n_composition == 1)
{
/*
** So, check if we can combine with that
@@ -643,7 +692,7 @@ static int unicode_composition_init2(const char32_t *string,
** original starter, the new one is here.
*/
new_char=lookup_composition
- (last_composition->composition[0],
+ (last_composition->info->composition[0],
string[i]);
if (new_char != 0)
@@ -651,7 +700,7 @@ static int unicode_composition_init2(const char32_t *string,
/*
** Just update the composed char.
*/
- last_composition->composition[0]=
+ last_composition->info->composition[0]=
new_char;
/*
@@ -659,7 +708,7 @@ static int unicode_composition_init2(const char32_t *string,
** This nukes this starter, as if
** it was a part of the composition!
*/
- ++last_composition->n_composed;
+ ++last_composition->info->n_composed;
continue;
}
}
@@ -679,7 +728,7 @@ static int unicode_composition_init2(const char32_t *string,
** from two starters here.
*/
- struct unicode_compositions *new_composition;
+ struct unicode_compose_info_list *new_composition;
if (create_new_composition(starter_index,
1, &new_composition))
@@ -689,9 +738,9 @@ static int unicode_composition_init2(const char32_t *string,
**tail_ptr=new_composition;
*tail_ptr= &new_composition->next;
- new_composition->n_composed=2;
- new_composition->n_composition=1;
- new_composition->composition[0]=new_char;
+ new_composition->info->n_composed=2;
+ new_composition->info->n_composition=1;
+ new_composition->info->composition[0]=new_char;
continue;
}
/*
@@ -739,26 +788,30 @@ static int compare_levels(const void *a, const void *b)
static int create_new_composition(size_t starter_index,
size_t n_combining_marks,
- struct unicode_compositions **ptr)
+ struct unicode_compose_info_list **ptr)
{
- struct unicode_compositions *c=
- (struct unicode_compositions *)
- malloc(sizeof(struct unicode_compositions));
+ struct unicode_compose_info_list *c=
+ (struct unicode_compose_info_list *)
+ malloc(sizeof(struct unicode_compose_info_list));
if (!c)
return -1;
- c->index=starter_index;
- c->next=NULL;
+ c->info=malloc(sizeof(struct unicode_compose_info)+
+ sizeof(char32_t) * n_combining_marks);
- /* Worst case: nothing is composed */
-
- if ((c->composition=malloc(sizeof(char32_t) *
- n_combining_marks)) == NULL)
+ if (!c->info)
{
free(c);
return -1;
}
+
+ c->info->index=starter_index;
+ c->info->composition=(char32_t *)(c->info+1);
+ c->next=NULL;
+
+ /* Worst case: nothing is composed */
+
*ptr=c;
return 0;
}
@@ -767,11 +820,11 @@ static int compose_chars_and_levels(const char32_t *starterptr,
size_t starter_index,
int flags,
struct chars_and_levels *clptr,
- struct unicode_compositions
+ struct unicode_compose_info_list
**last_compositionptr,
- struct unicode_compositions ***tail_ptr)
+ struct unicode_compose_info_list ***tail_ptr)
{
- struct unicode_compositions *new_composition;
+ struct unicode_compose_info_list *new_composition;
char32_t starter=0;
size_t i;
int composed;
@@ -826,9 +879,9 @@ static int compose_chars_and_levels(const char32_t *starterptr,
{
size_t j;
- new_composition->n_composed=clptr->size+1;
+ new_composition->info->n_composed=clptr->size+1;
- new_composition->composition[0]=starter;
+ new_composition->info->composition[0]=starter;
i=1;
if (!(flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED))
@@ -841,12 +894,12 @@ static int compose_chars_and_levels(const char32_t *starterptr,
*/
if (clptr->ptr[j].level)
{
- new_composition->composition[i++]=
+ new_composition->info->composition[i++]=
clptr->ptr[j].ch;
}
}
}
- new_composition->n_composition=i;
+ new_composition->info->n_composition=i;
} else if (!starterptr && (flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED))
{
/*
@@ -855,8 +908,8 @@ static int compose_chars_and_levels(const char32_t *starterptr,
** new_composition.
*/
- new_composition->n_composed=clptr->size;
- new_composition->n_composition=0;
+ new_composition->info->n_composed=clptr->size;
+ new_composition->info->n_composition=0;
composed=1;
}
@@ -868,7 +921,7 @@ static int compose_chars_and_levels(const char32_t *starterptr,
}
else
{
- free(new_composition->composition);
+ free(new_composition->info);
free(new_composition);
new_composition=NULL;
}
@@ -877,37 +930,39 @@ static int compose_chars_and_levels(const char32_t *starterptr,
return 0;
}
-void unicode_composition_deinit(struct unicode_compositions *ptr)
+void unicode_composition_deinit(unicode_composition_t *info)
{
- while (ptr)
- {
- struct unicode_compositions *next=ptr->next;
+ size_t i;
- if (ptr->composition)
- free(ptr->composition);
- free(ptr);
- ptr=next;
- }
+ for (i=0; i<info->n_compositions; ++i)
+ free(info->compositions[i]);
+
+ if (info->compositions)
+ free(info->compositions);
+ info->compositions=0;
+ info->n_compositions=0;
}
size_t unicode_composition_apply(char32_t *string,
size_t string_size,
- struct unicode_compositions *compositions)
+ unicode_composition_t *info)
{
size_t j=0;
size_t i;
+ size_t c_index=0;
for (i=0; i<string_size; )
{
- if (compositions && compositions->index == i)
+ if (c_index < info->n_compositions &&
+ info->compositions[c_index]->index == i)
{
size_t k;
+ struct unicode_compose_info *compose=
+ info->compositions[c_index++];
- for (k=0; k<compositions->n_composition; ++k)
- string[j++]=compositions->composition[k];
- i += compositions->n_composed;
-
- compositions=compositions->next;
+ for (k=0; k<compose->n_composition; ++k)
+ string[j++]=compose->composition[k];
+ i += compose->n_composed;
}
else
{
@@ -925,14 +980,14 @@ int unicode_compose(char32_t *string,
int flags,
size_t *new_size)
{
- struct unicode_compositions *composes;
+ unicode_composition_t info;
- if (unicode_composition_init(string, string_size, flags, &composes))
+ if (unicode_composition_init(string, string_size, flags, &info))
return -1;
- *new_size=unicode_composition_apply(string, string_size, composes);
+ *new_size=unicode_composition_apply(string, string_size, &info);
- unicode_composition_deinit(composes);
+ unicode_composition_deinit(&info);
return 0;
}