summaryrefslogtreecommitdiffstats
path: root/unicode/unicode_normalization.c
diff options
context:
space:
mode:
authorSam Varshavchik2021-03-07 17:46:01 -0500
committerSam Varshavchik2021-03-10 22:52:34 -0500
commit18fc31347b80597f4100f96c86799fe130786781 (patch)
tree08b641332ec55232a34d3656d6435559a847fcce /unicode/unicode_normalization.c
parent92bcce9b28d5d123af67ff0201cd97508af21326 (diff)
downloadcourier-libs-18fc31347b80597f4100f96c86799fe130786781.tar.bz2
courier-unicode: canonical decomposition and composition implementation.
Move unicode_canonical() into a larger library of functions that handles full Unicode decomposition and composition.
Diffstat (limited to 'unicode/unicode_normalization.c')
-rw-r--r--unicode/unicode_normalization.c938
1 files changed, 938 insertions, 0 deletions
diff --git a/unicode/unicode_normalization.c b/unicode/unicode_normalization.c
new file mode 100644
index 0000000..ea9e256
--- /dev/null
+++ b/unicode/unicode_normalization.c
@@ -0,0 +1,938 @@
+/*
+** Copyright 2021 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include "unicode_config.h"
+#include "courier-unicode.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "normalization_defs.h"
+#include "normalization.h"
+
+/*
+** Lookup NFC quick check.
+*/
+static int unicode_nfc_qc(char32_t ch)
+{
+ return unicode_tab_lookup(ch,
+ nfc_qc_starting_indextab,
+ nfc_qc_starting_pagetab,
+ sizeof(nfc_qc_starting_indextab)/
+ sizeof(nfc_qc_starting_indextab[0]),
+ nfc_qc_rangetab,
+ sizeof(nfc_qc_rangetab)/
+ sizeof(nfc_qc_rangetab[0]),
+ nfc_qc_classtab,
+ UNICODE_NFC_QC_Y);
+}
+
+/*
+** Lookup nfkc quick check.
+*/
+static int unicode_nfkc_qc(char32_t ch)
+{
+ return unicode_tab_lookup(ch,
+ nfkc_qc_starting_indextab,
+ nfkc_qc_starting_pagetab,
+ sizeof(nfkc_qc_starting_indextab)/
+ sizeof(nfkc_qc_starting_indextab[0]),
+ nfkc_qc_rangetab,
+ sizeof(nfkc_qc_rangetab)/
+ sizeof(nfkc_qc_rangetab[0]),
+ nfkc_qc_classtab,
+ UNICODE_NFKC_QC_Y);
+}
+
+/*
+** Lookup a character's canonical combining class.
+*/
+
+static uint8_t unicode_ccc(char32_t ch)
+{
+ return unicode_tab_lookup(ch,
+ ccc_starting_indextab,
+ ccc_starting_pagetab,
+ sizeof(ccc_starting_indextab)/
+ sizeof(ccc_starting_indextab[0]),
+ ccc_rangetab,
+ sizeof(ccc_rangetab)/
+ sizeof(ccc_rangetab[0]),
+ ccc_classtab,
+ 0);
+}
+
+/*
+** Look up a character's decomposition
+*/
+
+static const struct decomposition_info *decomp_lookup_search(char32_t ch)
+{
+ size_t hash;
+ size_t i;
+
+ if (ch == 0)
+ return NULL;
+
+ hash=(size_t)(ch % (sizeof(decomp_lookup)
+ /sizeof(decomp_lookup[0])));
+
+
+ for (i=0; i<sizeof(decomp_lookup[hash])/sizeof(decomp_lookup[hash][0]);
+ ++i)
+ {
+ if (decomp_lookup[hash][i].ch == ch)
+ return &decomp_lookup[hash][i];
+ }
+ return NULL;
+}
+
+unicode_canonical_t unicode_canonical(char32_t ch)
+{
+ const struct decomposition_info *info=decomp_lookup_search(ch);
+ unicode_canonical_t ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ if (info)
+ {
+ ret.format=info->decomp_type;
+ ret.n_canonical_chars=info->decomp_size;
+ ret.canonical_chars=decompositions+info->decomp_index;
+ }
+ return ret;
+}
+
+/*
+** Scan through a string, finding each character to decompose.
+**
+** This invokes a provided callback, for each character that's getting
+** decomposed.
+*/
+
+static void search_for_decompose(struct unicode_decompose_info *info,
+ void (*f)(struct unicode_decompose_info *,
+ size_t,
+ const struct decomposition_info *,
+ void *),
+ void *arg)
+{
+ size_t i;
+
+ for (i=0; i<info->string_size; ++i)
+ {
+ const struct decomposition_info *comp_info;
+
+ /*
+ ** Skip over characters that have a quick check flag set.
+ */
+ if (info->decompose_flags & UNICODE_DECOMPOSE_FLAG_QC)
+ {
+ if (info->decompose_flags &
+ UNICODE_DECOMPOSE_FLAG_COMPAT
+ ? unicode_nfkc_qc(info->string[i])
+ == UNICODE_NFKC_QC_Y
+ : unicode_nfc_qc(info->string[i])
+ == UNICODE_NFC_QC_Y)
+ continue;
+ }
+
+ comp_info=decomp_lookup_search(info->string[i]);
+ if (comp_info &&
+ /* Should be the case: */
+ comp_info->decomp_size > 0 &&
+
+ /*
+ ** Pick only canonical decompositions, unless we're
+ ** doing a compatibility decomposition.
+ */
+ (comp_info->decomp_type == UNICODE_CANONICAL_FMT_NONE ||
+ info->decompose_flags &
+ UNICODE_DECOMPOSE_FLAG_COMPAT))
+ (*f)(info, i, comp_info, arg);
+ }
+}
+
+/*
+** Metadata we store about the decomposition.
+*/
+
+struct decompose_meta {
+
+ /* Number of characters, in total, that we are decomposing. */
+ size_t nchars;
+
+ /* And their indices in the original string. */
+ size_t *indexes;
+
+ /*
+ ** For each character how many we're adding, in addition to the
+ ** character being decomposed. This is 0 for a 1:1 decomposition.
+ **/
+ size_t *sizes;
+
+ /*
+ ** And the information about each individual decomposition.
+ */
+ const struct decomposition_info **infos;
+};
+
+/* Pass 1: count the number of characters to decompose. */
+
+static void decompose_meta_count(struct unicode_decompose_info *info,
+ size_t i,
+ const struct decomposition_info *cinfo,
+ void *arg)
+{
+ struct decompose_meta *ptr=(struct decompose_meta *)arg;
+
+ ++ptr->nchars;
+}
+
+/* Pass 2: compile a list of characters to decompose. */
+
+static void decompose_meta_save(struct unicode_decompose_info *info,
+ size_t i,
+ const struct decomposition_info *cinfo,
+ void *arg)
+{
+ struct decompose_meta *ptr=(struct decompose_meta *)arg;
+
+ ptr->indexes[ptr->nchars]=i;
+ ptr->sizes[ptr->nchars]=cinfo->decomp_size-1;
+ ptr->infos[ptr->nchars]=cinfo;
+
+ ++ptr->nchars;
+}
+
+size_t unicode_decompose_reallocate_size(struct unicode_decompose_info *info,
+ const size_t *sizes,
+ size_t n)
+{
+ size_t i;
+ size_t new_size=info->string_size;
+
+ for (i=0; i<n; ++i)
+ new_size += sizes[i];
+
+ return new_size;
+}
+
+static int unicode_decompose_reallocate(struct unicode_decompose_info *info,
+ const size_t *offsets,
+ const size_t *sizes,
+ size_t n)
+{
+ size_t new_size=unicode_decompose_reallocate_size(info, sizes, n);
+ char32_t *str=(char32_t *)realloc(info->string,
+ (new_size+1) * sizeof(char32_t));
+
+ if (str == NULL)
+ return -1;
+
+ info->string=str;
+ str[new_size]=0;
+ /* If the original was null-terminated, this one will be too,
+ ** we realloc-ed an extra char32_t */
+ return 0;
+}
+
+void unicode_decompose_info_init(struct unicode_decompose_info *info,
+ char32_t *string,
+ size_t string_size,
+ void *arg)
+{
+ memset(info, 0, sizeof(*info));
+
+ info->string=string;
+ if (string && string_size == (size_t)-1)
+ {
+ for (string_size=0; string[string_size]; ++string_size)
+ ;
+ }
+
+ info->reallocate=unicode_decompose_reallocate;
+ info->string_size=string_size;
+ info->arg=arg;
+}
+
+void unicode_decompose_info_deinit(struct unicode_decompose_info *info)
+{
+}
+
+int unicode_decompose(struct unicode_decompose_info *info)
+{
+ int replaced;
+ int rc=0;
+
+ do
+ {
+ struct decompose_meta meta;
+ char32_t *string_end, *string_put;
+ size_t next_insert_pos;
+ size_t i;
+ size_t old_pos;
+
+ replaced=0;
+
+ /*
+ ** Make two passes over the string, on the first pass count
+ ** the decompositions.
+ */
+
+ meta.nchars=0;
+ search_for_decompose(info, decompose_meta_count, &meta);
+
+ if (!meta.nchars)
+ continue; /* We're done */
+
+ /* We'll want to make another pass */
+ replaced=1;
+
+ /*
+ ** We have two size_t pointers to allocate dynamically,
+ ** allocate both of them together.
+ */
+ meta.indexes=(size_t *)malloc(meta.nchars * 2 * sizeof(size_t));
+ if (!meta.indexes)
+ {
+ rc= -1;
+ break;
+ }
+
+ meta.infos=(const struct decomposition_info **)
+ malloc(sizeof(const struct decomposition_info *)
+ *meta.nchars);
+
+ if (!meta.infos)
+ {
+ free(meta.indexes);
+ rc= -1;
+ break;
+ }
+
+ /*
+ ** And here's the second size_t * that we dynamically allocated
+ */
+ meta.sizes=meta.indexes+meta.nchars;
+
+ /* And now make a second pass to load the decompositions */
+ meta.nchars=0;
+
+ search_for_decompose(info, decompose_meta_save, &meta);
+
+ rc=(*info->reallocate)(info,
+ meta.indexes, meta.sizes,
+ meta.nchars);
+
+ if (rc)
+ {
+ /* Error, ensure the code below does nothing */
+ meta.nchars=0;
+ replaced=0;
+ }
+
+ /*
+ ** Insert decompositions. We make a pass from the end of
+ ** the string to the beginning of it. We already know what
+ ** all the decompositions are, on this pass.
+ **
+ ** We'll be copying the characters from the existing string
+ ** from the end of the string to its beginning, starting with
+ ** the new ending position.
+ **
+ ** Each time we reach a position where the decomposition
+ ** occurs, we inject it. In this manner we'll be processing
+ ** each decomposition starting with the last one in the
+ ** string, going back to the first one.
+ **
+ ** So, the next_insert_pos that we will do is the
+ ** last one:
+ */
+
+ next_insert_pos=meta.nchars;
+
+ /*
+ ** We start shifting over from the end of the string up.
+ ** Here's the current end of the string.
+ */
+ string_end=info->string+info->string_size;
+ old_pos=info->string_size;
+
+ /*
+ ** And compute where we're putting all the characters
+ ** by adding up all the sizes.
+ */
+
+ string_put=string_end;
+ for (i=0; i<meta.nchars; ++i)
+ {
+ /* While we're here we'll adjust string_size too... */
+ info->string_size += meta.sizes[i];
+ string_put += meta.sizes[i];
+ }
+
+ /* We only need to keep going until we finished shifting. */
+
+ while (next_insert_pos)
+ {
+ const struct decomposition_info *decompose=NULL;
+
+ if (--old_pos == meta.indexes[next_insert_pos-1])
+ {
+ /*
+ ** Reach the next (previous, really) insert
+ ** position. We only adjust the pointer here,
+ ** we'll be copying the decomposition below.
+ ** Here we're just quietly skipping ahead by
+ ** the number of new characters in the
+ ** decomposition.
+ */
+ --next_insert_pos;
+ string_put -= meta.sizes[next_insert_pos];
+ decompose=meta.infos[next_insert_pos];
+ }
+ *--string_put=*--string_end;
+
+ if (decompose)
+ {
+ /* The decomposition goes here */
+ memcpy(string_put,
+ decompositions + decompose->decomp_index,
+ decompose->decomp_size
+ * sizeof(char32_t));
+ }
+ }
+
+ free(meta.indexes);
+ free(meta.infos);
+
+ /*
+ ** Make another pass, perhaps.
+ */
+ } while (replaced);
+
+ return rc;
+}
+
+/*
+** Canonical compositions are all two characters > one character.
+*/
+
+static char32_t lookup_composition(char32_t a, char32_t b)
+{
+ size_t i, j;
+
+ i=((size_t)a * canonical_mult1
+ + (size_t)b * canonical_mult2)
+ % (sizeof(canonical_compositions_lookup) /
+ sizeof(canonical_compositions_lookup[0]));
+
+ j= i+1 < (sizeof(canonical_compositions_lookup) /
+ sizeof(canonical_compositions_lookup[0]))
+ ? canonical_compositions_lookup[i+1]
+ : sizeof(canonical_compositions)/
+ sizeof(canonical_compositions[0]);
+ i=canonical_compositions_lookup[i];
+
+ while (i<j)
+ {
+ if (canonical_compositions[i][0] == a &&
+ canonical_compositions[i][1] == b)
+ {
+ return canonical_compositions[i][2];
+ }
+ ++i;
+ }
+ return 0;
+}
+
+/*
+** Collect consecutive sequence of composable characters. We cache each
+** character's composition level.
+*/
+
+struct char_and_level {
+ char32_t ch; /* The character */
+ size_t index; /* Its position in the original string */
+ uint8_t level; /* Its combining level */
+};
+
+/*
+** A growing buffer of consecutive composition characters
+*/
+
+struct chars_and_levels {
+ struct char_and_level *ptr;
+ size_t size;
+ size_t reserved;
+};
+
+/*
+** Initialize this buffer.
+*/
+static int chars_and_levels_init(struct chars_and_levels *p)
+{
+ p->ptr=malloc( sizeof(struct char_and_level)*(p->reserved=1) );
+ if (!p->ptr)
+ return -1;
+ p->size=0;
+
+ return 0;
+}
+
+/*
+** Add char+level to the buffer, growing it if needed.
+*/
+
+static int add_char_and_level(struct chars_and_levels *p, char32_t ch,
+ size_t index,
+ uint8_t level)
+{
+ if (p->reserved <= p->size)
+ {
+ size_t n=p->reserved * 2;
+
+ struct char_and_level *ptr=
+ realloc(p->ptr, sizeof(struct char_and_level)*n);
+
+ if (!ptr)
+ return -1;
+ p->reserved=n;
+ p->ptr=ptr;
+ }
+ p->ptr[p->size].ch=ch;
+ p->ptr[p->size].index=index;
+ p->ptr[p->size].level=level;
+ ++p->size;
+ return 0;
+}
+
+/*
+** Deallocate the buffer.
+*/
+
+static void chars_and_levels_deinit(struct chars_and_levels *p)
+{
+ if (p->ptr)
+ free(p->ptr);
+}
+
+static int unicode_composition_init2(const char32_t *string,
+ size_t string_size,
+ int flags,
+ struct chars_and_levels *clptr,
+ struct unicode_compositions ***tail_ptr);
+
+int unicode_composition_init(const char32_t *string,
+ size_t string_size,
+ int flags,
+ struct unicode_compositions **ret)
+{
+ /*
+ ** Initialize a singly-linked unicode_compositions_list.
+ **
+ ** Initialize the tail pointer. We'll be adding onto the tail pointer
+ ** as we find each composition.
+ **
+ ** Initialize the chars_and_levels buffer.
+ */
+
+ struct unicode_compositions *list=NULL;
+ struct unicode_compositions **tail=&list;
+ struct chars_and_levels cl;
+ int c;
+
+ if (chars_and_levels_init(&cl))
+ return -1;
+
+ /*
+ ** Call unicode_composition_init2 to do all the work.
+ **
+ ** When it returns we can deinit the chars_and_levels buffer.
+ **
+ ** If it fails we can also deinitialize the linked list, and
+ ** return a NULL pointer.
+ */
+ c=unicode_composition_init2(string, string_size, flags,
+ &cl, &tail);
+ chars_and_levels_deinit(&cl);
+
+ if (c)
+ {
+ unicode_composition_deinit(list);
+ list=NULL;
+ }
+
+ *ret=list;
+ return c;
+}
+
+static int compose_chars_and_levels(const char32_t *starterptr,
+ size_t starter_index,
+ int flags,
+ struct chars_and_levels *clptr,
+ struct unicode_compositions
+ **last_compositionptr,
+ struct unicode_compositions ***tail_ptr);
+
+static int create_new_composition(size_t starter_index,
+ size_t n_combining_marks,
+ struct unicode_compositions **ptr);
+
+static int unicode_composition_init2(const char32_t *string,
+ size_t string_size,
+ int flags,
+ struct chars_and_levels *clptr,
+ struct unicode_compositions ***tail_ptr)
+{
+ size_t i;
+ struct unicode_compositions *last_composition=NULL;
+
+ /*
+ ** Here we consecutively scan the string and look up each character's
+ ** composition level.
+ **
+ ** Each time we get a composition level of 0 we update starterptr to
+ ** point to the starter character, and save its index here.
+ */
+
+ const char32_t *starterptr=NULL;
+ size_t starter_index=0;
+ for (i=0; i<string_size; ++i)
+ {
+ uint8_t ccc=unicode_ccc(string[i]);
+ char32_t new_char;
+
+ if (ccc == 0)
+ {
+ /*
+ ** Starter. If there were any preceding composing
+ ** characters, then compose them.
+ */
+ if (compose_chars_and_levels(starterptr,
+ starter_index,
+ flags,
+ clptr,
+ &last_composition,
+ tail_ptr))
+ return -1;
+
+ /*
+ ** It's possible for a starter to combine with its
+ ** preceding starter.
+ */
+
+ if (starterptr &&
+ /* Did we just compose this starter? */
+ last_composition &&
+ last_composition->index == starter_index &&
+
+ /*
+ ** Did we compose everything, didn't leave
+ ** any combined marks behind?
+ */
+ last_composition->n_composition == 1)
+ {
+ /*
+ ** So, check if we can combine with that
+ ** last starter. *starterptr is the
+ ** original starter, the new one is here.
+ */
+ new_char=lookup_composition
+ (last_composition->composition[0],
+ string[i]);
+
+ if (new_char != 0)
+ {
+ /*
+ ** Just update the composed char.
+ */
+ last_composition->composition[0]=
+ new_char;
+
+ /*
+ ** And incrementing n_composed.
+ ** This nukes this starter, as if
+ ** it was a part of the composition!
+ */
+ ++last_composition->n_composed;
+ continue;
+ }
+ }
+ else if (starterptr && starter_index+1 == i &&
+ /*
+ ** Ok, the last starter was not composed,
+ ** and it was the previous character. The
+ ** comparison against the starter_index is
+ ** just, really, a sanity check.
+ */
+ (new_char=
+ lookup_composition(*starterptr,
+ string[i])) != 0)
+ {
+ /*
+ ** We'll need to manually create a composition
+ ** from two starters here.
+ */
+
+ struct unicode_compositions *new_composition;
+
+ if (create_new_composition(starter_index,
+ 1, &new_composition))
+ return -1;
+
+ last_composition=new_composition;
+ **tail_ptr=new_composition;
+ *tail_ptr= &new_composition->next;
+
+ new_composition->n_composed=2;
+ new_composition->n_composition=1;
+ new_composition->composition[0]=new_char;
+ continue;
+ }
+ /*
+ ** And this is a new starter.
+ */
+ starterptr=&string[i];
+ starter_index=i;
+ }
+ else
+ {
+ /* Add composing characters */
+
+ if (add_char_and_level(clptr, string[i], i, ccc))
+ return -1;
+ }
+ }
+
+ /* We could've finish the string with some composition */
+
+ return compose_chars_and_levels(starterptr,
+ starter_index,
+ flags,
+ clptr,
+ &last_composition,
+ tail_ptr);
+}
+
+/*
+** sort combining characters by their canonical combining class
+*/
+
+static int compare_levels(const void *a, const void *b)
+{
+ const struct char_and_level *ca=(const struct char_and_level *)a;
+ const struct char_and_level *cb=(const struct char_and_level *)b;
+
+ return ca->level < cb->level ? -1 :
+ ca->level > cb->level ? 1
+
+ /* Same combining level, compare their indexes */
+ : ca->index < cb->index ? -1
+ : ca->index > cb->index ? 1
+ : 0;
+}
+
+static int create_new_composition(size_t starter_index,
+ size_t n_combining_marks,
+ struct unicode_compositions **ptr)
+{
+ struct unicode_compositions *c=
+ (struct unicode_compositions *)
+ malloc(sizeof(struct unicode_compositions));
+
+ if (!c)
+ return -1;
+
+ c->index=starter_index;
+ c->next=NULL;
+
+ /* Worst case: nothing is composed */
+
+ if ((c->composition=malloc(sizeof(char32_t) *
+ n_combining_marks)) == NULL)
+ {
+ free(c);
+ return -1;
+ }
+ *ptr=c;
+ return 0;
+}
+
+static int compose_chars_and_levels(const char32_t *starterptr,
+ size_t starter_index,
+ int flags,
+ struct chars_and_levels *clptr,
+ struct unicode_compositions
+ **last_compositionptr,
+ struct unicode_compositions ***tail_ptr)
+{
+ struct unicode_compositions *new_composition;
+ char32_t starter=0;
+ size_t i;
+ int composed;
+
+ if (clptr->size == 0)
+ /* Nothing to do, no composable chars since last starter */
+ return 0;
+
+ qsort(clptr->ptr, clptr->size, sizeof(struct char_and_level),
+ compare_levels);
+
+ if (create_new_composition(starter_index,
+ clptr->size,
+ &new_composition))
+ return -1;
+
+ composed=0;
+
+ if (starterptr)
+ {
+ starter=*starterptr;
+
+ for (i=0; i<clptr->size; ++i)
+ {
+ char32_t new_char=lookup_composition(starter,
+ clptr->ptr[i].ch);
+
+ if (new_char)
+ {
+ starter=new_char;
+
+ /*
+ ** These must have a non-0 level, so mark
+ ** these ones by setting their level to 0.
+ */
+ clptr->ptr[i].level=0;
+ composed=1;
+
+ if (flags & UNICODE_COMPOSE_FLAG_ONESHOT)
+ break;
+ }
+ }
+ }
+
+ /*
+ ** new_composition->composition is the same size as the
+ ** number of composable characters. If we composed at
+ ** least once, we know that we can fit the starter character
+ ** and whatever was not composed into that buffer.
+ */
+ if (composed)
+ {
+ size_t j;
+
+ new_composition->n_composed=clptr->size+1;
+
+ new_composition->composition[0]=starter;
+
+ i=1;
+ if (!(flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED))
+ {
+ for (j=0; j<clptr->size; ++j)
+ {
+ /*
+ ** The ones that were used in the composition
+ ** have their level reset to 0.
+ */
+ if (clptr->ptr[j].level)
+ {
+ new_composition->composition[i++]=
+ clptr->ptr[j].ch;
+ }
+ }
+ }
+ new_composition->n_composition=i;
+ } else if (!starterptr && (flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED))
+ {
+ /*
+ ** Edge case, this flag is set, composition characters at the
+ ** beginning of the string. We will create an empty
+ ** new_composition.
+ */
+
+ new_composition->n_composed=clptr->size;
+ new_composition->n_composition=0;
+ composed=1;
+ }
+
+ if (composed)
+ {
+ *last_compositionptr=new_composition;
+ **tail_ptr=new_composition;
+ *tail_ptr=&new_composition->next;
+ }
+ else
+ {
+ free(new_composition->composition);
+ free(new_composition);
+ new_composition=NULL;
+ }
+
+ clptr->size=0; /* Start a new list of composable characters */
+ return 0;
+}
+
+void unicode_composition_deinit(struct unicode_compositions *ptr)
+{
+ while (ptr)
+ {
+ struct unicode_compositions *next=ptr->next;
+
+ if (ptr->composition)
+ free(ptr->composition);
+ free(ptr);
+ ptr=next;
+ }
+}
+
+size_t unicode_composition_apply(char32_t *string,
+ size_t string_size,
+ struct unicode_compositions *compositions)
+{
+ size_t j=0;
+ size_t i;
+
+ for (i=0; i<string_size; )
+ {
+ if (compositions && compositions->index == i)
+ {
+ size_t k;
+
+ for (k=0; k<compositions->n_composition; ++k)
+ string[j++]=compositions->composition[k];
+ i += compositions->n_composed;
+
+ compositions=compositions->next;
+ }
+ else
+ {
+ string[j++]=string[i++];
+ }
+ }
+
+ if (j < string_size)
+ string[j]=0;
+ return j;
+}
+
+int unicode_compose(char32_t *string,
+ size_t string_size,
+ int flags,
+ size_t *new_size)
+{
+ struct unicode_compositions *composes;
+
+ if (unicode_composition_init(string, string_size, flags, &composes))
+ return -1;
+
+ *new_size=unicode_composition_apply(string, string_size, composes);
+
+ unicode_composition_deinit(composes);
+
+ return 0;
+}