diff options
| author | Sam Varshavchik | 2021-03-07 17:46:01 -0500 | 
|---|---|---|
| committer | Sam Varshavchik | 2021-03-10 22:52:34 -0500 | 
| commit | 18fc31347b80597f4100f96c86799fe130786781 (patch) | |
| tree | 08b641332ec55232a34d3656d6435559a847fcce /unicode/unicode_normalization.c | |
| parent | 92bcce9b28d5d123af67ff0201cd97508af21326 (diff) | |
| download | courier-libs-18fc31347b80597f4100f96c86799fe130786781.tar.bz2 | |
courier-unicode: canonical decomposition and composition implementation.
Move unicode_canonical() into a larger library of functions that handles
full Unicode decomposition and composition.
Diffstat (limited to 'unicode/unicode_normalization.c')
| -rw-r--r-- | unicode/unicode_normalization.c | 938 | 
1 files changed, 938 insertions, 0 deletions
| diff --git a/unicode/unicode_normalization.c b/unicode/unicode_normalization.c new file mode 100644 index 0000000..ea9e256 --- /dev/null +++ b/unicode/unicode_normalization.c @@ -0,0 +1,938 @@ +/* +** Copyright 2021 Double Precision, Inc. +** See COPYING for distribution information. +** +*/ + +#include	"unicode_config.h" +#include	"courier-unicode.h" +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include	"normalization_defs.h" +#include	"normalization.h" + +/* +** Lookup NFC quick check. +*/ +static int unicode_nfc_qc(char32_t ch) +{ +	return unicode_tab_lookup(ch, +				  nfc_qc_starting_indextab, +				  nfc_qc_starting_pagetab, +				  sizeof(nfc_qc_starting_indextab)/ +				  sizeof(nfc_qc_starting_indextab[0]), +				  nfc_qc_rangetab, +				  sizeof(nfc_qc_rangetab)/ +				  sizeof(nfc_qc_rangetab[0]), +				  nfc_qc_classtab, +				  UNICODE_NFC_QC_Y); +} + +/* +** Lookup nfkc quick check. +*/ +static int unicode_nfkc_qc(char32_t ch) +{ +	return unicode_tab_lookup(ch, +				  nfkc_qc_starting_indextab, +				  nfkc_qc_starting_pagetab, +				  sizeof(nfkc_qc_starting_indextab)/ +				  sizeof(nfkc_qc_starting_indextab[0]), +				  nfkc_qc_rangetab, +				  sizeof(nfkc_qc_rangetab)/ +				  sizeof(nfkc_qc_rangetab[0]), +				  nfkc_qc_classtab, +				  UNICODE_NFKC_QC_Y); +} + +/* +** Lookup a character's canonical combining class. +*/ + +static uint8_t unicode_ccc(char32_t ch) +{ +	return unicode_tab_lookup(ch, +				  ccc_starting_indextab, +				  ccc_starting_pagetab, +				  sizeof(ccc_starting_indextab)/ +				  sizeof(ccc_starting_indextab[0]), +				  ccc_rangetab, +				  sizeof(ccc_rangetab)/ +				  sizeof(ccc_rangetab[0]), +				  ccc_classtab, +				  0); +} + +/* +** Look up a character's decomposition +*/ + +static const struct decomposition_info *decomp_lookup_search(char32_t ch) +{ +	size_t hash; +	size_t i; + +	if (ch == 0) +		return NULL; + +	hash=(size_t)(ch % (sizeof(decomp_lookup) +			    /sizeof(decomp_lookup[0]))); + + +	for (i=0; i<sizeof(decomp_lookup[hash])/sizeof(decomp_lookup[hash][0]); +	     ++i) +	{ +		if (decomp_lookup[hash][i].ch == ch) +			return &decomp_lookup[hash][i]; +	} +	return NULL; +} + +unicode_canonical_t unicode_canonical(char32_t ch) +{ +	const struct decomposition_info *info=decomp_lookup_search(ch); +	unicode_canonical_t ret; + +	memset(&ret, 0, sizeof(ret)); + +	if (info) +	{ +		ret.format=info->decomp_type; +		ret.n_canonical_chars=info->decomp_size; +		ret.canonical_chars=decompositions+info->decomp_index; +	} +	return ret; +} + +/* +** Scan through a string, finding each character to decompose. +** +** This invokes a provided callback, for each character that's getting +** decomposed. +*/ + +static void search_for_decompose(struct unicode_decompose_info *info, +				 void (*f)(struct unicode_decompose_info *, +					   size_t, +					   const struct decomposition_info *, +					   void *), +				 void *arg) +{ +	size_t i; + +	for (i=0; i<info->string_size; ++i) +	{ +		const struct decomposition_info *comp_info; + +		/* +		** Skip over characters that have a quick check flag set. +		*/ +		if (info->decompose_flags & UNICODE_DECOMPOSE_FLAG_QC) +		{ +			if (info->decompose_flags & +			    UNICODE_DECOMPOSE_FLAG_COMPAT +			    ? unicode_nfkc_qc(info->string[i]) +			    == UNICODE_NFKC_QC_Y +			    : unicode_nfc_qc(info->string[i]) +			    == UNICODE_NFC_QC_Y) +				continue; +		} + +		comp_info=decomp_lookup_search(info->string[i]); +		if (comp_info && +		    /* Should be the case: */ +		    comp_info->decomp_size > 0 && + +		    /* +		    ** Pick only canonical decompositions, unless we're +		    ** doing a compatibility decomposition. +		    */ +		    (comp_info->decomp_type == UNICODE_CANONICAL_FMT_NONE || +		     info->decompose_flags & +		     UNICODE_DECOMPOSE_FLAG_COMPAT)) +			(*f)(info, i, comp_info, arg); +	} +} + +/* +** Metadata we store about the decomposition. +*/ + +struct decompose_meta { + +	/* Number of characters, in total, that we are decomposing. */ +	size_t nchars; + +	/* And their indices in the original string. */ +	size_t *indexes; + +	/* +	** For each character how many we're adding, in addition to the +	** character being decomposed. This is 0 for a 1:1 decomposition. +	**/ +	size_t *sizes; + +	/* +	** And the information about each individual decomposition. +	*/ +	const struct decomposition_info **infos; +}; + +/* Pass 1: count the number of characters to decompose. */ + +static void decompose_meta_count(struct unicode_decompose_info *info, +				 size_t i, +				 const struct decomposition_info *cinfo, +				 void *arg) +{ +	struct decompose_meta *ptr=(struct decompose_meta *)arg; + +	++ptr->nchars; +} + +/* Pass 2: compile a list of characters to decompose. */ + +static void decompose_meta_save(struct unicode_decompose_info *info, +				size_t i, +				const struct decomposition_info *cinfo, +				void *arg) +{ +	struct decompose_meta *ptr=(struct decompose_meta *)arg; + +	ptr->indexes[ptr->nchars]=i; +	ptr->sizes[ptr->nchars]=cinfo->decomp_size-1; +	ptr->infos[ptr->nchars]=cinfo; + +	++ptr->nchars; +} + +size_t unicode_decompose_reallocate_size(struct unicode_decompose_info *info, +					 const size_t *sizes, +					 size_t n) +{ +	size_t i; +	size_t new_size=info->string_size; + +	for (i=0; i<n; ++i) +		new_size += sizes[i]; + +	return new_size; +} + +static int unicode_decompose_reallocate(struct unicode_decompose_info *info, +					 const size_t *offsets, +					 const size_t *sizes, +					 size_t n) +{ +	size_t new_size=unicode_decompose_reallocate_size(info, sizes, n); +	char32_t *str=(char32_t *)realloc(info->string, +					  (new_size+1) * sizeof(char32_t)); + +	if (str == NULL) +		return -1; + +	info->string=str; +	str[new_size]=0; +	/* If the original was null-terminated, this one will be too, +	** we realloc-ed an extra char32_t */ +	return 0; +} + +void unicode_decompose_info_init(struct unicode_decompose_info *info, +				 char32_t *string, +				 size_t string_size, +				 void *arg) +{ +	memset(info, 0, sizeof(*info)); + +	info->string=string; +	if (string && string_size == (size_t)-1) +	{ +		for (string_size=0; string[string_size]; ++string_size) +			; +	} + +	info->reallocate=unicode_decompose_reallocate; +	info->string_size=string_size; +	info->arg=arg; +} + +void unicode_decompose_info_deinit(struct unicode_decompose_info *info) +{ +} + +int unicode_decompose(struct unicode_decompose_info *info) +{ +	int replaced; +	int rc=0; + +	do +	{ +		struct decompose_meta meta; +		char32_t *string_end, *string_put; +		size_t next_insert_pos; +		size_t i; +		size_t old_pos; + +		replaced=0; + +		/* +		** Make two passes over the string, on the first pass count +		** the decompositions. +		*/ + +		meta.nchars=0; +		search_for_decompose(info, decompose_meta_count, &meta); + +		if (!meta.nchars) +			continue; /* We're done */ + +		/* We'll want to make another pass */ +		replaced=1; + +		/* +		** We have two size_t pointers to allocate dynamically, +		** allocate both of them together. +		*/ +		meta.indexes=(size_t *)malloc(meta.nchars * 2 * sizeof(size_t)); +		if (!meta.indexes) +		{ +			rc= -1; +			break; +		} + +		meta.infos=(const struct decomposition_info **) +			malloc(sizeof(const struct decomposition_info *) +			       *meta.nchars); + +		if (!meta.infos) +		{ +			free(meta.indexes); +			rc= -1; +			break; +		} + +		/* +		** And here's the second size_t * that we dynamically allocated +		*/ +		meta.sizes=meta.indexes+meta.nchars; + +		/* And now make a second pass to load the decompositions */ +		meta.nchars=0; + +		search_for_decompose(info, decompose_meta_save, &meta); + +		rc=(*info->reallocate)(info, +				       meta.indexes, meta.sizes, +				       meta.nchars); + +		if (rc) +		{ +			/* Error, ensure the code below does nothing */ +			meta.nchars=0; +			replaced=0; +		} + +		/* +		** Insert decompositions. We make a pass from the end of +		** the string to the beginning of it. We already know what +		** all the decompositions are, on this pass. +		** +		** We'll be copying the characters from the existing string +		** from the end of the string to its beginning, starting with +		** the new ending position. +		** +		** Each time we reach a position where the decomposition +		** occurs, we inject it. In this manner we'll be processing +		** each decomposition starting with the last one in the +		** string, going back to the first one. +		** +		** So, the next_insert_pos that we will do is the +		** last one: +		*/ + +		next_insert_pos=meta.nchars; + +		/* +		** We start shifting over from the end of the string up. +		** Here's the current end of the string. +		*/ +		string_end=info->string+info->string_size; +		old_pos=info->string_size; + +		/* +		** And compute where we're putting all the characters +		** by adding up all the sizes. +		*/ + +		string_put=string_end; +		for (i=0; i<meta.nchars; ++i) +		{ +			/* While we're here we'll adjust string_size too... */ +			info->string_size += meta.sizes[i]; +			string_put += meta.sizes[i]; +		} + +		/* We only need to keep going until we finished shifting. */ + +		while (next_insert_pos) +		{ +			const struct decomposition_info *decompose=NULL; + +			if (--old_pos == meta.indexes[next_insert_pos-1]) +			{ +				/* +				** Reach the next (previous, really) insert +				** position. We only adjust the pointer here, +				** we'll be copying the decomposition below. +				** Here we're just quietly skipping ahead by +				** the number of new characters in the +				** decomposition. +				*/ +				--next_insert_pos; +				string_put -= meta.sizes[next_insert_pos]; +				decompose=meta.infos[next_insert_pos]; +			} +			*--string_put=*--string_end; + +			if (decompose) +			{ +				/* The decomposition goes here */ +				memcpy(string_put, +				       decompositions + decompose->decomp_index, +				       decompose->decomp_size +				       * sizeof(char32_t)); +			} +		} + +		free(meta.indexes); +		free(meta.infos); + +		/* +		** Make another pass, perhaps. +		*/ +	} while (replaced); + +	return rc; +} + +/* +** Canonical compositions are all two characters > one character. +*/ + +static char32_t lookup_composition(char32_t a, char32_t b) +{ +	size_t i, j; + +	i=((size_t)a * canonical_mult1 +	   + (size_t)b * canonical_mult2) +		% (sizeof(canonical_compositions_lookup) / +		   sizeof(canonical_compositions_lookup[0])); + +	j= i+1 < (sizeof(canonical_compositions_lookup) / +		  sizeof(canonical_compositions_lookup[0])) +		? canonical_compositions_lookup[i+1] +		: sizeof(canonical_compositions)/ +		sizeof(canonical_compositions[0]); +	i=canonical_compositions_lookup[i]; + +	while (i<j) +	{ +		if (canonical_compositions[i][0] == a && +		    canonical_compositions[i][1] == b) +		{ +			return canonical_compositions[i][2]; +		} +		++i; +	} +	return 0; +} + +/* +** Collect consecutive sequence of composable characters. We cache each +** character's composition level. +*/ + +struct char_and_level { +	char32_t ch;		/* The character */ +	size_t index;		/* Its position in the original string */ +	uint8_t level;		/* Its combining level */ +}; + +/* +** A growing buffer of consecutive composition characters +*/ + +struct chars_and_levels { +	struct char_and_level *ptr; +	size_t size; +	size_t reserved; +}; + +/* +** Initialize this buffer. +*/ +static int chars_and_levels_init(struct chars_and_levels *p) +{ +	p->ptr=malloc( sizeof(struct char_and_level)*(p->reserved=1) ); +	if (!p->ptr) +		return -1; +	p->size=0; + +	return 0; +} + +/* +** Add char+level to the buffer, growing it if needed. +*/ + +static int add_char_and_level(struct chars_and_levels *p, char32_t ch, +			      size_t index, +			      uint8_t level) +{ +	if (p->reserved <= p->size) +	{ +		size_t n=p->reserved * 2; + +		struct char_and_level *ptr= +			realloc(p->ptr, sizeof(struct char_and_level)*n); + +		if (!ptr) +			return -1; +		p->reserved=n; +		p->ptr=ptr; +	} +	p->ptr[p->size].ch=ch; +	p->ptr[p->size].index=index; +	p->ptr[p->size].level=level; +	++p->size; +	return 0; +} + +/* +** Deallocate the buffer. +*/ + +static void chars_and_levels_deinit(struct chars_and_levels *p) +{ +	if (p->ptr) +		free(p->ptr); +} + +static int unicode_composition_init2(const char32_t *string, +				     size_t string_size, +				     int flags, +				     struct chars_and_levels *clptr, +				     struct unicode_compositions ***tail_ptr); + +int unicode_composition_init(const char32_t *string, +			     size_t string_size, +			     int flags, +			     struct unicode_compositions **ret) +{ +	/* +	** Initialize a singly-linked unicode_compositions_list. +	** +	** Initialize the tail pointer. We'll be adding onto the tail pointer +	** as we find each composition. +	** +	** Initialize the chars_and_levels buffer. +	*/ + +	struct unicode_compositions *list=NULL; +	struct unicode_compositions **tail=&list; +	struct chars_and_levels cl; +	int c; + +	if (chars_and_levels_init(&cl)) +		return -1; + +	/* +	** Call unicode_composition_init2 to do all the work. +	** +	** When it returns we can deinit the chars_and_levels buffer. +	** +	** If it fails we can also deinitialize the linked list, and +	** return a NULL pointer. +	*/ +	c=unicode_composition_init2(string, string_size, flags, +				    &cl, &tail); +	chars_and_levels_deinit(&cl); + +	if (c) +	{ +		unicode_composition_deinit(list); +		list=NULL; +	} + +	*ret=list; +	return c; +} + +static int compose_chars_and_levels(const char32_t *starterptr, +				    size_t starter_index, +				    int flags, +				    struct chars_and_levels *clptr, +				    struct unicode_compositions +				    **last_compositionptr, +				    struct unicode_compositions ***tail_ptr); + +static int create_new_composition(size_t starter_index, +				  size_t n_combining_marks, +				  struct unicode_compositions **ptr); + +static int unicode_composition_init2(const char32_t *string, +				     size_t string_size, +				     int flags, +				     struct chars_and_levels *clptr, +				     struct unicode_compositions ***tail_ptr) +{ +	size_t i; +	struct unicode_compositions *last_composition=NULL; + +	/* +	** Here we consecutively scan the string and look up each character's +	** composition level. +	** +	** Each time we get a composition level of 0 we update starterptr to +	** point to the starter character, and save its index here. +	*/ + +	const char32_t *starterptr=NULL; +	size_t starter_index=0; +	for (i=0; i<string_size; ++i) +	{ +		uint8_t ccc=unicode_ccc(string[i]); +		char32_t new_char; + +		if (ccc == 0) +		{ +			/* +			** Starter. If there were any preceding composing +			** characters, then compose them. +			*/ +			if (compose_chars_and_levels(starterptr, +						     starter_index, +						     flags, +						     clptr, +						     &last_composition, +						     tail_ptr)) +				return -1; + +			/* +			** It's possible for a starter to combine with its +			** preceding starter. +			*/ + +			if (starterptr && +			    /* Did we just compose this starter? */ +			    last_composition && +			    last_composition->index == starter_index && + +			    /* +			    ** Did we compose everything, didn't leave +			    ** any combined marks behind? +			    */ +			    last_composition->n_composition == 1) +			{ +				/* +				** So, check if we can combine with that +				** last starter. *starterptr is the +				** original starter, the new one is here. +				*/ +				new_char=lookup_composition +					(last_composition->composition[0], +					 string[i]); + +				if (new_char != 0) +				{ +					/* +					** Just update the composed char. +					*/ +					last_composition->composition[0]= +						new_char; + +					/* +					** And incrementing n_composed. +					** This nukes this starter, as if +					** it was a part of the composition! +					*/ +					++last_composition->n_composed; +					continue; +				} +			} +			else if (starterptr && starter_index+1 == i && +				/* +				** Ok, the last starter was not composed, +				** and it was the previous character. The +				** comparison against the starter_index is +				** just, really, a sanity check. +				*/ +				 (new_char= +				  lookup_composition(*starterptr, +						     string[i])) != 0) +			{ +				/* +				** We'll need to manually create a composition +				** from two starters here. +				*/ + +				struct unicode_compositions *new_composition; + +				if (create_new_composition(starter_index, +							   1, &new_composition)) +					return -1; + +				last_composition=new_composition; +				**tail_ptr=new_composition; +				*tail_ptr= &new_composition->next; + +				new_composition->n_composed=2; +				new_composition->n_composition=1; +				new_composition->composition[0]=new_char; +				continue; +			} +			/* +			** And this is a new starter. +			*/ +			starterptr=&string[i]; +			starter_index=i; +		} +		else +		{ +			/* Add composing characters */ + +			if (add_char_and_level(clptr, string[i], i, ccc)) +				return -1; +		} +	} + +	/* We could've finish the string with some composition */ + +	return compose_chars_and_levels(starterptr, +					starter_index, +					flags, +					clptr, +					&last_composition, +					tail_ptr); +} + +/* +** sort combining characters by their canonical combining class +*/ + +static int compare_levels(const void *a, const void *b) +{ +	const struct char_and_level *ca=(const struct char_and_level *)a; +	const struct char_and_level *cb=(const struct char_and_level *)b; + +	return ca->level < cb->level ? -1 : +		ca->level > cb->level ? 1 + +		/* Same combining level, compare their indexes */ +		: ca->index < cb->index ? -1 +		: ca->index > cb->index ? 1 +		: 0; +} + +static int create_new_composition(size_t starter_index, +				  size_t n_combining_marks, +				  struct unicode_compositions **ptr) +{ +	struct unicode_compositions *c= +		(struct unicode_compositions *) +		malloc(sizeof(struct unicode_compositions)); + +	if (!c) +		return -1; + +	c->index=starter_index; +	c->next=NULL; + +	/* Worst case: nothing is composed */ + +	if ((c->composition=malloc(sizeof(char32_t) * +				   n_combining_marks)) == NULL) +	{ +		free(c); +		return -1; +	} +	*ptr=c; +	return 0; +} + +static int compose_chars_and_levels(const char32_t *starterptr, +				    size_t starter_index, +				    int flags, +				    struct chars_and_levels *clptr, +				    struct unicode_compositions +				    **last_compositionptr, +				    struct unicode_compositions ***tail_ptr) +{ +	struct unicode_compositions *new_composition; +	char32_t starter=0; +	size_t i; +	int composed; + +	if (clptr->size == 0) +		/* Nothing to do, no composable chars since last starter */ +		return 0; + +	qsort(clptr->ptr, clptr->size, sizeof(struct char_and_level), +	      compare_levels); + +	if (create_new_composition(starter_index, +				   clptr->size, +				   &new_composition)) +		return -1; + +	composed=0; + +	if (starterptr) +	{ +		starter=*starterptr; + +		for (i=0; i<clptr->size; ++i) +		{ +			char32_t new_char=lookup_composition(starter, +							     clptr->ptr[i].ch); + +			if (new_char) +			{ +				starter=new_char; + +				/* +				** These must have a non-0 level, so mark +				** these ones by setting their level to 0. +				*/ +				clptr->ptr[i].level=0; +				composed=1; + +				if (flags & UNICODE_COMPOSE_FLAG_ONESHOT) +					break; +			} +		} +	} + +	/* +	** new_composition->composition is the same size as the +	** number of composable characters. If we composed at +	** least once, we know that we can fit the starter character +	** and whatever was not composed into that buffer. +	*/ +	if (composed) +	{ +		size_t j; + +		new_composition->n_composed=clptr->size+1; + +		new_composition->composition[0]=starter; + +		i=1; +		if (!(flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED)) +		{ +			for (j=0; j<clptr->size; ++j) +			{ +				/* +				** The ones that were used in the composition +				** have their level reset to 0. +				*/ +				if (clptr->ptr[j].level) +				{ +					new_composition->composition[i++]= +						clptr->ptr[j].ch; +				} +			} +		} +		new_composition->n_composition=i; +	} else if (!starterptr && (flags & UNICODE_COMPOSE_FLAG_REMOVEUNUSED)) +	{ +		/* +		** Edge case, this flag is set, composition characters at the +		** beginning of the string. We will create an empty +		** new_composition. +		*/ + +		new_composition->n_composed=clptr->size; +		new_composition->n_composition=0; +		composed=1; +	} + +	if (composed) +	{ +		*last_compositionptr=new_composition; +		**tail_ptr=new_composition; +		*tail_ptr=&new_composition->next; +	} +	else +	{ +		free(new_composition->composition); +		free(new_composition); +		new_composition=NULL; +	} + +	clptr->size=0; /* Start a new list of composable characters */ +	return 0; +} + +void unicode_composition_deinit(struct unicode_compositions *ptr) +{ +	while (ptr) +	{ +		struct unicode_compositions *next=ptr->next; + +		if (ptr->composition) +			free(ptr->composition); +		free(ptr); +		ptr=next; +	} +} + +size_t unicode_composition_apply(char32_t *string, +				 size_t string_size, +				 struct unicode_compositions *compositions) +{ +	size_t j=0; +	size_t i; + +	for (i=0; i<string_size; ) +	{ +		if (compositions && compositions->index == i) +		{ +			size_t k; + +			for (k=0; k<compositions->n_composition; ++k) +				string[j++]=compositions->composition[k]; +			i += compositions->n_composed; + +			compositions=compositions->next; +		} +		else +		{ +			string[j++]=string[i++]; +		} +	} + +	if (j < string_size) +		string[j]=0; +	return j; +} + +int unicode_compose(char32_t *string, +		    size_t string_size, +		    int flags, +		    size_t *new_size) +{ +	struct unicode_compositions *composes; + +	if (unicode_composition_init(string, string_size, flags, &composes)) +		return -1; + +	*new_size=unicode_composition_apply(string, string_size, composes); + +	unicode_composition_deinit(composes); + +	return 0; +} | 
