diff options
Diffstat (limited to 'unicode/courier-unicode.h')
| -rw-r--r-- | unicode/courier-unicode.h | 224 | 
1 files changed, 83 insertions, 141 deletions
| diff --git a/unicode/courier-unicode.h b/unicode/courier-unicode.h index 42ef9a1..a84e230 100644 --- a/unicode/courier-unicode.h +++ b/unicode/courier-unicode.h @@ -67,12 +67,7 @@ unicode_char unicode_html40ent_lookup(const char *n);  extern int unicode_wcwidth(unicode_char c);  extern size_t unicode_wcwidth_str(const unicode_char *c); -/* -** The unicode-ish isspace() -*/ -extern int unicode_isspace(unicode_char ch); - -/* Internal unicode table lookup function */ +/* Internal unicode table lookup functions */  extern uint8_t unicode_tab_lookup(unicode_char ch,  				  const size_t *unicode_indextab, @@ -81,6 +76,85 @@ extern uint8_t unicode_tab_lookup(unicode_char ch,  				  const uint8_t *unicode_classtab,  				  uint8_t uclass); +extern uint32_t unicode_tab32_lookup(unicode_char ch, +				     const size_t *unicode_indextab, +				     size_t unicode_indextab_sizeof, +				     const uint8_t (*unicode_rangetab)[2], +				     const uint32_t *unicode_classtab, +				     uint32_t uclass); + +/* +** Look up unicode categorization, see http://unicode.org/notes/tn36/ +** +** Returns a 32 bit value with four unicode categories encoded in the +** bits defined by UNICODE_CATEGORY_1..4 +*/ + +#define UNICODE_CATEGORY_1   0xFF000000 +#define UNICODE_CATEGORY_2   0x00FF0000 +#define UNICODE_CATEGORY_3   0x0000FF00 +#define UNICODE_CATEGORY_4   0x000000FF + +#include <courier-unicode-categories-tab.h> + +uint32_t unicode_category_lookup(unicode_char); + +/* +** Return non-0 for TAB, and all UNICODE_CATEGORY_2_SPACE. +*/ + +extern int unicode_isblank(unicode_char ch); + +/* +** The unicode-ish isspace(). In addition to return non-0 for +** unicode_isblank(), this also returns non-0 for unicode characters +** with linebreaking properties of BK, CR, LF, NL, and SP. +*/ +extern int unicode_isspace(unicode_char ch); + +/* +** Return non-0 for all UNICODE_CATEGORY_1_LETTER +*/ + +extern int unicode_isalpha(unicode_char ch); + +/* +** Return non-0 for all UNICODE_CATEGORY_1_NUMBER | UNICODE_CATEGORY_2_DIGIT, +** only (no third categories). +*/ +extern int unicode_isdigit(unicode_char ch); + +/* +** Return non-0 for all unicode_isalpha() or unicode_isdigit(). +*/ + +extern int unicode_isalnum(unicode_char ch); + +/* +** Returns non-0 for all codepoints above SPACE which are not +** unicode_isspace(). +*/ + +extern int unicode_isgraph(unicode_char ch); + +/* +** Return non-0 for all UNICODE_CATEGORY_1_PUNCTUATION. +*/ + +extern int unicode_ispunct(unicode_char ch); + +/* +** Return non-0 for all unicode_isalpha() for which the character is +** equal to unicode_lc() of itself. +*/ +extern int unicode_islower(unicode_char ch); + +/* +** Return non-0 for all unicode_isalpha() for which the character is +** equal to unicode_uc() of itself. +*/ +extern int unicode_isupper(unicode_char ch); +  /*  ** Implementation of grapheme cluster boundary rules, as per  ** http://www.unicode.org/reports/tr29/tr29-27.html @@ -93,141 +167,9 @@ extern uint8_t unicode_tab_lookup(unicode_char ch,  int unicode_grapheme_break(unicode_char a, unicode_char b);  typedef enum { -	/* UNICODE_SCRIPT_T */ -	unicode_script_unknown, -	unicode_script_common, -	unicode_script_latin, -	unicode_script_greek, -	unicode_script_cyrillic, -	unicode_script_armenian, -	unicode_script_hebrew, -	unicode_script_arabic, -	unicode_script_syriac, -	unicode_script_thaana, -	unicode_script_devanagari, -	unicode_script_bengali, -	unicode_script_gurmukhi, -	unicode_script_gujarati, -	unicode_script_oriya, -	unicode_script_tamil, -	unicode_script_telugu, -	unicode_script_kannada, -	unicode_script_malayalam, -	unicode_script_sinhala, -	unicode_script_thai, -	unicode_script_lao, -	unicode_script_tibetan, -	unicode_script_myanmar, -	unicode_script_georgian, -	unicode_script_hangul, -	unicode_script_ethiopic, -	unicode_script_cherokee, -	unicode_script_canadian_aboriginal, -	unicode_script_ogham, -	unicode_script_runic, -	unicode_script_khmer, -	unicode_script_mongolian, -	unicode_script_hiragana, -	unicode_script_katakana, -	unicode_script_bopomofo, -	unicode_script_han, -	unicode_script_yi, -	unicode_script_old_italic, -	unicode_script_gothic, -	unicode_script_deseret, -	unicode_script_inherited, -	unicode_script_tagalog, -	unicode_script_hanunoo, -	unicode_script_buhid, -	unicode_script_tagbanwa, -	unicode_script_limbu, -	unicode_script_tai_le, -	unicode_script_linear_b, -	unicode_script_ugaritic, -	unicode_script_shavian, -	unicode_script_osmanya, -	unicode_script_cypriot, -	unicode_script_braille, -	unicode_script_buginese, -	unicode_script_coptic, -	unicode_script_new_tai_lue, -	unicode_script_glagolitic, -	unicode_script_tifinagh, -	unicode_script_syloti_nagri, -	unicode_script_old_persian, -	unicode_script_kharoshthi, -	unicode_script_balinese, -	unicode_script_cuneiform, -	unicode_script_phoenician, -	unicode_script_phags_pa, -	unicode_script_nko, -	unicode_script_sundanese, -	unicode_script_lepcha, -	unicode_script_ol_chiki, -	unicode_script_vai, -	unicode_script_saurashtra, -	unicode_script_kayah_li, -	unicode_script_rejang, -	unicode_script_lycian, -	unicode_script_carian, -	unicode_script_lydian, -	unicode_script_cham, -	unicode_script_tai_tham, -	unicode_script_tai_viet, -	unicode_script_avestan, -	unicode_script_egyptian_hieroglyphs, -	unicode_script_samaritan, -	unicode_script_lisu, -	unicode_script_bamum, -	unicode_script_javanese, -	unicode_script_meetei_mayek, -	unicode_script_imperial_aramaic, -	unicode_script_old_south_arabian, -	unicode_script_inscriptional_parthian, -	unicode_script_inscriptional_pahlavi, -	unicode_script_old_turkic, -	unicode_script_kaithi, -	unicode_script_batak, -	unicode_script_brahmi, -	unicode_script_mandaic, -	unicode_script_chakma, -	unicode_script_meroitic_cursive, -	unicode_script_meroitic_hieroglyphs, -	unicode_script_miao, -	unicode_script_sharada, -	unicode_script_sora_sompeng, -	unicode_script_takri, -	unicode_script_caucasian_albanian, -	unicode_script_bassa_vah, -	unicode_script_duployan, -	unicode_script_elbasan, -	unicode_script_grantha, -	unicode_script_pahawh_hmong, -	unicode_script_khojki, -	unicode_script_linear_a, -	unicode_script_mahajani, -	unicode_script_manichaean, -	unicode_script_mende_kikakui, -	unicode_script_modi, -	unicode_script_mro, -	unicode_script_old_north_arabian, -	unicode_script_nabataean, -	unicode_script_palmyrene, -	unicode_script_pau_cin_hau, -	unicode_script_old_permic, -	unicode_script_psalter_pahlavi, -	unicode_script_siddham, -	unicode_script_khudawadi, -	unicode_script_tirhuta, -	unicode_script_warang_citi, -	unicode_script_ahom, -	unicode_script_anatolian_hieroglyphs, -	unicode_script_hatran, -	unicode_script_multani, -	unicode_script_old_hungarian, -	unicode_script_signwriting - -	/* UNICODE_SCRIPT_T */ + +#include <courier-unicode-script-tab.h> +  } unicode_script_t;  /* | 
