summaryrefslogtreecommitdiffstats
path: root/unicode/courier-unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'unicode/courier-unicode.h')
-rw-r--r--unicode/courier-unicode.h224
1 files changed, 83 insertions, 141 deletions
diff --git a/unicode/courier-unicode.h b/unicode/courier-unicode.h
index 42ef9a1..a84e230 100644
--- a/unicode/courier-unicode.h
+++ b/unicode/courier-unicode.h
@@ -67,12 +67,7 @@ unicode_char unicode_html40ent_lookup(const char *n);
extern int unicode_wcwidth(unicode_char c);
extern size_t unicode_wcwidth_str(const unicode_char *c);
-/*
-** The unicode-ish isspace()
-*/
-extern int unicode_isspace(unicode_char ch);
-
-/* Internal unicode table lookup function */
+/* Internal unicode table lookup functions */
extern uint8_t unicode_tab_lookup(unicode_char ch,
const size_t *unicode_indextab,
@@ -81,6 +76,85 @@ extern uint8_t unicode_tab_lookup(unicode_char ch,
const uint8_t *unicode_classtab,
uint8_t uclass);
+extern uint32_t unicode_tab32_lookup(unicode_char ch,
+ const size_t *unicode_indextab,
+ size_t unicode_indextab_sizeof,
+ const uint8_t (*unicode_rangetab)[2],
+ const uint32_t *unicode_classtab,
+ uint32_t uclass);
+
+/*
+** Look up unicode categorization, see http://unicode.org/notes/tn36/
+**
+** Returns a 32 bit value with four unicode categories encoded in the
+** bits defined by UNICODE_CATEGORY_1..4
+*/
+
+#define UNICODE_CATEGORY_1 0xFF000000
+#define UNICODE_CATEGORY_2 0x00FF0000
+#define UNICODE_CATEGORY_3 0x0000FF00
+#define UNICODE_CATEGORY_4 0x000000FF
+
+#include <courier-unicode-categories-tab.h>
+
+uint32_t unicode_category_lookup(unicode_char);
+
+/*
+** Return non-0 for TAB, and all UNICODE_CATEGORY_2_SPACE.
+*/
+
+extern int unicode_isblank(unicode_char ch);
+
+/*
+** The unicode-ish isspace(). In addition to return non-0 for
+** unicode_isblank(), this also returns non-0 for unicode characters
+** with linebreaking properties of BK, CR, LF, NL, and SP.
+*/
+extern int unicode_isspace(unicode_char ch);
+
+/*
+** Return non-0 for all UNICODE_CATEGORY_1_LETTER
+*/
+
+extern int unicode_isalpha(unicode_char ch);
+
+/*
+** Return non-0 for all UNICODE_CATEGORY_1_NUMBER | UNICODE_CATEGORY_2_DIGIT,
+** only (no third categories).
+*/
+extern int unicode_isdigit(unicode_char ch);
+
+/*
+** Return non-0 for all unicode_isalpha() or unicode_isdigit().
+*/
+
+extern int unicode_isalnum(unicode_char ch);
+
+/*
+** Returns non-0 for all codepoints above SPACE which are not
+** unicode_isspace().
+*/
+
+extern int unicode_isgraph(unicode_char ch);
+
+/*
+** Return non-0 for all UNICODE_CATEGORY_1_PUNCTUATION.
+*/
+
+extern int unicode_ispunct(unicode_char ch);
+
+/*
+** Return non-0 for all unicode_isalpha() for which the character is
+** equal to unicode_lc() of itself.
+*/
+extern int unicode_islower(unicode_char ch);
+
+/*
+** Return non-0 for all unicode_isalpha() for which the character is
+** equal to unicode_uc() of itself.
+*/
+extern int unicode_isupper(unicode_char ch);
+
/*
** Implementation of grapheme cluster boundary rules, as per
** http://www.unicode.org/reports/tr29/tr29-27.html
@@ -93,141 +167,9 @@ extern uint8_t unicode_tab_lookup(unicode_char ch,
int unicode_grapheme_break(unicode_char a, unicode_char b);
typedef enum {
- /* UNICODE_SCRIPT_T */
- unicode_script_unknown,
- unicode_script_common,
- unicode_script_latin,
- unicode_script_greek,
- unicode_script_cyrillic,
- unicode_script_armenian,
- unicode_script_hebrew,
- unicode_script_arabic,
- unicode_script_syriac,
- unicode_script_thaana,
- unicode_script_devanagari,
- unicode_script_bengali,
- unicode_script_gurmukhi,
- unicode_script_gujarati,
- unicode_script_oriya,
- unicode_script_tamil,
- unicode_script_telugu,
- unicode_script_kannada,
- unicode_script_malayalam,
- unicode_script_sinhala,
- unicode_script_thai,
- unicode_script_lao,
- unicode_script_tibetan,
- unicode_script_myanmar,
- unicode_script_georgian,
- unicode_script_hangul,
- unicode_script_ethiopic,
- unicode_script_cherokee,
- unicode_script_canadian_aboriginal,
- unicode_script_ogham,
- unicode_script_runic,
- unicode_script_khmer,
- unicode_script_mongolian,
- unicode_script_hiragana,
- unicode_script_katakana,
- unicode_script_bopomofo,
- unicode_script_han,
- unicode_script_yi,
- unicode_script_old_italic,
- unicode_script_gothic,
- unicode_script_deseret,
- unicode_script_inherited,
- unicode_script_tagalog,
- unicode_script_hanunoo,
- unicode_script_buhid,
- unicode_script_tagbanwa,
- unicode_script_limbu,
- unicode_script_tai_le,
- unicode_script_linear_b,
- unicode_script_ugaritic,
- unicode_script_shavian,
- unicode_script_osmanya,
- unicode_script_cypriot,
- unicode_script_braille,
- unicode_script_buginese,
- unicode_script_coptic,
- unicode_script_new_tai_lue,
- unicode_script_glagolitic,
- unicode_script_tifinagh,
- unicode_script_syloti_nagri,
- unicode_script_old_persian,
- unicode_script_kharoshthi,
- unicode_script_balinese,
- unicode_script_cuneiform,
- unicode_script_phoenician,
- unicode_script_phags_pa,
- unicode_script_nko,
- unicode_script_sundanese,
- unicode_script_lepcha,
- unicode_script_ol_chiki,
- unicode_script_vai,
- unicode_script_saurashtra,
- unicode_script_kayah_li,
- unicode_script_rejang,
- unicode_script_lycian,
- unicode_script_carian,
- unicode_script_lydian,
- unicode_script_cham,
- unicode_script_tai_tham,
- unicode_script_tai_viet,
- unicode_script_avestan,
- unicode_script_egyptian_hieroglyphs,
- unicode_script_samaritan,
- unicode_script_lisu,
- unicode_script_bamum,
- unicode_script_javanese,
- unicode_script_meetei_mayek,
- unicode_script_imperial_aramaic,
- unicode_script_old_south_arabian,
- unicode_script_inscriptional_parthian,
- unicode_script_inscriptional_pahlavi,
- unicode_script_old_turkic,
- unicode_script_kaithi,
- unicode_script_batak,
- unicode_script_brahmi,
- unicode_script_mandaic,
- unicode_script_chakma,
- unicode_script_meroitic_cursive,
- unicode_script_meroitic_hieroglyphs,
- unicode_script_miao,
- unicode_script_sharada,
- unicode_script_sora_sompeng,
- unicode_script_takri,
- unicode_script_caucasian_albanian,
- unicode_script_bassa_vah,
- unicode_script_duployan,
- unicode_script_elbasan,
- unicode_script_grantha,
- unicode_script_pahawh_hmong,
- unicode_script_khojki,
- unicode_script_linear_a,
- unicode_script_mahajani,
- unicode_script_manichaean,
- unicode_script_mende_kikakui,
- unicode_script_modi,
- unicode_script_mro,
- unicode_script_old_north_arabian,
- unicode_script_nabataean,
- unicode_script_palmyrene,
- unicode_script_pau_cin_hau,
- unicode_script_old_permic,
- unicode_script_psalter_pahlavi,
- unicode_script_siddham,
- unicode_script_khudawadi,
- unicode_script_tirhuta,
- unicode_script_warang_citi,
- unicode_script_ahom,
- unicode_script_anatolian_hieroglyphs,
- unicode_script_hatran,
- unicode_script_multani,
- unicode_script_old_hungarian,
- unicode_script_signwriting
-
- /* UNICODE_SCRIPT_T */
+
+#include <courier-unicode-script-tab.h>
+
} unicode_script_t;
/*