diff options
Diffstat (limited to 'unicode/mkcommon.pm')
| -rw-r--r-- | unicode/mkcommon.pm | 80 |
1 files changed, 62 insertions, 18 deletions
diff --git a/unicode/mkcommon.pm b/unicode/mkcommon.pm index dc2fa99..091a219 100644 --- a/unicode/mkcommon.pm +++ b/unicode/mkcommon.pm @@ -8,6 +8,38 @@ require Exporter; our @ISA = qw(Exporter); +# Common code used to create Unicode lookup tables. +# +# Generates C code that declares a bunch of arrays. +# +# The 'rangetab' array's structure is [firstchar, lastchar], and the +# 'classtab' array has the same size containing "class", giving the +# associated "value" for unicode character range firstchar-lastchar. +# +# The ranges are sorted in numerical order, but rangetab stores the least +# singificant byte of the 32-bit Unicode character (firstchar and lastchar). +# The leading bytes of both firstchar and lastchar are the same. +# +# In this manner, the Unicode data gets divided into 256 character blocks. +# +# The "starting_indextab" array enumerates which 256 character blocks have +# any data in the "rangetab" array. 256 characters that don't wind up +# with any data get skipped entirely. +# +# The "starting_pagetab" array is the starting index in the "rangetab" +# array for the corresponding 256 character block. "starting_indextab" and +# "starting_pagetab" arrays have the same size. +# +# "starting_indextab" is sorted, a binary search finds the start of the +# 256 character block containing the character, via "starting_pagetab". +# +# The end of the 256 character block in "rangetab" is given by the +# starting index of the next 256 character block, or the end of the "rangetab" +# array. +# +# A binary search is done to locate the range containing the given character, +# and the associated value from "classtab" gets returned. + # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. @@ -136,8 +168,6 @@ sub output { $this->_doemit(); # Emit last linebreaking unicode char range class - $this->_doemit_endblock(); # End of the most recent $BLOCK_SIZE char range class - print "static const uint8_t unicode_rangetab[][2]={\n"; my $comma="\t"; @@ -164,32 +194,46 @@ sub output { print "};\n\n"; - print "static const size_t unicode_indextab[]={\n"; + my $prev_block=-1; - $comma="\t"; + my @starting_indextab; + my @starting_pagetab; - my $prev_block=-1; - foreach (@{$this->{'char_start'}}) + foreach my $sp (@{$this->{'char_start'}}) { - my $sp=$_; - my $cnt=1; + my $block=int($this->{'char_array'}->[$sp]->[0] / $BLOCK_SIZE); - if ($sp <= $#{$this->{'char_array'}}) + if ($block != $prev_block) { - my $block=int($this->{'char_array'}->[$sp]->[0] / $BLOCK_SIZE); - - $cnt = $block - $prev_block; + push @starting_indextab, $block; + push @starting_pagetab, $sp; $prev_block=$block; } + } - foreach (1..$cnt) - { - print "$comma$sp"; - $comma=",\n\t"; - } + print "static const size_t unicode_starting_indextab[]={\n"; + + $comma="\t"; + + foreach (@starting_indextab) + { + print "$comma$_"; + $comma=",\n\t"; } - print "};\n\n"; + print "\n};\n\nstatic const char32_t unicode_starting_pagetab[]={\n"; + + $comma="\t"; + + foreach (@starting_pagetab) + { + my $sp=$_; + + print "$comma$sp"; + $comma=",\n\t"; + } + + print "\n};\n\n"; } 1; |
