diff options
Diffstat (limited to 'unicode/mkcommon.pm')
| -rw-r--r-- | unicode/mkcommon.pm | 80 | 
1 files changed, 62 insertions, 18 deletions
| diff --git a/unicode/mkcommon.pm b/unicode/mkcommon.pm index dc2fa99..091a219 100644 --- a/unicode/mkcommon.pm +++ b/unicode/mkcommon.pm @@ -8,6 +8,38 @@ require Exporter;  our @ISA = qw(Exporter); +# Common code used to create Unicode lookup tables. +# +# Generates C code that declares a bunch of arrays. +# +# The 'rangetab' array's structure is [firstchar, lastchar], and the +# 'classtab' array has the same size containing "class", giving the +# associated "value" for unicode character range firstchar-lastchar. +# +# The ranges are sorted in numerical order, but rangetab stores the least +# singificant byte of the 32-bit Unicode character (firstchar and lastchar). +# The leading bytes of both firstchar and lastchar are the same. +# +# In this manner, the Unicode data gets divided into 256 character blocks. +# +# The "starting_indextab" array enumerates which 256 character blocks have +# any data in the "rangetab" array. 256 characters that don't wind up +# with any data get skipped entirely. +# +# The "starting_pagetab" array is the starting index in the "rangetab" +# array for the corresponding 256 character block. "starting_indextab" and +# "starting_pagetab" arrays have the same size. +# +# "starting_indextab" is sorted, a binary search finds the start of the +# 256 character block containing the character, via "starting_pagetab". +# +# The end of the 256 character block in "rangetab" is given by the +# starting index of the next 256 character block, or the end of the "rangetab" +# array. +# +# A binary search is done to locate the range containing the given character, +# and the associated value from "classtab" gets returned. +  # Items to export into callers namespace by default. Note: do not export  # names by default without a very good reason. Use EXPORT_OK instead.  # Do not simply export all your public functions/methods/constants. @@ -136,8 +168,6 @@ sub output {      $this->_doemit();  # Emit last linebreaking unicode char range class -    $this->_doemit_endblock(); # End of the most recent $BLOCK_SIZE char range class -      print "static const uint8_t unicode_rangetab[][2]={\n";      my $comma="\t"; @@ -164,32 +194,46 @@ sub output {      print "};\n\n"; -    print "static const size_t unicode_indextab[]={\n"; +    my $prev_block=-1; -    $comma="\t"; +    my @starting_indextab; +    my @starting_pagetab; -    my $prev_block=-1; -    foreach (@{$this->{'char_start'}}) +    foreach my $sp (@{$this->{'char_start'}})      { -	my $sp=$_; -	my $cnt=1; +	my $block=int($this->{'char_array'}->[$sp]->[0] / $BLOCK_SIZE); -	if ($sp <= $#{$this->{'char_array'}}) +	if ($block != $prev_block)  	{ -	    my $block=int($this->{'char_array'}->[$sp]->[0] / $BLOCK_SIZE); - -	    $cnt = $block - $prev_block; +	    push @starting_indextab, $block; +	    push @starting_pagetab, $sp;  	    $prev_block=$block;  	} +    } -	foreach (1..$cnt) -	{ -	    print "$comma$sp"; -	    $comma=",\n\t"; -	} +    print "static const size_t unicode_starting_indextab[]={\n"; + +    $comma="\t"; + +    foreach (@starting_indextab) +    { +	print "$comma$_"; +	$comma=",\n\t";      } -    print "};\n\n"; +    print "\n};\n\nstatic const char32_t unicode_starting_pagetab[]={\n"; + +    $comma="\t"; + +    foreach (@starting_pagetab) +    { +	my $sp=$_; + +	print "$comma$sp"; +	$comma=",\n\t"; +    } + +    print "\n};\n\n";  }  1; | 
