package mkcommon; use 5.012002; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); # Common code used to create Unicode lookup tables. # # Generates C code that declares a bunch of arrays. # # The 'rangetab' array's structure is [firstchar, lastchar], and the # 'classtab' array has the same size containing "class", giving the # associated "value" for unicode character range firstchar-lastchar. # # The ranges are sorted in numerical order, but rangetab stores the least # singificant byte of the 32-bit Unicode character (firstchar and lastchar). # The leading bytes of both firstchar and lastchar are the same. # # In this manner, the Unicode data gets divided into 256 character blocks. # # The "starting_indextab" array enumerates which 256 character blocks have # any data in the "rangetab" array. 256 characters that don't wind up # with any data get skipped entirely. # # The "starting_pagetab" array is the starting index in the "rangetab" # array for the corresponding 256 character block. "starting_indextab" and # "starting_pagetab" arrays have the same size. # # "starting_indextab" is sorted, a binary search finds the start of the # 256 character block containing the character, via "starting_pagetab". # # The end of the 256 character block in "rangetab" is given by the # starting index of the next 256 character block, or the end of the "rangetab" # array. # # A binary search is done to locate the range containing the given character, # and the associated value from "classtab" gets returned. # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # This allows declaration use mkcommon ':all'; # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( ); our $VERSION = '0.01'; my $BLOCK_SIZE=256; # Preloaded methods go here. sub new { my ($this, %params) = @_; my $class = ref($this) || $this; my $self = \%params; bless $self, $class; $self->{'char_array'}=[]; $self->{'char_class'}=[]; $self->{'char_start'}=[0]; $self->{'last_block'}=-1; $self->{'last'}=""; $self->{'last_f'}=0; $self->{'last_l'}=0; $self->{"classtype"} //= "uint8_t"; $self->{"prefix"} //= "unicode"; return $self; } sub _doemit_block { my $this=shift; my $f=shift; my $l=shift; push @{$this->{'char_array'}}, [$f, $l]; push @{$this->{'char_class'}}, $this->{'last'}; } sub _doemit_endblock { my $this=shift; push @{$this->{'char_start'}}, $#{$this->{'char_array'}}+1; } # _doemit invokes _doemit_block() for each unicode char range with a given # linebreaking class. However, once a unicode char range starts in a different # $BLOCK_SIZE character class, call _doemit_endblock() before calling _doemit_block(). # # If a single unicode char range crosses a $BLOCK_SIZE character class boundary, # split it at the boundary; call _doemit_endblock() to finish the current $BLOCK_SIZE # char boundary, call _doemit_endblock(), then call _doemit_block() for the # rest of the char range. sub _doemit { my $this=shift; $this->_doemit_endblock() if int($this->{'last_f'} / $BLOCK_SIZE) != $this->{'last_block'} && $this->{'last_block'} != -1; if (int($this->{'last_f'} / $BLOCK_SIZE) != int($this->{'last_l'} / $BLOCK_SIZE)) { while (int($this->{'last_f'} / $BLOCK_SIZE) != int($this->{'last_l'} / $BLOCK_SIZE)) { my $n=int($this->{'last_f'} / $BLOCK_SIZE) * $BLOCK_SIZE + ($BLOCK_SIZE-1); $this->_doemit_block($this->{'last_f'}, $n); $this->_doemit_endblock(); $this->{'last_f'}=$n+1; } } $this->_doemit_block($this->{'last_f'}, $this->{'last_l'}); $this->{'last_block'}=int($this->{'last_l'} / $BLOCK_SIZE); } # # Coalesce adjacent unicode char blocks that have the same linebreaking # property. Invoke _doemit() for the accumulate unicode char range once # a range with a different linebreaking class is seen. sub range { my $this=shift; my $f=shift; my $l=shift; my $t=shift // 'NONE'; if ($this->{'last_l'} + 1 == $f && $this->{'last'} eq $t) { $this->{'last_l'}=$l; return; } $this->_doemit() if $this->{'last'}; # New linebreaking class $this->{'last_f'}=$f; $this->{'last_l'}=$l; $this->{'last'}=$t; } sub output { my $this=shift; $this->_doemit(); # Emit last linebreaking unicode char range class my $prefix = $this->{"prefix"}; print "static const uint8_t ${prefix}_rangetab[][2]={\n"; my $comma="\t"; my $modulo=sprintf("0x%X", $BLOCK_SIZE-1); foreach ( @{$this->{'char_array'}} ) { print "${comma}{0x" . sprintf("%04x", $$_[0]) . " & $modulo, 0x" . sprintf("%04x", $$_[1]) . " & $modulo}"; $comma=",\n\t"; } print "};\n\n"; unless ($this->{noclass}) { print "static const " . $this->{classtype} . " ${prefix}_classtab[]={\n"; $comma="\t"; foreach ( @{$this->{'char_class'}} ) { print "${comma}$_"; $comma=",\n\t"; } print "};\n\n"; } my $prev_block=-1; my @starting_indextab; my @starting_pagetab; foreach my $sp (@{$this->{'char_start'}}) { my $block=int($this->{'char_array'}->[$sp]->[0] / $BLOCK_SIZE); if ($block != $prev_block) { push @starting_indextab, $block; push @starting_pagetab, $sp; $prev_block=$block; } } print "static const size_t ${prefix}_starting_indextab[]={\n"; $comma="\t"; foreach (@starting_indextab) { print "$comma$_"; $comma=",\n\t"; } print "\n};\n\nstatic const char32_t ${prefix}_starting_pagetab[]={\n"; $comma="\t"; foreach (@starting_pagetab) { my $sp=$_; print "$comma$sp"; $comma=",\n\t"; } print "\n};\n\n"; } 1;