diff options
Diffstat (limited to 'unicode/mkcanonical.pl')
| -rw-r--r-- | unicode/mkcanonical.pl | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/unicode/mkcanonical.pl b/unicode/mkcanonical.pl new file mode 100644 index 0000000..43d7e44 --- /dev/null +++ b/unicode/mkcanonical.pl @@ -0,0 +1,110 @@ +#! /usr/bin/perl +# +# Creates a lookup table for canonical mappings in UnicodeData.txt + +use strict; +use warnings; + +open(F, "<UnicodeData.txt") || die; + +my @mappings; +my @data; + +while (defined($_=<F>)) +{ + my @w=split(/;/, $_, -1); + + next unless $w[5]; + + my $code=$w[0]; + + my @mapping=split(/\s/, $w[5]); + + my $formatting_tag = "UNICODE_CANONICAL_FMT_NONE"; + + if ($mapping[0] =~ /^</) + { + $formatting_tag = shift @mapping; + + $formatting_tag =~ s/<//g; + $formatting_tag =~ s/>//g; + $formatting_tag = "UNICODE_CANONICAL_FMT_" . uc($formatting_tag); + }; + + die "Too long\n" if (scalar @mapping) > 0xFFFF; + + my $dec_code; + + eval "\$dec_code=0x$code\n"; + + push @data, [$dec_code, "\t{0x$code, (unsigned char)$formatting_tag, " + . (scalar @mapping) . ", " + . scalar(@mappings) . "}" ]; + push @mappings, @mapping; +} + +my $hash_size = int( (scalar @data) * 3 / 4); + +my %buckets; + +my $keep_going = 1; + +while ($keep_going) +{ + %buckets = (); + + $keep_going = 0; + + foreach my $m (@data) + { + my $bucket = $m->[0] % $hash_size; + + push @{$buckets{$bucket}}, $m; + + if ((scalar @{$buckets{$bucket}}) > 3) + { + $keep_going = 1; + ++$hash_size; + last; + } + } +} + +print "#define HASH_SIZE $hash_size\n"; + +@data = (); + +my $pfix = ""; + +print "static const unsigned short canon_map_hash[]={\n"; + +foreach my $bucket (0.. ($hash_size)-1) +{ + print "$pfix\t" . (scalar @data); + $pfix = ",\n"; + + push @data, @{ $buckets{$bucket} // [] }; +} + + +print "};\n\nstatic const struct canon_map_table canon_map_lookup[]={\n"; + +$pfix = ""; + +foreach my $m (@data) +{ + print "$pfix" . $m->[1]; + $pfix = ",\n"; +} + +print "\n};\n\nstatic const char32_t canon_map_values[]={\n"; + +$pfix=""; + +foreach my $v (@mappings) +{ + print "$pfix\t0x$v"; + + $pfix=",\n"; +} +print "};\n"; |
