Last active
August 29, 2015 14:01
-
-
Save nobu/3fa2cd4f0bd7362c35d5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Usage: | |
# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt | |
# $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h | |
def hex_seq(v) | |
v.map {|i| "0x%04x" % i}.join(", ") | |
end | |
def print_table(table, data) | |
print("static const #{table}[] = {\n") | |
for k, v in data.sort | |
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) | |
print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n") | |
end | |
print("};\n\n") | |
end | |
def load_case_folding_data(filename) | |
pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ | |
fold = {} | |
unfold = [{}, {}, {}] | |
turkic = [] | |
IO.foreach(filename) do |line| | |
next unless res = pattern.match(line) | |
ch_from = res[1].to_i(16) | |
if res[2] == 'T' | |
# Turkic case folding | |
turkic << ch_from | |
next | |
end | |
# store folding data | |
ch_to = res[3..6].inject([]) do |a, i| | |
break a unless i | |
a << i.to_i(16) | |
end | |
fold[ch_from] = ch_to | |
# store unfolding data | |
i = ch_to.length - 1 | |
(unfold[i][ch_to] ||= []) << ch_from | |
end | |
# move locale dependent data to (un)fold_locale | |
fold_locale = {} | |
unfold_locale = [{}, {}] | |
for ch_from in turkic | |
key = fold[ch_from] | |
i = key.length - 1 | |
unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) | |
fold_locale[ch_from] = fold.delete(ch_from) | |
end | |
{fold: fold, fold_locale: fold_locale, unfold: unfold, unfold_locale: unfold_locale} | |
end | |
def print_case_folding_data(fold: fold, fold_locale: fold_locale, unfold: unfold, unfold_locale: unfold_locale) | |
# print the header | |
print("/* DO NOT EDIT THIS FILE. */\n") | |
print("/* Generated by tool/CaseFolding.py */\n\n") | |
# print folding data | |
# CaseFold | |
print_table("CaseFold_11_Type CaseFold", fold) | |
# CaseFold_Locale | |
print_table("CaseFold_11_Type CaseFold_Locale", fold_locale) | |
# print unfolding data | |
# CaseUnfold_11 | |
print_table("CaseUnfold_11_Type CaseUnfold_11", unfold[0]) | |
# CaseUnfold_11_Locale | |
print_table("CaseUnfold_11_Type CaseUnfold_11_Locale", unfold_locale[0]) | |
# CaseUnfold_12 | |
print_table("CaseUnfold_12_Type CaseUnfold_12", unfold[1]) | |
# CaseUnfold_12_Locale | |
print_table("CaseUnfold_12_Type CaseUnfold_12_Locale", unfold_locale[1]) | |
# CaseUnfold_13 | |
print_table("CaseUnfold_13_Type CaseUnfold_13", unfold[2]) | |
# table sizes | |
fold_table_size = fold.size + fold_locale.size | |
printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2)) | |
unfold1_table_size = unfold[0].size + unfold_locale[0].size | |
printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2)) | |
unfold2_table_size = unfold[1].size + unfold_locale[1].size | |
printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5)) | |
unfold3_table_size = unfold[2].size | |
printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7)) | |
end | |
def main(filename = 'CaseFolding.txt') | |
data = load_case_folding_data(filename) | |
print_case_folding_data(**data) | |
end | |
if $0 == __FILE__ | |
main(*ARGV) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment