Skip to content

Instantly share code, notes, and snippets.

@nobu
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nobu/3fa2cd4f0bd7362c35d5 to your computer and use it in GitHub Desktop.
Save nobu/3fa2cd4f0bd7362c35d5 to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby
# Usage:
# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
# $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h
def hex_seq(v)
v.map {|i| "0x%04x" % i}.join(", ")
end
def print_table(table, data)
print("static const #{table}[] = {\n")
for k, v in data.sort
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
end
print("};\n\n")
end
def load_case_folding_data(filename)
pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/
fold = {}
unfold = [{}, {}, {}]
turkic = []
IO.foreach(filename) do |line|
next unless res = pattern.match(line)
ch_from = res[1].to_i(16)
if res[2] == 'T'
# Turkic case folding
turkic << ch_from
next
end
# store folding data
ch_to = res[3..6].inject([]) do |a, i|
break a unless i
a << i.to_i(16)
end
fold[ch_from] = ch_to
# store unfolding data
i = ch_to.length - 1
(unfold[i][ch_to] ||= []) << ch_from
end
# move locale dependent data to (un)fold_locale
fold_locale = {}
unfold_locale = [{}, {}]
for ch_from in turkic
key = fold[ch_from]
i = key.length - 1
unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
fold_locale[ch_from] = fold.delete(ch_from)
end
{fold: fold, fold_locale: fold_locale, unfold: unfold, unfold_locale: unfold_locale}
end
def print_case_folding_data(fold: fold, fold_locale: fold_locale, unfold: unfold, unfold_locale: unfold_locale)
# print the header
print("/* DO NOT EDIT THIS FILE. */\n")
print("/* Generated by tool/CaseFolding.py */\n\n")
# print folding data
# CaseFold
print_table("CaseFold_11_Type CaseFold", fold)
# CaseFold_Locale
print_table("CaseFold_11_Type CaseFold_Locale", fold_locale)
# print unfolding data
# CaseUnfold_11
print_table("CaseUnfold_11_Type CaseUnfold_11", unfold[0])
# CaseUnfold_11_Locale
print_table("CaseUnfold_11_Type CaseUnfold_11_Locale", unfold_locale[0])
# CaseUnfold_12
print_table("CaseUnfold_12_Type CaseUnfold_12", unfold[1])
# CaseUnfold_12_Locale
print_table("CaseUnfold_12_Type CaseUnfold_12_Locale", unfold_locale[1])
# CaseUnfold_13
print_table("CaseUnfold_13_Type CaseUnfold_13", unfold[2])
# table sizes
fold_table_size = fold.size + fold_locale.size
printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
unfold1_table_size = unfold[0].size + unfold_locale[0].size
printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
unfold2_table_size = unfold[1].size + unfold_locale[1].size
printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5))
unfold3_table_size = unfold[2].size
printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7))
end
def main(filename = 'CaseFolding.txt')
data = load_case_folding_data(filename)
print_case_folding_data(**data)
end
if $0 == __FILE__
main(*ARGV)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment