-
-
Save timothyandrew/8f257bda963b5f6adc1b to your computer and use it in GitHub Desktop.
Reduce the size of NormalizationTest.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
class Norm_Splitter | |
def split_into_parts | |
norm_test = File.open('NormalizationTest.txt', 'r') | |
@part0 = [] | |
#Part 0 | |
until (line = norm_test.readline).include? "@Part1 # Character by character test" | |
unless line[0] == '#' || line[0] == '@' | |
@part0 << line | |
end | |
end | |
@part1 = [] | |
#Part 1 | |
until (line = norm_test.readline).include? "@Part2 # Canonical Order Test" | |
unless line[0] == '#' || line[0] == '@' | |
@part1 << line | |
end | |
end | |
@part2 = [] | |
#Part 2 | |
until (line = norm_test.readline).include? "@Part3 # PRI #29 Test" | |
unless line[0] == '#' || line[0] == '@' | |
@part2 << line | |
end | |
end | |
@part3 = [] | |
#Part 3 | |
until (line = norm_test.readline).include? "END OF FILE" | |
unless line[0] == '#' || line[0] == '@' | |
@part3 << line | |
end | |
end | |
norm_test.close | |
end | |
def get_block_ranges | |
ranges = [] | |
IO.readlines('Blocks.txt').each do |line| | |
line = line.chomp | |
unless line[0] == '#' or line.empty? | |
split_line = line.split(';') | |
#Get hex Range from first half of line and convert it to decimal | |
range = split_line.first.split('..') | |
range.map! { |n| n.hex } | |
ranges << Range.new(range.first, range.last) | |
end | |
end | |
ranges | |
end | |
def process_parts | |
#Part 0 is unchanged | |
new_part1 = [] | |
#Take 10 items for each block from Part 1 | |
get_block_ranges.each do |range| | |
new_part1 << @part1.find_all do |line| | |
code = line.split(';').first | |
range.cover? code.hex | |
end.sample(10).uniq | |
end | |
@part1 = new_part1 | |
#Take 10 random cases from Part 2 | |
@part2 = @part2.sample(10) | |
#Take 10 random cases from Part 3 | |
@part3 = @part3.sample(10) | |
end | |
def write_file | |
final = File.new('final.txt', 'w') | |
#Write instructions | |
final.puts("# ---------------------------------------------------------------------------------------------------- | |
# PLEASE NOTE | |
# This is NOT the entirety of NormalizationTest.txt. Since tests took long to run on the entire thing, | |
# we've cut down the size of Part1 to about 10 cases per block (as defined in Blocks.txt). | |
# Parts 2 & 3 have been cut down by selecting 10 random cases from each. Part 0 has been left intact. | |
# | |
# If you want to test against the entire NormalizationTest.txt, you can download it and replace this | |
# file with it. It is available for download in the Downloads section of the gem, as well as at | |
# this URL: http://unicode.org/Public/UNIDATA/NormalizationTest.txt | |
# ---------------------------------------------------------------------------------------------------- | |
# NormalizationTest-6.1.0.txt | |
# Date: 2011-11-27, 05:10:33 GMT [MD] | |
# | |
# Unicode Character Database | |
# Copyright (c) 1991-2011 Unicode, Inc. | |
# For terms of use, see http://www.unicode.org/terms_of_use.html | |
# For documentation, see http://www.unicode.org/reports/tr44/ | |
# | |
# Normalization Test Suite | |
# Format: | |
# | |
# Columns (c1, c2,...) are separated by semicolons | |
# They have the following meaning: | |
# source; NFC; NFD; NFKC; NFKD | |
# Comments are indicated with hash marks | |
# Each of the columns may have one or more code points. | |
# | |
# CONFORMANCE: | |
# 1. The following invariants must be true for all conformant implementations | |
# | |
# NFC | |
# c2 == toNFC(c1) == toNFC(c2) == toNFC(c3) | |
# c4 == toNFC(c4) == toNFC(c5) | |
# | |
# NFD | |
# c3 == toNFD(c1) == toNFD(c2) == toNFD(c3) | |
# c5 == toNFD(c4) == toNFD(c5) | |
# | |
# NFKC | |
# c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4) == toNFKC(c5) | |
# | |
# NFKD | |
# c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4) == toNFKD(c5) | |
# | |
# 2. For every code point X assigned in this version of Unicode that is not specifically | |
# listed in Part 1, the following invariants must be true for all conformant | |
# implementations: | |
# | |
# X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X) | |
#") | |
#Write part0 | |
final.puts "#\n@Part0: # Specific Cases\n#" | |
@part0.each do |line| | |
final.puts line | |
end | |
#Write part1 | |
final.puts "#\n@Part1 # Character by character test\n# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.\n#" | |
@part1.each do |line| | |
final.puts line | |
end | |
#Write part2 | |
final.puts "#\n@Part2 # Canonical Order Test\n#" | |
@part2.each do |line| | |
final.puts line | |
end | |
#Write part3 | |
final.puts "#\n@Part3 # PRI #29 Test\n#" | |
@part3.each do |line| | |
final.puts line | |
end | |
end | |
def split | |
Encoding.default_external = Encoding::UTF_8 | |
split_into_parts | |
process_parts | |
write_file | |
end | |
end | |
Norm_Splitter.new.split |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment