Skip to content

Instantly share code, notes, and snippets.

@mikecarroll
Last active December 21, 2015 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.
Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.
desc "Normalize Uzbek file."
task :uzbek => :environment do
require 'csv'
counter = 0
entry_counter = 0
previous = nil
desc_array = []
new_file = File.open("/Users/woodchip/fixed_UE.txt", 'w')
CSV.foreach("/Users/woodchip/UE.txt", { :col_sep => "|" }) do |row|
if row.count > 2
row[1] = row[1] + row[2]
row.pop
end
unless row.empty?
entry = row[0].strip
# p "ENTRY: #{entry}"
pos = entry.match(/\w*\.$/).to_s.strip
# p "PoS: #{pos}"
entry = entry.gsub(/\w*\.$/, '').strip
cyrillic = entry.gsub(/\w/u,'').strip
# p "CYRILLIC: #{cyrillic}"
latin = entry.gsub(/[Ѐ-ӿ\d]+\b/i,'').strip
# p "LATIN: #{latin}"
desc_array << ["<i>#{pos}</i>", row[1].strip]
# p "DESC ARRAY: #{desc_array}"
formatted_entry = "#{latin} / #{cyrillic}".gsub(/(^\!|\s\-|\-\s|\s\'|^\/|\/$|\'\s|\s\’|\’\s|\s\,|\,\s|^\?|\'$|\s\!|\s\?)|\([\ -\/]*\)/, '').strip
# puts formatted_entry
if previous.blank? || previous == entry
previous = entry
# p "#{previous} || #{entry}"
else
# p "#{previous} || #{entry}"
new_entry = "<b>#{formatted_entry}</b>"
hash = {}
desc_array.each do |y|
if hash[y[0]]
hash[y[0]] = hash[y[0]] << ", #{y[1]}"
else
hash[y[0]] = y[1]
end
end
hash.each_with_index do |(k, v), i|
if i == 0
new_entry << " #{k} #{v}"
else
new_entry << "; #{k} #{v}"
end
end
# p new_entry
new_file << new_entry + "\n"
# p new_entry.encoding.name
desc_array = []
previous = entry
entry_counter += 1
end
counter += 1
if counter%1000 == 0
p counter
end
end
end
new_file.close
p "Done! ENTRIES COUNT: #{entry_counter}"
p "Done! TOTAL COUNT: #{counter}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment