Skip to content

Instantly share code, notes, and snippets.

@veer66
Last active August 29, 2015 14:06
Show Gist options
  • Save veer66/8443e76a3524fa790970 to your computer and use it in GitHub Desktop.
Save veer66/8443e76a3524fa790970 to your computer and use it in GitHub Desktop.
Dictionaries merger for Apertium
require "nokogiri"
require "pp"
include Nokogiri
class Extra
def initialize
end
def child2txt(t)
(t.kind_of?(XML::Element) and t.name == "b") ? " " : t.text
end
def il2txt(i)
i.children.map{|t| child2txt(t)}.join("")
end
def get_il(e)
il = nil
l = e.css("l")
if l.length == 0
i = e.css("i")
if i.length > 0
il = i[0]
end
else
il = l[0]
end
return il
end
def get_dix(fn)
dix = Hash.new
f = File.open(fn)
doc = Nokogiri::XML(f)
doc.css("section e").each do |e|
il = get_il(e)
if not il.nil?
txt = il2txt(il)
if not dix.has_key?(txt)
dix[txt] = []
end
dix[txt] << e
end
end
f.close
return dix
end
def find_entries(src, tgt)
src.each do |txt, e_list|
if not tgt.has_key?(txt)
e_list.each do |e|
puts e.to_s
end
else
h = {}
for e in tgt[txt]
h[e.to_s] = true
end
e_list.each do |e|
if not h.has_key?(e.to_s)
puts e.to_s
end
end
end
end
end
def run_cmd
if ARGV.length != 2
$stderr.puts "Usage: ruby #{$0} <source> <target>"
exit 1
end
src = get_dix(ARGV[0])
tgt = get_dix(ARGV[1])
find_entries(src, tgt)
end
end
extra = Extra.new
extra.run_cmd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment