Skip to content

Instantly share code, notes, and snippets.

@nebuta
Created November 8, 2011 07:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nebuta/1347240 to your computer and use it in GitHub Desktop.
Save nebuta/1347240 to your computer and use it in GitHub Desktop.
Make a dictionary for ja encodings
require 'rubygems'
require 'hpricot'
require 'iconv'
$vector = Hash.new
$vector[:utf8] = Array.new(65536).fill(0)
$vector[:shiftjis] = Array.new(65536).fill(0)
$vector[:iso] = Array.new(65536).fill(0)
$vector[:eucjp] = Array.new(65536).fill(0)
$pwd = ""
def normalize
norm = 65536
$vector.each_key{|key|
sqsum = $vector[key].inject(0){|sum,e| sum += e*e}
factor = Math.sqrt(sqsum)
$vector[key].map!{|e| e.to_f * norm / factor}
}
end
def print_vector
Dir::chdir($pwd)
$vector.each_key{|key|
p $vector[key].inject(0){|sum,e| sum += e}
open("vector_"+key.to_s+'.txt','w'){|out|
for i in 0..255
start = i*256
row = ($vector[key][start,256])
out.puts(row.map{|e| "%d"%e}.join("\t"))
end
}
}
end
$filecount = 0
def analyze(enc, text)
b = text[0].unpack('C*')
for i in 0..(b.length-2)
$vector[enc][b[i]*256+b[i+1]] += 1
end
end
def read_aozora(file)
$filecount = $filecount + 1
$stderr.puts $filecount.to_s + ": " + File.basename(file)
html = IO.read(file).gsub(/<ruby>.*?<rb>(.+?)<\/rb>.*?<\/ruby>/,'\1')
doc = Hpricot(html)
text = doc.inner_text
begin
analyze(:shiftjis, Iconv.iconv("SHIFT_JIS","SHIFT_JIS",text))
rescue
end
begin
analyze(:utf8, Iconv.iconv("UTF-8","SHIFT_JIS",text))
rescue
end
begin
analyze(:iso, Iconv.iconv("ISO-2022-JP","SHIFT_JIS",text))
rescue
end
begin
analyze(:eucjp, Iconv.iconv("EUC-JP","SHIFT_JIS",text))
rescue
end
end
def main
$pwd = Dir::pwd
Dir::chdir("database")
Dir::glob("*.html").each{|file|
read_aozora(file)
}
normalize
print_vector
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment