Skip to content

Instantly share code, notes, and snippets.

@nebuta
Created November 8, 2011 08:55
Show Gist options
  • Save nebuta/1347317 to your computer and use it in GitHub Desktop.
Save nebuta/1347317 to your computer and use it in GitHub Desktop.
Test algorithm with 1d dictionary
require 'rubygems'
require 'hpricot'
$asciilist = (0x20..0x7e).to_a | [0x09,0x0a,0x0c,0x0d]
def isAscii?(b)
$asciilist.include? b
end
def parse1d(lines)
arr = lines[0].chomp.split("\t").map{|e| e.to_f}
return arr
end
def test_judge_1d(out,bytes)
len = bytes.length
score = Hash.new
score[:utf8]=0
score[:eucjp]=0
score[:iso]=0
score[:shiftjis]=0
i = -1
ch_count = 0
while i<len and ch_count <= 100
i += 1
next if isAscii? bytes[i]
score.each_key{|key|
score[key] += $vector1d[key][bytes[i]]
}
ch_count += 1
if ch_count>=100
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
end
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
def judge(res,answer)
res =
case answer
when /shift[_-]jis/i
res == "shiftjis"
when /euc[_-]jp/i
res == "eucjp"
when /ISO-2022-JP/i
res == "iso"
when /UTF-8/i
res == "utf8"
else
"N/A"
end
if res == true
return "OK"
elsif res == false
return "NG"
else
return "N/A"
end
end
def main
$vector1d = Hash.new
$vector1d[:utf8] = parse1d(IO.readlines("vector_utf8_1d.txt"))
$vector1d[:eucjp] = parse1d(IO.readlines("vector_eucjp_1d.txt"))
$vector1d[:iso] = parse1d(IO.readlines("vector_iso_1d.txt"))
$vector1d[:shiftjis] = parse1d(IO.readlines("vector_shiftjis_1d.txt"))
out = open('result1d.txt','w')
Dir::chdir("web")
Dir::glob("web*.html").each{|file|
begin
doc = Hpricot(open(file))
(doc/'script').remove
text = (doc/:body).inner_text
(doc/:head).inner_html =~ /charset=['"]?(.+?)['"]/
$stderr.puts File.basename(file)
if $1 then
answer = $1
open(file+".txt","w"){|o|
o.print text
}
res, ch, index = test_judge_1d(out,text.unpack('C*'))#
out.puts [res,answer,judge(res,answer),ch,index].join("\t")
out.puts
else
puts "skipped"
end
rescue => e
puts "Error: " + File.basename(file)
p e
end
}
out.close
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment