Skip to content

Instantly share code, notes, and snippets.

@nebuta
Created November 8, 2011 08:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nebuta/1347302 to your computer and use it in GitHub Desktop.
Save nebuta/1347302 to your computer and use it in GitHub Desktop.
Test algorithm
require 'rubygems'
require 'hpricot'
$asciilist = (0x20..0x7e).to_a | [0x09,0x0a,0x0c,0x0d]
def parse(lines)
arr = Array.new(65536)
start = 0
lines.each{|line|
arr[start,256]=line.chomp.split("\t").map{|e| e.to_f}
start += 256
}
return arr
end
def isAscii?(b)
$asciilist.include? b
end
def test_judge(out,bytes)
len = bytes.length
score = Hash.new
score[:utf8]=0
score[:eucjp]=0
score[:iso]=0
score[:shiftjis]=0
i = 0
ch_count = 0
while i < len
i += 1
next if isAscii? bytes[i-1]
while i<len and ch_count <= 100
break if isAscii? bytes[i]
score.each_key{|key|
score[key] += $vector[key][(bytes[i-1])*256+bytes[i]]
}
ch_count += 1
i += 1
if ch_count>=100
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
end
end
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
def judge(res,answer)
res =
case answer
when /shift[_-]jis/i
res == "shiftjis"
when /euc[_-]jp/i
res == "eucjp"
when /ISO-2022-JP/i
res == "iso"
when /UTF-8/i
res == "utf8"
else
"N/A"
end
if res == true
return "OK"
elsif res == false
return "NG"
else
return "N/A"
end
end
def main
$vector = Hash.new
$vectorascii = parse(IO.readlines("vector_ascii.txt"))
$vector[:utf8] = parse(IO.readlines("sub_utf8.txt"))
$vector[:eucjp] = parse(IO.readlines("sub_eucjp.txt"))
$vector[:iso] = parse(IO.readlines("sub_iso.txt"))
$vector[:shiftjis] = parse(IO.readlines("sub_shiftjis.txt"))
out = open('result.txt','w')
Dir::chdir("web")
Dir::glob("web*.html").each{|file|
begin
doc = Hpricot(open(file))
(doc/'script').remove
text = (doc/:body).inner_text
(doc/:head).inner_html =~ /charset=['"]?(.+?)['"]/
$stderr.puts File.basename(file)
if $1 then
answer = $1
open(file+".txt","w"){|o|
o.print text
}
res, ch, index = test_judge(out,text.unpack('C*'))
out.puts [res,answer,judge(res,answer),ch,index].join("\t")
out.puts
else
puts "skipped"
end
rescue => e
puts "Error: " + File.basename(file)
p e
end
}
out.close
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment