nebuta/encode_test.rb

## encode_test.rb
require 'rubygems'
require 'hpricot'

$asciilist = (0x20..0x7e).to_a | [0x09,0x0a,0x0c,0x0d]

def parse(lines)
	arr = Array.new(65536)
	start = 0
	lines.each{|line|
		arr[start,256]=line.chomp.split("\t").map{|e| e.to_f}
		start += 256
	}
	return arr
end

def isAscii?(b)
	$asciilist.include? b
end

def test_judge(out,bytes)
	len = bytes.length
	score = Hash.new
	score[:utf8]=0
	score[:eucjp]=0
	score[:iso]=0
	score[:shiftjis]=0
	i = 0
	ch_count = 0
	while i < len
		i += 1
		next if isAscii? bytes[i-1]
		while i<len and ch_count <= 100
			break if isAscii? bytes[i]
			score.each_key{|key|
				score[key] += $vector[key][(bytes[i-1])*256+bytes[i]]
			}
			ch_count += 1
			i += 1
			if ch_count>=100
				sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
				out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
				return sorted[0][0].to_s, ch_count, i
			end
		end
	end
	sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
	out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
	return sorted[0][0].to_s, ch_count, i
end

def judge(res,answer)
	res =
	case answer
	when /shift[_-]jis/i
		res == "shiftjis"
	when /euc[_-]jp/i
		res == "eucjp"
	when /ISO-2022-JP/i
		res == "iso"
	when /UTF-8/i
		res == "utf8"
	else
		"N/A"
	end
	if res == true
		return "OK"
	elsif res == false
		return "NG"
	else
		return "N/A"
	end
end

def main
	$vector = Hash.new
	$vectorascii = parse(IO.readlines("vector_ascii.txt"))
	$vector[:utf8] = parse(IO.readlines("sub_utf8.txt"))
	$vector[:eucjp] = parse(IO.readlines("sub_eucjp.txt"))
	$vector[:iso] = parse(IO.readlines("sub_iso.txt"))
	$vector[:shiftjis] = parse(IO.readlines("sub_shiftjis.txt"))

	out = open('result.txt','w')
	Dir::chdir("web")
	Dir::glob("web*.html").each{|file|
		begin
		doc = Hpricot(open(file))
		(doc/'script').remove
		text = (doc/:body).inner_text
		(doc/:head).inner_html =~ /charset=['"]?(.+?)['"]/
		$stderr.puts File.basename(file)
		if $1 then
			answer = $1
			open(file+".txt","w"){|o|
				o.print text
			}
			res, ch, index = test_judge(out,text.unpack('C*'))
			out.puts [res,answer,judge(res,answer),ch,index].join("\t")
			out.puts
		else
			puts "skipped"
		end
		rescue => e
			puts "Error: " + File.basename(file)
			p e
		end
	}
	out.close
end

main
	require 'rubygems'
	require 'hpricot'

	$asciilist = (0x20..0x7e).to_a \| [0x09,0x0a,0x0c,0x0d]

	def parse(lines)
	arr = Array.new(65536)
	start = 0
	lines.each{\|line\|
	arr[start,256]=line.chomp.split("\t").map{\|e\| e.to_f}
	start += 256
	}
	return arr
	end

	def isAscii?(b)
	$asciilist.include? b
	end

	def test_judge(out,bytes)
	len = bytes.length
	score = Hash.new
	score[:utf8]=0
	score[:eucjp]=0
	score[:iso]=0
	score[:shiftjis]=0
	i = 0
	ch_count = 0
	while i < len
	i += 1
	next if isAscii? bytes[i-1]
	while i<len and ch_count <= 100
	break if isAscii? bytes[i]
	score.each_key{\|key\|
	score[key] += $vector[key][(bytes[i-1])*256+bytes[i]]
	}
	ch_count += 1
	i += 1
	if ch_count>=100
	sorted = score.to_a.sort{\|a,b\| a[1] <=> b[1]}.reverse
	out.puts score.to_a.sort{\|a,b\| a[0].to_s <=> b[0].to_s }.map{\|a\| "%.1f"%a[1]}.join("\t")
	return sorted[0][0].to_s, ch_count, i
	end
	end
	end
	sorted = score.to_a.sort{\|a,b\| a[1] <=> b[1]}.reverse
	out.puts score.to_a.sort{\|a,b\| a[0].to_s <=> b[0].to_s }.map{\|a\| "%.1f"%a[1]}.join("\t")
	return sorted[0][0].to_s, ch_count, i
	end

	def judge(res,answer)
	res =
	case answer
	when /shift[_-]jis/i
	res == "shiftjis"
	when /euc[_-]jp/i
	res == "eucjp"
	when /ISO-2022-JP/i
	res == "iso"
	when /UTF-8/i
	res == "utf8"
	else
	"N/A"
	end
	if res == true
	return "OK"
	elsif res == false
	return "NG"
	else
	return "N/A"
	end
	end

	def main
	$vector = Hash.new
	$vectorascii = parse(IO.readlines("vector_ascii.txt"))
	$vector[:utf8] = parse(IO.readlines("sub_utf8.txt"))
	$vector[:eucjp] = parse(IO.readlines("sub_eucjp.txt"))
	$vector[:iso] = parse(IO.readlines("sub_iso.txt"))
	$vector[:shiftjis] = parse(IO.readlines("sub_shiftjis.txt"))

	out = open('result.txt','w')
	Dir::chdir("web")
	Dir::glob("web*.html").each{\|file\|
	begin
	doc = Hpricot(open(file))
	(doc/'script').remove
	text = (doc/:body).inner_text
	(doc/:head).inner_html =~ /charset=['"]?(.+?)['"]/
	$stderr.puts File.basename(file)
	if $1 then
	answer = $1
	open(file+".txt","w"){\|o\|
	o.print text
	}
	res, ch, index = test_judge(out,text.unpack('C*'))
	out.puts [res,answer,judge(res,answer),ch,index].join("\t")
	out.puts
	else
	puts "skipped"
	end
	rescue => e
	puts "Error: " + File.basename(file)
	p e
	end
	}
	out.close
	end

	main