nebuta/aozora.rb

## aozora.rb
require 'rubygems'
require 'hpricot'
require 'iconv'

$vector = Hash.new
$vector[:utf8] = Array.new(65536).fill(0)
$vector[:shiftjis] = Array.new(65536).fill(0)
$vector[:iso] = Array.new(65536).fill(0)
$vector[:eucjp] = Array.new(65536).fill(0)

$pwd = ""

def normalize
	norm = 65536
	$vector.each_key{|key|
		sqsum = $vector[key].inject(0){|sum,e| sum += e*e}
		factor = Math.sqrt(sqsum)
		$vector[key].map!{|e| e.to_f * norm / factor}
	}
end

def print_vector
	Dir::chdir($pwd)
	$vector.each_key{|key|
		p $vector[key].inject(0){|sum,e| sum += e}
		open("vector_"+key.to_s+'.txt','w'){|out|
			for i in 0..255
				start = i*256
				row = ($vector[key][start,256])
				out.puts(row.map{|e| "%d"%e}.join("\t"))
			end
		}
	}
end

$filecount = 0
def analyze(enc, text)
	b = text[0].unpack('C*')
	for i in 0..(b.length-2)
		$vector[enc][b[i]*256+b[i+1]] += 1
	end
end

def read_aozora(file)
	$filecount = $filecount + 1
	$stderr.puts $filecount.to_s + ": " + File.basename(file)
	html = IO.read(file).gsub(/<ruby>.*?<rb>(.+?)<\/rb>.*?<\/ruby>/,'\1')
	doc = Hpricot(html)
	text = doc.inner_text
	begin
		analyze(:shiftjis, Iconv.iconv("SHIFT_JIS","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:utf8, Iconv.iconv("UTF-8","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:iso, Iconv.iconv("ISO-2022-JP","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:eucjp, Iconv.iconv("EUC-JP","SHIFT_JIS",text))
	rescue
	end
end

def main
	$pwd = Dir::pwd
	Dir::chdir("database")
	Dir::glob("*.html").each{|file|
		read_aozora(file)
	}
	normalize
	print_vector
end

main
	require 'rubygems'
	require 'hpricot'
	require 'iconv'

	$vector = Hash.new
	$vector[:utf8] = Array.new(65536).fill(0)
	$vector[:shiftjis] = Array.new(65536).fill(0)
	$vector[:iso] = Array.new(65536).fill(0)
	$vector[:eucjp] = Array.new(65536).fill(0)

	$pwd = ""

	def normalize
	norm = 65536
	$vector.each_key{\|key\|
	sqsum = $vector[key].inject(0){\|sum,e\| sum += e*e}
	factor = Math.sqrt(sqsum)
	$vector[key].map!{\|e\| e.to_f * norm / factor}
	}
	end

	def print_vector
	Dir::chdir($pwd)
	$vector.each_key{\|key\|
	p $vector[key].inject(0){\|sum,e\| sum += e}
	open("vector_"+key.to_s+'.txt','w'){\|out\|
	for i in 0..255
	start = i*256
	row = ($vector[key][start,256])
	out.puts(row.map{\|e\| "%d"%e}.join("\t"))
	end
	}
	}
	end

	$filecount = 0
	def analyze(enc, text)
	b = text[0].unpack('C*')
	for i in 0..(b.length-2)
	$vector[enc][b[i]*256+b[i+1]] += 1
	end
	end

	def read_aozora(file)
	$filecount = $filecount + 1
	$stderr.puts $filecount.to_s + ": " + File.basename(file)
	html = IO.read(file).gsub(/<ruby>.?<rb>(.+?)<\/rb>.?<\/ruby>/,'\1')
	doc = Hpricot(html)
	text = doc.inner_text
	begin
	analyze(:shiftjis, Iconv.iconv("SHIFT_JIS","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:utf8, Iconv.iconv("UTF-8","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:iso, Iconv.iconv("ISO-2022-JP","SHIFT_JIS",text))
	rescue
	end
	begin
	analyze(:eucjp, Iconv.iconv("EUC-JP","SHIFT_JIS",text))
	rescue
	end
	end

	def main
	$pwd = Dir::pwd
	Dir::chdir("database")
	Dir::glob("*.html").each{\|file\|
	read_aozora(file)
	}
	normalize
	print_vector
	end

	main