Created
February 5, 2019 12:38
-
-
Save umaz/ec1b7cd4008943823541888530cafa47 to your computer and use it in GitHub Desktop.
Rubyによる形態素解析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'natto' | |
require 'csv' | |
# nattoオブジェクトの作成 | |
# node_formatは%f[6]原形, %f[0]形態素 | |
$nm = Natto::MeCab.new(dicdir: "/usr/lib/mecab/dic/mecab-ipadic-neologd", node_format:'%f[6]\t%f[0]\n', unk_format: '%M\t未知語\n',eos_format:"") | |
def mecab(txt) | |
parse = $nm.parse(txt) | |
split = parse.split(/\n/) | |
split.map! do |str| | |
str.split(/\t/) | |
end | |
split.flatten! | |
return split | |
end | |
# ディレクトリ内のテキストファイルをすべて読み込む | |
dir = Dir.glob("*.txt") | |
dir.each do |file| | |
word = [] | |
File.foreach(file) do |line| | |
parse = mecab(line) | |
word.push(parse) | |
end | |
word.flatten! | |
word_count = Hash.new(0) | |
word.each_with_index do |count, i| | |
if i % 2 == 0 | |
word_count[count] += 1 #文字列の出現回数を数える | |
end | |
end | |
word_sort = word_count.sort {|(k1, v1), (k2, v2)| v2 <=> v1 } | |
word_class = Hash[*word] | |
name = file.split(/.txt/)[0] | |
out = name + ".csv" | |
CSV.open(out, "wb") do |csv| | |
csv << ["文字列", "出現回数", "品詞"] | |
word_sort.each do |row| | |
row.push(word_class[row[0]]) | |
csv << row | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment