Skip to content

Instantly share code, notes, and snippets.

@yokozawa
Created May 8, 2016 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yokozawa/f4ee6d1aa92c23bd9f1fa8f5b9295076 to your computer and use it in GitHub Desktop.
Save yokozawa/f4ee6d1aa92c23bd9f1fa8f5b9295076 to your computer and use it in GitHub Desktop.
require 'natto'
require 'csv'
if ARGV[0].nil?
p "usage: ruby mecab.rb [filename]"
exit
end
words_by_line = {}
CSV.open(ARGV[0], 'r') do |lines|
mecab = Natto::MeCab.new
lines.each_with_index do |line, index|
words = []
# 改行や句読点を削除
body = line[1].gsub(/(\r\n|\r|\n|\f|、|。)/,"")
mecab.parse(body) do |noun|
words << noun.surface ? noun.surface : "-"
end
words_by_line[index] = words
end
end
word_counts_by_line = {}
words_by_line.each_with_index do |line, index|
word_count = []
words_by_line[index].uniq.map do |word|
word.gsub!(/(暴言1|暴言2)/, "ピー")
word_count[word_count.size] = ["#{word}", "#{words_by_line[index].grep(word).count}"] if word
end
word_counts_by_line[index] = word_count
end
CSV.open('output.csv', 'wb') do |csv|
word_counts_by_line.each_with_index do |line, index|
csv << word_counts_by_line[index].sort_by { |word_and_count| word_and_count[1].to_i }.reverse
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment