stephenmac7/freq.rb

## freq.rb
# Gem Depends: ve, docopt
# System Depends: mecab, mecab-ipadic-utf-8
require 'csv'
require 've'
require 'docopt'

doc = <<DOCOPT
Lemma Frequency Report.

Usage:
  #{__FILE__} [options] FILE ...
  #{__FILE__} -h | --help
  #{__FILE__} --version

Options:
  -h --help      Show this screen.
  -m --morpheme  Target morphemes, instead of lexemes.
  --version      Show version.
DOCOPT

def main(opt)
  # Input from args, UTF-8 required
  contents = ''
  opt['FILE'].each do |f|
    if f == '-'
      f = '/dev/stdin'
    end
    contents << File.read(f)
  end

  # Pre-processing
  lines = remove_rubies(contents).split # We need to give mecab bite-sized
                                        # pieces, because pipes can't handle
                                        # big sizes and ve uses pipes

  # Process the text and count lemmas, this might take a while
  freq = calculate_frequency(lines, opt['--morpheme'])

  # Show count
  show_count(freq)
end

def calculate_frequency(lines, morpheme)
  # Creates a hash with the frequency for all the lines
  lines.reduce(Hash.new(0)) do |freq,line|
    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
    get_frequency_hash(ve_line, morpheme, freq)
  end
end

def remove_rubies(text)
  # For Aozora Bunko text as input, rubies need to be removed
  text.gsub(/《.*》/, "")
end

# For morpheme operations, it would be much faster to use mecab directly
def get_frequency_hash(words, morpheme, freq = Hash.new(0))
  words.each do |word|
    unless word.lemma == "*" # if lemma could not be found, don't count
      if morpheme
        word.tokens.each do |token|
          index = [token[:lemma], token[:pos]]
          freq[index] += 1
        end
      else
        index = [word.lemma, word.part_of_speech.name]
        freq[index] += 1
      end
    end
  end

  freq
end

def filter_blacklisted(words)
  pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
  words.select { |word| not pos_blacklist.include? word.part_of_speech }
end

def show_count(counts)
  counts.sort_by{|_,count| count}.reverse.each do |ind,count|
    print [count, ind.first, ind.last].to_csv
  end
end

if __FILE__==$0
  begin
    main Docopt::docopt(doc, version: '0.0.1')
  rescue Docopt::Exit => e
    puts e.message
  end
end
	# Gem Depends: ve, docopt
	# System Depends: mecab, mecab-ipadic-utf-8
	require 'csv'
	require 've'
	require 'docopt'

	doc = <<DOCOPT
	Lemma Frequency Report.

	Usage:
	#{__FILE__} [options] FILE ...
	#{__FILE__} -h \| --help
	#{__FILE__} --version

	Options:
	-h --help Show this screen.
	-m --morpheme Target morphemes, instead of lexemes.
	--version Show version.
	DOCOPT

	def main(opt)
	# Input from args, UTF-8 required
	contents = ''
	opt['FILE'].each do \|f\|
	if f == '-'
	f = '/dev/stdin'
	end
	contents << File.read(f)
	end

	# Pre-processing
	lines = remove_rubies(contents).split # We need to give mecab bite-sized
	# pieces, because pipes can't handle
	# big sizes and ve uses pipes

	# Process the text and count lemmas, this might take a while
	freq = calculate_frequency(lines, opt['--morpheme'])

	# Show count
	show_count(freq)
	end

	def calculate_frequency(lines, morpheme)
	# Creates a hash with the frequency for all the lines
	lines.reduce(Hash.new(0)) do \|freq,line\|
	ve_line = filter_blacklisted(Ve.in(:ja).words(line))
	get_frequency_hash(ve_line, morpheme, freq)
	end
	end

	def remove_rubies(text)
	# For Aozora Bunko text as input, rubies need to be removed
	text.gsub(/《.*》/, "")
	end

	# For morpheme operations, it would be much faster to use mecab directly
	def get_frequency_hash(words, morpheme, freq = Hash.new(0))
	words.each do \|word\|
	unless word.lemma == "*" # if lemma could not be found, don't count
	if morpheme
	word.tokens.each do \|token\|
	index = [token[:lemma], token[:pos]]
	freq[index] += 1
	end
	else
	index = [word.lemma, word.part_of_speech.name]
	freq[index] += 1
	end
	end
	end

	freq
	end

	def filter_blacklisted(words)
	pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
	words.select { \|word\| not pos_blacklist.include? word.part_of_speech }
	end

	def show_count(counts)
	counts.sort_by{\|_,count\| count}.reverse.each do \|ind,count\|
	print [count, ind.first, ind.last].to_csv
	end
	end

	if __FILE__==$0
	begin
	main Docopt::docopt(doc, version: '0.0.1')
	rescue Docopt::Exit => e
	puts e.message
	end
	end