jhamon/text_analyzer.rb

## text_analyzer.rb
class TextAnalyzer
  def initialize(raw_text)
    @text = raw_text
    self.normalize
    @words = self.tokenize
  end

  def tokenize
    @text.split(" ")
  end

  def normalize
    # Assumption: ignore capitalization and punctuation
    # when analyzing word pair frequency.
    @text.downcase!
    @text = @text.gsub(/[^a-zA-Z\s]/,"")
  end

  def count_digrams
    @digram_count = Hash.new { |h,k| h[k] = 0 }
    @words.each_with_index do |word, idx|
      digram = "#{word} #{@words[idx+1]}"
      @digram_count[digram] += 1
    end
  end

  def most_common_digram
    @digram_count || self.count_digrams
    most_common_digram = @digram_count.max_by { |hash, key| key }
    most_common_digram[0]
  end
end

if $PROGRAM_NAME == __FILE__
  raw_text = File.open(ARGV[0]) { |f| f.read() }
  text_analyzer = TextAnalyzer.new(raw_text)
  puts text_analyzer.most_common_digram
end
	class TextAnalyzer
	def initialize(raw_text)
	@text = raw_text
	self.normalize
	@words = self.tokenize
	end

	def tokenize
	@text.split(" ")
	end

	def normalize
	# Assumption: ignore capitalization and punctuation
	# when analyzing word pair frequency.
	@text.downcase!
	@text = @text.gsub(/[^a-zA-Z\s]/,"")
	end

	def count_digrams
	@digram_count = Hash.new { \|h,k\| h[k] = 0 }
	@words.each_with_index do \|word, idx\|
	digram = "#{word} #{@words[idx+1]}"
	@digram_count[digram] += 1
	end
	end

	def most_common_digram
	@digram_count \|\| self.count_digrams
	most_common_digram = @digram_count.max_by { \|hash, key\| key }
	most_common_digram[0]
	end
	end

	if $PROGRAM_NAME == __FILE__
	raw_text = File.open(ARGV[0]) { \|f\| f.read() }
	text_analyzer = TextAnalyzer.new(raw_text)
	puts text_analyzer.most_common_digram
	end