Created
November 21, 2013 22:30
-
-
Save jhamon/7590964 to your computer and use it in GitHub Desktop.
To run from the command line, pass one filename as an argument. `ruby text_analyzer.rb text_file_to_analyze.txt`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TextAnalyzer | |
def initialize(raw_text) | |
@text = raw_text | |
self.normalize | |
@words = self.tokenize | |
end | |
def tokenize | |
@text.split(" ") | |
end | |
def normalize | |
# Assumption: ignore capitalization and punctuation | |
# when analyzing word pair frequency. | |
@text.downcase! | |
@text = @text.gsub(/[^a-zA-Z\s]/,"") | |
end | |
def count_digrams | |
@digram_count = Hash.new { |h,k| h[k] = 0 } | |
@words.each_with_index do |word, idx| | |
digram = "#{word} #{@words[idx+1]}" | |
@digram_count[digram] += 1 | |
end | |
end | |
def most_common_digram | |
@digram_count || self.count_digrams | |
most_common_digram = @digram_count.max_by { |hash, key| key } | |
most_common_digram[0] | |
end | |
end | |
if $PROGRAM_NAME == __FILE__ | |
raw_text = File.open(ARGV[0]) { |f| f.read() } | |
text_analyzer = TextAnalyzer.new(raw_text) | |
puts text_analyzer.most_common_digram | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment