Skip to content

Instantly share code, notes, and snippets.

@bradland
Created Sep 16, 2011
Embed
What would you like to do?
Text analyzer
#!/usr/bin/env ruby
# Text Analyzer
# Author: Zach
# Purpose: Utilize Ruby to analyze Text files
# and generate statistical information therein.
require 'cgi'
STOPWORDS = IO.readlines('conf/stop_words.txt').map{|x| x.chomp}
# Count the characters; return a hash with stats
def count_chars(text)
{
:tot_chars => text.length,
:tot_chars_no_space => text.gsub(/\s+/, '').length
}
end
# Count the words, sentences and paragraphs; return a hash with stats
def count_chunks(text)
{
:word_count => text.split.length,
:sent_count => text.split(/\.|\?|!/).length,
:para_count => text.split(/\n\n/).length,
:line_count => text.size,
:keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) },
}
end
def useful_words(word_count, keywords)
{
:pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
:most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first)
}
end
# Display stats for the sentences
def ideal_sentences(sentences)
sentences_sorted = sentences.sort_by { |sentence| sentence.length }
foo = sentences_sorted.length / 7
ideal_sentences = sentences_sorted.slice(foo, foo + 1)
{ :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } }
end
def main
file_stats = []
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
# Local Variables / Text Files
text = File.read(text_file)
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
# Count the characters
char_stats = count_chars(text)
# Count the words, sentences and paragraphs
chunk_stats = count_chunks(text)
# Figure out the most Useful Words
word_stats = useful_words(chunk_stats[:word_count], chunk_stats[:keywords])
# Spit out the most Useful Sentences
best_sentences = ideal_sentences(sentences)
# Collect our stats and stuff them in the results array
file_stats << [char_stats, chunk_stats, word_stats, best_sentences]
end # End each-loop
# Return the filestats at the end
file_stats
end
stats = main
output = <<HTML
<html>
<body>
<pre>
Innagural Speech Statistics
The total number of lines in the inaugural speech is
The total number of characters in the first part is
The total number of characters less whitespace is
The total number of words is
The total number of sentences is
The total number of paragraphs is
The average sentences per paragraph is
The average words per sentence is wordcount/sentcount
% of all words in the text are non-fluff words .
The ideal sentences are:
The top 10 most common words are:
#{stats.inspect}
</pre>
</body>
</html>
HTML
cgi = CGI.new
cgi.out do
output
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment