Text analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Text Analyzer | |
# Author: Zach | |
# Purpose: Utilize Ruby to analyze Text files | |
# and generate statistical information therein. | |
require 'cgi' | |
STOPWORDS = IO.readlines('conf/stop_words.txt').map{|x| x.chomp} | |
# Count the characters; return a hash with stats | |
def count_chars(text) | |
{ | |
:tot_chars => text.length, | |
:tot_chars_no_space => text.gsub(/\s+/, '').length | |
} | |
end | |
# Count the words, sentences and paragraphs; return a hash with stats | |
def count_chunks(text) | |
{ | |
:word_count => text.split.length, | |
:sent_count => text.split(/\.|\?|!/).length, | |
:para_count => text.split(/\n\n/).length, | |
:line_count => text.size, | |
:keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) }, | |
} | |
end | |
def useful_words(word_count, keywords) | |
{ | |
:pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i, | |
:most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first) | |
} | |
end | |
# Display stats for the sentences | |
def ideal_sentences(sentences) | |
sentences_sorted = sentences.sort_by { |sentence| sentence.length } | |
foo = sentences_sorted.length / 7 | |
ideal_sentences = sentences_sorted.slice(foo, foo + 1) | |
{ :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } } | |
end | |
def main | |
file_stats = [] | |
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files | |
# Local Variables / Text Files | |
text = File.read(text_file) | |
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/) | |
# Count the characters | |
char_stats = count_chars(text) | |
# Count the words, sentences and paragraphs | |
chunk_stats = count_chunks(text) | |
# Figure out the most Useful Words | |
word_stats = useful_words(chunk_stats[:word_count], chunk_stats[:keywords]) | |
# Spit out the most Useful Sentences | |
best_sentences = ideal_sentences(sentences) | |
# Collect our stats and stuff them in the results array | |
file_stats << [char_stats, chunk_stats, word_stats, best_sentences] | |
end # End each-loop | |
# Return the filestats at the end | |
file_stats | |
end | |
stats = main | |
output = <<HTML | |
<html> | |
<body> | |
<pre> | |
Innagural Speech Statistics | |
The total number of lines in the inaugural speech is | |
The total number of characters in the first part is | |
The total number of characters less whitespace is | |
The total number of words is | |
The total number of sentences is | |
The total number of paragraphs is | |
The average sentences per paragraph is | |
The average words per sentence is wordcount/sentcount | |
% of all words in the text are non-fluff words . | |
The ideal sentences are: | |
The top 10 most common words are: | |
#{stats.inspect} | |
</pre> | |
</body> | |
</html> | |
HTML | |
cgi = CGI.new | |
cgi.out do | |
output | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment