-
-
Save AndrewVos/1227784 to your computer and use it in GitHub Desktop.
Refractored TextAnalyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Text Analyzer | |
# Author: Zach | |
# Purpose: Utilize Ruby to analyze Text files | |
# and generate statistical information therein. | |
require 'cgi' | |
STOPWORDS = File.read('conf/stop_words.txt').map{|x| x.chomp} | |
# Count the characters; return an array with the length of the file | |
def count_characters(text) | |
text.length | |
end | |
def count_characters_less_spaces(text) | |
count_characters(text.gsub(/\s+/, '')) | |
end | |
def count_words(text) | |
word_count = text.split.length | |
end | |
def count_sentences(text) | |
text.split(/\.|\?|!/).length | |
end | |
def count_paragraphs(text) | |
text.split(/\n\n/).length | |
end | |
def count_lines(text) | |
text.split("\n").length | |
end | |
def non_stopwords(text) | |
text.split(/\s+/).select { |word| !STOPWORDS.include?(word) } | |
end | |
def most_common_words(text) | |
most_common_words = (text.split(/\s+/) - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first).join('-- ') | |
#((keywords.length.to_f / word_count.to_f) * 100).to_i, | |
end | |
def ideal_sentences(sentences) # should return array - not hash | |
sentences_sorted = sentences.sort_by { |sentence| sentence.length } | |
foo = sentences_sorted.length / 7 | |
ideal_sentences = sentences_sorted.slice(foo, foo + 1) | |
ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } | |
end | |
def file_names(stats) | |
{ :names => stats.map{ |h| h[:text]} } | |
end | |
def build_reports (stats) | |
reports = [] | |
stats.each do |stat| | |
report = <<-REPORT | |
#{stat[:text]} Inaugural Speech - Analysis Results | |
Total number of characters is: #{stat[:chars]}. | |
Total number of characters less whitespace is: #{stat[:chars2]} | |
Total number of words is: #{stat[:wrd_c]}. | |
Total number of sentences is: #{stat[:chunks][:sent_count]} | |
Total number of paragraphs is: #{stat[:chunks][:para_count]} | |
The average sentences per paragraph is: #{stat[:chunks][:sent_count] / stat[:chunks][:para_count]} | |
The average words per sentence is: #{stat[:chunks][:word_count] / stat[:chunks][:sent_count]} | |
#{stat[:words][percent_good_words]} % of all words in the text are non-fluff words. | |
The ideal sentences are: #{stat[:sent][:ideal_sentences].join("-- ")} | |
The top 10 most common words are: #{stat[:words][:most_common_words]} | |
REPORT | |
reports << report | |
end #end each-loop | |
reports | |
end #end build_reports | |
def collect_stats | |
file_stats = [] | |
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files | |
# Local Variables / Text Files | |
text = File.read(text_file).sub('/files/', '').sub('.txt', '') | |
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/) | |
# Collect our stats and stuff them in the results array | |
file_stats << { | |
:chars => count_characters(text), | |
:chars2 => count_characters_less_spaces(text), | |
:words => count_words(text), | |
:sent => count_sentences(text), | |
:para => count_paragraphs(text), | |
:lines => count_lines(text), | |
:kword => non_stopwords(text), | |
:words => most_common_words(text), | |
:bsent => ideal_sentences(sentences), | |
:text => text_file, | |
} | |
end # End each-loop | |
# Return the filestats at the end | |
file_stats | |
end | |
# Call to collect_stats method containing calls to the functions that process the text files passed to var(text) | |
stats = collect_stats | |
# Process the file name(s) of the text files for use in the build_reports method | |
names = file_names(stats) | |
#Build reports that contain the values from our stats | |
reports = build_reports(stats) | |
header = <<HTML | |
<html> | |
<body> | |
<pre> | |
CS 132A Lab3 | |
Innagural Speech Analysis | |
HTML | |
footer = <<HTML | |
</html> | |
</body> | |
</pre> | |
HTML | |
output = <<OUT | |
#{header} | |
#{reports} | |
#{footer} | |
OUT | |
cgi = CGI.new | |
cgi.out do | |
output | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment