bradland/lab3.cgi

## lab3.cgi
#!/usr/bin/env ruby
# Text Analyzer
# Author: Zach
# Purpose: Utilize Ruby to analyze Text files
# and generate statistical information therein.

require 'cgi'

STOPWORDS = IO.readlines('conf/stop_words.txt').map{|x| x.chomp}

# Count the characters; return a hash with stats
def count_chars(text)
  {
    :tot_chars => text.length,
    :tot_chars_no_space => text.gsub(/\s+/, '').length
  }
end

# Count the words, sentences and paragraphs; return a hash with stats
def count_chunks(text)
  {
    :word_count => text.split.length,
    :sent_count => text.split(/\.|\?|!/).length,
    :para_count => text.split(/\n\n/).length,
    :line_count => text.size,
    :keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) },
  }
end

def useful_words(word_count, keywords)
  {
    :pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
    :most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first)
  }
end

# Display stats for the sentences
def ideal_sentences(sentences)
  sentences_sorted = sentences.sort_by { |sentence| sentence.length }
  foo = sentences_sorted.length / 7
  ideal_sentences = sentences_sorted.slice(foo, foo + 1)
  { :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } }
end

def main
  file_stats = []
  Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files

    # Local Variables / Text Files
    text = File.read(text_file)
    sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)

    # Count the characters
    char_stats = count_chars(text)

    # Count the words, sentences and paragraphs
    chunk_stats = count_chunks(text)

    # Figure out the most Useful Words
    word_stats = useful_words(chunk_stats[:word_count], chunk_stats[:keywords])

    # Spit out the most Useful Sentences
    best_sentences = ideal_sentences(sentences)

    # Collect our stats and stuff them in the results array
    file_stats << [char_stats, chunk_stats, word_stats, best_sentences]

  end # End each-loop
  # Return the filestats at the end
  file_stats
end

stats = main

output = <<HTML
<html>
  <body>
    <pre>
Innagural Speech Statistics
The total number of lines in the inaugural speech is
The total number of characters in the first part is
The total number of characters less whitespace is
The total number of words is
The total number of sentences is
The total number of paragraphs is
The average sentences per paragraph is
The average words per sentence is wordcount/sentcount
% of all words in the text are non-fluff words .
The ideal sentences are:
The top 10 most common words are:
#{stats.inspect}
    </pre>
  </body>
</html>
HTML

cgi = CGI.new
cgi.out do
  output
end
	#!/usr/bin/env ruby
	# Text Analyzer
	# Author: Zach
	# Purpose: Utilize Ruby to analyze Text files
	# and generate statistical information therein.

	require 'cgi'

	STOPWORDS = IO.readlines('conf/stop_words.txt').map{\|x\| x.chomp}

	# Count the characters; return a hash with stats
	def count_chars(text)
	{
	:tot_chars => text.length,
	:tot_chars_no_space => text.gsub(/\s+/, '').length
	}
	end

	# Count the words, sentences and paragraphs; return a hash with stats
	def count_chunks(text)
	{
	:word_count => text.split.length,
	:sent_count => text.split(/\.\|\?\|!/).length,
	:para_count => text.split(/\n\n/).length,
	:line_count => text.size,
	:keywords => text.split(/\s+/).select { \|word\| !STOPWORDS.include?(word) },
	}
	end

	def useful_words(word_count, keywords)
	{
	:pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
	:most_common_words => (keywords - STOPWORDS).group_by{ \|x\| x}.sort_by{ \|word, hits\| -hits.length}[0..9].map(&:first)
	}
	end

	# Display stats for the sentences
	def ideal_sentences(sentences)
	sentences_sorted = sentences.sort_by { \|sentence\| sentence.length }
	foo = sentences_sorted.length / 7
	ideal_sentences = sentences_sorted.slice(foo, foo + 1)
	{ :ideal_sentences => ideal_sentences.select { \|sentence\| sentence =~/\sis\W\|\sare\W/ } }
	end

	def main
	file_stats = []
	Dir.glob("files/*.txt").each do \|text_file\| # Iterate through all text files

	# Local Variables / Text Files
	text = File.read(text_file)
	sentences = text.gsub(/\s+/, ' ').strip.split(/\.\|\?\|!/)

	# Count the characters
	char_stats = count_chars(text)

	# Count the words, sentences and paragraphs
	chunk_stats = count_chunks(text)

	# Figure out the most Useful Words
	word_stats = useful_words(chunk_stats[:word_count], chunk_stats[:keywords])

	# Spit out the most Useful Sentences
	best_sentences = ideal_sentences(sentences)

	# Collect our stats and stuff them in the results array
	file_stats << [char_stats, chunk_stats, word_stats, best_sentences]

	end # End each-loop
	# Return the filestats at the end
	file_stats
	end

	stats = main

	output = <<HTML
	<html>
	<body>
	<pre>
	Innagural Speech Statistics
	The total number of lines in the inaugural speech is
	The total number of characters in the first part is
	The total number of characters less whitespace is
	The total number of words is
	The total number of sentences is
	The total number of paragraphs is
	The average sentences per paragraph is
	The average words per sentence is wordcount/sentcount
	% of all words in the text are non-fluff words .
	The ideal sentences are:
	The top 10 most common words are:
	#{stats.inspect}
	</pre>
	</body>
	</html>
	HTML

	cgi = CGI.new
	cgi.out do
	output
	end