lnaia/sample_output.txt

## sample_output.txt
+--------------------------------------------+--------+
|                      Metadata                       |
+--------------------------------------------+--------+
| Type                                       | Result |
+--------------------------------------------+--------+
| number_of_words                            | 454    |
| number_of_characters                       | 3150   |
| number_of_characters_excluding_white_space | 2690   |
| number_of_sentences                        | 54     |
| number_of_paragraphs                       | 5      |
| average_words_in_sentences                 | 8      |
| average_words_in_paragraphs                | 90     |
| average_sentences_in_paragraphs            | 10     |
| is_text_english_language?                  | false  |
+--------------------------------------------+--------+

+---------+-------+-------+
|      Top 10 words       |
+---------+-------+-------+
| Word    | Count | %     |
+---------+-------+-------+
| vitae   | 8     | 1.76% |
| ut      | 8     | 1.76% |
| vel     | 8     | 1.76% |
| sem     | 7     | 1.54% |
| iaculis | 7     | 1.54% |
| odio    | 7     | 1.54% |
| ac      | 7     | 1.54% |
| quis    | 6     | 1.32% |
| non     | 6     | 1.32% |
| at      | 6     | 1.32% |
| massa   | 6     | 1.32% |
+---------+-------+-------+

+-------+---------+---------+---------+
|     Top 3 Character Comparison      |
+-------+---------+---------+---------+
| Found | %       | English | %       |
+-------+---------+---------+---------+
| e     | 11.227% | e       | 12.702% |
| i     | 10.074% | t       | 9.056%  |
| u     | 8.922%  | a       | 8.167%  |
+-------+---------+---------+---------+

## text_metadata.rb
#!/usr/bin/ruby
#encoding: UTF-8

require 'terminal-table'
require 'awesome_print' # for debug

# Usage:
# echo "a e i o u" | ./text_metadata.rb
# cat <file> | ./text_metadata.rb

contents = ''
ARGF.each { |line| contents << line }

def word_occurrence(words)
  word_occurrence_arr = []
  word_occurrence = Hash.new(0)
  words.each { |word| word_occurrence[word] += 1 }
  word_occurrence.each { |word, count| word_occurrence_arr << {:word => word, :count => count} }
  word_occurrence_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse
end

def sentences(text)
  text.split(/\./).map { |item|
    item.strip if item.strip.match(/^[A-Z]/)
  }.compact
end

def average_words(groups_of_words)
  sum = 0
  groups_of_words.each { |group_of_words| sum += group_of_words.split(/\W+/).length }
  sum > 0 ? sum / groups_of_words.length : 0
end

def average_sentences(paragraphs)
  sum = 0
  paragraphs.each { |paragraph| sum += sentences(paragraph).length }
  sum > 0 ? sum / paragraphs.length : 0
end

def display(metadata)
  general_info = Terminal::Table.new({:title => 'Metadata', :headings => %w(Type Result)}) do |t|
    metadata.each { |type, result|
      t.add_row [type, result] unless type == :top_ten_words or type == :top_characters
    }
  end

  options = {:title => 'Top 10 words', :headings => %w(Word Count %)}
  top_words = Terminal::Table.new(options) do |t|
    metadata[:top_ten_words].each { |item|
      percentage = (item[:count].to_i*100.0)/metadata[:number_of_words].to_i
      t.add_row [item[:word], item[:count], "#{sprintf('%.2f', percentage)}%"]
    }
  end

  options = {:title => 'Top 3 Character Comparison', :headings => %w(Found % English %)}
  character_comparison = Terminal::Table.new(options) do |t|
    metadata[:top_characters][0 .. 2 ].each_index { |index|
      character = metadata[:top_characters][index]
      row = []
      row << character[:letter]
      row << "#{sprintf('%.3f', character[:percentage])}%"
      row << english_top_letters[index][:letter]
      row << "#{sprintf('%.3f', english_top_letters[index][:percentage])}%"
      t.add_row row
    }
  end
  puts "#{general_info}\n\n#{top_words}\n\n#{character_comparison}"
end

def paragraphs(text)
  paragraphs = text.split("\n")
  paragraphs = text.split("\r?\n") if paragraphs.empty?
  paragraphs.keep_if { |item| not item.empty? }
end

def english_top_letters
  # https://en.wikipedia.org/wiki/Letter_frequency
  [
      {:letter => 'e', :percentage => 12.702},
      {:letter => 't', :percentage => 9.056},
      {:letter => 'a', :percentage => 8.167},
      {:letter => 'o', :percentage => 7.507},
      {:letter => 'i', :percentage => 6.966},
      {:letter => 'n', :percentage => 6.749},
      {:letter => 's', :percentage => 6.327},
      {:letter => 'h', :percentage => 6.094},
      {:letter => 'r', :percentage => 5.987},
      {:letter => 'd', :percentage => 4.253}
  ]
end

def character_occurrence(text)
  letters = text.gsub(/\s/, '').split('')
  frequency = letters.each_with_object(Hash.new(0)) { |letter, hash| hash[letter] += 1 }
  frequency_arr = []
  frequency.each { |letter, count|
    percentage = (count * 100.0 / letters.length).to_f
    frequency_arr << {
        :letter => letter,
        :count => count,
        :percentage => percentage
    }
  }
  frequency_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse
end


def is_english_language?(top_letters)
  error_margin = 0.015
  top_letters[0 .. 2].each_index { |index|
    return false unless top_letters[index][:letter] == english_top_letters[index][:letter]

    upper = top_letters[index][:percentage] + error_margin
    lower = top_letters[index][:percentage] - error_margin
    reference = english_top_letters[index][:percentage]
    return false unless reference <= upper and reference >= lower
  }
  true
end

def calculate_metadata(contents)
  {
      :number_of_words => contents.split(/\W+/).length,
      :number_of_characters => contents.length,
      :number_of_characters_excluding_white_space => contents.gsub(/\s/, '').length,
      :number_of_sentences => sentences(contents).length,
      :number_of_paragraphs => paragraphs(contents).length,
      :average_words_in_sentences => average_words(sentences(contents)),
      :average_words_in_paragraphs => average_words(paragraphs(contents)),
      :average_sentences_in_paragraphs => average_sentences(paragraphs(contents)),
      :top_ten_words => word_occurrence(contents.split(/\W+/))[0 .. 10],
      :top_characters => character_occurrence(contents),
      :is_text_english_language? => is_english_language?(character_occurrence(contents))
  }
end

display calculate_metadata(contents)
	+--------------------------------------------+--------+
	\| Metadata \|
	+--------------------------------------------+--------+
	\| Type \| Result \|
	+--------------------------------------------+--------+
	\| number_of_words \| 454 \|
	\| number_of_characters \| 3150 \|
	\| number_of_characters_excluding_white_space \| 2690 \|
	\| number_of_sentences \| 54 \|
	\| number_of_paragraphs \| 5 \|
	\| average_words_in_sentences \| 8 \|
	\| average_words_in_paragraphs \| 90 \|
	\| average_sentences_in_paragraphs \| 10 \|
	\| is_text_english_language? \| false \|
	+--------------------------------------------+--------+

	+---------+-------+-------+
	\| Top 10 words \|
	+---------+-------+-------+
	\| Word \| Count \| % \|
	+---------+-------+-------+
	\| vitae \| 8 \| 1.76% \|
	\| ut \| 8 \| 1.76% \|
	\| vel \| 8 \| 1.76% \|
	\| sem \| 7 \| 1.54% \|
	\| iaculis \| 7 \| 1.54% \|
	\| odio \| 7 \| 1.54% \|
	\| ac \| 7 \| 1.54% \|
	\| quis \| 6 \| 1.32% \|
	\| non \| 6 \| 1.32% \|
	\| at \| 6 \| 1.32% \|
	\| massa \| 6 \| 1.32% \|
	+---------+-------+-------+

	+-------+---------+---------+---------+
	\| Top 3 Character Comparison \|
	+-------+---------+---------+---------+
	\| Found \| % \| English \| % \|
	+-------+---------+---------+---------+
	\| e \| 11.227% \| e \| 12.702% \|
	\| i \| 10.074% \| t \| 9.056% \|
	\| u \| 8.922% \| a \| 8.167% \|
	+-------+---------+---------+---------+
	#!/usr/bin/ruby
	#encoding: UTF-8

	require 'terminal-table'
	require 'awesome_print' # for debug

	# Usage:
	# echo "a e i o u" \| ./text_metadata.rb
	# cat <file> \| ./text_metadata.rb

	contents = ''
	ARGF.each { \|line\| contents << line }

	def word_occurrence(words)
	word_occurrence_arr = []
	word_occurrence = Hash.new(0)
	words.each { \|word\| word_occurrence[word] += 1 }
	word_occurrence.each { \|word, count\| word_occurrence_arr << {:word => word, :count => count} }
	word_occurrence_arr.sort { \|a, b\| a[:count] <=> b[:count] }.reverse
	end

	def sentences(text)
	text.split(/\./).map { \|item\|
	item.strip if item.strip.match(/^[A-Z]/)
	}.compact
	end

	def average_words(groups_of_words)
	sum = 0
	groups_of_words.each { \|group_of_words\| sum += group_of_words.split(/\W+/).length }
	sum > 0 ? sum / groups_of_words.length : 0
	end

	def average_sentences(paragraphs)
	sum = 0
	paragraphs.each { \|paragraph\| sum += sentences(paragraph).length }
	sum > 0 ? sum / paragraphs.length : 0
	end

	def display(metadata)
	general_info = Terminal::Table.new({:title => 'Metadata', :headings => %w(Type Result)}) do \|t\|
	metadata.each { \|type, result\|
	t.add_row [type, result] unless type == :top_ten_words or type == :top_characters
	}
	end

	options = {:title => 'Top 10 words', :headings => %w(Word Count %)}
	top_words = Terminal::Table.new(options) do \|t\|
	metadata[:top_ten_words].each { \|item\|
	percentage = (item[:count].to_i*100.0)/metadata[:number_of_words].to_i
	t.add_row [item[:word], item[:count], "#{sprintf('%.2f', percentage)}%"]
	}
	end

	options = {:title => 'Top 3 Character Comparison', :headings => %w(Found % English %)}
	character_comparison = Terminal::Table.new(options) do \|t\|
	metadata[:top_characters][0 .. 2 ].each_index { \|index\|
	character = metadata[:top_characters][index]
	row = []
	row << character[:letter]
	row << "#{sprintf('%.3f', character[:percentage])}%"
	row << english_top_letters[index][:letter]
	row << "#{sprintf('%.3f', english_top_letters[index][:percentage])}%"
	t.add_row row
	}
	end
	puts "#{general_info}\n\n#{top_words}\n\n#{character_comparison}"
	end

	def paragraphs(text)
	paragraphs = text.split("\n")
	paragraphs = text.split("\r?\n") if paragraphs.empty?
	paragraphs.keep_if { \|item\| not item.empty? }
	end

	def english_top_letters
	# https://en.wikipedia.org/wiki/Letter_frequency
	[
	{:letter => 'e', :percentage => 12.702},
	{:letter => 't', :percentage => 9.056},
	{:letter => 'a', :percentage => 8.167},
	{:letter => 'o', :percentage => 7.507},
	{:letter => 'i', :percentage => 6.966},
	{:letter => 'n', :percentage => 6.749},
	{:letter => 's', :percentage => 6.327},
	{:letter => 'h', :percentage => 6.094},
	{:letter => 'r', :percentage => 5.987},
	{:letter => 'd', :percentage => 4.253}
	]
	end

	def character_occurrence(text)
	letters = text.gsub(/\s/, '').split('')
	frequency = letters.each_with_object(Hash.new(0)) { \|letter, hash\| hash[letter] += 1 }
	frequency_arr = []
	frequency.each { \|letter, count\|
	percentage = (count * 100.0 / letters.length).to_f
	frequency_arr << {
	:letter => letter,
	:count => count,
	:percentage => percentage
	}
	}
	frequency_arr.sort { \|a, b\| a[:count] <=> b[:count] }.reverse
	end


	def is_english_language?(top_letters)
	error_margin = 0.015
	top_letters[0 .. 2].each_index { \|index\|
	return false unless top_letters[index][:letter] == english_top_letters[index][:letter]

	upper = top_letters[index][:percentage] + error_margin
	lower = top_letters[index][:percentage] - error_margin
	reference = english_top_letters[index][:percentage]
	return false unless reference <= upper and reference >= lower
	}
	true
	end

	def calculate_metadata(contents)
	{
	:number_of_words => contents.split(/\W+/).length,
	:number_of_characters => contents.length,
	:number_of_characters_excluding_white_space => contents.gsub(/\s/, '').length,
	:number_of_sentences => sentences(contents).length,
	:number_of_paragraphs => paragraphs(contents).length,
	:average_words_in_sentences => average_words(sentences(contents)),
	:average_words_in_paragraphs => average_words(paragraphs(contents)),
	:average_sentences_in_paragraphs => average_sentences(paragraphs(contents)),
	:top_ten_words => word_occurrence(contents.split(/\W+/))[0 .. 10],
	:top_characters => character_occurrence(contents),
	:is_text_english_language? => is_english_language?(character_occurrence(contents))
	}
	end

	display calculate_metadata(contents)