Last active
September 16, 2015 08:40
-
-
Save lnaia/bb4e7c0c07c7026a4082 to your computer and use it in GitHub Desktop.
Text metadata generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+--------------------------------------------+--------+ | |
| Metadata | | |
+--------------------------------------------+--------+ | |
| Type | Result | | |
+--------------------------------------------+--------+ | |
| number_of_words | 454 | | |
| number_of_characters | 3150 | | |
| number_of_characters_excluding_white_space | 2690 | | |
| number_of_sentences | 54 | | |
| number_of_paragraphs | 5 | | |
| average_words_in_sentences | 8 | | |
| average_words_in_paragraphs | 90 | | |
| average_sentences_in_paragraphs | 10 | | |
| is_text_english_language? | false | | |
+--------------------------------------------+--------+ | |
+---------+-------+-------+ | |
| Top 10 words | | |
+---------+-------+-------+ | |
| Word | Count | % | | |
+---------+-------+-------+ | |
| vitae | 8 | 1.76% | | |
| ut | 8 | 1.76% | | |
| vel | 8 | 1.76% | | |
| sem | 7 | 1.54% | | |
| iaculis | 7 | 1.54% | | |
| odio | 7 | 1.54% | | |
| ac | 7 | 1.54% | | |
| quis | 6 | 1.32% | | |
| non | 6 | 1.32% | | |
| at | 6 | 1.32% | | |
| massa | 6 | 1.32% | | |
+---------+-------+-------+ | |
+-------+---------+---------+---------+ | |
| Top 3 Character Comparison | | |
+-------+---------+---------+---------+ | |
| Found | % | English | % | | |
+-------+---------+---------+---------+ | |
| e | 11.227% | e | 12.702% | | |
| i | 10.074% | t | 9.056% | | |
| u | 8.922% | a | 8.167% | | |
+-------+---------+---------+---------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
#encoding: UTF-8 | |
require 'terminal-table' | |
require 'awesome_print' # for debug | |
# Usage: | |
# echo "a e i o u" | ./text_metadata.rb | |
# cat <file> | ./text_metadata.rb | |
contents = '' | |
ARGF.each { |line| contents << line } | |
def word_occurrence(words) | |
word_occurrence_arr = [] | |
word_occurrence = Hash.new(0) | |
words.each { |word| word_occurrence[word] += 1 } | |
word_occurrence.each { |word, count| word_occurrence_arr << {:word => word, :count => count} } | |
word_occurrence_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse | |
end | |
def sentences(text) | |
text.split(/\./).map { |item| | |
item.strip if item.strip.match(/^[A-Z]/) | |
}.compact | |
end | |
def average_words(groups_of_words) | |
sum = 0 | |
groups_of_words.each { |group_of_words| sum += group_of_words.split(/\W+/).length } | |
sum > 0 ? sum / groups_of_words.length : 0 | |
end | |
def average_sentences(paragraphs) | |
sum = 0 | |
paragraphs.each { |paragraph| sum += sentences(paragraph).length } | |
sum > 0 ? sum / paragraphs.length : 0 | |
end | |
def display(metadata) | |
general_info = Terminal::Table.new({:title => 'Metadata', :headings => %w(Type Result)}) do |t| | |
metadata.each { |type, result| | |
t.add_row [type, result] unless type == :top_ten_words or type == :top_characters | |
} | |
end | |
options = {:title => 'Top 10 words', :headings => %w(Word Count %)} | |
top_words = Terminal::Table.new(options) do |t| | |
metadata[:top_ten_words].each { |item| | |
percentage = (item[:count].to_i*100.0)/metadata[:number_of_words].to_i | |
t.add_row [item[:word], item[:count], "#{sprintf('%.2f', percentage)}%"] | |
} | |
end | |
options = {:title => 'Top 3 Character Comparison', :headings => %w(Found % English %)} | |
character_comparison = Terminal::Table.new(options) do |t| | |
metadata[:top_characters][0 .. 2 ].each_index { |index| | |
character = metadata[:top_characters][index] | |
row = [] | |
row << character[:letter] | |
row << "#{sprintf('%.3f', character[:percentage])}%" | |
row << english_top_letters[index][:letter] | |
row << "#{sprintf('%.3f', english_top_letters[index][:percentage])}%" | |
t.add_row row | |
} | |
end | |
puts "#{general_info}\n\n#{top_words}\n\n#{character_comparison}" | |
end | |
def paragraphs(text) | |
paragraphs = text.split("\n") | |
paragraphs = text.split("\r?\n") if paragraphs.empty? | |
paragraphs.keep_if { |item| not item.empty? } | |
end | |
def english_top_letters | |
# https://en.wikipedia.org/wiki/Letter_frequency | |
[ | |
{:letter => 'e', :percentage => 12.702}, | |
{:letter => 't', :percentage => 9.056}, | |
{:letter => 'a', :percentage => 8.167}, | |
{:letter => 'o', :percentage => 7.507}, | |
{:letter => 'i', :percentage => 6.966}, | |
{:letter => 'n', :percentage => 6.749}, | |
{:letter => 's', :percentage => 6.327}, | |
{:letter => 'h', :percentage => 6.094}, | |
{:letter => 'r', :percentage => 5.987}, | |
{:letter => 'd', :percentage => 4.253} | |
] | |
end | |
def character_occurrence(text) | |
letters = text.gsub(/\s/, '').split('') | |
frequency = letters.each_with_object(Hash.new(0)) { |letter, hash| hash[letter] += 1 } | |
frequency_arr = [] | |
frequency.each { |letter, count| | |
percentage = (count * 100.0 / letters.length).to_f | |
frequency_arr << { | |
:letter => letter, | |
:count => count, | |
:percentage => percentage | |
} | |
} | |
frequency_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse | |
end | |
def is_english_language?(top_letters) | |
error_margin = 0.015 | |
top_letters[0 .. 2].each_index { |index| | |
return false unless top_letters[index][:letter] == english_top_letters[index][:letter] | |
upper = top_letters[index][:percentage] + error_margin | |
lower = top_letters[index][:percentage] - error_margin | |
reference = english_top_letters[index][:percentage] | |
return false unless reference <= upper and reference >= lower | |
} | |
true | |
end | |
def calculate_metadata(contents) | |
{ | |
:number_of_words => contents.split(/\W+/).length, | |
:number_of_characters => contents.length, | |
:number_of_characters_excluding_white_space => contents.gsub(/\s/, '').length, | |
:number_of_sentences => sentences(contents).length, | |
:number_of_paragraphs => paragraphs(contents).length, | |
:average_words_in_sentences => average_words(sentences(contents)), | |
:average_words_in_paragraphs => average_words(paragraphs(contents)), | |
:average_sentences_in_paragraphs => average_sentences(paragraphs(contents)), | |
:top_ten_words => word_occurrence(contents.split(/\W+/))[0 .. 10], | |
:top_characters => character_occurrence(contents), | |
:is_text_english_language? => is_english_language?(character_occurrence(contents)) | |
} | |
end | |
display calculate_metadata(contents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment