Skip to content

Instantly share code, notes, and snippets.

@lnaia
Last active September 16, 2015 08:40
Show Gist options
  • Save lnaia/bb4e7c0c07c7026a4082 to your computer and use it in GitHub Desktop.
Save lnaia/bb4e7c0c07c7026a4082 to your computer and use it in GitHub Desktop.
Text metadata generator
+--------------------------------------------+--------+
| Metadata |
+--------------------------------------------+--------+
| Type | Result |
+--------------------------------------------+--------+
| number_of_words | 454 |
| number_of_characters | 3150 |
| number_of_characters_excluding_white_space | 2690 |
| number_of_sentences | 54 |
| number_of_paragraphs | 5 |
| average_words_in_sentences | 8 |
| average_words_in_paragraphs | 90 |
| average_sentences_in_paragraphs | 10 |
| is_text_english_language? | false |
+--------------------------------------------+--------+
+---------+-------+-------+
| Top 10 words |
+---------+-------+-------+
| Word | Count | % |
+---------+-------+-------+
| vitae | 8 | 1.76% |
| ut | 8 | 1.76% |
| vel | 8 | 1.76% |
| sem | 7 | 1.54% |
| iaculis | 7 | 1.54% |
| odio | 7 | 1.54% |
| ac | 7 | 1.54% |
| quis | 6 | 1.32% |
| non | 6 | 1.32% |
| at | 6 | 1.32% |
| massa | 6 | 1.32% |
+---------+-------+-------+
+-------+---------+---------+---------+
| Top 3 Character Comparison |
+-------+---------+---------+---------+
| Found | % | English | % |
+-------+---------+---------+---------+
| e | 11.227% | e | 12.702% |
| i | 10.074% | t | 9.056% |
| u | 8.922% | a | 8.167% |
+-------+---------+---------+---------+
#!/usr/bin/ruby
#encoding: UTF-8
require 'terminal-table'
require 'awesome_print' # for debug
# Usage:
# echo "a e i o u" | ./text_metadata.rb
# cat <file> | ./text_metadata.rb
contents = ''
ARGF.each { |line| contents << line }
def word_occurrence(words)
word_occurrence_arr = []
word_occurrence = Hash.new(0)
words.each { |word| word_occurrence[word] += 1 }
word_occurrence.each { |word, count| word_occurrence_arr << {:word => word, :count => count} }
word_occurrence_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse
end
def sentences(text)
text.split(/\./).map { |item|
item.strip if item.strip.match(/^[A-Z]/)
}.compact
end
def average_words(groups_of_words)
sum = 0
groups_of_words.each { |group_of_words| sum += group_of_words.split(/\W+/).length }
sum > 0 ? sum / groups_of_words.length : 0
end
def average_sentences(paragraphs)
sum = 0
paragraphs.each { |paragraph| sum += sentences(paragraph).length }
sum > 0 ? sum / paragraphs.length : 0
end
def display(metadata)
general_info = Terminal::Table.new({:title => 'Metadata', :headings => %w(Type Result)}) do |t|
metadata.each { |type, result|
t.add_row [type, result] unless type == :top_ten_words or type == :top_characters
}
end
options = {:title => 'Top 10 words', :headings => %w(Word Count %)}
top_words = Terminal::Table.new(options) do |t|
metadata[:top_ten_words].each { |item|
percentage = (item[:count].to_i*100.0)/metadata[:number_of_words].to_i
t.add_row [item[:word], item[:count], "#{sprintf('%.2f', percentage)}%"]
}
end
options = {:title => 'Top 3 Character Comparison', :headings => %w(Found % English %)}
character_comparison = Terminal::Table.new(options) do |t|
metadata[:top_characters][0 .. 2 ].each_index { |index|
character = metadata[:top_characters][index]
row = []
row << character[:letter]
row << "#{sprintf('%.3f', character[:percentage])}%"
row << english_top_letters[index][:letter]
row << "#{sprintf('%.3f', english_top_letters[index][:percentage])}%"
t.add_row row
}
end
puts "#{general_info}\n\n#{top_words}\n\n#{character_comparison}"
end
def paragraphs(text)
paragraphs = text.split("\n")
paragraphs = text.split("\r?\n") if paragraphs.empty?
paragraphs.keep_if { |item| not item.empty? }
end
def english_top_letters
# https://en.wikipedia.org/wiki/Letter_frequency
[
{:letter => 'e', :percentage => 12.702},
{:letter => 't', :percentage => 9.056},
{:letter => 'a', :percentage => 8.167},
{:letter => 'o', :percentage => 7.507},
{:letter => 'i', :percentage => 6.966},
{:letter => 'n', :percentage => 6.749},
{:letter => 's', :percentage => 6.327},
{:letter => 'h', :percentage => 6.094},
{:letter => 'r', :percentage => 5.987},
{:letter => 'd', :percentage => 4.253}
]
end
def character_occurrence(text)
letters = text.gsub(/\s/, '').split('')
frequency = letters.each_with_object(Hash.new(0)) { |letter, hash| hash[letter] += 1 }
frequency_arr = []
frequency.each { |letter, count|
percentage = (count * 100.0 / letters.length).to_f
frequency_arr << {
:letter => letter,
:count => count,
:percentage => percentage
}
}
frequency_arr.sort { |a, b| a[:count] <=> b[:count] }.reverse
end
def is_english_language?(top_letters)
error_margin = 0.015
top_letters[0 .. 2].each_index { |index|
return false unless top_letters[index][:letter] == english_top_letters[index][:letter]
upper = top_letters[index][:percentage] + error_margin
lower = top_letters[index][:percentage] - error_margin
reference = english_top_letters[index][:percentage]
return false unless reference <= upper and reference >= lower
}
true
end
def calculate_metadata(contents)
{
:number_of_words => contents.split(/\W+/).length,
:number_of_characters => contents.length,
:number_of_characters_excluding_white_space => contents.gsub(/\s/, '').length,
:number_of_sentences => sentences(contents).length,
:number_of_paragraphs => paragraphs(contents).length,
:average_words_in_sentences => average_words(sentences(contents)),
:average_words_in_paragraphs => average_words(paragraphs(contents)),
:average_sentences_in_paragraphs => average_sentences(paragraphs(contents)),
:top_ten_words => word_occurrence(contents.split(/\W+/))[0 .. 10],
:top_characters => character_occurrence(contents),
:is_text_english_language? => is_english_language?(character_occurrence(contents))
}
end
display calculate_metadata(contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment