Skip to content

Instantly share code, notes, and snippets.

@thomasbrus
Last active March 15, 2016 07:52
Show Gist options
  • Save thomasbrus/eb78faca7bf943ab2da3 to your computer and use it in GitHub Desktop.
Save thomasbrus/eb78faca7bf943ab2da3 to your computer and use it in GitHub Desktop.
Assignment 2 - Speech and Language Processing 1
require 'document_classifier'
example1 = "In other words, i'm fine already~ He's a very nice person... literally."
example2 = "So, i can get on with my life normally~XD oh, and my voice is not back yet."
example3 = "Perhaps a time capsule.........hmmmmmmmmm."
puts DocumentClassifier::Tokenizer.new(example1).words.join(', ')
puts DocumentClassifier::Tokenizer.new(example2).words.join(', ')
puts DocumentClassifier::Tokenizer.new(example3).words.join(', ')
require 'document_classifier'
corpus = Dir['data/blogs/train/*.txt'].reduce("") do |data, filename|
data << File.read(filename)
end
tokenizer = DocumentClassifier::Tokenizer.new(corpus)
analyzer = DocumentClassifier::Analyzer.new(tokenizer.words)
unigrams_count = analyzer.unigrams.count
bigrams_count = analyzer.bigrams.count
trigrams_count = analyzer.trigrams.count
puts "Number of (unique) unigrams: #{unigrams_count}"
puts "Number of (unique) bigrams: #{bigrams_count}"
puts "Number of (unique) trigrams: #{trigrams_count}"
most_frequent_words = analyzer.unigrams.sort_by { |_, count| -count }.flat_map(&:first)
puts "Top 10 most frequent words: #{most_frequent_words.take(10)}"
words_that_occur_once = analyzer.unigrams.select { |_, count| count == 1 }
words_that_occur_twice = analyzer.unigrams.select { |_, count| count == 2 }
words_that_occur_thrice = analyzer.unigrams.select { |_, count| count == 3 }
words_that_occur_four_times = analyzer.unigrams.select { |_, count| count == 4 }
puts "Number of words that occur only once: #{words_that_occur_once.count}"
puts "Number of words that occur only twice: #{words_that_occur_twice.count}"
puts "Number of words that occur only thrice: #{words_that_occur_thrice.count}"
puts "Number of words that occur only four times: #{words_that_occur_four_times.count}"
require 'document_classifier'
classifier = DocumentClassifier.new(%i(male female)) do |config|
config.ignore_words = File.readlines('data/blogs/ignore_words.txt').map(&:strip)
config.stop_words = File.readlines('data/blogs/stop_words.txt').map(&:strip)
end
def train(classifier, category, filename)
classifier.train(category, File.read(filename))
puts "Training... #{filename}"
end
def classify(classifier, filename)
puts "Classifying... #{filename}"
classifier.classify(File.read(filename))
end
def training_set(category)
Dir.glob("data/blogs/train/#{{male: 'M', female: 'F' }.fetch(category)}-*.txt")
end
def test_set(category)
Dir.glob("data/blogs/test/#{{male: 'M', female: 'F' }.fetch(category)}-*.txt")
end
training_set(:male).each { |filename| train(classifier, :male, filename) }
training_set(:female).each { |filename| train(classifier, :female, filename) }
male_predictions = test_set(:male).map { |filename| classify(classifier, filename) }
female_predictions = test_set(:female).map { |filename| classify(classifier, filename) }
total = male_predictions.count + female_predictions.count
correct = male_predictions.count(:male) + female_predictions.count(:female)
puts "Accuracy: #{Rational(correct, total).to_f}"
require 'document_classifier'
corpus1 = Dir['data/blogs/train/M-*.txt'].reduce("") do |data, filename|
data << File.read(filename)
end
corpus2 = Dir['data/blogs/train/F-*.txt'].reduce("") do |data, filename|
data << File.read(filename)
end
tokenizer1 = DocumentClassifier::Tokenizer.new(corpus1) do |config|
config.ignore_words = File.readlines('data/blogs/ignore_words.txt').map(&:strip)
config.stop_words = File.readlines('data/blogs/stop_words.txt').map(&:strip)
end
tokenizer2 = DocumentClassifier::Tokenizer.new(corpus2) do |config|
config.ignore_words = File.readlines('data/blogs/ignore_words.txt').map(&:strip)
config.stop_words = File.readlines('data/blogs/stop_words.txt').map(&:strip)
end
tokenizer3 = DocumentClassifier::Tokenizer.new(corpus1 + corpus2) do |config|
config.ignore_words = File.readlines('data/blogs/ignore_words.txt').map(&:strip)
config.stop_words = File.readlines('data/blogs/stop_words.txt').map(&:strip)
end
analyzer1 = DocumentClassifier::Analyzer.new(tokenizer1.words)
analyzer2 = DocumentClassifier::Analyzer.new(tokenizer2.words)
def conditional_probability(word, analyzer)
Rational(analyzer.frequency(word), analyzer.word_count).to_f
end
charasteristic_male_words = tokenizer3.words.uniq.sort_by do |word|
conditional_probability(word, analyzer1) / conditional_probability(word, analyzer2)
end
charasteristic_female_words = tokenizer3.words.uniq.sort_by do |word|
conditional_probability(word, analyzer2) / conditional_probability(word, analyzer1)
end
puts charasteristic_male_words.take(10).inspect
puts charasteristic_female_words.take(10).inspect
require 'document_classifier'
classifier = DocumentClassifier.new(%i(male female)) do |config|
config.ignore_words = File.readlines('data/blogs/ignore_words.txt').map(&:strip)
config.stop_words = File.readlines('data/blogs/stop_words.txt').map(&:strip)
end
def train(classifier, category, filename)
classifier.train(category, File.read(filename))
end
def classify(classifier, filename)
classifier.classify(File.read(filename))
end
def training_set(category)
Dir.glob("data/blogs/train/#{{male: 'M', female: 'F' }.fetch(category)}-*.txt")
end
def test_set(category)
Dir.glob("data/blogs/test/#{{male: 'M', female: 'F' }.fetch(category)}-*.txt")
end
training_set(:male).each { |filename| train(classifier, :male, filename) }
training_set(:female).each { |filename| train(classifier, :female, filename) }
incorrect_predictions1 = test_set(:male).select do |filename|
classify(classifier, filename) == :female
end
incorrect_predictions2 = test_set(:female).select do |filename|
classify(classifier, filename) == :male
end
puts "Incorrectly predicted:", (incorrect_predictions1 + incorrect_predictions2)
# A sample Gemfile
source "https://rubygems.org"
gem 'rake'
gem 'pry'
gem 'document_classifier', path: '~/Code/document-classifier'
PATH
remote: ~/Code/document-classifier
specs:
document_classifier (0.1.0)
memoist (~> 0.12)
GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
memoist (0.12.0)
method_source (0.8.2)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
rake (10.4.2)
slop (3.6.0)
PLATFORMS
ruby
DEPENDENCIES
document_classifier!
pry
rake
BUNDLED WITH
1.10.6
require 'document_classifier'
namespace :vocabulary do
desc <<-DESCRIPTION
Generates a normalized vocabulary from a set of documents
DESCRIPTION
task :generate do
corpus = Dir['data/blogs/train/*.txt'].reduce("") do |data, filename|
data << File.read(filename)
end
tokenizer = DocumentClassifier::Tokenizer.new(corpus)
analyzer = DocumentClassifier::Analyzer.new(tokenizer.words)
min_frequency = Integer(ENV.fetch('MIN_FREQUENCY', 0))
max_frequency = Integer(ENV.fetch('MAX_FREQUENCY', tokenizer.word_count))
unigrams = analyzer.unigrams.select do |word, word_count|
(min_frequency..max_frequency).include?(word_count)
end
vocabulary = unigrams.keys
puts vocabulary
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment