Skip to content

Instantly share code, notes, and snippets.

@thejefflarson
Created March 31, 2011 05:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thejefflarson/895845 to your computer and use it in GitHub Desktop.
Save thejefflarson/895845 to your computer and use it in GitHub Desktop.
class TweetClassifier
GOOD = %w[superior quality excellent superb outstanding magnificent
great exceptional marvelous wonderful awesome]
BAD = %w[inferior shoddy careless miserable horrible
incompetent awful atrocious terrible pathetic crappy]
FACETS = %w[article story piece column]
SEARCH = "http://search.twitter.com/search.json"
def initialize
@features = {}
@categories = []
category(:good)
category(:bad)
end
def classify(doc)
max = 0
best = :unknown
@categories.each do |cat|
p = probability(cat, doc)
if p > max
max = p
best = cat
end
end
best
end
def probability(category, doc)
category_probability = category_count(category) / total
document_probability(category, doc) / category_probability
end
def category(sym)
grab_tweets(sym).each do |tweet|
as_features(tweet).each do |word|
@features[word] ||= {}
@features[word][sym] ||= 0.to_f
@features[word][sym] += 1
@categories << sym
@categories.uniq!
end
end
end
def document_probability(cat, doc)
as_features(doc).inject(1) { |prob, feature| prob * weighted_probability(cat, feature) }
end
def as_features(doc)
doc.split(/\W+/).map { |word|
next if word.length > 20 || word.length < 2
word.downcase
}.compact
end
def weighted_probability(cat, feature)
assumed = 0.5
weight = 1
totals = @categories.inject(0) { |sum, category| sum + feature_count(category, feature) }
((weight * assumed) + (totals * feature_probability(cat, feature))) / (weight + totals)
end
def feature_probability(cat, feature)
return 0 if category_count(cat) == 0
return feature_count(cat, feature) / category_count(cat)
end
def category_count(sym)
@features.values.reduce(0) do |memo, value|
memo + (value[sym] || 0)
end
end
def total
@categories.inject(0) {|sum, cat| sum + category_count(cat) }
end
def feature_count(sym, feature)
return @features[feature][sym] if @features[feature] && @features[feature][sym]
0
end
def good_words
facet GOOD
end
def bad_words
facet BAD
end
def facet(cons)
cons.map { |word|
FACETS.map { |facet| "#{word} #{facet}"}
}.flatten
end
def grab_tweets(cons)
training_file = File.join(ROOT, "models", "#{cons}_words.train")
return File.read(training_file).split("\n") if File.exists? training_file
send("#{cons}_words").map { |word|
resp = Crack::JSON.parse RestClient.get(SEARCH, :params => {:q => word, :result_type => "mixed", :rpp => 100})
resp['results'].map do |tweet|
text = tweet['text'].gsub("\n", "")
File.open(training_file, (File::WRONLY | File::APPEND | File::CREAT)) { |f| f.puts text }
text
end
}.flatten
end
end
@thejefflarson
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment