Created
March 31, 2011 05:03
-
-
Save thejefflarson/895845 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TweetClassifier | |
GOOD = %w[superior quality excellent superb outstanding magnificent | |
great exceptional marvelous wonderful awesome] | |
BAD = %w[inferior shoddy careless miserable horrible | |
incompetent awful atrocious terrible pathetic crappy] | |
FACETS = %w[article story piece column] | |
SEARCH = "http://search.twitter.com/search.json" | |
def initialize | |
@features = {} | |
@categories = [] | |
category(:good) | |
category(:bad) | |
end | |
def classify(doc) | |
max = 0 | |
best = :unknown | |
@categories.each do |cat| | |
p = probability(cat, doc) | |
if p > max | |
max = p | |
best = cat | |
end | |
end | |
best | |
end | |
def probability(category, doc) | |
category_probability = category_count(category) / total | |
document_probability(category, doc) / category_probability | |
end | |
def category(sym) | |
grab_tweets(sym).each do |tweet| | |
as_features(tweet).each do |word| | |
@features[word] ||= {} | |
@features[word][sym] ||= 0.to_f | |
@features[word][sym] += 1 | |
@categories << sym | |
@categories.uniq! | |
end | |
end | |
end | |
def document_probability(cat, doc) | |
as_features(doc).inject(1) { |prob, feature| prob * weighted_probability(cat, feature) } | |
end | |
def as_features(doc) | |
doc.split(/\W+/).map { |word| | |
next if word.length > 20 || word.length < 2 | |
word.downcase | |
}.compact | |
end | |
def weighted_probability(cat, feature) | |
assumed = 0.5 | |
weight = 1 | |
totals = @categories.inject(0) { |sum, category| sum + feature_count(category, feature) } | |
((weight * assumed) + (totals * feature_probability(cat, feature))) / (weight + totals) | |
end | |
def feature_probability(cat, feature) | |
return 0 if category_count(cat) == 0 | |
return feature_count(cat, feature) / category_count(cat) | |
end | |
def category_count(sym) | |
@features.values.reduce(0) do |memo, value| | |
memo + (value[sym] || 0) | |
end | |
end | |
def total | |
@categories.inject(0) {|sum, cat| sum + category_count(cat) } | |
end | |
def feature_count(sym, feature) | |
return @features[feature][sym] if @features[feature] && @features[feature][sym] | |
0 | |
end | |
def good_words | |
facet GOOD | |
end | |
def bad_words | |
facet BAD | |
end | |
def facet(cons) | |
cons.map { |word| | |
FACETS.map { |facet| "#{word} #{facet}"} | |
}.flatten | |
end | |
def grab_tweets(cons) | |
training_file = File.join(ROOT, "models", "#{cons}_words.train") | |
return File.read(training_file).split("\n") if File.exists? training_file | |
send("#{cons}_words").map { |word| | |
resp = Crack::JSON.parse RestClient.get(SEARCH, :params => {:q => word, :result_type => "mixed", :rpp => 100}) | |
resp['results'].map do |tweet| | |
text = tweet['text'].gsub("\n", "") | |
File.open(training_file, (File::WRONLY | File::APPEND | File::CREAT)) { |f| f.puts text } | |
text | |
end | |
}.flatten | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://www.lrec-conf.org/proceedings/lrec2010/pdf/385_Paper.pdf