Skip to content

Instantly share code, notes, and snippets.

@andreiolariu
Created February 12, 2012 08:59
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save andreiolariu/1807396 to your computer and use it in GitHub Desktop.
uberVU hackaton - Twitter Tagcloud for Oscar Best Movie Nominees
# more info here: http://webmining.olariu.org/the-story-of-the-oscar-predictions
import urllib, urllib2, re
import json
from time import time
# using this POS tagger:
# http://jasonwiener.com/2006/01/20/simple-nlp-part-of-speech-tagger-in-python/
import NLPlib
def fetch_url(url, get=None, post=None):
user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
headers = {'User-Agent': user_agent}
if get:
data = urllib.urlencode(get)
url = "%s?%s" % (url, data)
print url
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req).read()
response = json.loads(response)
except Exception, e:
print 'error in reading %s: %s' % (url, e)
return None
return response
def get_tweets(values):
'''
do a series of api calls at ubervu's api to get all
tweets matching the filtering options
'''
url = 'http://api.contextvoice.com/1.2/mentions/search/'
data = []
val = time()
while True:
response = fetch_url(url, values)
if not response or response['total'] == 0:
break
data.extend(response['results'])
val = min([t['published'] for t in response['results']])
values.update({
'until': val - 1,
})
return data
def tokenize(text):
'''
given a text, returns a list of words
handle twitter specific tokens
'''
text = text.lower()
# Remove email adresses
text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '', text)
# Remove twitter user names
text = re.sub(r'(\A|\s)@(\w+)', r'\1', text)
# Remove urls
text = re.sub(r'\w+:\/\/\S+', r'', text)
# Remove repeated (3+) letters: cooool --> cool, niiiiice --> niice
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
# Do it again in case we have coooooooollllllll --> cooll
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
words = re.findall('\w+', text)
return words
# get tweets
today = int(time())
tweets = {}
values = {
'since': today - 7 * 86400,
'until': today,
'generator':'twitter',
'format':'json',
'language':'english',
'apikey': 'you\'l have to get your own',
'count': 100
}
# the keywords tracked by ubervu
keywords = {
'moneyball': "Moneyball movie OR oscar OR picture OR film",
'hugo': "Hugo movie OR oscar OR picture OR film OR animation",
'treelife': '"Tree of Life" movie OR oscar OR picture OR film',
'midnight': '"Midnight in Paris" movie OR oscar OR picture OR film',
'warhorse': '"War Horse" movie OR oscar OR picture OR film',
'artist': '"The Artist" movie OR oscar OR picture OR film',
'descendants': '"The Descendants" movie OR oscar OR picture OR film',
'help': '"The help" movie OR oscar OR picture OR film',
'loud': 'extremely loud incredibly close movie OR oscar OR picture OR film',
}
for movie, query in keywords.iteritems():
values.update({
'q': query,
'until': today
})
tweets.update({movie: get_tweets(values)})
# get frequencies for words
freq = {}
for movie, tweet_list in tweets.iteritems():
f = {}
for tweet in tweet_list:
words = tokenize(tweet['content'])
for i in xrange(len(words)):
f[words[i]] = f.get(words[i], 0) + 1
freq[movie] = f
# build probabilities P(word|movie)
# acts as normalisation
prob = {}
min_frequency = 8
for movie, frequencies in freq.iteritems():
for word, frequency in frequencies.iteritems():
if frequency >= min_frequency:
if word not in prob:
prob[word] = {}
prob[word][movie] = frequency * 100.0 / len(tweets[movie])
# invert probabilities
# from P(word|movie) build P(movie|word) using Bayes' theorem
# keep only words with the above probability over 55%
top_words = []
min_probability = 55
for word, f in prob.iteritems():
s = 0
maxmovie = ''
maxprob = 0
for movie, p in f.iteritems():
s += p
if maxp < p:
maxp = p
maxmovie = movie
d = maxp * 100 / s
if d > min_probability:
top_words.append((word, maxmovie, d))
top_words.sort(key=lambda x: -x[2])
# at this point I noticed a lot of noise, so I decided to keep only adjectives
tagger = NLPlib()
adjectives = []
for e in top_words:
if tagger.tag(e[0:1]) == ['JJ']:
adjectives.append(e)
# generate data to use in wordle.net
data = {}
wordle_threshold = 50
for e in adjectives:
if e[1] not in data:
data[e[1]] = ''
data[e[1]] += '%s:%s\n' % (e[0], int(e[2] - wordle_threshold))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment