public
Last active

uberVU hackaton - Twitter Tagcloud for Oscar Best Movie Nominees

  • Download Gist
oscars.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
# more info here: http://webmining.olariu.org/the-story-of-the-oscar-predictions
 
import urllib, urllib2
import json
from time import time
 
# using this POS tagger:
# http://jasonwiener.com/2006/01/20/simple-nlp-part-of-speech-tagger-in-python/
import NLPlib
 
def fetch_url(url, get=None, post=None):
user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
headers = {'User-Agent': user_agent}
if get:
data = urllib.urlencode(get)
url = "%s?%s" % (url, data)
print url
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req).read()
response = json.loads(response)
except Exception, e:
print 'error in reading %s: %s' % (url, e)
return None
return response
 
def get_tweets(values):
'''
do a series of api calls at ubervu's api to get all
tweets matching the filtering options
'''
url = 'http://api.contextvoice.com/1.2/mentions/search/'
data = []
val = time()
while True:
response = fetch_url(url, values)
if not response or response['total'] == 0:
break
data.extend(response['results'])
val = min([t['published'] for t in response['results']])
values.update({
'until': val - 1,
})
return data
 
def tokenize(text):
'''
given a text, returns a list of words
handle twitter specific tokens
'''
text = text.lower()
# Remove email adresses
text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '', text)
# Remove twitter user names
text = re.sub(r'(\A|\s)@(\w+)', r'\1', text)
# Remove urls
text = re.sub(r'\w+:\/\/\S+', r'', text)
# Remove repeated (3+) letters: cooool --> cool, niiiiice --> niice
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
# Do it again in case we have coooooooollllllll --> cooll
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
 
words = re.findall('\w+', text)
return words
 
 
# get tweets
today = int(time())
tweets = {}
values = {
'since': today - 7 * 86400,
'until': today,
'generator':'twitter',
'format':'json',
'language':'english',
'apikey': 'you\'l have to get your own',
'count': 100
}
# the keywords tracked by ubervu
keywords = {
'moneyball': "Moneyball movie OR oscar OR picture OR film",
'hugo': "Hugo movie OR oscar OR picture OR film OR animation",
'treelife': '"Tree of Life" movie OR oscar OR picture OR film',
'midnight': '"Midnight in Paris" movie OR oscar OR picture OR film',
'warhorse': '"War Horse" movie OR oscar OR picture OR film',
'artist': '"The Artist" movie OR oscar OR picture OR film',
'descendants': '"The Descendants" movie OR oscar OR picture OR film',
'help': '"The help" movie OR oscar OR picture OR film',
'loud': 'extremely loud incredibly close movie OR oscar OR picture OR film',
}
for movie, query in keywords.iteritems():
values.update({
'q': query,
'until': today
})
tweets.update({movie: get_tweets(values)})
 
# get frequencies for words
freq = {}
for movie, tweet_list in tweets.iteritems():
f = {}
for tweet in tweet_list:
words = tokenize(tweet['content'])
for i in xrange(len(words)):
f[words[i]] = f.get(words[i], 0) + 1
freq[movie] = f
 
# build probabilities P(word|movie)
# acts as normalisation
prob = {}
min_frequency = 8
for movie, frequencies in freq.iteritems():
for word, frequency in frequencies.iteritems():
if frequency >= min_frequency:
if word not in prob:
prob[word] = {}
prob[word][movie] = frequency * 100.0 / len(tweets[movie])
# invert probabilities
# from P(word|movie) build P(movie|word) using Bayes' theorem
# keep only words with the above probability over 55%
top_words = []
min_probability = 55
for word, f in prob.iteritems():
s = 0
maxmovie = ''
maxprob = 0
for movie, p in f.iteritems():
s += p
if maxp < p:
maxp = p
maxmovie = movie
d = maxp * 100 / s
if d > min_probability:
top_words.append((word, maxmovie, d))
top_words.sort(key=lambda x: -x[2])
 
 
# at this point I noticed a lot of noise, so I decided to keep only adjectives
tagger = NLPlib()
adjectives = []
for e in top_words:
if tagger.tag(e[0:1]) == ['JJ']:
adjectives.append(e)
# generate data to use in wordle.net
data = {}
wordle_threshold = 50
for e in adjectives:
if e[1] not in data:
data[e[1]] = ''
data[e[1]] += '%s:%s\n' % (e[0], int(e[2] - wordle_threshold))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.