andreiolariu/oscars.py

## oscars.py
# more info here: http://webmining.olariu.org/the-story-of-the-oscar-predictions

import urllib, urllib2, re
import json
from time import time

# using this POS tagger:
# http://jasonwiener.com/2006/01/20/simple-nlp-part-of-speech-tagger-in-python/
import NLPlib

def fetch_url(url, get=None, post=None):
    user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
    headers = {'User-Agent': user_agent}
    if get:
        data = urllib.urlencode(get)
        url = "%s?%s" % (url, data)
    print url
    req = urllib2.Request(url, post, headers)
    try:
        response = urllib2.urlopen(req).read()
        response = json.loads(response)
    except Exception, e:
        print 'error in reading %s: %s' % (url, e)
        return None
    return response

def get_tweets(values):
    '''
    do a series of api calls at ubervu's api to get all
    tweets matching the filtering options
    '''
    url = 'http://api.contextvoice.com/1.2/mentions/search/'
    data = []
    val = time()
    while True:
        response = fetch_url(url, values)
        if not response or response['total'] == 0:
            break
        data.extend(response['results'])
        val = min([t['published'] for t in response['results']])
        values.update({
            'until': val - 1,
        })
    return data

def tokenize(text):
    '''
    given a text, returns a list of words
    handle twitter specific tokens
    '''
    text = text.lower()
    # Remove email adresses
    text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '', text)
    # Remove twitter user names
    text = re.sub(r'(\A|\s)@(\w+)', r'\1', text)
    # Remove urls
    text = re.sub(r'\w+:\/\/\S+', r'', text)
    # Remove repeated (3+) letters: cooool --> cool, niiiiice --> niice
    text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
    # Do it again in case we have coooooooollllllll --> cooll
    text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)

    words = re.findall('\w+', text)
    return words


# get tweets
today = int(time())
tweets = {}
values = {
    'since': today - 7 * 86400,
    'until': today,
    'generator':'twitter',
    'format':'json',
    'language':'english',
    'apikey': 'you\'l have to get your own',
    'count': 100
}
# the keywords tracked by ubervu
keywords = {
    'moneyball': "Moneyball movie OR oscar OR picture OR film",
    'hugo': "Hugo movie OR oscar OR picture OR film OR animation",
    'treelife': '"Tree of Life" movie OR oscar OR picture OR film',
    'midnight': '"Midnight in Paris" movie OR oscar OR picture OR film',
    'warhorse': '"War Horse" movie OR oscar OR picture OR film',
    'artist': '"The Artist" movie OR oscar OR picture OR film',
    'descendants': '"The Descendants" movie OR oscar OR picture OR film',
    'help': '"The help" movie OR oscar OR picture OR film',
    'loud': 'extremely loud incredibly close movie OR oscar OR picture OR film',
}
for movie, query in keywords.iteritems():
    values.update({
       'q': query,
       'until': today
    })
    tweets.update({movie: get_tweets(values)})

# get frequencies for words
freq = {}
for movie, tweet_list in tweets.iteritems():
    f = {}
    for tweet in tweet_list:
       words = tokenize(tweet['content'])
       for i in xrange(len(words)):
           f[words[i]] = f.get(words[i], 0) + 1
    freq[movie] = f

# build probabilities P(word|movie)
# acts as normalisation
prob = {}
min_frequency = 8
for movie, frequencies in freq.iteritems():
    for word, frequency in frequencies.iteritems():
       if frequency >= min_frequency:
           if word not in prob:
               prob[word] = {}
           prob[word][movie] = frequency * 100.0 / len(tweets[movie])

# invert probabilities
# from P(word|movie) build P(movie|word) using Bayes' theorem
# keep only words with the above probability over 55%
top_words = []
min_probability = 55
for word, f in prob.iteritems():
    s = 0
    maxmovie = ''
    maxprob = 0
    for movie, p in f.iteritems():
       s += p
       if maxp < p:
           maxp = p
           maxmovie = movie
    d = maxp * 100 / s
    if d > min_probability:
       top_words.append((word, maxmovie, d))
top_words.sort(key=lambda x: -x[2])


# at this point I noticed a lot of noise, so I decided to keep only adjectives
tagger = NLPlib()
adjectives = []
for e in top_words:
    if tagger.tag(e[0:1]) == ['JJ']:
       adjectives.append(e)

# generate data to use in wordle.net
data = {}
wordle_threshold = 50
for e in adjectives:
    if e[1] not in data:
       data[e[1]] = ''
    data[e[1]] += '%s:%s\n' % (e[0], int(e[2] - wordle_threshold))
	# more info here: http://webmining.olariu.org/the-story-of-the-oscar-predictions

	import urllib, urllib2, re
	import json
	from time import time

	# using this POS tagger:
	# http://jasonwiener.com/2006/01/20/simple-nlp-part-of-speech-tagger-in-python/
	import NLPlib

	def fetch_url(url, get=None, post=None):
	user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
	headers = {'User-Agent': user_agent}
	if get:
	data = urllib.urlencode(get)
	url = "%s?%s" % (url, data)
	print url
	req = urllib2.Request(url, post, headers)
	try:
	response = urllib2.urlopen(req).read()
	response = json.loads(response)
	except Exception, e:
	print 'error in reading %s: %s' % (url, e)
	return None
	return response

	def get_tweets(values):
	'''
	do a series of api calls at ubervu's api to get all
	tweets matching the filtering options
	'''
	url = 'http://api.contextvoice.com/1.2/mentions/search/'
	data = []
	val = time()
	while True:
	response = fetch_url(url, values)
	if not response or response['total'] == 0:
	break
	data.extend(response['results'])
	val = min([t['published'] for t in response['results']])
	values.update({
	'until': val - 1,
	})
	return data

	def tokenize(text):
	'''
	given a text, returns a list of words
	handle twitter specific tokens
	'''
	text = text.lower()
	# Remove email adresses
	text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '', text)
	# Remove twitter user names
	text = re.sub(r'(\A\|\s)@(\w+)', r'\1', text)
	# Remove urls
	text = re.sub(r'\w+:\/\/\S+', r'', text)
	# Remove repeated (3+) letters: cooool --> cool, niiiiice --> niice
	text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)
	# Do it again in case we have coooooooollllllll --> cooll
	text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text)

	words = re.findall('\w+', text)
	return words


	# get tweets
	today = int(time())
	tweets = {}
	values = {
	'since': today - 7 * 86400,
	'until': today,
	'generator':'twitter',
	'format':'json',
	'language':'english',
	'apikey': 'you\'l have to get your own',
	'count': 100
	}
	# the keywords tracked by ubervu
	keywords = {
	'moneyball': "Moneyball movie OR oscar OR picture OR film",
	'hugo': "Hugo movie OR oscar OR picture OR film OR animation",
	'treelife': '"Tree of Life" movie OR oscar OR picture OR film',
	'midnight': '"Midnight in Paris" movie OR oscar OR picture OR film',
	'warhorse': '"War Horse" movie OR oscar OR picture OR film',
	'artist': '"The Artist" movie OR oscar OR picture OR film',
	'descendants': '"The Descendants" movie OR oscar OR picture OR film',
	'help': '"The help" movie OR oscar OR picture OR film',
	'loud': 'extremely loud incredibly close movie OR oscar OR picture OR film',
	}
	for movie, query in keywords.iteritems():
	values.update({
	'q': query,
	'until': today
	})
	tweets.update({movie: get_tweets(values)})

	# get frequencies for words
	freq = {}
	for movie, tweet_list in tweets.iteritems():
	f = {}
	for tweet in tweet_list:
	words = tokenize(tweet['content'])
	for i in xrange(len(words)):
	f[words[i]] = f.get(words[i], 0) + 1
	freq[movie] = f

	# build probabilities P(word\|movie)
	# acts as normalisation
	prob = {}
	min_frequency = 8
	for movie, frequencies in freq.iteritems():
	for word, frequency in frequencies.iteritems():
	if frequency >= min_frequency:
	if word not in prob:
	prob[word] = {}
	prob[word][movie] = frequency * 100.0 / len(tweets[movie])

	# invert probabilities
	# from P(word\|movie) build P(movie\|word) using Bayes' theorem
	# keep only words with the above probability over 55%
	top_words = []
	min_probability = 55
	for word, f in prob.iteritems():
	s = 0
	maxmovie = ''
	maxprob = 0
	for movie, p in f.iteritems():
	s += p
	if maxp < p:
	maxp = p
	maxmovie = movie
	d = maxp * 100 / s
	if d > min_probability:
	top_words.append((word, maxmovie, d))
	top_words.sort(key=lambda x: -x[2])


	# at this point I noticed a lot of noise, so I decided to keep only adjectives
	tagger = NLPlib()
	adjectives = []
	for e in top_words:
	if tagger.tag(e[0:1]) == ['JJ']:
	adjectives.append(e)

	# generate data to use in wordle.net
	data = {}
	wordle_threshold = 50
	for e in adjectives:
	if e[1] not in data:
	data[e[1]] = ''
	data[e[1]] += '%s:%s\n' % (e[0], int(e[2] - wordle_threshold))