andreiolariu/elclasico.py

## elclasico.py
# more info at http://webmining.olariu.org/el-clasico-on-twitter
# this code is designed to be run in ipython

import urllib, urllib2, time, threading, Queue, re
from datetime import datetime

import simplejson as json
import matplotlib.pyplot as plt
import numpy as np

KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona',
        'forcabarca', 'el clasico', 'elclasico']

def fetch_url(url, get=None, post=None):
    user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
    headers = {'User-Agent': user_agent}
    if get:
        data = urllib.urlencode(get)
        url = "%s?%s" % (url, data)
    req = urllib2.Request(url, post, headers)
    try:
        response = urllib2.urlopen(req).read()
        response = json.loads(response)
    except Exception, e:
        print 'error in reading %s: %s' % (url, e)
        return None
    return response

def fetch_tweets():
    url = 'http://search.twitter.com/search.json'
    values = {
            'count' : 100,
            'q' : ' OR '.join(KEYWORDS) + ' -rt',
            'rpp': 100,
            'page': 1,
            'result_type': 'recent',
            'with_twitter_user_id': 'true',
            'lang': 'en',
    }
    response = fetch_url(url, values)
    if response and 'results' in response:
        return response['results']
    else:
        return []

def monitor_twitter():
    start_at = time.time()
    id_cache = set([])
    while keep_monitoring:
        batch = fetch_tweets()
        batch = [t for t in batch if t['id'] not in id_cache]
        print len(batch)
        id_cache.update([t['id'] for t in batch])
        for t in batch:
            queue.put(t)
        time.sleep(interval)

# Start monitoring Twitter
# Use a thread and a queue to save tweets - easier to work with in ipython
queue = Queue.Queue()
keep_monitoring = True
interval = 30
threading.Thread(target = monitor_twitter).start()

# Go watch the match and check every once in a while to see if I should
# adjust the interval

# Stop monitoring
keep_monitoring = False

# Get the tweets into a list
tweets = []
while queue.qsize():
    tweets.append(queue.get())

# Index tweets by minute relative to the start of the match
minutes = {}
start_of_match = datetime(2011, 12, 10, 21, 00, 00)
for t in tweets:
    t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000")
    minute = int((t_time - start_of_match).total_seconds() / 60)
    if minute not in minutes:
        minutes[minute] = []
    minutes[minute].append(t['text'].lower())

# Build the histogram for tweet volume
hist = []
# ... and the x-axis labels (minutes relative to start of match)
xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1)
for i in xaxis:
    hist.append(len(minutes.get(i, [])))

# Build a dictionary {word: frequency} over all tweets
words = {}
for texts in minutes.itervalues():
    for text in texts:
        for w in re.findall('\w+', text):
            if len(w) > 3:
                words[w] = words.get(w, 0) + 1

# Convert it into a dictionary {word: word_index} and discard rare words
i = 0
temp = {}
for w, f in words.iteritems():
    if f > 25:
        temp[w] = i
        i += 1
words = temp

# Build a matrix where every element [i,j] is the number of times
# the word having word_index=j appears during the time interval
# having index i
# Every time interval is 5 minutes long
counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))]
for start in xaxis[5::5]:
    index = xaxis.index(start) / 5 - 1
    for i in range(start - 5, start):
        for text in minutes.get(i, []):
            for w in re.findall('\w+', text):
                if w in words:
                    counts[index][words[w]] += 1

# Convert the dictionary of {word: word_index} into a list of words,
# where words[word_index] = word
temp = [0] * len(words)
for w, i in words.iteritems():
    temp[i] = w
words = temp

# Convert the matrix of counts to numpy
counts = np.matrix(counts)
# .. in order to easily compute means and standard deviations
# for every set of word frequencies
means = [counts[:,i].mean() for i in xrange(counts.shape[1])]
stds = [counts[:,i].std() for i in xrange(counts.shape[1])]

# Find spikes in word frequencies
# Computes a score = number of standard deviations from mean
awesome = {}
max_score = 0
for i in xrange(counts.shape[0]):
    for j in xrange(counts.shape[0]):
        score = (counts[i,j] - means[j]) / stds[j]
        if score > 3.5:
            x = xaxis[(i + 1) * 5]
            max_score = max(max_score, score)
            if x not  in awesome:
                awesome[x] = []
            awesome[x].append((words[j], score))

# Create the chart
plt.figure()
# Plot the histogram
plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4)
plt.xlabel('Minutes from match start')
plt.ylabel('Tweets per minute')
plt.xticks([i for i in xaxis if i % 10 == 0])
# Plot the first+second half spans
plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2)
plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2)
# Plot the goal lines
plt.axvline(x=1, color='r')
plt.axvline(x=30, color='r')
plt.axvline(x=53 + 2 + 15, color='r')
plt.axvline(x=66 + 2 + 15, color='r')
# Plot the words, based on the time when they spiked
for x, x_words in awesome.iteritems():
    y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)])
    y = min(y, 50)
    for pair in x_words:
        word, score = pair
        alpha = (score - 2) / (max_score - 2)
        plt.text(x - 8, y, word, alpha=alpha, rotation=45)
        y += 9
plt.show()
	# more info at http://webmining.olariu.org/el-clasico-on-twitter
	# this code is designed to be run in ipython

	import urllib, urllib2, time, threading, Queue, re
	from datetime import datetime

	import simplejson as json
	import matplotlib.pyplot as plt
	import numpy as np

	KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona',
	'forcabarca', 'el clasico', 'elclasico']

	def fetch_url(url, get=None, post=None):
	user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
	headers = {'User-Agent': user_agent}
	if get:
	data = urllib.urlencode(get)
	url = "%s?%s" % (url, data)
	req = urllib2.Request(url, post, headers)
	try:
	response = urllib2.urlopen(req).read()
	response = json.loads(response)
	except Exception, e:
	print 'error in reading %s: %s' % (url, e)
	return None
	return response

	def fetch_tweets():
	url = 'http://search.twitter.com/search.json'
	values = {
	'count' : 100,
	'q' : ' OR '.join(KEYWORDS) + ' -rt',
	'rpp': 100,
	'page': 1,
	'result_type': 'recent',
	'with_twitter_user_id': 'true',
	'lang': 'en',
	}
	response = fetch_url(url, values)
	if response and 'results' in response:
	return response['results']
	else:
	return []

	def monitor_twitter():
	start_at = time.time()
	id_cache = set([])
	while keep_monitoring:
	batch = fetch_tweets()
	batch = [t for t in batch if t['id'] not in id_cache]
	print len(batch)
	id_cache.update([t['id'] for t in batch])
	for t in batch:
	queue.put(t)
	time.sleep(interval)

	# Start monitoring Twitter
	# Use a thread and a queue to save tweets - easier to work with in ipython
	queue = Queue.Queue()
	keep_monitoring = True
	interval = 30
	threading.Thread(target = monitor_twitter).start()

	# Go watch the match and check every once in a while to see if I should
	# adjust the interval

	# Stop monitoring
	keep_monitoring = False

	# Get the tweets into a list
	tweets = []
	while queue.qsize():
	tweets.append(queue.get())

	# Index tweets by minute relative to the start of the match
	minutes = {}
	start_of_match = datetime(2011, 12, 10, 21, 00, 00)
	for t in tweets:
	t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000")
	minute = int((t_time - start_of_match).total_seconds() / 60)
	if minute not in minutes:
	minutes[minute] = []
	minutes[minute].append(t['text'].lower())

	# Build the histogram for tweet volume
	hist = []
	# ... and the x-axis labels (minutes relative to start of match)
	xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1)
	for i in xaxis:
	hist.append(len(minutes.get(i, [])))

	# Build a dictionary {word: frequency} over all tweets
	words = {}
	for texts in minutes.itervalues():
	for text in texts:
	for w in re.findall('\w+', text):
	if len(w) > 3:
	words[w] = words.get(w, 0) + 1

	# Convert it into a dictionary {word: word_index} and discard rare words
	i = 0
	temp = {}
	for w, f in words.iteritems():
	if f > 25:
	temp[w] = i
	i += 1
	words = temp

	# Build a matrix where every element [i,j] is the number of times
	# the word having word_index=j appears during the time interval
	# having index i
	# Every time interval is 5 minutes long
	counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))]
	for start in xaxis[5::5]:
	index = xaxis.index(start) / 5 - 1
	for i in range(start - 5, start):
	for text in minutes.get(i, []):
	for w in re.findall('\w+', text):
	if w in words:
	counts[index][words[w]] += 1

	# Convert the dictionary of {word: word_index} into a list of words,
	# where words[word_index] = word
	temp = [0] * len(words)
	for w, i in words.iteritems():
	temp[i] = w
	words = temp

	# Convert the matrix of counts to numpy
	counts = np.matrix(counts)
	# .. in order to easily compute means and standard deviations
	# for every set of word frequencies
	means = [counts[:,i].mean() for i in xrange(counts.shape[1])]
	stds = [counts[:,i].std() for i in xrange(counts.shape[1])]

	# Find spikes in word frequencies
	# Computes a score = number of standard deviations from mean
	awesome = {}
	max_score = 0
	for i in xrange(counts.shape[0]):
	for j in xrange(counts.shape[0]):
	score = (counts[i,j] - means[j]) / stds[j]
	if score > 3.5:
	x = xaxis[(i + 1) * 5]
	max_score = max(max_score, score)
	if x not in awesome:
	awesome[x] = []
	awesome[x].append((words[j], score))

	# Create the chart
	plt.figure()
	# Plot the histogram
	plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4)
	plt.xlabel('Minutes from match start')
	plt.ylabel('Tweets per minute')
	plt.xticks([i for i in xaxis if i % 10 == 0])
	# Plot the first+second half spans
	plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2)
	plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2)
	# Plot the goal lines
	plt.axvline(x=1, color='r')
	plt.axvline(x=30, color='r')
	plt.axvline(x=53 + 2 + 15, color='r')
	plt.axvline(x=66 + 2 + 15, color='r')
	# Plot the words, based on the time when they spiked
	for x, x_words in awesome.iteritems():
	y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)])
	y = min(y, 50)
	for pair in x_words:
	word, score = pair
	alpha = (score - 2) / (max_score - 2)
	plt.text(x - 8, y, word, alpha=alpha, rotation=45)
	y += 9
	plt.show()