cdfox/twitter_lsi.py

## twitter_lsi.py

import json
import urllib
import urllib2
from gensim import corpora, models, similarities
import logging
import sys
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure

logging.root.setLevel(logging.INFO) # will suppress DEBUG level events

def main():
    num_tweets = sys.argv[1]
    query = sys.argv[2]
    num_lsi_topics = int(sys.argv[3])
    threshold = float(sys.argv[4])
    lsi_twitter(num_tweets, query, num_lsi_topics, threshold)

def lsi_twitter(num_tweets, query, num_lsi_topics, threshold):
    tweets = twitter_search(num_tweets, query)
    run_lsi(tweets, num_lsi_topics, threshold)

def twitter_search(num_tweets, query):
    trends_url = 'http://search.twitter.com/search.json'
    params = urllib.urlencode({
        'rpp': num_tweets,
        'q': query,
        'lang': 'en',
    })
    url = '%s?%s' % (trends_url, params)
    response = urllib2.urlopen(url, timeout=10)
    content = response.read()
    data = json.loads(content)
    tweets = [r['text'] for r in data['results']]
    return tweets

def run_lsi(tweets, num_lsi_topics, threshold):
    #stoplist and words that occur once
    stoplist = set("for a of the and to in you're should have will do it's this i your is what".split())
    allTokens = ''.join(tweets)
    tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1)

    #split each text on whitespace, filtering out some stopwords
    split_tweets = [[word for word in tweet.lower().split()
                     if word not in stoplist and word not in tokensOnce]
                    for tweet in tweets]

    #map words in corpus to integer IDs, record word frequency
    dictionary = corpora.Dictionary(split_tweets)

    #dictionary.save('twitter.dict') # store the dictionary, for future reference

    #convert texts to term vectors via dictionary
    corpus = [dictionary.doc2bow(st) for st in split_tweets]

    #corpora.MmCorpus.serialize('twitter.mm', corpus) # store to disk, for later use

    #initialize model
    tfidf = models.TfidfModel(corpus)

    #use model to transform term vectors to tfidf vectors
    corpus_tfidf = tfidf[corpus]

    #two topic lsi model
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, numTopics=num_lsi_topics)
    lsi_vecs = lsi[corpus_tfidf]

    vecs_by_topic = [[] for i in range(num_lsi_topics)]

    #group vecs by max projection along a topic axis,
    #discarding the ones near zero
    for i, vec in enumerate(lsi_vecs):
        (max_topic, max_weight) = max(vec, key=lambda (topic, weight): abs(weight))
        if abs(max_weight) > threshold:
            vecs_by_topic[max_topic].append((tweets[i], vec, max_weight))

    for topic, results in enumerate(vecs_by_topic):
        print '\nTopic %i:\n' % (topic)
        results.sort(key=lambda (tweet, vec, max_weight): max_weight)
        for result in results:
            print result[0]
            print result[1]
            print ''

    #make plot
    if num_lsi_topics == 2:
        fig = Figure(figsize=(6.0,6.0))
        canvas = FigureCanvas(fig)
        ax = fig.add_subplot(111)

        xs = [vec[0][1] for vec in lsi_vecs]
        ys = [vec[1][1] for vec in lsi_vecs]

        ax.scatter(xs, ys)
        ax.set_title('Twitter LSI')
        ax.grid(True)
        ax.set_xlabel('topic 0')
        ax.set_ylabel('topic 1')
        canvas.print_figure('twitter_lsi')

def print_tfidf(tweets, corpus_tfidf, dictionary):
    #convert from TfidfModel to list of tuples
    tfidf_list = [doc for doc in corpus_tfidf]

    #get list of (key,value) pairs from dictionary to look-up terms by id
    dict_items = dictionary.token2id.items()

    #find first key in the dictionary with value term_id
    def id2token(term_id):
        return filter(lambda (k,v): v == term_id, dict_items)[0][0]

    for i in range(len(tweets)):
        print 'Result %i: %s' % (i,tweets[i])
        tfidf_vec = tfidf_list[i]
        tfidf_vec.sort(key=lambda term: term[1])
        #top_terms = tfidf_vec[:5]
        for term in tfidf_vec: #top_terms:
            print '%s: %f' % (id2token(term[0]),term[1])
        print ''

if __name__ == "__main__":
    main()

	import json
	import urllib
	import urllib2
	from gensim import corpora, models, similarities
	import logging
	import sys
	from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
	from matplotlib.figure import Figure

	logging.root.setLevel(logging.INFO) # will suppress DEBUG level events

	def main():
	num_tweets = sys.argv[1]
	query = sys.argv[2]
	num_lsi_topics = int(sys.argv[3])
	threshold = float(sys.argv[4])
	lsi_twitter(num_tweets, query, num_lsi_topics, threshold)

	def lsi_twitter(num_tweets, query, num_lsi_topics, threshold):
	tweets = twitter_search(num_tweets, query)
	run_lsi(tweets, num_lsi_topics, threshold)

	def twitter_search(num_tweets, query):
	trends_url = 'http://search.twitter.com/search.json'
	params = urllib.urlencode({
	'rpp': num_tweets,
	'q': query,
	'lang': 'en',
	})
	url = '%s?%s' % (trends_url, params)
	response = urllib2.urlopen(url, timeout=10)
	content = response.read()
	data = json.loads(content)
	tweets = [r['text'] for r in data['results']]
	return tweets

	def run_lsi(tweets, num_lsi_topics, threshold):
	#stoplist and words that occur once
	stoplist = set("for a of the and to in you're should have will do it's this i your is what".split())
	allTokens = ''.join(tweets)
	tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1)

	#split each text on whitespace, filtering out some stopwords
	split_tweets = [[word for word in tweet.lower().split()
	if word not in stoplist and word not in tokensOnce]
	for tweet in tweets]

	#map words in corpus to integer IDs, record word frequency
	dictionary = corpora.Dictionary(split_tweets)

	#dictionary.save('twitter.dict') # store the dictionary, for future reference

	#convert texts to term vectors via dictionary
	corpus = [dictionary.doc2bow(st) for st in split_tweets]

	#corpora.MmCorpus.serialize('twitter.mm', corpus) # store to disk, for later use

	#initialize model
	tfidf = models.TfidfModel(corpus)

	#use model to transform term vectors to tfidf vectors
	corpus_tfidf = tfidf[corpus]

	#two topic lsi model
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, numTopics=num_lsi_topics)
	lsi_vecs = lsi[corpus_tfidf]

	vecs_by_topic = [[] for i in range(num_lsi_topics)]

	#group vecs by max projection along a topic axis,
	#discarding the ones near zero
	for i, vec in enumerate(lsi_vecs):
	(max_topic, max_weight) = max(vec, key=lambda (topic, weight): abs(weight))
	if abs(max_weight) > threshold:
	vecs_by_topic[max_topic].append((tweets[i], vec, max_weight))

	for topic, results in enumerate(vecs_by_topic):
	print '\nTopic %i:\n' % (topic)
	results.sort(key=lambda (tweet, vec, max_weight): max_weight)
	for result in results:
	print result[0]
	print result[1]
	print ''

	#make plot
	if num_lsi_topics == 2:
	fig = Figure(figsize=(6.0,6.0))
	canvas = FigureCanvas(fig)
	ax = fig.add_subplot(111)

	xs = [vec[0][1] for vec in lsi_vecs]
	ys = [vec[1][1] for vec in lsi_vecs]

	ax.scatter(xs, ys)
	ax.set_title('Twitter LSI')
	ax.grid(True)
	ax.set_xlabel('topic 0')
	ax.set_ylabel('topic 1')
	canvas.print_figure('twitter_lsi')

	def print_tfidf(tweets, corpus_tfidf, dictionary):
	#convert from TfidfModel to list of tuples
	tfidf_list = [doc for doc in corpus_tfidf]

	#get list of (key,value) pairs from dictionary to look-up terms by id
	dict_items = dictionary.token2id.items()

	#find first key in the dictionary with value term_id
	def id2token(term_id):
	return filter(lambda (k,v): v == term_id, dict_items)[0][0]

	for i in range(len(tweets)):
	print 'Result %i: %s' % (i,tweets[i])
	tfidf_vec = tfidf_list[i]
	tfidf_vec.sort(key=lambda term: term[1])
	#top_terms = tfidf_vec[:5]
	for term in tfidf_vec: #top_terms:
	print '%s: %f' % (id2token(term[0]),term[1])
	print ''

	if __name__ == "__main__":
	main()