Skip to content

Instantly share code, notes, and snippets.

@cdfox
Forked from anonymous/twitter_lsi.py
Created June 17, 2011 05:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdfox/1030916 to your computer and use it in GitHub Desktop.
Save cdfox/1030916 to your computer and use it in GitHub Desktop.
Applying LSI to Twitter search
import json
import urllib
import urllib2
from gensim import corpora, models, similarities
import logging
import sys
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure
logging.root.setLevel(logging.INFO) # will suppress DEBUG level events
def main():
num_tweets = sys.argv[1]
query = sys.argv[2]
num_lsi_topics = int(sys.argv[3])
threshold = float(sys.argv[4])
lsi_twitter(num_tweets, query, num_lsi_topics, threshold)
def lsi_twitter(num_tweets, query, num_lsi_topics, threshold):
tweets = twitter_search(num_tweets, query)
run_lsi(tweets, num_lsi_topics, threshold)
def twitter_search(num_tweets, query):
trends_url = 'http://search.twitter.com/search.json'
params = urllib.urlencode({
'rpp': num_tweets,
'q': query,
'lang': 'en',
})
url = '%s?%s' % (trends_url, params)
response = urllib2.urlopen(url, timeout=10)
content = response.read()
data = json.loads(content)
tweets = [r['text'] for r in data['results']]
return tweets
def run_lsi(tweets, num_lsi_topics, threshold):
#stoplist and words that occur once
stoplist = set("for a of the and to in you're should have will do it's this i your is what".split())
allTokens = ''.join(tweets)
tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1)
#split each text on whitespace, filtering out some stopwords
split_tweets = [[word for word in tweet.lower().split()
if word not in stoplist and word not in tokensOnce]
for tweet in tweets]
#map words in corpus to integer IDs, record word frequency
dictionary = corpora.Dictionary(split_tweets)
#dictionary.save('twitter.dict') # store the dictionary, for future reference
#convert texts to term vectors via dictionary
corpus = [dictionary.doc2bow(st) for st in split_tweets]
#corpora.MmCorpus.serialize('twitter.mm', corpus) # store to disk, for later use
#initialize model
tfidf = models.TfidfModel(corpus)
#use model to transform term vectors to tfidf vectors
corpus_tfidf = tfidf[corpus]
#two topic lsi model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, numTopics=num_lsi_topics)
lsi_vecs = lsi[corpus_tfidf]
vecs_by_topic = [[] for i in range(num_lsi_topics)]
#group vecs by max projection along a topic axis,
#discarding the ones near zero
for i, vec in enumerate(lsi_vecs):
(max_topic, max_weight) = max(vec, key=lambda (topic, weight): abs(weight))
if abs(max_weight) > threshold:
vecs_by_topic[max_topic].append((tweets[i], vec, max_weight))
for topic, results in enumerate(vecs_by_topic):
print '\nTopic %i:\n' % (topic)
results.sort(key=lambda (tweet, vec, max_weight): max_weight)
for result in results:
print result[0]
print result[1]
print ''
#make plot
if num_lsi_topics == 2:
fig = Figure(figsize=(6.0,6.0))
canvas = FigureCanvas(fig)
ax = fig.add_subplot(111)
xs = [vec[0][1] for vec in lsi_vecs]
ys = [vec[1][1] for vec in lsi_vecs]
ax.scatter(xs, ys)
ax.set_title('Twitter LSI')
ax.grid(True)
ax.set_xlabel('topic 0')
ax.set_ylabel('topic 1')
canvas.print_figure('twitter_lsi')
def print_tfidf(tweets, corpus_tfidf, dictionary):
#convert from TfidfModel to list of tuples
tfidf_list = [doc for doc in corpus_tfidf]
#get list of (key,value) pairs from dictionary to look-up terms by id
dict_items = dictionary.token2id.items()
#find first key in the dictionary with value term_id
def id2token(term_id):
return filter(lambda (k,v): v == term_id, dict_items)[0][0]
for i in range(len(tweets)):
print 'Result %i: %s' % (i,tweets[i])
tfidf_vec = tfidf_list[i]
tfidf_vec.sort(key=lambda term: term[1])
#top_terms = tfidf_vec[:5]
for term in tfidf_vec: #top_terms:
print '%s: %f' % (id2token(term[0]),term[1])
print ''
if __name__ == "__main__":
main()
@windagardena
Copy link

i tried but why it failed?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment