Skip to content

Instantly share code, notes, and snippets.

@vivekn
Created September 20, 2014 17:38
Show Gist options
  • Save vivekn/9dfd1f23ce111b12c8ef to your computer and use it in GitHub Desktop.
Save vivekn/9dfd1f23ce111b12c8ef to your computer and use it in GitHub Desktop.
from collections import defaultdict
tweets = open('tweets.txt.aa').read().lower().split()
target = open('tweets.txt.ab').read().lower().split()
#Mapping from term to number of tweets
doc_ctr = defaultdict(int)
for tweet in tweets:
for word in set(tweet.split()):
if word[0] == '#':
doc_ctr[word] += 1
#counts in target set
term_ctr = defaultdict(int)
for tweet in target:
for word in tweet.split():
if word[0] == '#':
term_ctr[word] += 1
def tfidf(word):
return term_ctr[word] * 1.0 / (1 + doc_ctr[word]) # Add one smoothing to avoid division by zero.
trending_topics = sorted(term_ctr.keys(), key=tfidf, reverse=True)[:10]
print "Top 10 trending topics"
print '\n'.join(trending_topics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment