vivekn/tt.py

## tt.py
from collections import defaultdict

tweets = open('tweets.txt.aa').read().lower().split()
target = open('tweets.txt.ab').read().lower().split()

#Mapping from term to number of tweets
doc_ctr = defaultdict(int)

for tweet in tweets:
    for word in set(tweet.split()):
        if word[0] == '#':
            doc_ctr[word] += 1

#counts in target set
term_ctr = defaultdict(int)

for tweet in target:
    for word in tweet.split():
        if word[0] == '#':
            term_ctr[word] += 1

def tfidf(word):
    return term_ctr[word] * 1.0 / (1 + doc_ctr[word]) # Add one smoothing to avoid division by zero.

trending_topics = sorted(term_ctr.keys(), key=tfidf, reverse=True)[:10]

print "Top 10 trending topics"
print '\n'.join(trending_topics)
	from collections import defaultdict

	tweets = open('tweets.txt.aa').read().lower().split()
	target = open('tweets.txt.ab').read().lower().split()

	#Mapping from term to number of tweets
	doc_ctr = defaultdict(int)

	for tweet in tweets:
	for word in set(tweet.split()):
	if word[0] == '#':
	doc_ctr[word] += 1

	#counts in target set
	term_ctr = defaultdict(int)

	for tweet in target:
	for word in tweet.split():
	if word[0] == '#':
	term_ctr[word] += 1

	def tfidf(word):
	return term_ctr[word] * 1.0 / (1 + doc_ctr[word]) # Add one smoothing to avoid division by zero.

	trending_topics = sorted(term_ctr.keys(), key=tfidf, reverse=True)[:10]

	print "Top 10 trending topics"
	print '\n'.join(trending_topics)