Skip to content

Instantly share code, notes, and snippets.

@criccomini
Created January 17, 2010 19:01
Show Gist options
  • Save criccomini/279511 to your computer and use it in GitHub Desktop.
Save criccomini/279511 to your computer and use it in GitHub Desktop.
# print a user's tweets
import simplejson
import re
import urllib2
import string
me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/peteskomoroch.json?count=200')
for tweet in simplejson.loads(me.read()):
print tweet['text'].encode('utf-8')
# me is a file with a user's tweets (one tweet per line)
# garden hose is a file with one tweet per line (one tweet per line from twitter's stream)
# sw is a stopwords file (one word per line)
# will compute tfidf for user's stream, then go over twitter's stream and compute cosine similarity between me's stream and every tweet. In short, will try to find related tweets to your stream.
# tfidf on a user's profile
import string
import math
me = open('/Users/criccomi/twitter/me.stream', 'r')
gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
sw = open('/Users/criccomi/twitter/stopwords.txt', 'r')
me_words = {}
me_tfs = {}
me_total = 0
gh_words = {}
gh_total = 0
me_tfidfs = {}
stop_words = {}
# load stop words
for word in sw:
stop_words[word.strip()] = True
exclude = set(string.punctuation)
def pstrip(tostrip):
return ''.join(ch for ch in tostrip if ch not in exclude)
# calculate tfs for all words in me.stream
for tweet in me:
for word in pstrip(tweet).lower().replace("'s", '').split():
me_words[word] = me_words.get(word, 0) + 1
me_total = me_total + 1
for word, count in me_words.items():
if not stop_words.has_key(word):
me_tfs[word] = float(count) / me_total
# calculate idfs for all words in garden.hose.stream
for tweet in gh:
distinct_words = {}
for word in pstrip(tweet).lower().replace("'s", '').split():
distinct_words[word] = True
for word, bool in distinct_words.items():
gh_words[word] = gh_words.get(word, 0) + 1
gh_total = gh_total + 1
# calculate important words in me.stream using tf x idf
for word, tf in me_tfs.items():
if not stop_words.has_key(word):
me_tfidfs[word] = tf * math.log(gh_total / float(1 + gh_words.get(word, 0)))
# sort by tfidf ascending
tfidf_items = me_tfidfs.items()
tfidf_items.sort(key=lambda x: x[1])
for ws in tfidf_items:
if ws[1] < 0.02:
del me_tfs[ws[0]]
else:
print ws
# re-open to reset stream
gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
cosims = {}
# compute cosine similarities between every tweet and my twitter feed
for tweet in gh:
tweet_words = {}
tweet_total = 0
cosim_numerator = 0
for word in pstrip(tweet).lower().replace("'s", '').split():
tweet_words[word] = tweet_words.get(word, 0) + 1
tweet_total = tweet_total + 1
for word, count in tweet_words.items():
cosim_numerator = cosim_numerator + (float(count) / tweet_total) * me_tfs.get(word, 0)
cosim_denominator = tweet_total * me_total
if cosim_denominator > 0:
cosim = cosim_numerator / (tweet_total * me_total)
cosims[tweet.strip()] = cosim
# sort by cosim ascending
cosims_items = cosims.items()
cosims_items.sort(key=lambda x: x[1])
for tweet, score in cosims_items:
print score, tweet
# unshorten a user's urls
import simplejson
import re
import urllib2
me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/criccomini.json?count=200')
for line in me:
tweets = simplejson.loads(line)
for tweet in tweets:
url = re.search("(?P<url>https?://[^\s]+)", tweet['text'])
if url:
url = url.group("url")
# unshorten
try:
url = urllib2.urlopen(url).geturl()
print url
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment