Skip to content

Instantly share code, notes, and snippets.

@bmentges
Created May 20, 2013 19:46
Show Gist options
  • Save bmentges/5614916 to your computer and use it in GitHub Desktop.
Save bmentges/5614916 to your computer and use it in GitHub Desktop.
My frequency implementation for Data Science in coursera.org
# coding: utf-8
import types
import sys
import json
import re
from collections import defaultdict
regex = re.compile('[%s]' % re.escape('!"#$%&()*+,-./:;<=>?[\\]^_{|}~'))
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
if strings_only and isinstance(s, (types.NoneType, int)):
return s
elif not isinstance(s, basestring):
try:
return str(s)
except UnicodeEncodeError:
if isinstance(s, Exception):
return ' '.join([smart_str(arg, encoding, strings_only,
errors) for arg in s])
return unicode(s).encode(encoding, errors)
elif isinstance(s, unicode):
return s.encode(encoding, errors)
elif s and encoding != 'utf-8':
return s.decode('utf-8', errors).encode(encoding, errors)
else:
return s
class Tweet():
def __init__(self, raw_tweet):
self.raw_tweet = json.loads(raw_tweet)
def is_tt(self):
if "text" in self.raw_tweet:
return True
return False
def get_tweet_ws(self):
if self.is_tt():
text = smart_str(self.raw_tweet['text'])
text = self._rp(text)
ws = text.replace("\n", " ").split()
ws = [x for x in ws if x]
return ws
else:
return []
def _rp(self, s):
return regex.sub('', s)
class FrequencyEngine():
def __init__(self, tweet_file):
self.tweet_file = tweet_file
self.word_frequency = {"total_words": 0, "words": defaultdict(int)}
def reduce_words(self, accumulated, word):
accumulated["total_words"] += 1
accumulated["words"][word] += 1
return accumulated
def reduce_tweets(self, accumulated, tweet):
tw = Tweet(tweet)
words = tw.get_tweet_ws()
accumulated = reduce(self.reduce_words, words, accumulated)
return accumulated
def reduce_frequency(self, accumulated, word):
total = float(accumulated["total_words"])
encountered = float(accumulated["words"][word])
accumulated["words"][word] = encountered / total
return accumulated
def compute_frequency(self):
words = reduce(self.reduce_tweets, self.tweet_file, self.word_frequency)
frequency = reduce(self.reduce_frequency, words["words"].keys(), words)
return frequency
def main():
tweet_file = open(sys.argv[1])
engine = FrequencyEngine(tweet_file)
fq_stmt = engine.compute_frequency()
for word in fq_stmt["words"].keys():
for w in word.split():
print "%s\t%.3f" % (w, fq_stmt["words"][word])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment