bmentges/frequency.py

## frequency.py
# coding: utf-8
import types
import sys
import json
import re
from collections import defaultdict

regex = re.compile('[%s]' % re.escape('!"#$%&()*+,-./:;<=>?[\\]^_{|}~'))
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
    if strings_only and isinstance(s, (types.NoneType, int)):
        return s
    elif not isinstance(s, basestring):
        try:
            return str(s)
        except UnicodeEncodeError:
            if isinstance(s, Exception):
                return ' '.join([smart_str(arg, encoding, strings_only,
                        errors) for arg in s])
            return unicode(s).encode(encoding, errors)
    elif isinstance(s, unicode):
        return s.encode(encoding, errors)
    elif s and encoding != 'utf-8':
        return s.decode('utf-8', errors).encode(encoding, errors)
    else:
        return s

class Tweet():
    def __init__(self, raw_tweet):
        self.raw_tweet = json.loads(raw_tweet)
    def is_tt(self):
        if "text" in self.raw_tweet:
            return True
        return False
    def get_tweet_ws(self):
        if self.is_tt():
            text = smart_str(self.raw_tweet['text'])
            text = self._rp(text)
            ws = text.replace("\n", " ").split()
            ws = [x for x in ws if x]
            return ws
        else:
            return []
    def _rp(self, s):
        return regex.sub('', s)

class FrequencyEngine():
    def __init__(self, tweet_file):
        self.tweet_file = tweet_file
        self.word_frequency = {"total_words": 0, "words": defaultdict(int)}

    def reduce_words(self, accumulated, word):
        accumulated["total_words"] += 1
        accumulated["words"][word] += 1
        return accumulated

    def reduce_tweets(self, accumulated, tweet):
        tw = Tweet(tweet)
        words = tw.get_tweet_ws()
        accumulated = reduce(self.reduce_words, words, accumulated)
        return accumulated

    def reduce_frequency(self, accumulated, word):
        total = float(accumulated["total_words"])
        encountered = float(accumulated["words"][word])
        accumulated["words"][word] = encountered / total
        return accumulated

    def compute_frequency(self):
        words = reduce(self.reduce_tweets, self.tweet_file, self.word_frequency)
        frequency = reduce(self.reduce_frequency, words["words"].keys(), words)
        return frequency


def main():
    tweet_file = open(sys.argv[1])
    engine = FrequencyEngine(tweet_file)
    fq_stmt = engine.compute_frequency()
    for word in fq_stmt["words"].keys():
        for w in word.split():
            print "%s\t%.3f" % (w, fq_stmt["words"][word])

if __name__ == "__main__":
    main()
	# coding: utf-8
	import types
	import sys
	import json
	import re
	from collections import defaultdict

	regex = re.compile('[%s]' % re.escape('!"#$%&()*+,-./:;<=>?[\\]^_{\|}~'))
	def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
	if strings_only and isinstance(s, (types.NoneType, int)):
	return s
	elif not isinstance(s, basestring):
	try:
	return str(s)
	except UnicodeEncodeError:
	if isinstance(s, Exception):
	return ' '.join([smart_str(arg, encoding, strings_only,
	errors) for arg in s])
	return unicode(s).encode(encoding, errors)
	elif isinstance(s, unicode):
	return s.encode(encoding, errors)
	elif s and encoding != 'utf-8':
	return s.decode('utf-8', errors).encode(encoding, errors)
	else:
	return s

	class Tweet():
	def __init__(self, raw_tweet):
	self.raw_tweet = json.loads(raw_tweet)
	def is_tt(self):
	if "text" in self.raw_tweet:
	return True
	return False
	def get_tweet_ws(self):
	if self.is_tt():
	text = smart_str(self.raw_tweet['text'])
	text = self._rp(text)
	ws = text.replace("\n", " ").split()
	ws = [x for x in ws if x]
	return ws
	else:
	return []
	def _rp(self, s):
	return regex.sub('', s)

	class FrequencyEngine():
	def __init__(self, tweet_file):
	self.tweet_file = tweet_file
	self.word_frequency = {"total_words": 0, "words": defaultdict(int)}

	def reduce_words(self, accumulated, word):
	accumulated["total_words"] += 1
	accumulated["words"][word] += 1
	return accumulated

	def reduce_tweets(self, accumulated, tweet):
	tw = Tweet(tweet)
	words = tw.get_tweet_ws()
	accumulated = reduce(self.reduce_words, words, accumulated)
	return accumulated

	def reduce_frequency(self, accumulated, word):
	total = float(accumulated["total_words"])
	encountered = float(accumulated["words"][word])
	accumulated["words"][word] = encountered / total
	return accumulated

	def compute_frequency(self):
	words = reduce(self.reduce_tweets, self.tweet_file, self.word_frequency)
	frequency = reduce(self.reduce_frequency, words["words"].keys(), words)
	return frequency


	def main():
	tweet_file = open(sys.argv[1])
	engine = FrequencyEngine(tweet_file)
	fq_stmt = engine.compute_frequency()
	for word in fq_stmt["words"].keys():
	for w in word.split():
	print "%s\t%.3f" % (w, fq_stmt["words"][word])

	if __name__ == "__main__":
	main()