Created
May 19, 2011 23:12
-
-
Save jbenet/982006 to your computer and use it in GitHub Desktop.
Zipf Distribution python module (counts text occurrences)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# The MIT License | |
# | |
# Copyright (c) 2011 Juan Batiz-Benet | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
from __future__ import with_statement | |
import sys | |
import string | |
import operator | |
import optparse | |
__version__ = '0.1' | |
def fix_whitespace_and_strip_punctuation(s): | |
replace_set = string.punctuation + string.digits + string.whitespace | |
return s.translate(string.maketrans(replace_set," " * len(replace_set)), '') | |
class ZipfDistribution(object): | |
def __init__(self, text=None): | |
self.words = {} | |
if text: | |
self.countText(text) | |
def __get__(self, word): | |
word = word.lower() | |
if word in self.words: | |
return self.words[word] | |
return 0 | |
def __set__(self, word, value): | |
word = word.lower() | |
if not isinstance(value, int): | |
raise TypeError("Occurrences must be an integer.") | |
self.words[word] = value | |
def __iter__(self): | |
sorted_words = sorted(self.words.iteritems(), key=operator.itemgetter(1)) | |
sorted_words.reverse() | |
return iter(sorted_words) | |
def __len__(self): | |
return len(self.words) | |
def increment(self, word): | |
word = word.lower() | |
if word not in self.words: | |
self.words[word] = 1 | |
else: | |
self.words[word] += 1 | |
def occurrenceCount(self, word=None): | |
if word is None: | |
return sum(self.words.values()) | |
else: | |
return self[word.lower()] | |
def countText(self, text): | |
textseq = fix_whitespace_and_strip_punctuation(text).split(" ") | |
for word in textseq: | |
if word != '': | |
self.increment(word) | |
def topWordsCoveringPercentage(self, percent): | |
words = [] | |
occurrences_to_cover = percent * self.occurrenceCount() | |
for word, occurrences in self: | |
occurrences_to_cover -= occurrences | |
words.append(word) | |
if occurrences_to_cover <= 0: | |
break | |
return words | |
def distributionForText(str): | |
return ZipfDistribution(str) | |
def distributionForFile(filename): | |
zd = ZipfDistribution() | |
with open(filename) as fh: | |
for line in fh: | |
zd.countText(line) | |
return zd | |
def printStats(zipf): | |
print 'Distinct Words:', len(zipf) | |
print 'Total Words:', zipf.occurrenceCount() | |
print 'Half-Covering Words:', len(zipf.topWordsCoveringPercentage(0.5)) | |
def printDistribution(zipf): | |
print 'Occurrences of Words:' | |
for word, occurrence in zipf: | |
print occurrence, word | |
print '----------------------' | |
printStats(zipf) | |
def main(): | |
if len(sys.argv) > 1: | |
try: | |
print 'Zipf Distribution of words in ', sys.argv[1] | |
print '' | |
zipf = distributionForFile(sys.argv[1]) | |
printDistribution(zipf) | |
except Exception, e: | |
print 'Error opening file:', sys.argv[1] | |
print 'Usage: ', sys.argv[0], '<filename> (or use stdin)' | |
else: | |
zipf = ZipfDistribution() | |
for line in sys.stdin: | |
zipf.countText(line) | |
printDistribution(zipf) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment