Skip to content

Instantly share code, notes, and snippets.

@jbenet
Created May 19, 2011 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbenet/982006 to your computer and use it in GitHub Desktop.
Save jbenet/982006 to your computer and use it in GitHub Desktop.
Zipf Distribution python module (counts text occurrences)
#!/usr/bin/env python
# The MIT License
#
# Copyright (c) 2011 Juan Batiz-Benet
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from __future__ import with_statement
import sys
import string
import operator
import optparse
__version__ = '0.1'
def fix_whitespace_and_strip_punctuation(s):
replace_set = string.punctuation + string.digits + string.whitespace
return s.translate(string.maketrans(replace_set," " * len(replace_set)), '')
class ZipfDistribution(object):
def __init__(self, text=None):
self.words = {}
if text:
self.countText(text)
def __get__(self, word):
word = word.lower()
if word in self.words:
return self.words[word]
return 0
def __set__(self, word, value):
word = word.lower()
if not isinstance(value, int):
raise TypeError("Occurrences must be an integer.")
self.words[word] = value
def __iter__(self):
sorted_words = sorted(self.words.iteritems(), key=operator.itemgetter(1))
sorted_words.reverse()
return iter(sorted_words)
def __len__(self):
return len(self.words)
def increment(self, word):
word = word.lower()
if word not in self.words:
self.words[word] = 1
else:
self.words[word] += 1
def occurrenceCount(self, word=None):
if word is None:
return sum(self.words.values())
else:
return self[word.lower()]
def countText(self, text):
textseq = fix_whitespace_and_strip_punctuation(text).split(" ")
for word in textseq:
if word != '':
self.increment(word)
def topWordsCoveringPercentage(self, percent):
words = []
occurrences_to_cover = percent * self.occurrenceCount()
for word, occurrences in self:
occurrences_to_cover -= occurrences
words.append(word)
if occurrences_to_cover <= 0:
break
return words
def distributionForText(str):
return ZipfDistribution(str)
def distributionForFile(filename):
zd = ZipfDistribution()
with open(filename) as fh:
for line in fh:
zd.countText(line)
return zd
def printStats(zipf):
print 'Distinct Words:', len(zipf)
print 'Total Words:', zipf.occurrenceCount()
print 'Half-Covering Words:', len(zipf.topWordsCoveringPercentage(0.5))
def printDistribution(zipf):
print 'Occurrences of Words:'
for word, occurrence in zipf:
print occurrence, word
print '----------------------'
printStats(zipf)
def main():
if len(sys.argv) > 1:
try:
print 'Zipf Distribution of words in ', sys.argv[1]
print ''
zipf = distributionForFile(sys.argv[1])
printDistribution(zipf)
except Exception, e:
print 'Error opening file:', sys.argv[1]
print 'Usage: ', sys.argv[0], '<filename> (or use stdin)'
else:
zipf = ZipfDistribution()
for line in sys.stdin:
zipf.countText(line)
printDistribution(zipf)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment