jbenet/zipf.py

## zipf.py
#!/usr/bin/env python

# The MIT License
#
# Copyright (c) 2011 Juan Batiz-Benet
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.


from __future__ import with_statement

import sys
import string
import operator
import optparse

__version__ = '0.1'

def fix_whitespace_and_strip_punctuation(s):
  replace_set = string.punctuation + string.digits + string.whitespace
  return s.translate(string.maketrans(replace_set," " * len(replace_set)), '')


class ZipfDistribution(object):

  def __init__(self, text=None):
    self.words = {}
    if text:
      self.countText(text)

  def __get__(self, word):
    word = word.lower()
    if word in self.words:
      return self.words[word]
    return 0

  def __set__(self, word, value):
    word = word.lower()
    if not isinstance(value, int):
      raise TypeError("Occurrences must be an integer.")
    self.words[word] = value

  def __iter__(self):
    sorted_words = sorted(self.words.iteritems(), key=operator.itemgetter(1))
    sorted_words.reverse()
    return iter(sorted_words)

  def __len__(self):
    return len(self.words)

  def increment(self, word):
    word = word.lower()
    if word not in self.words:
      self.words[word] = 1
    else:
      self.words[word] += 1

  def occurrenceCount(self, word=None):
    if word is None:
      return sum(self.words.values())
    else:
      return self[word.lower()]

  def countText(self, text):
    textseq = fix_whitespace_and_strip_punctuation(text).split(" ")
    for word in textseq:
      if word != '':
        self.increment(word)

  def topWordsCoveringPercentage(self, percent):
    words = []
    occurrences_to_cover = percent * self.occurrenceCount()
    for word, occurrences in self:
      occurrences_to_cover -= occurrences
      words.append(word)
      if occurrences_to_cover <= 0:
        break
    return words


def distributionForText(str):
  return ZipfDistribution(str)

def distributionForFile(filename):
  zd = ZipfDistribution()
  with open(filename) as fh:
    for line in fh:
      zd.countText(line)
  return zd


def printStats(zipf):
  print 'Distinct Words:', len(zipf)
  print 'Total Words:', zipf.occurrenceCount()
  print 'Half-Covering Words:', len(zipf.topWordsCoveringPercentage(0.5))

def printDistribution(zipf):
  print 'Occurrences of Words:'
  for word, occurrence in zipf:
    print occurrence, word
  print '----------------------'
  printStats(zipf)


def main():
  if len(sys.argv) > 1:
    try:
      print 'Zipf Distribution of words in ', sys.argv[1]
      print ''
      zipf = distributionForFile(sys.argv[1])
      printDistribution(zipf)
    except Exception, e:
      print 'Error opening file:', sys.argv[1]
      print 'Usage: ', sys.argv[0], '<filename> (or use stdin)'

  else:
    zipf = ZipfDistribution()
    for line in sys.stdin:
      zipf.countText(line)
    printDistribution(zipf)


if __name__ == '__main__':
  main()
	#!/usr/bin/env python

	# The MIT License
	#
	# Copyright (c) 2011 Juan Batiz-Benet
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.


	from __future__ import with_statement

	import sys
	import string
	import operator
	import optparse

	__version__ = '0.1'

	def fix_whitespace_and_strip_punctuation(s):
	replace_set = string.punctuation + string.digits + string.whitespace
	return s.translate(string.maketrans(replace_set," " * len(replace_set)), '')




	class ZipfDistribution(object):

	def __init__(self, text=None):
	self.words = {}
	if text:
	self.countText(text)

	def __get__(self, word):
	word = word.lower()
	if word in self.words:
	return self.words[word]
	return 0

	def __set__(self, word, value):
	word = word.lower()
	if not isinstance(value, int):
	raise TypeError("Occurrences must be an integer.")
	self.words[word] = value

	def __iter__(self):
	sorted_words = sorted(self.words.iteritems(), key=operator.itemgetter(1))
	sorted_words.reverse()
	return iter(sorted_words)

	def __len__(self):
	return len(self.words)

	def increment(self, word):
	word = word.lower()
	if word not in self.words:
	self.words[word] = 1
	else:
	self.words[word] += 1

	def occurrenceCount(self, word=None):
	if word is None:
	return sum(self.words.values())
	else:
	return self[word.lower()]

	def countText(self, text):
	textseq = fix_whitespace_and_strip_punctuation(text).split(" ")
	for word in textseq:
	if word != '':
	self.increment(word)

	def topWordsCoveringPercentage(self, percent):
	words = []
	occurrences_to_cover = percent * self.occurrenceCount()
	for word, occurrences in self:
	occurrences_to_cover -= occurrences
	words.append(word)
	if occurrences_to_cover <= 0:
	break
	return words





	def distributionForText(str):
	return ZipfDistribution(str)

	def distributionForFile(filename):
	zd = ZipfDistribution()
	with open(filename) as fh:
	for line in fh:
	zd.countText(line)
	return zd




	def printStats(zipf):
	print 'Distinct Words:', len(zipf)
	print 'Total Words:', zipf.occurrenceCount()
	print 'Half-Covering Words:', len(zipf.topWordsCoveringPercentage(0.5))

	def printDistribution(zipf):
	print 'Occurrences of Words:'
	for word, occurrence in zipf:
	print occurrence, word
	print '----------------------'
	printStats(zipf)



	def main():
	if len(sys.argv) > 1:
	try:
	print 'Zipf Distribution of words in ', sys.argv[1]
	print ''
	zipf = distributionForFile(sys.argv[1])
	printDistribution(zipf)
	except Exception, e:
	print 'Error opening file:', sys.argv[1]
	print 'Usage: ', sys.argv[0], '<filename> (or use stdin)'

	else:
	zipf = ZipfDistribution()
	for line in sys.stdin:
	zipf.countText(line)
	printDistribution(zipf)



	if __name__ == '__main__':
	main()