SethMMorton/countwords.py

## countwords.py
#! /usr/bin/env python

from __future__ import print_function, division
import sys
import re
from subprocess import Popen, PIPE
from collections import defaultdict
from operator import itemgetter

######################################################
# Counts and reports the N most common words in a file
######################################################

# Grab the file
try:
    file_to_count = sys.argv[1]
except IndexError:
    sys.exit('Need a file to count the lines in')

# Grab the number of words
try:
    nwords = int(sys.argv[2])
except IndexError:
    sys.exit('Need the number of most popular words to display')
except ValueError:
    sys.exit('The number of words must be an integer')

# First remove LaTeX commands (simultaneously reading the file)
try:
    detex = Popen(['detex', file_to_count], stdout=PIPE)
except OSError:
    sys.exit('detex may not be installed on your system...')
else:
    text, dummy = detex.communicate()

# Now, grab all words in the file and count them except common words
word_count = defaultdict(int)
ignoreset = set(['the', 'of', 'and', 'a', 'to', 'is', 'in', 'that', 'for',
                 'as', 'by', 'this', 'be', 'with', 'are', 'from', 'can', 'on',
                 'thus', 'an', 'these', 'using', 'when', 'due', 'at', 'or',
                 'it', 'we', 'have', 'not', 'there', 'where', 'was', 'been',
                 'such', 'they', 'al', 'et', 'used', 'into', 'one', 'two',
                 'which', 'first', 'second', 'third', 'three', 'i', 's', 'has',
                 'their', 'both', 'because', 'then', 'also', 'well', 'than',
                 'very', 'only', 'e', 'see', 'fig', '_', 'if', 'so', 'here',
                 'whereas'])
for word in re.findall('[a-zA-Z_/]+', text):
    lw = word.lower()
    if lw in ignoreset:
        continue
    word_count[lw] += 1

# Order the counted words and return as a list of tuples
words = reversed(sorted(word_count.iteritems(), key=itemgetter(1)))

# Now print up to the number requested
print('Word  {0:14} Count'.format(''))
for i, (word, count) in enumerate(words):
    print('{0:20} {1:d}'.format(word, count))
    if i >= nwords:
        break
	#! /usr/bin/env python

	from __future__ import print_function, division
	import sys
	import re
	from subprocess import Popen, PIPE
	from collections import defaultdict
	from operator import itemgetter

	######################################################
	# Counts and reports the N most common words in a file
	######################################################

	# Grab the file
	try:
	file_to_count = sys.argv[1]
	except IndexError:
	sys.exit('Need a file to count the lines in')

	# Grab the number of words
	try:
	nwords = int(sys.argv[2])
	except IndexError:
	sys.exit('Need the number of most popular words to display')
	except ValueError:
	sys.exit('The number of words must be an integer')

	# First remove LaTeX commands (simultaneously reading the file)
	try:
	detex = Popen(['detex', file_to_count], stdout=PIPE)
	except OSError:
	sys.exit('detex may not be installed on your system...')
	else:
	text, dummy = detex.communicate()

	# Now, grab all words in the file and count them except common words
	word_count = defaultdict(int)
	ignoreset = set(['the', 'of', 'and', 'a', 'to', 'is', 'in', 'that', 'for',
	'as', 'by', 'this', 'be', 'with', 'are', 'from', 'can', 'on',
	'thus', 'an', 'these', 'using', 'when', 'due', 'at', 'or',
	'it', 'we', 'have', 'not', 'there', 'where', 'was', 'been',
	'such', 'they', 'al', 'et', 'used', 'into', 'one', 'two',
	'which', 'first', 'second', 'third', 'three', 'i', 's', 'has',
	'their', 'both', 'because', 'then', 'also', 'well', 'than',
	'very', 'only', 'e', 'see', 'fig', '_', 'if', 'so', 'here',
	'whereas'])
	for word in re.findall('[a-zA-Z_/]+', text):
	lw = word.lower()
	if lw in ignoreset:
	continue
	word_count[lw] += 1

	# Order the counted words and return as a list of tuples
	words = reversed(sorted(word_count.iteritems(), key=itemgetter(1)))

	# Now print up to the number requested
	print('Word {0:14} Count'.format(''))
	for i, (word, count) in enumerate(words):
	print('{0:20} {1:d}'.format(word, count))
	if i >= nwords:
	break