Skip to content

Instantly share code, notes, and snippets.

@SethMMorton
Created June 9, 2012 16:49
Show Gist options
  • Save SethMMorton/2901755 to your computer and use it in GitHub Desktop.
Save SethMMorton/2901755 to your computer and use it in GitHub Desktop.
Count the most common words in a [LaTeX] file, ignoring common words (the, an, etc...)
#! /usr/bin/env python
from __future__ import print_function, division
import sys
import re
from subprocess import Popen, PIPE
from collections import defaultdict
from operator import itemgetter
######################################################
# Counts and reports the N most common words in a file
######################################################
# Grab the file
try:
file_to_count = sys.argv[1]
except IndexError:
sys.exit('Need a file to count the lines in')
# Grab the number of words
try:
nwords = int(sys.argv[2])
except IndexError:
sys.exit('Need the number of most popular words to display')
except ValueError:
sys.exit('The number of words must be an integer')
# First remove LaTeX commands (simultaneously reading the file)
try:
detex = Popen(['detex', file_to_count], stdout=PIPE)
except OSError:
sys.exit('detex may not be installed on your system...')
else:
text, dummy = detex.communicate()
# Now, grab all words in the file and count them except common words
word_count = defaultdict(int)
ignoreset = set(['the', 'of', 'and', 'a', 'to', 'is', 'in', 'that', 'for',
'as', 'by', 'this', 'be', 'with', 'are', 'from', 'can', 'on',
'thus', 'an', 'these', 'using', 'when', 'due', 'at', 'or',
'it', 'we', 'have', 'not', 'there', 'where', 'was', 'been',
'such', 'they', 'al', 'et', 'used', 'into', 'one', 'two',
'which', 'first', 'second', 'third', 'three', 'i', 's', 'has',
'their', 'both', 'because', 'then', 'also', 'well', 'than',
'very', 'only', 'e', 'see', 'fig', '_', 'if', 'so', 'here',
'whereas'])
for word in re.findall('[a-zA-Z_/]+', text):
lw = word.lower()
if lw in ignoreset:
continue
word_count[lw] += 1
# Order the counted words and return as a list of tuples
words = reversed(sorted(word_count.iteritems(), key=itemgetter(1)))
# Now print up to the number requested
print('Word {0:14} Count'.format(''))
for i, (word, count) in enumerate(words):
print('{0:20} {1:d}'.format(word, count))
if i >= nwords:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment