Skip to content

Instantly share code, notes, and snippets.

@finbarrtimbers
Created July 3, 2014 00:51
Show Gist options
  • Save finbarrtimbers/313527ca3b923b68a7a4 to your computer and use it in GitHub Desktop.
Save finbarrtimbers/313527ca3b923b68a7a4 to your computer and use it in GitHub Desktop.
A script used in my research to extract wordlists from software engineering textbooks.
#!/usr/local/bin/python
import sys
import fileinput
import json
import string
from nltk import word_tokenize
from collections import Counter
def getWordLists(fileName):
# Given a sequence of words from stdin, return a dictionary
# w/ words as keys and word frequencies as values
with open(fileName, 'r') as f:
text = f.read()
# Remove all punctuation from text
text = text.translate(None, string.punctuation)
words = word_tokenize(text)
# Set comprehension ignores duplicates:
wordList = {word.lower() for word in words}
return wordList
def removeStopWords(wordList, stopWordsFile = "stop-words.txt"):
with open(stopWordsFile, 'r') as f:
stopWords = word_tokenize(f.read())
for stopWord in stopWords:
try:
wordList.remove(stopWord)
except KeyError:
pass
return wordList
if __name__ == "__main__":
textFiles = sys.argv[1:]
for textFile in textFiles:
print "Generating word frequency list..."
wordList = getWordLists(textFile)
print "Removing stop words..."
wordList = removeStopWords(wordList)
wordListFileName = 'word-list-' + textFile[:-4] + '.txt'
with open(wordListFileName, 'wb') as wordListFile:
wordListString = ""
for word in wordList:
wordListString += word + "\n"
wordListFile.write(wordListString)

TODO

  • Split the logic into "retrieving data" and "processing data." Right now, the code has big, multi-purpose functions.

  • Refactor the code, making it in line with PEP 8, and (maybe?) add assertion statements to act as unit tests.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment