-
Split the logic into "retrieving data" and "processing data." Right now, the code has big, multi-purpose functions.
-
Refactor the code, making it in line with PEP 8, and (maybe?) add assertion statements to act as unit tests.
Created
July 3, 2014 00:51
-
-
Save finbarrtimbers/313527ca3b923b68a7a4 to your computer and use it in GitHub Desktop.
A script used in my research to extract wordlists from software engineering textbooks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import sys | |
import fileinput | |
import json | |
import string | |
from nltk import word_tokenize | |
from collections import Counter | |
def getWordLists(fileName): | |
# Given a sequence of words from stdin, return a dictionary | |
# w/ words as keys and word frequencies as values | |
with open(fileName, 'r') as f: | |
text = f.read() | |
# Remove all punctuation from text | |
text = text.translate(None, string.punctuation) | |
words = word_tokenize(text) | |
# Set comprehension ignores duplicates: | |
wordList = {word.lower() for word in words} | |
return wordList | |
def removeStopWords(wordList, stopWordsFile = "stop-words.txt"): | |
with open(stopWordsFile, 'r') as f: | |
stopWords = word_tokenize(f.read()) | |
for stopWord in stopWords: | |
try: | |
wordList.remove(stopWord) | |
except KeyError: | |
pass | |
return wordList | |
if __name__ == "__main__": | |
textFiles = sys.argv[1:] | |
for textFile in textFiles: | |
print "Generating word frequency list..." | |
wordList = getWordLists(textFile) | |
print "Removing stop words..." | |
wordList = removeStopWords(wordList) | |
wordListFileName = 'word-list-' + textFile[:-4] + '.txt' | |
with open(wordListFileName, 'wb') as wordListFile: | |
wordListString = "" | |
for word in wordList: | |
wordListString += word + "\n" | |
wordListFile.write(wordListString) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment