Last active
August 29, 2015 14:09
-
-
Save ethanwillis/ec89af9bc0cc32d2f5c2 to your computer and use it in GitHub Desktop.
Final Version of Ngrams script with CSV Output and multi query support
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
from string import maketrans | |
def ngramsProgram(directory, ngramQueries): | |
# Input: get corpus from all text files within a directory | |
# Step 1: Get list of files in a directory | |
txtFileList = os.listdir(directory) | |
# Step 2: Process each text file in our list of files | |
searchResults = [] | |
for txtFile in txtFileList: | |
# create path to text file | |
filePath = os.path.join(directory, txtFile) | |
# open and read the text file | |
openTxtFile = open(filePath, 'r') | |
txtFileContents = openTxtFile.read() | |
# close the file | |
openTxtFile.close() | |
#print("length: " + str(len(txtFileContents))) | |
# Process: process our input to find ngrams matching our | |
# ngram queries. | |
queryResults = findNGramsMatching(txtFileContents, ngramQueries) | |
# Prepend the filename to our result. | |
queryResults.insert(0, txtFile) | |
# Add our query results to our list of results. | |
searchResults.append(queryResults) | |
# Output: save our ngrams to disk. | |
saveResultsAsCSV(searchResults) | |
def findNGrams(corpus, n): | |
# Tokenize our corpus, which also gives us all of our unigrams | |
tokenizedCorpus = corpus.split(' ') | |
# initialize our set of "windows", ngrams. | |
ngrams = [] | |
# find windows of size n and add them to our set of windows. | |
for i in range(0, len(tokenizedCorpus)-(n-1)): | |
# initialize an empty ngram | |
curNGram = [] | |
# find the ngram in our current window, starting with the | |
# xth unigram. | |
for x in range(i, i+n): | |
# build our current ngram from the current unigram | |
curNGram.append(tokenizedCorpus[x]) | |
# add this ngram to our list of ngrams. | |
ngrams.append(curNGram) | |
return ngrams | |
def preprocess(corpus): | |
# remove punctuation new lines and tabs | |
# create translation table. | |
translateTable = maketrans(',\'".-', ' ') | |
corpus = corpus.translate(translateTable) | |
# normalize for case | |
corpus = corpus.lower() | |
return corpus | |
def findNGramsMatching(corpus, ngramQueries): | |
# preprocess our corpus | |
corpus = preprocess(corpus) | |
# find number of ngrams from our corpus that match the ngram | |
# for our current ngramQuery | |
queryResults = [] | |
for ngramQuery in ngramQueries: | |
# get all ngrams in our corpus that are the size of our query | |
ngrams = findNGrams(corpus, len(ngramQuery)) | |
#print("query: " + str(ngramQuery) + " length: " + str(len(ngramQuery))) | |
numMatches = 0 | |
for ngram in ngrams: | |
if cmp(ngram, ngramQuery) == 0: | |
numMatches += 1 | |
queryResults.append(numMatches) | |
return queryResults | |
# Function to save search rsults to disk as a comma delimited CSV file. | |
def saveResultsAsCSV(results): | |
# open output file in current working directory | |
with open('summary.csv', 'wb') as outputFile: | |
fileWriter = csv.writer(outputFile, delimiter=',') | |
# write csv file headers | |
fileWriter.writerow(['filename', 'future', 'scientists say', 'The United States']) | |
for result in results: | |
fileWriter.writerow(result) | |
# Find these ngrams in the files in a given directory | |
ngramQueries = [] | |
ngramQueries.append(['future']) | |
ngramQueries.append(['scientists', 'say']) | |
ngramQueries.append(['the', 'united', 'states']) | |
directoryPath = "Texts_4" | |
ngramsProgram(directoryPath, ngramQueries) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment