Skip to content

Instantly share code, notes, and snippets.

@DannyMcwaves
Last active December 7, 2016 23:19
Show Gist options
  • Save DannyMcwaves/ea4032fa1193b91acfdc30b0bfa6a5b2 to your computer and use it in GitHub Desktop.
Save DannyMcwaves/ea4032fa1193b91acfdc30b0bfa6a5b2 to your computer and use it in GitHub Desktop.
"""
NATURAL LANGUAGE PROCESSING
using the nltk library, i am going to read a pdf file and then remove all the
stop words from the pdf file and then save the rest of the document in a file.
"""
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from concurrent import futures
import PyPDF2
def yieldTextFromPdf(pdfname):
"""
this function would use generators to read and return the text from a page of the pdf.
It will do so for all the pages in the pdf and then yield the text from that page.
:param pdfname: the name of the pdf file.
:yield: the text from a specific page.
"""
# open the pdf file as you would open any other binary file.
file_descriptor = open(pdfname, "rb")
# use the PyPDF module imported to parse the binary data.
pdf_file = PyPDF2.PdfFileReader(file_descriptor)
# now loop through the number of pages in the pdf file, extract the text from the page and then yield the text.
for i in range(0, pdf_file.numPages):
page = pdf_file.getPage(i)
yield page.extractText()
def length(pdfname):
return PyPDF2.PdfFileReader(open(pdfname, "rb")).numPages
def detectLanguage(text):
"""
this module uses the nltk library to detect the language the text file is written in by using the.
:param text: the text file.
:return: the language of the text file.
"""
# using the wordpunct_tokensize function, I generate a list of the words in the text file.
# and then i reduce all to lowercase for brevity and language check.
tokens = wordpunct_tokenize(text)
words = {x.lower() for x in tokens}
# what I am doing over here is that, I am looping through all the languages and I am going to check which of the
# languages has the most stopwords appearing in the words list. the one with the highest number of stopword
# occurrences in the words list, is the language of the text file.
languages = {x: len(words.intersection({i for i in stopwords.words(x)})) for x in stopwords.fileids()}
return max(languages, key=languages.get)
def removeStopWords(text):
"""
this function gets the extracted text from the pdf and then removes the stop words from it.
:param text: the text obtained from the pdf file.
:return: the list of words remaining after removing stop words
"""
tokenize = {x for x in wordpunct_tokenize(text)}
stopwordsList = stopwords.words(detectLanguage(text))
return [x for x in tokenize if x not in stopwordsList and len(x) > 1]
def main(pdfname):
"""
this is the main function that will do all the works needed.
:param pdfname: the name of the pdf file
:return: none. saves to the file.
run this in a concurrent thread by driving the generators and delegating to the result to a thread.
to read the files faster.
"""
with futures.ThreadPoolExecutor(max_workers=length(pdfname)) as executor:
future_list = []
try:
for i in yieldTextFromPdf(pdfname):
future = executor.submit(removeStopWords, i)
future_list.append(future)
except StopIteration:
pass
file = open("filtered", "w")
for f in futures.as_completed(future_list):
try:
for i in f.result():
file.write(i + "\n")
except AttributeError as ar:
pass
file.close()
pdfLoc = "danny.pdf"
pdf_location = "french.pdf"
if __name__ == '__main__':
main(pdf_location)
# this code has being testes and debugged.
# it should be free of any errors.
# (c) Ayikpah Danny Mcwaves.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment