DannyMcwaves/text_processing.py

## text_processing.py
"""
NATURAL LANGUAGE PROCESSING

using the nltk library, i am going to read a pdf file and then remove all the
stop words from the pdf file and then save the rest of the document in a file.
"""

from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from concurrent import futures
import PyPDF2


def yieldTextFromPdf(pdfname):
    """
    this function would use generators to read and return the text from a page of the pdf.
    It will do so for all the pages in the pdf and then yield the text from that page.
    :param pdfname: the name of the pdf file.
    :yield: the text from a specific page.
    """

    # open the pdf file as you would open any other binary file.
    file_descriptor = open(pdfname, "rb")

    # use the PyPDF module imported to parse the binary data.
    pdf_file = PyPDF2.PdfFileReader(file_descriptor)

    # now loop through the number of pages in the pdf file, extract the text from the page and then yield the text.
    for i in range(0, pdf_file.numPages):
        page = pdf_file.getPage(i)
        yield page.extractText()


def length(pdfname):
    return PyPDF2.PdfFileReader(open(pdfname, "rb")).numPages


def detectLanguage(text):
    """
    this module uses the nltk library to detect the language the text file is written in by using the.
    :param text: the text file.
    :return: the language of the text file.
    """

    # using the wordpunct_tokensize function, I generate a list of the words in the text file.
    # and then i reduce all to lowercase for brevity and language check.
    tokens = wordpunct_tokenize(text)
    words = {x.lower() for x in tokens}

    # what I am doing over here is that, I am looping through all the languages and I am going to check which of the
    # languages has the most stopwords appearing in the words list. the one with the highest number of stopword
    # occurrences in the words list, is the language of the text file.
    languages = {x: len(words.intersection({i for i in stopwords.words(x)})) for x in stopwords.fileids()}

    return max(languages, key=languages.get)


def removeStopWords(text):
    """
    this function gets the extracted text from the pdf and then removes the stop words from it.
    :param text: the text obtained from the pdf file.
    :return: the list of words remaining after removing stop words
    """

    tokenize = {x for x in wordpunct_tokenize(text)}
    stopwordsList = stopwords.words(detectLanguage(text))

    return [x for x in tokenize if x not in stopwordsList and len(x) > 1]


def main(pdfname):
    """
    this is the main function that will do all the works needed.
    :param pdfname: the name of the pdf file
    :return: none. saves to the file.

    run this in a concurrent thread by driving the generators and delegating to the result to a thread.
    to read the files faster.
    """
    with futures.ThreadPoolExecutor(max_workers=length(pdfname)) as executor:
        future_list = []

        try:
            for i in yieldTextFromPdf(pdfname):
                future = executor.submit(removeStopWords, i)
                future_list.append(future)

        except StopIteration:
            pass

        file = open("filtered", "w")

        for f in futures.as_completed(future_list):
            try:
                for i in f.result():
                    file.write(i + "\n")
            except AttributeError as ar:
                pass

        file.close()


pdfLoc = "danny.pdf"
pdf_location = "french.pdf"

if __name__ == '__main__':
    main(pdf_location)

# this code has being testes and debugged.
# it should be free of any errors.
# (c) Ayikpah Danny Mcwaves.
	"""
	NATURAL LANGUAGE PROCESSING

	using the nltk library, i am going to read a pdf file and then remove all the
	stop words from the pdf file and then save the rest of the document in a file.
	"""

	from nltk import wordpunct_tokenize
	from nltk.corpus import stopwords
	from concurrent import futures
	import PyPDF2


	def yieldTextFromPdf(pdfname):
	"""
	this function would use generators to read and return the text from a page of the pdf.
	It will do so for all the pages in the pdf and then yield the text from that page.
	:param pdfname: the name of the pdf file.
	:yield: the text from a specific page.
	"""

	# open the pdf file as you would open any other binary file.
	file_descriptor = open(pdfname, "rb")

	# use the PyPDF module imported to parse the binary data.
	pdf_file = PyPDF2.PdfFileReader(file_descriptor)

	# now loop through the number of pages in the pdf file, extract the text from the page and then yield the text.
	for i in range(0, pdf_file.numPages):
	page = pdf_file.getPage(i)
	yield page.extractText()


	def length(pdfname):
	return PyPDF2.PdfFileReader(open(pdfname, "rb")).numPages


	def detectLanguage(text):
	"""
	this module uses the nltk library to detect the language the text file is written in by using the.
	:param text: the text file.
	:return: the language of the text file.
	"""

	# using the wordpunct_tokensize function, I generate a list of the words in the text file.
	# and then i reduce all to lowercase for brevity and language check.
	tokens = wordpunct_tokenize(text)
	words = {x.lower() for x in tokens}

	# what I am doing over here is that, I am looping through all the languages and I am going to check which of the
	# languages has the most stopwords appearing in the words list. the one with the highest number of stopword
	# occurrences in the words list, is the language of the text file.
	languages = {x: len(words.intersection({i for i in stopwords.words(x)})) for x in stopwords.fileids()}

	return max(languages, key=languages.get)


	def removeStopWords(text):
	"""
	this function gets the extracted text from the pdf and then removes the stop words from it.
	:param text: the text obtained from the pdf file.
	:return: the list of words remaining after removing stop words
	"""

	tokenize = {x for x in wordpunct_tokenize(text)}
	stopwordsList = stopwords.words(detectLanguage(text))

	return [x for x in tokenize if x not in stopwordsList and len(x) > 1]


	def main(pdfname):
	"""
	this is the main function that will do all the works needed.
	:param pdfname: the name of the pdf file
	:return: none. saves to the file.

	run this in a concurrent thread by driving the generators and delegating to the result to a thread.
	to read the files faster.
	"""
	with futures.ThreadPoolExecutor(max_workers=length(pdfname)) as executor:
	future_list = []

	try:
	for i in yieldTextFromPdf(pdfname):
	future = executor.submit(removeStopWords, i)
	future_list.append(future)

	except StopIteration:
	pass

	file = open("filtered", "w")

	for f in futures.as_completed(future_list):
	try:
	for i in f.result():
	file.write(i + "\n")
	except AttributeError as ar:
	pass

	file.close()


	pdfLoc = "danny.pdf"
	pdf_location = "french.pdf"

	if __name__ == '__main__':
	main(pdf_location)

	# this code has being testes and debugged.
	# it should be free of any errors.
	# (c) Ayikpah Danny Mcwaves.