akash-ch2812/Data_Preprocessing.py

## Data_Preprocessing.py
import spacy
import PyPDF2

# spacy english model (large)
nlp = spacy.load('en_core_web_lg')

# method for reading a pdf file
def readPdfFile(filename, folder_name):

    # storing path of PDF-Documents folder
    data_path = str(os.getcwd()) + "\\" + folder_name

    file = open(data_path + "\\" + filename, mode="rb")

    # looping through pdf pages and storing data
    pdf_reader = PyPDF2.PdfFileReader(file)
    num_pages = pdf_reader.numPages

    # traverse through each page and store data as an element in list
    text = []
    for pages in range(0, num_pages):
        current_page = pdf_reader.getPage(pages)
        text.append(current_page.extractText().replace("\n","").lower())

    # # remove \n from list
    # text = [t.replace("\n", "").lower() for t in text]

    # store content of 1-last page in a seperate list
    rest_pages = []
    for t in text[1:]:
        rest_pages.append(t[115:])

    # store 0th page content separately
    first_page = [text[0][850:]]

    # storing the 0th and 1-last page content after cleaning in text
    text = first_page + rest_pages

    # creating a single string containing full text
    full_text = "".join(text)

    return full_text


# customer sentence segmenter for creating spacy document object
def setCustomBoundaries(doc):
    # traversing through tokens in document object
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i + 1].is_sent_start = True
        if token.text == ".":
            doc[token.i + 1].is_sent_start = False
    return doc


# create spacy document object from pdf text
def getSpacyDocument(pdf_text, nlp):
    main_doc = nlp(pdf_text)  # create spacy document object

    return main_doc

# adding setCusotmeBoundaries to the pipeline
nlp.add_pipe(setCustomBoundaries, before='parser')
	import spacy
	import PyPDF2

	# spacy english model (large)
	nlp = spacy.load('en_core_web_lg')

	# method for reading a pdf file
	def readPdfFile(filename, folder_name):

	# storing path of PDF-Documents folder
	data_path = str(os.getcwd()) + "\\" + folder_name

	file = open(data_path + "\\" + filename, mode="rb")

	# looping through pdf pages and storing data
	pdf_reader = PyPDF2.PdfFileReader(file)
	num_pages = pdf_reader.numPages

	# traverse through each page and store data as an element in list
	text = []
	for pages in range(0, num_pages):
	current_page = pdf_reader.getPage(pages)
	text.append(current_page.extractText().replace("\n","").lower())

	# # remove \n from list
	# text = [t.replace("\n", "").lower() for t in text]

	# store content of 1-last page in a seperate list
	rest_pages = []
	for t in text[1:]:
	rest_pages.append(t[115:])

	# store 0th page content separately
	first_page = [text[0][850:]]

	# storing the 0th and 1-last page content after cleaning in text
	text = first_page + rest_pages

	# creating a single string containing full text
	full_text = "".join(text)

	return full_text


	# customer sentence segmenter for creating spacy document object
	def setCustomBoundaries(doc):
	# traversing through tokens in document object
	for token in doc[:-1]:
	if token.text == ';':
	doc[token.i + 1].is_sent_start = True
	if token.text == ".":
	doc[token.i + 1].is_sent_start = False
	return doc


	# create spacy document object from pdf text
	def getSpacyDocument(pdf_text, nlp):
	main_doc = nlp(pdf_text) # create spacy document object

	return main_doc

	# adding setCusotmeBoundaries to the pipeline
	nlp.add_pipe(setCustomBoundaries, before='parser')