Last active
September 6, 2020 02:38
-
-
Save akash-ch2812/ac663173ae98c8f0b250cc8ad2f4717c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import PyPDF2 | |
# spacy english model (large) | |
nlp = spacy.load('en_core_web_lg') | |
# method for reading a pdf file | |
def readPdfFile(filename, folder_name): | |
# storing path of PDF-Documents folder | |
data_path = str(os.getcwd()) + "\\" + folder_name | |
file = open(data_path + "\\" + filename, mode="rb") | |
# looping through pdf pages and storing data | |
pdf_reader = PyPDF2.PdfFileReader(file) | |
num_pages = pdf_reader.numPages | |
# traverse through each page and store data as an element in list | |
text = [] | |
for pages in range(0, num_pages): | |
current_page = pdf_reader.getPage(pages) | |
text.append(current_page.extractText().replace("\n","").lower()) | |
# # remove \n from list | |
# text = [t.replace("\n", "").lower() for t in text] | |
# store content of 1-last page in a seperate list | |
rest_pages = [] | |
for t in text[1:]: | |
rest_pages.append(t[115:]) | |
# store 0th page content separately | |
first_page = [text[0][850:]] | |
# storing the 0th and 1-last page content after cleaning in text | |
text = first_page + rest_pages | |
# creating a single string containing full text | |
full_text = "".join(text) | |
return full_text | |
# customer sentence segmenter for creating spacy document object | |
def setCustomBoundaries(doc): | |
# traversing through tokens in document object | |
for token in doc[:-1]: | |
if token.text == ';': | |
doc[token.i + 1].is_sent_start = True | |
if token.text == ".": | |
doc[token.i + 1].is_sent_start = False | |
return doc | |
# create spacy document object from pdf text | |
def getSpacyDocument(pdf_text, nlp): | |
main_doc = nlp(pdf_text) # create spacy document object | |
return main_doc | |
# adding setCusotmeBoundaries to the pipeline | |
nlp.add_pipe(setCustomBoundaries, before='parser') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment