Skip to content

Instantly share code, notes, and snippets.

@akash-ch2812
Last active September 6, 2020 02:38
Show Gist options
  • Save akash-ch2812/ac663173ae98c8f0b250cc8ad2f4717c to your computer and use it in GitHub Desktop.
Save akash-ch2812/ac663173ae98c8f0b250cc8ad2f4717c to your computer and use it in GitHub Desktop.
import spacy
import PyPDF2
# spacy english model (large)
nlp = spacy.load('en_core_web_lg')
# method for reading a pdf file
def readPdfFile(filename, folder_name):
# storing path of PDF-Documents folder
data_path = str(os.getcwd()) + "\\" + folder_name
file = open(data_path + "\\" + filename, mode="rb")
# looping through pdf pages and storing data
pdf_reader = PyPDF2.PdfFileReader(file)
num_pages = pdf_reader.numPages
# traverse through each page and store data as an element in list
text = []
for pages in range(0, num_pages):
current_page = pdf_reader.getPage(pages)
text.append(current_page.extractText().replace("\n","").lower())
# # remove \n from list
# text = [t.replace("\n", "").lower() for t in text]
# store content of 1-last page in a seperate list
rest_pages = []
for t in text[1:]:
rest_pages.append(t[115:])
# store 0th page content separately
first_page = [text[0][850:]]
# storing the 0th and 1-last page content after cleaning in text
text = first_page + rest_pages
# creating a single string containing full text
full_text = "".join(text)
return full_text
# customer sentence segmenter for creating spacy document object
def setCustomBoundaries(doc):
# traversing through tokens in document object
for token in doc[:-1]:
if token.text == ';':
doc[token.i + 1].is_sent_start = True
if token.text == ".":
doc[token.i + 1].is_sent_start = False
return doc
# create spacy document object from pdf text
def getSpacyDocument(pdf_text, nlp):
main_doc = nlp(pdf_text) # create spacy document object
return main_doc
# adding setCusotmeBoundaries to the pipeline
nlp.add_pipe(setCustomBoundaries, before='parser')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment