Skip to content

Instantly share code, notes, and snippets.

@benmccloskey
Last active February 25, 2023 09:36
Show Gist options
  • Save benmccloskey/bf6aaee84327d4270922a53e9ee23d7f to your computer and use it in GitHub Desktop.
Save benmccloskey/bf6aaee84327d4270922a53e9ee23d7f to your computer and use it in GitHub Desktop.
import re
import PyPDF2
import spacy
class PdfParser():
def __init__(self, file_path):
self.file_path = file_path
def pdf_reader(self) -> str:
content = ''
opener = open(self.file_path, 'rb')
pdf = PyPDF2.PdfFileReader(opener)
num_pages = pdf.numPages
for i in range(0,num_pages):
content+= pdf.getPage(i).extractText() + '\n'
content = ' '.join(content.replace(u'\xa0', ' ').strip().split())
page_number_removal = r"\d{1,3} of \d{1,3}"
page_number_removal_pattern = re.compile(page_number_removal, re.IGNORECASE)
content = re.sub(page_number_removal_pattern, '',content)
return content
def date_extractor(self) -> set:
text = self.pdf_reader()
date_label = ['DATE']
nlp = spacy.load('en_core_web_lg')
doc = nlp(text)
dates_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
dates = set((ent.text) for ent in doc.ents if ent.label_ in date_label)
filtered_dates = set(date for date in dates if not dates_pattern.match(date))
sorted_dates = sorted(list(filtered_dates),reverse=True)
return sorted_dates
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment