Skip to content

Instantly share code, notes, and snippets.

@shandanjay
Forked from benmccloskey/pdf_parser
Created October 29, 2022 13:24
Show Gist options
  • Save shandanjay/3f1b4d46d9f884ca74882c5c99ec460a to your computer and use it in GitHub Desktop.
Save shandanjay/3f1b4d46d9f884ca74882c5c99ec460a to your computer and use it in GitHub Desktop.
import re
import PyPDF2
import spacy
class PdfParser():
def __init__(self, file_path):
self.file_path = file_path
def pdf_reader(self) -> str:
content = ''
opener = open(self.file_path, 'rb')
pdf = PyPDF2.PdfFileReader(opener)
num_pages = pdf.numPages
for i in range(0,num_pages):
content+= pdf.getPage(i).extractText() + '\n'
content = ' '.join(content.replace(u'\xa0', ' ').strip().split())
page_number_removal = r"\d{1,3} of \d{1,3}"
page_number_removal_pattern = re.compile(page_number_removal, re.IGNORECASE)
content = re.sub(page_number_removal_pattern, '',content)
return content
def date_extractor(self) -> set:
text = self.pdf_reader()
date_label = ['DATE']
nlp = spacy.load('en_core_web_lg')
doc = nlp(text)
dates_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
dates = set((ent.text) for ent in doc.ents if ent.label_ in date_label)
filtered_dates = set(date for date in dates if not dates_pattern.match(date))
sorted_dates = sorted(list(filtered_dates),reverse=True)
return sorted_dates
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment