Skip to content

Instantly share code, notes, and snippets.

@gchristian
Last active May 24, 2021 18:24
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save gchristian/d17fc5bf566e7e510fd7fdacefa8221e to your computer and use it in GitHub Desktop.
Save gchristian/d17fc5bf566e7e510fd7fdacefa8221e to your computer and use it in GitHub Desktop.
parse pdf into smaller pdfs based on key value
#separate pdf based on a phrase that can be used to delineate break points and names files by first word after that break point
import PyPDF2
import pdfplumber
if __name__ == '__main__':
pdf_path = 'MBA Report Creator.pdf'
pdf_break_point = 'Student_Number '
base_pdf = PyPDF2.PdfFileReader(pdf_path)
new_pdf = PyPDF2.PdfFileWriter()
next_student_file = None
page_count = 0
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text= page.extract_text()
if pdf_break_point in page_text:
if new_pdf.getNumPages() > 0 and next_student_file is not None:
with open(next_student_file + ".pdf", 'wb') as fh:
new_pdf.write(fh)
new_pdf = PyPDF2.PdfFileWriter()
new_pdf.addPage(base_pdf.getPage(page_count))
else:
new_pdf.addPage(base_pdf.getPage(page_count))
next_student_file = page_text.split(pdf_break_point)[1].split("\n")[0].strip()
else:
new_pdf.addPage(base_pdf.getPage(page_count))
page_count = page_count + 1
if new_pdf.getNumPages() > 0 and next_student_file is not None:
with open(next_student_file + ".pdf", 'wb') as fh:
new_pdf.write(fh)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment