Skip to content

Instantly share code, notes, and snippets.

@AhmedSamara
Created December 15, 2018 20:21
Show Gist options
  • Save AhmedSamara/dcfec9aa7cf4fd7b5ce40fead2205b90 to your computer and use it in GitHub Desktop.
Save AhmedSamara/dcfec9aa7cf4fd7b5ce40fead2205b90 to your computer and use it in GitHub Desktop.
question_parser.py
import PyPDF2
import subprocess
pdfIn = open('presentations.pdf', 'rb')
reader = PyPDF2.PdfFileReader(pdfIn)
writer = PyPDF2.PdfFileWriter()
for i in range(reader.numPages):
page = reader.getPage(i)
# This didn't work so using others instead.
# contents = page.extractText()
# get this page as it's own file so it can be parsed.
bufWriter = PyPDF2.PdfFileWriter()
bufWriter.addPage(page)
# Write this page to a pdf on its own to parse.
bufFile = open('buffer.pdf', 'wb')
bufWriter.write(bufFile)
bufFile.close()
# Put the text of the page into a txt file.
subprocess.call(['pdftotext', 'buffer.pdf', 'buffer.txt'])
# If that .txt contains "question" then write.
with open('buffer.txt', 'r') as bufTxt:
pageTxt = bufTxt.read()
if "question" in pageTxt.lower():
writer.addPage(page)
fileOutput = open('questions.pdf', 'wb')
writer.write(fileOutput)
pdfIn.close()
fileOutput.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment