Created
December 15, 2018 20:21
-
-
Save AhmedSamara/dcfec9aa7cf4fd7b5ce40fead2205b90 to your computer and use it in GitHub Desktop.
question_parser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
import subprocess | |
pdfIn = open('presentations.pdf', 'rb') | |
reader = PyPDF2.PdfFileReader(pdfIn) | |
writer = PyPDF2.PdfFileWriter() | |
for i in range(reader.numPages): | |
page = reader.getPage(i) | |
# This didn't work so using others instead. | |
# contents = page.extractText() | |
# get this page as it's own file so it can be parsed. | |
bufWriter = PyPDF2.PdfFileWriter() | |
bufWriter.addPage(page) | |
# Write this page to a pdf on its own to parse. | |
bufFile = open('buffer.pdf', 'wb') | |
bufWriter.write(bufFile) | |
bufFile.close() | |
# Put the text of the page into a txt file. | |
subprocess.call(['pdftotext', 'buffer.pdf', 'buffer.txt']) | |
# If that .txt contains "question" then write. | |
with open('buffer.txt', 'r') as bufTxt: | |
pageTxt = bufTxt.read() | |
if "question" in pageTxt.lower(): | |
writer.addPage(page) | |
fileOutput = open('questions.pdf', 'wb') | |
writer.write(fileOutput) | |
pdfIn.close() | |
fileOutput.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment