Skip to content

Instantly share code, notes, and snippets.

@mbstacy
Created December 12, 2019 23:14
Show Gist options
  • Save mbstacy/8c25e554ee47a95af9dca0ef5c42ef59 to your computer and use it in GitHub Desktop.
Save mbstacy/8c25e554ee47a95af9dca0ef5c42ef59 to your computer and use it in GitHub Desktop.
import PyPDF2
from sys import argv
def getTextPdf(filename):
'''
Reads entire file and returns text
'''
#open allows you to read the file
with open(filename,'rb') as pdfFileObj:
#The pdfReader variable is a readable object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#discerning the number of pages will allow us to parse through all #the pages
num_pages = pdfReader.numPages
count = 0
text = ""
#The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText()
return text
def getFirstPageTextPdf(filename):
'''
Reads first page
'''
#open allows you to read the file
with open(filename,'rb') as pdfFileObj:
#The pdfReader variable is a readable object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#discerning the number of pages will allow us to parse through all #the pages
pageObj = pdfReader.getPage(0)
text = pageObj.extractText()
return text
if __name__ == "__main__":
text=getFirstPageTextPdf(argv[1])
if "Authors" in text[:9]:
print("TRUE: {0}".format(argv[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment