Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Demonstration of how to extract attachments from PDF files using Python 3 and PyPDF2.
import PyPDF2
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
"""
catalog = reader.trailer["/Root"]
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
attachments = {}
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].getObject()
fData = fDict['/EF']['/F'].getData()
attachments[name] = fData
return attachments
handler = open('YOURPDFPATH', 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
print(dictionary)
for fName, fData in dictionary.items():
with open(fName, 'wb') as outfile:
outfile.write(fData)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment