Skip to content

Instantly share code, notes, and snippets.

@AnthonyZJiang
Created March 11, 2024 11:22
Show Gist options
  • Save AnthonyZJiang/a99391f383ec045bdac9a50f75229469 to your computer and use it in GitHub Desktop.
Save AnthonyZJiang/a99391f383ec045bdac9a50f75229469 to your computer and use it in GitHub Desktop.
Python script to extract PDF files embedded in Microsoft Word *.docx files
import olefile
from zipfile import ZipFile
from glob import glob
# How many PDF documents have we saved
pdf_count = 0
# Loop through all the .docx files in the current folder
for filename in glob("*.docx"):
try:
# Try to open the document as ZIP file
with ZipFile(filename, "r") as zip:
# Find files in the word/embeddings folder of the ZIP file
for entry in zip.infolist():
if not entry.filename.startswith("word/embeddings/"):
continue
# Try to open the embedded OLE file
with zip.open(entry.filename) as f:
if not olefile.isOleFile(f):
continue
ole = olefile.OleFileIO(f)
# CLSID for Adobe Acrobat Document
if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000":
continue
if not ole.exists("CONTENTS"):
continue
# Extract the PDF from the OLE file
pdf_data = ole.openstream('CONTENTS').read()
# Does the embedded file have a %PDF- header?
if pdf_data[0:5] == b'%PDF-':
pdf_count += 1
pdf_filename = "Document %d.pdf" % pdf_count
# Save the PDF
with open(pdf_filename, "wb") as output_file:
output_file.write(pdf_data)
except:
print("Unable to open '%s'" % filename)
print("Extracted %d PDF documents" % pdf_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment