AnthonyZJiang/extract_pdf.py

## extract_pdf.py
import olefile
from zipfile import ZipFile
from glob import glob

# How many PDF documents have we saved
pdf_count = 0

# Loop through all the .docx files in the current folder
for filename in glob("*.docx"):
  try:
    # Try to open the document as ZIP file
    with ZipFile(filename, "r") as zip:

      # Find files in the word/embeddings folder of the ZIP file
      for entry in zip.infolist():
        if not entry.filename.startswith("word/embeddings/"):
          continue

        # Try to open the embedded OLE file
        with zip.open(entry.filename) as f:
          if not olefile.isOleFile(f):
            continue

          ole = olefile.OleFileIO(f)

          # CLSID for Adobe Acrobat Document
          if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000":
            continue

          if not ole.exists("CONTENTS"):
            continue

          # Extract the PDF from the OLE file
          pdf_data = ole.openstream('CONTENTS').read()

          # Does the embedded file have a %PDF- header?
          if pdf_data[0:5] == b'%PDF-':
            pdf_count += 1

            pdf_filename = "Document %d.pdf" % pdf_count

            # Save the PDF
            with open(pdf_filename, "wb") as output_file:
              output_file.write(pdf_data)

  except:
    print("Unable to open '%s'" % filename)

print("Extracted %d PDF documents" % pdf_count)
	import olefile
	from zipfile import ZipFile
	from glob import glob

	# How many PDF documents have we saved
	pdf_count = 0

	# Loop through all the .docx files in the current folder
	for filename in glob("*.docx"):
	try:
	# Try to open the document as ZIP file
	with ZipFile(filename, "r") as zip:

	# Find files in the word/embeddings folder of the ZIP file
	for entry in zip.infolist():
	if not entry.filename.startswith("word/embeddings/"):
	continue

	# Try to open the embedded OLE file
	with zip.open(entry.filename) as f:
	if not olefile.isOleFile(f):
	continue

	ole = olefile.OleFileIO(f)

	# CLSID for Adobe Acrobat Document
	if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000":
	continue

	if not ole.exists("CONTENTS"):
	continue

	# Extract the PDF from the OLE file
	pdf_data = ole.openstream('CONTENTS').read()

	# Does the embedded file have a %PDF- header?
	if pdf_data[0:5] == b'%PDF-':
	pdf_count += 1

	pdf_filename = "Document %d.pdf" % pdf_count

	# Save the PDF
	with open(pdf_filename, "wb") as output_file:
	output_file.write(pdf_data)

	except:
	print("Unable to open '%s'" % filename)

	print("Extracted %d PDF documents" % pdf_count)