Skip to content

Instantly share code, notes, and snippets.

@jlumbroso
Created November 3, 2023 21:59
Show Gist options
  • Save jlumbroso/3142108d7884d1c7d1dba1a4b10aa9af to your computer and use it in GitHub Desktop.
Save jlumbroso/3142108d7884d1c7d1dba1a4b10aa9af to your computer and use it in GitHub Desktop.
A Python script to recursively decrypt PDF files using `qpdf --decrypt`, handling files without passwords and overwriting the originals if decryption is successful.
#!/usr/bin/env python3
"""
decrypt_all_pdfs.py
Author: Jérémie Lumbroso <lumbroso@seas.upenn.edu>
Date: November 3, 2023
Description:
This script recursively finds all PDF files within a specified directory path,
checks if they are encrypted (without a password), and attempts to decrypt them
using `qpdf --decrypt`. It may require modifications if PDF files are password-protected.
This script is distributed under the MIT License.
Usage:
python decrypt_all_pdfs.py /path/to/pdf/directory
"""
import glob
import os
import shutil
import subprocess
import sys
import tempfile
# MIT License
# Copyright (c) 2023 Jérémie Lumbroso
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
def detect_pdf_decryption_status(pdf_path):
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
# Copy the PDF file into the temporary directory
temp_pdf_path = shutil.copy(pdf_path, tmpdirname)
# Construct the output text file path
text_file_path = os.path.splitext(temp_pdf_path)[0] + '.txt'
# Call pdftotext on the copied file
stderr_file_path = os.path.join(tmpdirname, 'stderr.txt')
with open(stderr_file_path, 'w') as stderr_file:
result = subprocess.run(['pdftotext', temp_pdf_path], stderr=stderr_file)
# Check if the text file is created
text_file_created = os.path.exists(text_file_path)
# Check if there is output on stderr
stderr_output = os.path.getsize(stderr_file_path) > 0
# Clean up is handled by the TemporaryDirectory context manager
# Return the final piece of information
return text_file_created and not stderr_output
def decrypt_pdf(pdf_path):
# Ensure the qpdf is available
if shutil.which("qpdf") is None:
raise RuntimeError("qpdf is not installed or not found in system PATH.")
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
# Copy the PDF file into the temporary directory
temp_pdf_path = shutil.copy(pdf_path, tmpdirname)
# Construct the decrypted PDF file path
decrypted_pdf_path = os.path.splitext(temp_pdf_path)[0] + '.decrypted.pdf'
# Call qpdf to decrypt the copied PDF file
subprocess.run(['qpdf', '--decrypt', temp_pdf_path, decrypted_pdf_path], check=True)
# Check if the decrypted file was created successfully
if not os.path.exists(decrypted_pdf_path):
raise FileNotFoundError("Decryption failed, decrypted file not found.")
# Copy the decrypted PDF file back to overwrite the original file
shutil.copy(decrypted_pdf_path, pdf_path)
# Temporary directory and its contents will be automatically cleaned up
def decrypt_all_pdfs(path):
# Recursively glob all PDF files in the given directory path
for pdf_file in glob.glob(os.path.join(path, '**/*.pdf'), recursive=True):
# Check if the PDF is encrypted
if not detect_pdf_decryption_status(pdf_file):
print(f"Decrypting: {pdf_file}")
try:
decrypt_pdf(pdf_file)
print(f"Decryption successful for: {pdf_file}")
except Exception as e:
print(f"An error occurred while decrypting {pdf_file}: {e}")
else:
print(f"File is not encrypted or already decrypted: {pdf_file}")
# Example usage:
if __name__ == "__main__":
# Use the first command line argument if provided, otherwise default to the current working directory
path_to_pdfs = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
decrypt_all_pdfs(path_to_pdfs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment