Last active
October 10, 2023 20:39
-
-
Save Jonty/7c0e29119c05d0591375fe8e94d5708f to your computer and use it in GitHub Desktop.
The Deredactatron: De-redacts PDF's redacted by drawing rectangles over the text. Needs Python3 and PyMuPDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# De-redacts PDF's that have been redacted by drawing PDF-native rectangles over the text | |
# This removes ALL rectangles so might also nuke any shape diagrams a bit | |
# python deredactatron.py a_redacted_doc.pdf | |
# Outputs: deredacted-a_redacted_doc.pdf | |
import sys | |
import re | |
import fitz # PyMuPDF==1.17.5 | |
doc = fitz.open(sys.argv[1]) | |
for page_number, page in enumerate(doc, start=1): | |
for xref in page._getContents(): | |
stream = doc.xrefStream(xref).decode() | |
# Format: `36.95 806.58 149.32 -14.6 re\n` | |
stream = re.sub("-?\d+(\.\d+)? -?\d+(\.\d+)? -?\d+(\.\d+)? -?\d+(\.\d+)? re\r?\n", "", stream) | |
doc.updateStream(xref, stream.encode()) | |
doc.save("deredacted-" + sys.argv[1], clean=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment