-
-
Save wecacuee/f1e92d421312b7c7c1907667f4f3a318 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
try: | |
import fitz | |
except ImportError as e: | |
import subprocess | |
print('Trying to install pip install PyMuPDF') | |
subprocess.call("pip install PyMuPDF".split()) | |
print('Try pip install PyMuPDF') | |
raise | |
import sys | |
if len(sys.argv) < 2: | |
raise ValueError("Please provide a pdf to anonymize") | |
if len(sys.argv) < 3: | |
outfilename = filename.replace('.pdf', '.anon.pdf') | |
else: | |
outfilename = sys.argv[2] | |
filename = sys.argv[1] | |
doc = fitz.open(filename) | |
metadata = doc.metadata | |
for k, v in metadata.items(): | |
if k not in ['format']: # retain some metadata | |
metadata[k] = '' | |
doc.set_metadata(metadata) | |
FIELDS_TO_KEEP = ['content'] | |
FIELD_DEFAULTS = { 'modDate' : "D:20000000000000-00'00'", | |
'other' : 'X'} | |
for page in doc: | |
for annot in page.annots(): | |
info = annot.info | |
# print("Before", info) | |
for k, v in info.items(): | |
# remove all annotation info | |
# other than the content | |
if k not in FIELDS_TO_KEEP: | |
info[k] = FIELD_DEFAULTS.get( | |
k, FIELD_DEFAULTS['other']) | |
# print("After", info) | |
annot.set_info(info) | |
annot.update() | |
print("Printing annotations. Check for retained info in the pdf.") | |
for page in doc: | |
for annot in page.annots(): | |
print(annot.info) | |
doc.save(outfilename) |
MIT License | |
Copyright (c) 2024 Vikas Dhiman | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. |
I stepped across this gist, since I had the identical problem as the op in https://softwarerecs.stackexchange.com/questions/31971/pdf-anonymizer-remove-potentially-identifying-metadata and indeed I wanted to remove author information from all annotations, so I ended in this gist.
I can reproduce the first problem of @jellepoland and found more bugs in the code. However, I don't know what @jellepoland means with his second bullet. Could you elaborate on this problem?
- the code does remove the names from annotations made inside a document viewer pdf
I am able to review the code and fix the problems. I am running Ubuntu 22.04.5 LTS (Ubuntu Budgie), with Python 3.10.12. This should also work for @jellepoland with Fedora.
@wecacuee could you please add a license to this gist? I would be glad to fork and send pull requests, provided it is FOSS licensed.
@vanschnapen I added MIT License. Please feel free to fork and update.
Updated to remove all info other than the content from annotations.
Thanks @wecacuee! With your commit 5d06efff181f5b99dea6234d01df617c21ecc14b the most urgent fix is done. Empty strings like before title=""
were not possible and resulted in no changes in the annotations.
I will continue with my changes in the fork and inform you.
On Linux (Fedora) script is not working as expected.
frontend
andfritz
had to be installed manually.