Skip to content

Instantly share code, notes, and snippets.

@eng-rodrigocunha
Created February 18, 2023 17:57
Show Gist options
  • Save eng-rodrigocunha/34731505fdd8624c53df19143ab57868 to your computer and use it in GitHub Desktop.
Save eng-rodrigocunha/34731505fdd8624c53df19143ab57868 to your computer and use it in GitHub Desktop.
Reduct pdf sensitive content
#!pip install pdf-redactor
import re
from datetime import datetime
import pdf_redactor
## Set options.
options = pdf_redactor.RedactorOptions()
options.metadata_filters = {
# Perform some field filtering --- turn the Title into uppercase.
"Title": [lambda value : value.upper()],
# Set some values, overriding any value present in the PDF.
"Producer": [lambda value : "Rodrigo Cunha"],
"CreationDate": [lambda value : datetime.utcnow()],
# Clear all other fields.
"DEFAULT": [lambda value : None],
}
options.xmp_filters = [lambda xml : None]
options.content_filters = [
(
#re.compile(r"([A-zÀ-ú]+): (\d+)"),
re.compile(r"(?<=(([A-zÀ-ú]): ))(\d+)"),
lambda m : "x"*11
),
(
re.compile(r"(?<=(RG: MG-))((\d+).(\d+).(\d+))"),
lambda m : "xx.xxx.xxx"
),
(
re.compile(r"(?<=(RG MG-))((\d+).(\d+).(\d+))"),
lambda m : "xx.xxx.xxx"
),
]
options.input_stream = "TERMO_UTILIZACAO.pdf"
options.output_stream = "TERMO_UTILIZACAO_redact.pdf"
# Perform the redaction using PDF on standard input and writing to standard output.
pdf_redactor.redactor(options)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment