eng-rodrigocunha/pdf_reduct.py

## pdf_reduct.py
#!pip install pdf-redactor

import re
from datetime import datetime

import pdf_redactor

## Set options.

options = pdf_redactor.RedactorOptions()

options.metadata_filters = {
	# Perform some field filtering --- turn the Title into uppercase.
	"Title": [lambda value : value.upper()],

	# Set some values, overriding any value present in the PDF.
	"Producer": [lambda value : "Rodrigo Cunha"],
	"CreationDate": [lambda value : datetime.utcnow()],

	# Clear all other fields.
	"DEFAULT": [lambda value : None],
}

options.xmp_filters = [lambda xml : None]

options.content_filters = [
	(
		#re.compile(r"([A-zÀ-ú]+): (\d+)"),
        re.compile(r"(?<=(([A-zÀ-ú]): ))(\d+)"),
		lambda m : "x"*11
	),
    (
		re.compile(r"(?<=(RG: MG-))((\d+).(\d+).(\d+))"),
		lambda m : "xx.xxx.xxx"
	),
    (
		re.compile(r"(?<=(RG MG-))((\d+).(\d+).(\d+))"),
		lambda m : "xx.xxx.xxx"
	),
]

options.input_stream = "TERMO_UTILIZACAO.pdf"
options.output_stream = "TERMO_UTILIZACAO_redact.pdf"

# Perform the redaction using PDF on standard input and writing to standard output.
pdf_redactor.redactor(options)
	#!pip install pdf-redactor

	import re
	from datetime import datetime

	import pdf_redactor

	## Set options.

	options = pdf_redactor.RedactorOptions()

	options.metadata_filters = {
	# Perform some field filtering --- turn the Title into uppercase.
	"Title": [lambda value : value.upper()],

	# Set some values, overriding any value present in the PDF.
	"Producer": [lambda value : "Rodrigo Cunha"],
	"CreationDate": [lambda value : datetime.utcnow()],

	# Clear all other fields.
	"DEFAULT": [lambda value : None],
	}

	options.xmp_filters = [lambda xml : None]

	options.content_filters = [
	(
	#re.compile(r"([A-zÀ-ú]+): (\d+)"),
	re.compile(r"(?<=(([A-zÀ-ú]): ))(\d+)"),
	lambda m : "x"*11
	),
	(
	re.compile(r"(?<=(RG: MG-))((\d+).(\d+).(\d+))"),
	lambda m : "xx.xxx.xxx"
	),
	(
	re.compile(r"(?<=(RG MG-))((\d+).(\d+).(\d+))"),
	lambda m : "xx.xxx.xxx"
	),
	]

	options.input_stream = "TERMO_UTILIZACAO.pdf"
	options.output_stream = "TERMO_UTILIZACAO_redact.pdf"

	# Perform the redaction using PDF on standard input and writing to standard output.
	pdf_redactor.redactor(options)