Skip to content

Instantly share code, notes, and snippets.

@welmends
Last active August 24, 2022 20:35
Show Gist options
  • Save welmends/49bd9eecf46060b89620b807f9f5e8f4 to your computer and use it in GitHub Desktop.
Save welmends/49bd9eecf46060b89620b807f9f5e8f4 to your computer and use it in GitHub Desktop.
Tool to swap text from pdf files using Python
'''
config.yml
path: /path/to/folder
text1:
old: "xxxxx"
new: "yyyyy"
'''
'''
my_config.yml
path: /path/to/folder
text1:
old: "(1602975)"
new: "(0000000)"
text2:
old: "(06983770366\\040\\055\\040João\\040Wellington\\040Mendes\\040de\\040Souza)"
new: "()"
'''
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
import os
import codecs
import yaml
PATH = ""
TEXTS = []
def create_wm_file():
c = canvas.Canvas("wm.pdf")
c.rect(0, 0, 1, 1, fill=1)
c.save()
return
def delete_wm_file():
os.remove("wm.pdf")
return
def generate_editable_pdf(filepath):
wm = PdfFileReader(open("wm.pdf", "rb"))
output_file = PdfFileWriter()
input_file = PdfFileReader(open(filepath, "rb"))
page_count = input_file.getNumPages()
for page_number in range(page_count):
input_page = input_file.getPage(page_number)
input_page.mergePage(wm.getPage(0))
output_file.addPage(input_page)
with open("tmp.pdf", "wb") as outputStream:
output_file.write(outputStream)
return
def remove_refs_from_pdf():
# get data
f = codecs.open("tmp.pdf", encoding="ISO8859-1", mode="rb")
data = f.read()
f.close()
# update data
for T in TEXTS:
data = data.replace(T[0], T[1])
# write updated data
f = codecs.open("tmp.pdf", encoding="ISO8859-1", mode="wb")
f.write(data)
f.close()
return
def count_total_valid_files(files):
files_updated = []
for f in files:
filepath = os.path.join(PATH,f)
_,ext = os.path.splitext(filepath)
if(os.path.isfile(filepath) and ext==".pdf"):
files_updated.append(filepath)
return files_updated
def read_config_file():
global PATH, TEXTS
with open("config.yml", 'r') as stream:
try:
yamlData = yaml.safe_load(stream)
texts = list(yamlData.keys())
PATH = yamlData["path"]
texts.remove("path")
for text in texts:
T = [yamlData[text]["old"], yamlData[text]["new"]]
TEXTS.append(T)
except:
print("Error: config file must have the following structure")
print("config.yml")
print(" path: /path/to/folder")
print(" text1:")
print(" old: 'xxxxxx'")
print(" new: 'yyyyyy'")
print("...")
exit(-1)
if __name__ == "__main__":
read_config_file()
files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(PATH)) for f in fn]
files = count_total_valid_files(files)
counter = 1
create_wm_file()
for filepath in files:
print("> editing file [{}/{}]".format(counter, len(files)))
counter += 1
generate_editable_pdf(filepath)
remove_refs_from_pdf()
os.replace("tmp.pdf", filepath)
delete_wm_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment