Last active
August 24, 2022 20:35
-
-
Save welmends/49bd9eecf46060b89620b807f9f5e8f4 to your computer and use it in GitHub Desktop.
Tool to swap text from pdf files using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
config.yml | |
path: /path/to/folder | |
text1: | |
old: "xxxxx" | |
new: "yyyyy" | |
''' | |
''' | |
my_config.yml | |
path: /path/to/folder | |
text1: | |
old: "(1602975)" | |
new: "(0000000)" | |
text2: | |
old: "(06983770366\\040\\055\\040João\\040Wellington\\040Mendes\\040de\\040Souza)" | |
new: "()" | |
''' | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from reportlab.pdfgen import canvas | |
import os | |
import codecs | |
import yaml | |
PATH = "" | |
TEXTS = [] | |
def create_wm_file(): | |
c = canvas.Canvas("wm.pdf") | |
c.rect(0, 0, 1, 1, fill=1) | |
c.save() | |
return | |
def delete_wm_file(): | |
os.remove("wm.pdf") | |
return | |
def generate_editable_pdf(filepath): | |
wm = PdfFileReader(open("wm.pdf", "rb")) | |
output_file = PdfFileWriter() | |
input_file = PdfFileReader(open(filepath, "rb")) | |
page_count = input_file.getNumPages() | |
for page_number in range(page_count): | |
input_page = input_file.getPage(page_number) | |
input_page.mergePage(wm.getPage(0)) | |
output_file.addPage(input_page) | |
with open("tmp.pdf", "wb") as outputStream: | |
output_file.write(outputStream) | |
return | |
def remove_refs_from_pdf(): | |
# get data | |
f = codecs.open("tmp.pdf", encoding="ISO8859-1", mode="rb") | |
data = f.read() | |
f.close() | |
# update data | |
for T in TEXTS: | |
data = data.replace(T[0], T[1]) | |
# write updated data | |
f = codecs.open("tmp.pdf", encoding="ISO8859-1", mode="wb") | |
f.write(data) | |
f.close() | |
return | |
def count_total_valid_files(files): | |
files_updated = [] | |
for f in files: | |
filepath = os.path.join(PATH,f) | |
_,ext = os.path.splitext(filepath) | |
if(os.path.isfile(filepath) and ext==".pdf"): | |
files_updated.append(filepath) | |
return files_updated | |
def read_config_file(): | |
global PATH, TEXTS | |
with open("config.yml", 'r') as stream: | |
try: | |
yamlData = yaml.safe_load(stream) | |
texts = list(yamlData.keys()) | |
PATH = yamlData["path"] | |
texts.remove("path") | |
for text in texts: | |
T = [yamlData[text]["old"], yamlData[text]["new"]] | |
TEXTS.append(T) | |
except: | |
print("Error: config file must have the following structure") | |
print("config.yml") | |
print(" path: /path/to/folder") | |
print(" text1:") | |
print(" old: 'xxxxxx'") | |
print(" new: 'yyyyyy'") | |
print("...") | |
exit(-1) | |
if __name__ == "__main__": | |
read_config_file() | |
files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(PATH)) for f in fn] | |
files = count_total_valid_files(files) | |
counter = 1 | |
create_wm_file() | |
for filepath in files: | |
print("> editing file [{}/{}]".format(counter, len(files))) | |
counter += 1 | |
generate_editable_pdf(filepath) | |
remove_refs_from_pdf() | |
os.replace("tmp.pdf", filepath) | |
delete_wm_file() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment