Skip to content

Instantly share code, notes, and snippets.

@philshem
Last active May 21, 2019 09:24
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save philshem/dd9c6f564cadfd294ffb383ecd925309 to your computer and use it in GitHub Desktop.
Python3 script to clean non-ascii characters from the PDF "Title" metadata field.
# requires python3.x and one non-standard module `pip install pdfrw`
# pdfs should be in folder relative to this code, named `pdfs`
import os
from pdfrw import PdfReader, PdfWriter
from glob import glob
import unicodedata
def edit_title_metadata(inpdf):
trailer = PdfReader(inpdf)
trailer.Info.Title = unicode_normalize(trailer.Info.Title.decode())
PdfWriter(rename_file(inpdf), trailer=trailer).write()
return True
def rename_file(inpdf):
return inpdf.replace('.pdf','_CLEAN.pdf')
def unicode_normalize(s):
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
def get_pdfs():
return glob('pdfs'+os.sep+'*.pdf')
def main():
pdf_list = get_pdfs()
for pdf in pdf_list:
if '_CLEAN' not in pdf:
if edit_title_metadata(pdf):
print ('OK:',pdf)
else:
print ('WARNING:',pdf)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment