Skip to content

Instantly share code, notes, and snippets.

@frankgeerlings
Created September 9, 2021 07:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frankgeerlings/e9d59e1ecbce1adadf40c042fb398cc0 to your computer and use it in GitHub Desktop.
Save frankgeerlings/e9d59e1ecbce1adadf40c042fb398cc0 to your computer and use it in GitHub Desktop.
Bundle, rectify and OCR (sometimes poorly) scanned multi-page PDFs in current directory
import os, subprocess
from glob import glob
from pprint import pprint
_, directories, files = next(os.walk('.'))
def raad_taal_en_titel(directory):
split = directory.split(' ', 1)
if len(split) is 2 and split[0] in ['eng', 'deu', 'nld']:
return split
return ('nld', directory)
# Dit zijn files die uit 1 stuk bestaan, dat is geen eigen dir waard
# dus pak ik die rechtstreeks op. Eindigt op PDF, niet op pdf!
inputfiles = [file[:-4] for file in files if file.endswith('.PDF') or file.endswith('.pdf')]
for file in inputfiles:
taal, titel = raad_taal_en_titel(file)
print(titel)
ocrmypdf = ['ocrmypdf', '-l', taal, '--skip-text', '--deskew', f'{file}.PDF', f'{titel}.pdf']
subprocess.run(ocrmypdf)
for directory in directories:
files = glob(directory + '/*.PDF') + glob(directory + '/*.pdf')
if not any(files):
print(f"De directory {directory} bevat geen PDFs")
continue
taal, titel = raad_taal_en_titel(directory)
print(titel)
pdfunite = ['pdfunite'] + files + [f"{titel}.pdf"]
ocrmypdf = ['ocrmypdf', '-l', taal, '--skip-text', '--deskew', f'{titel}.pdf', f'{titel}.pdf']
subprocess.run(pdfunite)
subprocess.run(ocrmypdf)
"pdfunite {directory}/*.{pdf,PDF} {directory}.pdf"
"exec ocrmypdf -l nld --skip-text --deskew {directory}.pdf --sidecar {directory}.txt"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment