Skip to content

Instantly share code, notes, and snippets.

@lkwg82
Created April 19, 2020 20:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lkwg82/34e9e04d8f1da0cb33d9f5bab6ca2fb0 to your computer and use it in GitHub Desktop.
Save lkwg82/34e9e04d8f1da0cb33d9f5bab6ca2fb0 to your computer and use it in GitHub Desktop.
bash script to use ocrmypdf to convert scanned pdfs without to pdfs with searchable text
#!/bin/bash
set -ex
function convertPdf(){
local file=$1
set -e
echo $file
local suffix=".backup_ocrmypdf.pdf"
local backup="$file$suffix"
echo
echo
echo
if [[ $file =~ $suffix$ ]]; then
echo "skip backup file"
return
fi
if [[ -f "$backup" ]]; then
echo "INFO seems already ocred"
else
if [[ $(exiftool "$file" | grep -q ^"Creator Tool" | grep -q ": ocrmypdf") ]]; then
echo "INFO: $file already ocred"
else
echo "NEED ocr: $file"
local ocrTemp=$(dirname "$file")"/.temp_ocrmypdf_"$(basename "$file")
local stdout=$(tempfile)
local stderr=$(tempfile)
set +e
docker run --rm -i \
--user "$(id -u):$(id -g)" \
--workdir /data \
-v "$PWD:/data" \
jbarlow83/ocrmypdf "$file" "$ocrTemp" 2> $stderr
local exitCode=$?
set -e
# https://ocrmypdf.readthedocs.io/en/latest/advanced.html#return-code-policy
echo $exitCode
if [[ $exitCode == 0 ]]; then
ln -v "$file" "$backup" || cp -v "$file" "$backup"
mv -v "$ocrTemp" "$file"
return
fi
if [[ $exitCode == 6 ]]; then
echo "already ocred"
else
cat $stderr
fi
if [[ -f "$ocrTemp" ]]; then
rm -v "$ocrTemp"
fi
fi
fi
}
export -f convertPdf
cd beruflich
find -type f -name "*.pdf" | xargs -P5 -n1 -I{} bash -c "convertPdf '{}'"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment