Last active
December 27, 2018 07:26
-
-
Save temberature/f51216a84d31fdcc6d295572daad4fed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# bash tut: http://linuxconfig.org/bash-scripting-tutorial | |
# Linux PDF,OCR: http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/ | |
y="$(pwd)/$1" | |
echo Will create a searchable PDF for $y | |
x="$(basename "$y")" | |
name=${x%.*} | |
mkdir "$name" | |
cd "$name" | |
# splitting to individual pages | |
gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y" | |
# process each page | |
for f in $( ls *.jpg ); do | |
# extract text | |
tesseract -l eng -psm 3 $f ${f%.*} hocr | |
# remove the “<?xml” line, it disturbed hocr2df | |
grep -v "<?xml" ${f%.*}.hocr > ${f%.*}.noxml | |
rm ${f%.*}.hocr | |
# create a searchable page | |
hocr2pdf -i $f -s -o ${f%.*}.pdf < ${f%.*}.noxml | |
rm ${f%.*}.noxml | |
rm $f | |
done | |
# combine all pages back to a single file | |
# from http://www.ehow.com/how_6874571_merge-pdf-files-ghostscript.html | |
gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=../${name}_searchable.pdf *.pdf | |
cd .. | |
rm -rf $name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# bash tut: http://linuxconfig.org/bash-scripting-tutorial | |
# Linux PDF,OCR: http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/ | |
y="$(pwd)/$1" | |
echo Will create a searchable PDF for $y | |
x="$(basename "$y")" | |
name=${x%.*} | |
mkdir "$name" | |
cd "$name" | |
# splitting to individual pages | |
gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y" | |
# process each page | |
for f in $( ls *.jpg ); do | |
# extract text | |
tesseract -l eng -psm 3 $f ${f%.*} pdf | |
rm $f | |
done | |
# combine all pages back to a single file | |
# from http://www.ehow.com/how_6874571_merge-pdf-files-ghostscript.html | |
gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=../${name}_searchable.pdf *.pdf | |
cd .. | |
rm -rf $name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment