davidpfahler/ocr.sh

## ocr.sh
#!/bin/sh

mkdir -p __searchable__

y="`pwd`/$1"
echo Will create a searchable PDF for $y

x=`basename "$y"`
name=${x%.*}

mkdir "$name"
cd "$name"

# splitting to individual pages
gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y"

# process each page
for f in $( ls *.jpg ); do
  # extract text
  tesseract $f ${f%.*} -l deu --psm 3 pdf
  rm $f
done

# combine all pages back to a single file
gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="../__searchable__/${name}.pdf" *.pdf

cd ..
rm -rf "${name}"
	#!/bin/sh

	mkdir -p __searchable__

	y="`pwd`/$1"
	echo Will create a searchable PDF for $y

	x=`basename "$y"`
	name=${x%.*}

	mkdir "$name"
	cd "$name"

	# splitting to individual pages
	gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y"

	# process each page
	for f in $( ls *.jpg ); do
	# extract text
	tesseract $f ${f%.*} -l deu --psm 3 pdf
	rm $f
	done

	# combine all pages back to a single file
	gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="../__searchable__/${name}.pdf" *.pdf

	cd ..
	rm -rf "${name}"