kanru/pdf-ocr.sh

## pdf-ocr.sh
#! /bin/sh
# Batch OCR pdf files to text files
#
# Copyright (C) 2012  Kan-Ru Chen <kanru@kanru.info>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Prerequisite:
#  - tesseract 3.00
#  - Chinese (Traditional) language data for Tesseract
#  - mupdf-tools
#  - imagemagick
#
# You can get the tesseract files from
# https://code.google.com/p/tesseract-ocr/downloads/list

TESSERACT=api/tesseract
LANGDATA=chi_tra

usage() {
    echo "pdf-ocr.sh [pdf file]"
}

if [ x"$1" = x"" ]; then
    usage;
    exit 1;
fi

PDF="$1"
TEMP=`mktemp -d pdf-ocr.XXXXXXXXXX`

pdfdraw -o $TEMP/page%d.png -r 300 $PDF
for png in `ls $TEMP/page*.png`; do
    convert $png $png.tif
done
for tif in `ls $TEMP/page*.tif`; do
    echo "OCRing $tif"
    $TESSERACT $tif $tif -l $LANGDATA
done
for txt in `ls $TEMP/page*.txt`; do
    (cat $txt;echo ) >> $PDF.txt
done

rm -rf $TEMP
	#! /bin/sh
	# Batch OCR pdf files to text files
	#
	# Copyright (C) 2012 Kan-Ru Chen <kanru@kanru.info>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	# Prerequisite:
	# - tesseract 3.00
	# - Chinese (Traditional) language data for Tesseract
	# - mupdf-tools
	# - imagemagick
	#
	# You can get the tesseract files from
	# https://code.google.com/p/tesseract-ocr/downloads/list

	TESSERACT=api/tesseract
	LANGDATA=chi_tra

	usage() {
	echo "pdf-ocr.sh [pdf file]"
	}

	if [ x"$1" = x"" ]; then
	usage;
	exit 1;
	fi

	PDF="$1"
	TEMP=`mktemp -d pdf-ocr.XXXXXXXXXX`

	pdfdraw -o $TEMP/page%d.png -r 300 $PDF
	for png in `ls $TEMP/page*.png`; do
	convert $png $png.tif
	done
	for tif in `ls $TEMP/page*.tif`; do
	echo "OCRing $tif"
	$TESSERACT $tif $tif -l $LANGDATA
	done
	for txt in `ls $TEMP/page*.txt`; do
	(cat $txt;echo ) >> $PDF.txt
	done

	rm -rf $TEMP