Skip to content

Instantly share code, notes, and snippets.

@yantonov
Created March 22, 2012 05:27
Show Gist options
  • Save yantonov/2156367 to your computer and use it in GitHub Desktop.
Save yantonov/2156367 to your computer and use it in GitHub Desktop.
OCR multipage pdf file
#!/bin/bash
#
# Optical character recognition for multipage pdf file
#
#
# Prerequisites:
# pdftk, cuneiform, imagemagick
#
#
# Disclaimer
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
tempDirectory="tmp-"`date +%s`
defaultLanguage="rus"
pagesDirectory="pages"
imagesDirectory="pages-images"
textDirectory="text"
startProcessingMessage="Start processing"
endProcessingMessage="OK"
DELIMITER="---"
function get_working_directory_path() {
echo "/tmp/$tempDirectory"
}
function get_pages_directory_path() {
echo "`get_working_directory_path`/$pagesDirectory"
}
function get_images_directory_path() {
echo "`get_working_directory_path`/$imagesDirectory"
}
function get_text_directory_path() {
echo "`get_working_directory_path`/$textDirectory"
}
function get_default_language() {
echo $defaultLanguage
}
function create_directory() {
path=$1
echo "creating directory \"$path\""
echo "mkdir $path"
mkdir $path
}
function remove_directory() {
path=$1
echo "removing directory \"$path\""
echo "rm -rf $path"
rm -rf $path
}
function split_multipage_pdf_to_separate_pages() {
inputFile=$1
pagesDirectory=$2
echo "splitting $inputFile to separate pages"
echo "pdftk $inputFile burst output $pagesDirectory/page_%04d.pdf"
pdftk $inputFile burst output $pagesDirectory/page_%04d.pdf
}
function generate_images_for_all_pages() {
pagesDirectory=$1
imagesDirectory=$2
for page in $(eval "echo ${pagesDirectory}/*.pdf")
do
echo "convert -density 300 $page $imagesDirectory/`basename $page`.png"
convert -density 300 $page $imagesDirectory/`basename $page`.png
done
}
function ocr_images() {
imagesDirectory=$1
textDirectory=$2
language=$3
for image in $(eval "echo ${imagesDirectory}/*.png")
do
echo "cuneiform -l rus -o $textDirectory/`basename $image`.txt $image"
cuneiform -l rus -o $textDirectory/`basename $image`.txt $image
done
}
function merge_text_from_separate_pages() {
textDirectory=$1
outputFile=$2
for text in $(eval "echo ${textDirectory}/*.txt")
do
cat $text >> $2
echo "" >> $2
echo "" >> $2
done
}
function outputMessage() {
echo $DELIMITER
echo $1
echo $DELIMITER
}
function run() {
inputPdf=$1
workingDirectory=`get_working_directory_path`
pagesDirectory=`get_pages_directory_path`
imagesDirectory=`get_images_directory_path`
textDirectory=`get_text_directory_path`
language=`get_default_language`
outputFile=$2
outputMessage $startProcessingMessage
create_directory $workingDirectory
create_directory $pagesDirectory
split_multipage_pdf_to_separate_pages $inputPdf $pagesDirectory
create_directory $imagesDirectory
generate_images_for_all_pages $pagesDirectory $imagesDirectory
create_directory $textDirectory
ocr_images $imagesDirectory $textDirectory $language
merge_text_from_separate_pages $textDirectory $outputFile
remove_directory $workingDirectory
outputMessage $endProcessingMessage
}
#
# main
#
if [ $# -ne 2 ]
then
echo "Usage: `basename $0` {inputFile} {outputFile}"
echo "inputFile - input pdf file"
echo "outputFile - output text file"
exit $E_BADARGS
fi
inputFile=$1
outputFile=$2
if [ -e $inputFile ]
then
if [ -e $outputFile ]
then
echo "output file \"$outputFile\" already exists"
else
run $inputFile $outputFile
fi
else
echo "input file \"$inputFile\" does not exists"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment