Created
March 22, 2012 05:27
-
-
Save yantonov/2156367 to your computer and use it in GitHub Desktop.
OCR multipage pdf file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Optical character recognition for multipage pdf file | |
# | |
# | |
# Prerequisites: | |
# pdftk, cuneiform, imagemagick | |
# | |
# | |
# Disclaimer | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
# | |
tempDirectory="tmp-"`date +%s` | |
defaultLanguage="rus" | |
pagesDirectory="pages" | |
imagesDirectory="pages-images" | |
textDirectory="text" | |
startProcessingMessage="Start processing" | |
endProcessingMessage="OK" | |
DELIMITER="---" | |
function get_working_directory_path() { | |
echo "/tmp/$tempDirectory" | |
} | |
function get_pages_directory_path() { | |
echo "`get_working_directory_path`/$pagesDirectory" | |
} | |
function get_images_directory_path() { | |
echo "`get_working_directory_path`/$imagesDirectory" | |
} | |
function get_text_directory_path() { | |
echo "`get_working_directory_path`/$textDirectory" | |
} | |
function get_default_language() { | |
echo $defaultLanguage | |
} | |
function create_directory() { | |
path=$1 | |
echo "creating directory \"$path\"" | |
echo "mkdir $path" | |
mkdir $path | |
} | |
function remove_directory() { | |
path=$1 | |
echo "removing directory \"$path\"" | |
echo "rm -rf $path" | |
rm -rf $path | |
} | |
function split_multipage_pdf_to_separate_pages() { | |
inputFile=$1 | |
pagesDirectory=$2 | |
echo "splitting $inputFile to separate pages" | |
echo "pdftk $inputFile burst output $pagesDirectory/page_%04d.pdf" | |
pdftk $inputFile burst output $pagesDirectory/page_%04d.pdf | |
} | |
function generate_images_for_all_pages() { | |
pagesDirectory=$1 | |
imagesDirectory=$2 | |
for page in $(eval "echo ${pagesDirectory}/*.pdf") | |
do | |
echo "convert -density 300 $page $imagesDirectory/`basename $page`.png" | |
convert -density 300 $page $imagesDirectory/`basename $page`.png | |
done | |
} | |
function ocr_images() { | |
imagesDirectory=$1 | |
textDirectory=$2 | |
language=$3 | |
for image in $(eval "echo ${imagesDirectory}/*.png") | |
do | |
echo "cuneiform -l rus -o $textDirectory/`basename $image`.txt $image" | |
cuneiform -l rus -o $textDirectory/`basename $image`.txt $image | |
done | |
} | |
function merge_text_from_separate_pages() { | |
textDirectory=$1 | |
outputFile=$2 | |
for text in $(eval "echo ${textDirectory}/*.txt") | |
do | |
cat $text >> $2 | |
echo "" >> $2 | |
echo "" >> $2 | |
done | |
} | |
function outputMessage() { | |
echo $DELIMITER | |
echo $1 | |
echo $DELIMITER | |
} | |
function run() { | |
inputPdf=$1 | |
workingDirectory=`get_working_directory_path` | |
pagesDirectory=`get_pages_directory_path` | |
imagesDirectory=`get_images_directory_path` | |
textDirectory=`get_text_directory_path` | |
language=`get_default_language` | |
outputFile=$2 | |
outputMessage $startProcessingMessage | |
create_directory $workingDirectory | |
create_directory $pagesDirectory | |
split_multipage_pdf_to_separate_pages $inputPdf $pagesDirectory | |
create_directory $imagesDirectory | |
generate_images_for_all_pages $pagesDirectory $imagesDirectory | |
create_directory $textDirectory | |
ocr_images $imagesDirectory $textDirectory $language | |
merge_text_from_separate_pages $textDirectory $outputFile | |
remove_directory $workingDirectory | |
outputMessage $endProcessingMessage | |
} | |
# | |
# main | |
# | |
if [ $# -ne 2 ] | |
then | |
echo "Usage: `basename $0` {inputFile} {outputFile}" | |
echo "inputFile - input pdf file" | |
echo "outputFile - output text file" | |
exit $E_BADARGS | |
fi | |
inputFile=$1 | |
outputFile=$2 | |
if [ -e $inputFile ] | |
then | |
if [ -e $outputFile ] | |
then | |
echo "output file \"$outputFile\" already exists" | |
else | |
run $inputFile $outputFile | |
fi | |
else | |
echo "input file \"$inputFile\" does not exists" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment