trufanov-nok/OCR_Into_Djvu.pl

## OCR_Into_Djvu.pl
#!/usr/bin/perl

# based on
# https://en.wikisource.org/wiki/Help:DjVu_files/OCR_with_Tesseract

# requirements:
# sudo apt install python2
# sudo rm /usr/bin/python
# sudo ln /usr/bin/python2  /usr/bin/python
# sudo pip2 install lxml python-djvulibre

$lang = "eng+rus";

if ( $#ARGV < 0 ) {
  print "give a DJVU file as 1st argument \n" ;
  exit 0 ;
}

$inputdjvu = $ARGV[0] ;
$imagetmp = "/tmp/temp.tif" ;
$djvutmp = "/tmp/outdjvu" ;

print "processing of $inputdjvu\n" ;

# calculate the number of pages
$nbpages = `djvused "$inputdjvu" -e 'n'` ;
print "number of pages: $nbpages" ;

for ($i=1 ; $i <= $nbpages ; $i++) { #
  print "OCR de la page $i\n" ;

  # page extraction as an image
  `ddjvu -format=tiff -mode=black -page="$i" "$inputdjvu" $imagetmp` ;

  `tesseract $imagetmp /tmp/outocr$i -l $lang hocr`; # use hOCR format
  print "OCR done\n" ;

  `hocr2djvused /tmp/outocr$i.hocr > $djvutmp`; # create djvused script from hOCR
  `sed -i $djvutmp -e '1 s/.*/select $i/'`; # replace select 1 with right page number

  # writing the text in the DJVU file
  `djvused "$inputdjvu" -f $djvutmp -s  `;
}
	#!/usr/bin/perl

	# based on
	# https://en.wikisource.org/wiki/Help:DjVu_files/OCR_with_Tesseract

	# requirements:
	# sudo apt install python2
	# sudo rm /usr/bin/python
	# sudo ln /usr/bin/python2 /usr/bin/python
	# sudo pip2 install lxml python-djvulibre

	$lang = "eng+rus";

	if ( $#ARGV < 0 ) {
	print "give a DJVU file as 1st argument \n" ;
	exit 0 ;
	}

	$inputdjvu = $ARGV[0] ;
	$imagetmp = "/tmp/temp.tif" ;
	$djvutmp = "/tmp/outdjvu" ;

	print "processing of $inputdjvu\n" ;

	# calculate the number of pages
	$nbpages = `djvused "$inputdjvu" -e 'n'` ;
	print "number of pages: $nbpages" ;

	for ($i=1 ; $i <= $nbpages ; $i++) { #
	print "OCR de la page $i\n" ;

	# page extraction as an image
	`ddjvu -format=tiff -mode=black -page="$i" "$inputdjvu" $imagetmp` ;

	`tesseract $imagetmp /tmp/outocr$i -l $lang hocr`; # use hOCR format
	print "OCR done\n" ;

	`hocr2djvused /tmp/outocr$i.hocr > $djvutmp`; # create djvused script from hOCR
	`sed -i $djvutmp -e '1 s/.*/select $i/'`; # replace select 1 with right page number

	# writing the text in the DJVU file
	`djvused "$inputdjvu" -f $djvutmp -s `;
	}