ryanfb/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Lace hOCR + PDF recombination

Use the lace branch of my fork of HocrConverter: https://github.com/ryanfb/HocrConverter/tree/lace (make sure you git pull to get the latest changes)
Download and compile jbig2enc in your script path. Modify pdf.py to use 300 instead of 72 dpi.
Example run:
./lace2pdf.sh xenophon04xeno

Warning: hocrcombine.sh is a non-XML-parsing hackjob I made because hocr-combine in hocr-tools was written with a deprecated Python library and it was easier than thinking about fixing that. If head/tail give you illegal option on output, that probably means your combined output is borked.

  
## hocrcombine.sh
#!/bin/bash

HEADER_LENGTH=9
FOOTER_LENGTH=2
HEADER=`head -$HEADER_LENGTH $1`
FOOTER=`tail -$FOOTER_LENGTH $1`

echo $HEADER

for ARG in $*; do
	LINES=`wc -l < $ARG | tr -d ' '`
	echo "tail -$(($LINES - $HEADER_LENGTH)) $ARG | head -$(($LINES - $HEADER_LENGTH - $FOOTER_LENGTH))" 1>&2
	tail -$(($LINES - $HEADER_LENGTH)) $ARG | head -$(($LINES - $HEADER_LENGTH - $FOOTER_LENGTH))
done

echo $FOOTER

## lace2pdf.sh
#!/bin/bash

SCRIPT_PATH="/Users/ryan/mess/current"
HEML_PATH="heml.mta.ca/lace/static/Tars"
ARCHIVE=`ls *_$1_*.tar.gz|tail -1`
PREFIX=`tar tzf $ARCHIVE|grep combined|head -1`

tar xzvf $ARCHIVE $PREFIX

pdftk "$1.pdf" burst output "$1_%04d.pdf"

$SCRIPT_PATH/hocrcombine.sh $PREFIX*.html > $1_combined.html
ls $1_*.pdf | parallel -u --eta convert -density 300 {} {.}.jpg
python $SCRIPT_PATH/HocrConverter/HocrConverter.py -v -i $1_combined.html -o $1_ocr_text.pdf -f $SCRIPT_PATH/Cardo104s.ttf -m -V -c $1_*.jpg
mkdir jbig2
cd jbig2
$SCRIPT_PATH/jbig2enc/src/jbig2 -s -p -v ../*.jpg
python $SCRIPT_PATH/jbig2enc/pdf.py output > ../$1_jbig2.pdf
cd ..
python $SCRIPT_PATH/mergepdf.py $1_jbig2.pdf $1_ocr_text.pdf $1_ocr_merged.pdf

## mergepdf.py
#!/usr/bin/env python
from PyPDF2 import PdfFileWriter, PdfFileReader
# from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
import sys
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

# read your existing PDFs
existing_image_pdf = PdfFileReader(file(sys.argv[1], "rb"))
existing_text_pdf = PdfFileReader(file(sys.argv[2], "rb"))

output = PdfFileWriter()
text_length = existing_text_pdf.getNumPages()
# add the text onto the existing page image
for i in range(existing_image_pdf.getNumPages()):
	image_page = existing_image_pdf.getPage(i)
	image_page.mergePage(existing_text_pdf.getPage(i if i < text_length else (text_length - 1)))
	output.addPage(image_page)

# finally, write "output" to a real file
outputStream = file(sys.argv[3], "wb")
output.write(outputStream)
outputStream.close()
	#!/bin/bash

	HEADER_LENGTH=9
	FOOTER_LENGTH=2
	HEADER=`head -$HEADER_LENGTH $1`
	FOOTER=`tail -$FOOTER_LENGTH $1`

	echo $HEADER

	for ARG in $*; do
	LINES=`wc -l < $ARG \| tr -d ' '`
	echo "tail -$(($LINES - $HEADER_LENGTH)) $ARG \| head -$(($LINES - $HEADER_LENGTH - $FOOTER_LENGTH))" 1>&2
	tail -$(($LINES - $HEADER_LENGTH)) $ARG \| head -$(($LINES - $HEADER_LENGTH - $FOOTER_LENGTH))
	done

	echo $FOOTER
	#!/bin/bash

	SCRIPT_PATH="/Users/ryan/mess/current"
	HEML_PATH="heml.mta.ca/lace/static/Tars"
	ARCHIVE=`ls _$1_.tar.gz\|tail -1`
	PREFIX=`tar tzf $ARCHIVE\|grep combined\|head -1`

	tar xzvf $ARCHIVE $PREFIX

	pdftk "$1.pdf" burst output "$1_%04d.pdf"

	$SCRIPT_PATH/hocrcombine.sh $PREFIX*.html > $1_combined.html
	ls $1_*.pdf \| parallel -u --eta convert -density 300 {} {.}.jpg
	python $SCRIPT_PATH/HocrConverter/HocrConverter.py -v -i $1_combined.html -o $1_ocr_text.pdf -f $SCRIPT_PATH/Cardo104s.ttf -m -V -c $1_*.jpg
	mkdir jbig2
	cd jbig2
	$SCRIPT_PATH/jbig2enc/src/jbig2 -s -p -v ../*.jpg
	python $SCRIPT_PATH/jbig2enc/pdf.py output > ../$1_jbig2.pdf
	cd ..
	python $SCRIPT_PATH/mergepdf.py $1_jbig2.pdf $1_ocr_text.pdf $1_ocr_merged.pdf
	#!/usr/bin/env python
	from PyPDF2 import PdfFileWriter, PdfFileReader
	# from pyPdf import PdfFileWriter, PdfFileReader
	import StringIO
	import sys
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import letter

	# read your existing PDFs
	existing_image_pdf = PdfFileReader(file(sys.argv[1], "rb"))
	existing_text_pdf = PdfFileReader(file(sys.argv[2], "rb"))

	output = PdfFileWriter()
	text_length = existing_text_pdf.getNumPages()
	# add the text onto the existing page image
	for i in range(existing_image_pdf.getNumPages()):
	image_page = existing_image_pdf.getPage(i)
	image_page.mergePage(existing_text_pdf.getPage(i if i < text_length else (text_length - 1)))
	output.addPage(image_page)

	# finally, write "output" to a real file
	outputStream = file(sys.argv[3], "wb")
	output.write(outputStream)
	outputStream.close()