Skip to content

Instantly share code, notes, and snippets.

@fiee
Created September 30, 2011 08:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fiee/1253113 to your computer and use it in GitHub Desktop.
Save fiee/1253113 to your computer and use it in GitHub Desktop.
run GOCR and clean up afterwards
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Replace common (G)OCR errors (in German)
Usage: python ocerrors.py inputfile [outputfile]
"""
import locale
import os, sys
import shutil
import re
if len(sys.argv) < 2:
print __doc__
sys.exit(1)
infilename = sys.argv[1]
if not os.path.isfile(infilename):
print u"%s is not a file!" % infilename
sys.exit(2)
infile = open(infilename, 'rU')
text = u''.join(infile.readlines())
infile.close()
if len(sys.argv) > 2:
print sys.argv
outfilename = sys.argv[2]
else:
shutil.move(infilename, infilename.replace('.txt','')+'.bak')
outfilename = infilename
textre = (
# ('\r', '\n'), # not necessary with filemode rU
('-\n', ''),
)
rere = (
# combine lines
(u'([^\.][\.\!\?])\n', r'\1\n###\n'), # CR -> #
(u'\n\n+', r'\n###\n'), # CR -> #
(u'\n+', u' '), # CR -> _
(u'###\s*', r'\n\n'),
# typical (G)OCR errors
(u'_', ''), # _ are always errors (?)
(u'(\w)(N)([a-zäöüß])', u"\\1v\\3"), # N-v
(u'(\w)(F)([a-zäöüß])', u"\\1f\\3"), # F-f
(u'(\w)(5|S)([a-zäöüß])', u"\\1s\\3"), # 5/S-s
(u'(\w)(2)([a-zäöüß])', u"\\1z\\3"), # 2-z
(u'(\w)(0)([a-zäöüß])', u"\\1o\\3"), # 0-o
(u'(\w)(1|I)([a-zäöüß ])', u"\\1l\\3"), # 1/I-l
(u'([A-ZÄÖÜ])(l)([A-ZÖÄÜ])', u"\\1I\\3"), # l-I
(u'([A-ZÄÖÜ])(0)([A-ZÖÄÜ])', u"\\1O\\3"), # 0-O
(u'lc([^h])', u'k\\1'), # lc-k
(u'[1Il]C', u'K'), # 1/I/lC-K
(u'(\d+)(o|O)', u'\\g<1>0'), # o-0
(u'(\d+)(o|O)', u'\\g<1>0'), # again
(u'[ \t]+', u' '),
(r"(\w+)u?\s?'\s?'\s?(\w+)", u'\\1ü\\2'), # u''-ü
(u'(ie|ei|eu|au|äu|ai|rö)ss', u'\\1ß'), # manuscript ß errors
(u'(<<|_<|<_|«)', u'„'), # opening quotes
(u'\s?(>>|_>|>_|»)', u'“'), # closing quotes
(u'\n\. ?\n', ''), # whitespace
)
text = text.replace('-\n','') # remove hyphenation
for (k, v) in rere:
try:
pat = re.compile(k, re.L|re.U)
(text, n) = re.subn(pat, v, text)
#print u'replace "%s"\twith "%s":\t%d replacements' % (k, v, n)
#print text
#raw_input('?')
except Exception, ex:
print ex
print k
sys.exit(3)
outfile = open(outfilename, 'wb')
outfile.write(text.encode('utf-8'))
outfile.close()
#!/bin/bash
prefix=$1
prefix=${prefix:-noname}
inputfilter=$prefix
if [ "$prefix" = "noname" ]
then
inputfilter=
fi
outputfile=$prefix.txt
for scan in pbm/$inputfilter*.pbm
do
echo processing $scan to $outputfile
gocr -i $scan -f UTF8 -l 0 -d -1 -m 130 -p ~/.db/ >> $outputfile
done
python ocerrors.py $outputfile
aspell check $outputfile -lde
edit $outputfile
#!/bin/bash
# gocr understands only PBM pictures
mkdir tif
mkdir pbm
for tif in *.tif
do
pbm=${tif%tif}pbm
echo converting $tif to $pbm ...
convert $tif pbm/$pbm
done
mv *.tif tif/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment