Skip to content

Instantly share code, notes, and snippets.

@acdha
Created March 17, 2014 22:49
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save acdha/9610005 to your computer and use it in GitHub Desktop.
Save acdha/9610005 to your computer and use it in GitHub Desktop.
Fragment of code used to process images with Tesseract OCR
def ocr_file(filename, languages, output_base, temp_dir):
log.info("Launching tesseract on %s", filename)
output = subprocess.check_output(['tesseract', filename, output_base,
'-l', '+'.join(languages), TESSERACT_CONFIG],
cwd=temp_dir,
stderr=subprocess.STDOUT)
with OCR_STORAGE.open('%s/%s/%s.log' % (item_id, group, index), 'w') as log_f:
log_f.write(output)
log.info("Processing hOCR output")
hocr_file = os.path.join(temp_dir, '%s.html' % output_base)
with open(hocr_file, 'rb') as f:
hocr_bytes = f.read()
OCR_STORAGE.save('%s/%s/%s.html.bz2' % (item_id, group, index),
ContentFile(bz2.compress(hocr_bytes)))
log.info("Extracting plain text")
# Kludge around https://bugs.launchpad.net/ubuntu/+source/tesseract/+bug/1094145
html = lxml.html.document_fromstring(hocr_bytes.decode("utf-8", "replace").encode("utf-8"),
parser=UTF8_PARSER)
# Extract the text for Solr:
text = []
for p in html.cssselect('p'):
text.append(u" ".join(i.text for i in p.iterdescendants() if i.text).strip())
text = u"\n\n".join(filter(None, text))
OCR_STORAGE.save('%s/%s/%s.txt.bz2' % (item_id, group, index),
ContentFile(bz2.compress(text.encode("utf-8"))))
log.info("Extracting word coordinates")
pages = html.cssselect('.ocr_page')
assert len(pages) == 1
page_elem = pages[0]
page_info = [i.strip() for i in page_elem.attrib['title'].split(";")]
for i in page_info:
if i.startswith('bbox'):
page_bbox = map(int, i.split()[1:5])
break
else:
LOGGER.warning('Page did not contain bounding box information - no word coordinates!')
return
assert page_bbox[0] == 0
assert page_bbox[1] == 0
page_width = page_bbox[2]
page_height = page_bbox[3]
word_coords = defaultdict(list)
for i in html.cssselect('.ocrx_word,.ocr_word'):
term = inner_text(i)
bbox = i.attrib['title'].split()
assert bbox[0] == 'bbox'
word_coords[term].append(map(int, bbox[1:5]))
coordinates = {"height": page_height, "width": page_width,
"words": word_coords}
coord_file = "%s/%s/%s.word_coordinates.json.bz2" % (item_id, group, index)
coord_data = bz2.compress(simplejson.dumps(coordinates))
OCR_STORAGE.save(coord_file, ContentFile(coord_data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment