Skip to content

Instantly share code, notes, and snippets.



Created Mar 17, 2014
What would you like to do?
Fragment of code used to process images with Tesseract OCR
def ocr_file(filename, languages, output_base, temp_dir):"Launching tesseract on %s", filename)
output = subprocess.check_output(['tesseract', filename, output_base,
'-l', '+'.join(languages), TESSERACT_CONFIG],
with'%s/%s/%s.log' % (item_id, group, index), 'w') as log_f:
log_f.write(output)"Processing hOCR output")
hocr_file = os.path.join(temp_dir, '%s.html' % output_base)
with open(hocr_file, 'rb') as f:
hocr_bytes ='%s/%s/%s.html.bz2' % (item_id, group, index),
ContentFile(bz2.compress(hocr_bytes)))"Extracting plain text")
# Kludge around
html = lxml.html.document_fromstring(hocr_bytes.decode("utf-8", "replace").encode("utf-8"),
# Extract the text for Solr:
text = []
for p in html.cssselect('p'):
text.append(u" ".join(i.text for i in p.iterdescendants() if i.text).strip())
text = u"\n\n".join(filter(None, text))'%s/%s/%s.txt.bz2' % (item_id, group, index),
ContentFile(bz2.compress(text.encode("utf-8"))))"Extracting word coordinates")
pages = html.cssselect('.ocr_page')
assert len(pages) == 1
page_elem = pages[0]
page_info = [i.strip() for i in page_elem.attrib['title'].split(";")]
for i in page_info:
if i.startswith('bbox'):
page_bbox = map(int, i.split()[1:5])
LOGGER.warning('Page did not contain bounding box information - no word coordinates!')
assert page_bbox[0] == 0
assert page_bbox[1] == 0
page_width = page_bbox[2]
page_height = page_bbox[3]
word_coords = defaultdict(list)
for i in html.cssselect('.ocrx_word,.ocr_word'):
term = inner_text(i)
bbox = i.attrib['title'].split()
assert bbox[0] == 'bbox'
word_coords[term].append(map(int, bbox[1:5]))
coordinates = {"height": page_height, "width": page_width,
"words": word_coords}
coord_file = "%s/%s/%s.word_coordinates.json.bz2" % (item_id, group, index)
coord_data = bz2.compress(simplejson.dumps(coordinates)), ContentFile(coord_data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment