Created September 12, 2019
HOCR output parsing from pytesseract
#!/usr/bin/env python
# coding: utf-8
# dependencies
import pytesseract
from bs4 import BeautifulSoup
from PIL import Image
import re
import numpy as np
import math
import cv2
import itertools
from collections import namedtuple
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# utility methods
def pre_process(image):
Image pre-processing entails finding all of the png image files and
applying a number of cleaning steps to them.
# Read in as greyscale
concatenated = np.array(image.convert('L'))
# Threshold image to black/white (threshold = 127 I presume)
num, grey_composite = cv2.threshold(concatenated, 127, 255, cv2.THRESH_BINARY)
# inverting the image for morphological operations
inverted_composite = 255 - grey_composite
# Perform closing, dilation followed by erosion
kernel = np.ones((2, 2), np.uint8)
closed_composite = cv2.morphologyEx(inverted_composite, cv2.MORPH_CLOSE, kernel)
# Undo inversion
closed_composite = 255 - closed_composite
# Write over original with processed version
return Image.fromarray(closed_composite)
def gray2rgb(image):
img = np.array(image)
img = cv2.cvtColor(img,cv2.COLOR_GRAY2RGB)
return img
# main methods
BoundingBox = namedtuple('BoundingBox', 'left top right bottom')
class Word:
text = None
conf = None
bbox = None
id = None
def __init__(self, tag):
self.text = tag.text
self.conf = tag_conf(tag)
self.bbox = tag_bbox(tag) = tag.get('id').replace('word_','')
def __str__(self):
return self.text
def __repr__(self):
return f'Word<{}>'
def get_numeric(self):
return convert_to_numeric(self.text)
class Block:
bbox = None
text = None
words = []
def __init__(self, word: Word):
self.bbox = word.bbox
self.text = word.text
self.words = [word]
def add(self, word):
self.bbox = BoundingBox(
min(word.bbox.left, self.bbox.left),
max(word.bbox.right, self.bbox.right),
max(word.bbox.bottom, self.bbox.bottom))
self.text += ' ' + word.text
def get_numeric(self):
return convert_to_numeric(self.text)
class Line:
bbox = None
words = None
blocks = None
id = None
def __init__(self, tag, h_toll=30):
self.bbox = tag_bbox(tag)
self.words = [Word(word) for word in tag.find_all('span', 'ocrx_word')] = tag.get('id').replace('line_','')
# group words into blocks
blocks = []
block = Block(self.words[0])
for word in self.words[1:]:
if (word.bbox.left - block.bbox.right) < h_toll:
block = Block(word)
self.blocks = blocks
def __repr__(self):
return f'Line<{}>'
def __str__(self):
return ' '.join([str(word) for word in self.words])
def get_text(self):
return str(self)
class Page:
image = None
bbox = None
lines = None
id = None
def __init__(self, image: Image, id=0, h_toll=30): = id
self.image = image
text = pytesseract.image_to_pdf_or_hocr(image, lang='eng', config='--oem 3 --psm 4', extension='hocr')
soup = BeautifulSoup(text, 'html.parser')
page = soup.find('div', 'ocr_page')
self.bbox = tag_bbox(page)
self.lines = [Line(line, h_toll) for line in page.find_all('span', 'ocr_line')]
def __repr__(self):
return f'Page<{}>'
def get_words(self):
for line in self.lines:
for word in line.words:
yield word
def get_blocks(self):
for line in self.lines:
for block in line.blocks:
yield block
def plot_blocks(self):
#img = np.array(self.image)
img = gray2rgb(self.image)
colors = [(255,0,0), (0,255,0)]
for block in self.get_blocks():
color = colors[int(np.isnan(block.get_numeric()))]
draw_rect(img, block.bbox, color, 0.2)
return Image.fromarray(img)
def plot_words(self):
#img = np.array(self.image)
img = gray2rgb(self.image)
for word in self.get_words():
draw_rect(img, word.bbox, (100,100,0), 0.2)
return Image.fromarray(img)
def plot_lines(self):
#img = np.array(self.image)
img = gray2rgb(self.image)
for line in self.lines:
draw_rect(img, line.bbox, (0,100,100), 0.2)
return Image.fromarray(img)
def get_text(self):
return '\n'.join([line.get_text() for line in self.lines])
def tag_bbox(tag):
title = tag.get('title')
match ='bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)', title)
values = [int(x) for x in match.groups()]
return BoundingBox(*values)
def tag_conf(tag):
title = tag.get('title')
match ='x_wconf ([0-9]+)', title)
return int(
def draw_rect(img, bb, color, alpha):
overlay = img.copy()
cv2.rectangle(overlay, (bb.left,, (bb.right, bb.bottom), color, -1)
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)
cv2.rectangle(img, (bb.left,, (bb.right, bb.bottom), color, 2)
return img
def convert_to_numeric(value):
Converts a pandas series object (of strings) to numeric if possible.
If not possible, will return numpy.nan.
x = (str(value).replace(" ","").replace("$","").replace("|","")
.replace(",", "").replace("(", "-").replace(")", ""))
return np.float(x)
return np.nan
#image ='working/png/00053475_12.png')
#image ='working/png/00178090_12.png')
image ='working/png/00468115_12.png')
#image ='working/png/00477955_10.png')
#image ='working/png/00542515_12.png')
#image ='working/png/00553535_11.png')
#image ='working/png/00782931_11.png')
#image ='working/png/00030177_17.png')
#image = image.crop((200,200, 2250,3200))
page = Page(image)
#img = np.array(page.image)
#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
# In[19]:
#img = np.array(page.image)
#for block in page.get_blocks():
# draw_rect(img, block.bbox, (0,0,0), 1)
#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
#numeric_blocks = [block for block in page.get_blocks() if not np.isnan(block.get_numeric())]
img = np.array(page.image)
#for block in numeric_blocks:
for block in page.get_blocks():
draw_rect(img, block.bbox, 0, 1)
h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
h_concentration = np.reshape(h_concentration, page.bbox.right)
v_concentration = cv2.reduce(img, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
v_concentration = np.reshape(v_concentration, page.bbox.bottom)
#img = np.array(page.image)
img = gray2rgb(page.image)
overlay = img.copy()
alpha = 0.4
threshold = 230
for i, x in enumerate(h_concentration):
color = 0 if int(x) > threshold else (0,255,0)
cv2.line(overlay, (i,0), (i,page.bbox.bottom), color)
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)
for i, x in enumerate(v_concentration):
color = 0 if int(x) > threshold else (255,0,0)
cv2.line(overlay, (0,i), (page.bbox.right,i), color)
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)
#! pip install scikit-learn
from sklearn.cluster import KMeans
y_centroids = [ + line.bbox.bottom//2 for line in page.lines]
y_centroids = np.reshape(y_centroids, (-1,1))
kmeans = KMeans(n_clusters=11, random_state=0).fit(y_centroids)
img = gray2rgb(image)
colors = [(255,0,0), (0,255,0), (0,0,255),(100,100,0),(100,0,100),(0,100,100),(100,100,100), (255,255,0), (0,255,255), (0,150,150), (150,150,150)]
for i, line in enumerate(page.lines):
color = colors[kmeans.labels_[i]]
draw_rect(img, line.bbox, color, 0.2)
import os
import cv2
import imutils
img = np.array(image)
morph_size=(4, 12)
# Otsu threshold
img = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = img.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
img = ~cpy
contours = cv2.findContours(img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0]
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
img = gray2rgb(image)
for box in boxes:
b = BoundingBox(box[0],box[1], box[0]+box[2], box[1]+box[3])
draw_rect(img, b, (255,0,0), 0.2)
def pre_process_image(img, morph_size=(8, 8)):
# get rid of the color
#pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Otsu threshold
pre = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
pre = ~cpy
#if save_in_file is not None:
# cv2.imwrite(save_in_file, pre)
return pre
def find_text_boxes(pre, min_text_height_limit=6, max_text_height_limit=40):
# Looking for the text spots contours
contours = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] #if imutils.is_cv2() else contours[1]
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
# if __name__ == "__main__":
# in_file = os.path.join("data", "page.jpg")
# pre_file = os.path.join("data", "pre.png")
# out_file = os.path.join("data", "out.png")
# img = cv2.imread(os.path.join(in_file))
# pre_processed = pre_process_image(img, pre_file)
# text_boxes = find_text_boxes(pre_processed)
# cells = find_table_in_boxes(text_boxes)
# hor_lines, ver_lines = build_lines(cells)
# # Visualize the result
# vis = img.copy()
# # for box in text_boxes:
# # (x, y, w, h) = box
# # cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)
# for line in hor_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
# for line in ver_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
# cv2.imwrite(out_file, vis)
img = np.array(image)
pre_processed = pre_process_image(img, morph_size=(10, 10))
text_boxes = find_text_boxes(pre_processed, min_text_height_limit=18, max_text_height_limit=100)
cells = find_table_in_boxes(text_boxes,cell_threshold=100, min_columns=2)
hor_lines, ver_lines = build_lines(cells)
vis = gray2rgb(img.copy())
# for box in text_boxes:
# (x, y, w, h) = box
# cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
