mndrake/hocr_parse.py

## hocr_parse.py
#!/usr/bin/env python
# coding: utf-8

# In[11]:


# dependencies
import pytesseract
from bs4 import BeautifulSoup
from PIL import Image
import re
import numpy as np
import math
import cv2
import itertools
from collections import namedtuple
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')


# In[12]:


# utility methods
def pre_process(image):
    """
    Image pre-processing entails finding all of the png image files and
    applying a number of cleaning steps to them.
    """

    # Read in as greyscale
    concatenated = np.array(image.convert('L'))

    # Threshold image to black/white (threshold = 127 I presume)
    num, grey_composite = cv2.threshold(concatenated, 127, 255, cv2.THRESH_BINARY)

    # inverting the image for morphological operations
    inverted_composite = 255 - grey_composite

    # Perform closing, dilation followed by erosion
    kernel = np.ones((2, 2), np.uint8)
    closed_composite = cv2.morphologyEx(inverted_composite, cv2.MORPH_CLOSE, kernel)

    # Undo inversion
    closed_composite = 255 - closed_composite

    # Write over original with processed version
    return Image.fromarray(closed_composite)


def gray2rgb(image):
    img = np.array(image)
    img = cv2.cvtColor(img,cv2.COLOR_GRAY2RGB)
    return img


# In[13]:


# main methods
BoundingBox = namedtuple('BoundingBox', 'left top right bottom')

class Word:
    text = None
    conf = None
    bbox = None
    id = None
    def __init__(self, tag):
        self.text = tag.text
        self.conf = tag_conf(tag)
        self.bbox = tag_bbox(tag)
        self.id = tag.get('id').replace('word_','')
    def __str__(self):
        return self.text
    def __repr__(self):
        return f'Word<{self.id}>'
    def get_numeric(self):
        return convert_to_numeric(self.text)


class Block:
    bbox = None
    text = None
    words = []
    def __init__(self, word: Word):
        self.bbox = word.bbox
        self.text = word.text
        self.words = [word]
    def add(self, word):
        self.bbox = BoundingBox(
            min(word.bbox.left, self.bbox.left),
            min(word.bbox.top, self.bbox.top),
            max(word.bbox.right, self.bbox.right),
            max(word.bbox.bottom, self.bbox.bottom))
        self.text += ' ' + word.text
        self.words.append(word)
    def get_numeric(self):
        return convert_to_numeric(self.text)


class Line:
    bbox = None
    words = None
    blocks = None
    id = None
    def __init__(self, tag, h_toll=30):
        self.bbox = tag_bbox(tag)
        self.words = [Word(word) for word in tag.find_all('span', 'ocrx_word')]
        self.id = tag.get('id').replace('line_','')

        # group words into blocks
        blocks = []
        block = Block(self.words[0])
        for word in self.words[1:]:
            if (word.bbox.left - block.bbox.right) < h_toll:
                block.add(word)
            else:
                blocks.append(block)
                block = Block(word)
        blocks.append(block)
        self.blocks = blocks

    def __repr__(self):
        return f'Line<{self.id}>'
    def __str__(self):
        return ' '.join([str(word) for word in self.words])
    def get_text(self):
        return str(self)


class Page:
    image = None
    bbox = None
    lines = None
    id = None
    def __init__(self, image: Image, id=0, h_toll=30):
        self.id = id
        self.image = image
        text = pytesseract.image_to_pdf_or_hocr(image, lang='eng', config='--oem 3 --psm 4', extension='hocr')
        soup = BeautifulSoup(text, 'html.parser')
        page = soup.find('div', 'ocr_page')
        self.bbox = tag_bbox(page)
        self.lines = [Line(line, h_toll) for line in page.find_all('span', 'ocr_line')]
    def __repr__(self):
        return f'Page<{self.id}>'

    def get_words(self):
        for line in self.lines:
            for word in line.words:
                yield word

    def get_blocks(self):
        for line in self.lines:
            for block in line.blocks:
                yield block

    def plot_blocks(self):
        #img = np.array(self.image)
        img = gray2rgb(self.image)
        colors = [(255,0,0), (0,255,0)]
        for block in self.get_blocks():
            color = colors[int(np.isnan(block.get_numeric()))]
            draw_rect(img, block.bbox, color, 0.2)
        return Image.fromarray(img)

    def plot_words(self):
        #img = np.array(self.image)
        img = gray2rgb(self.image)
        for word in self.get_words():
            draw_rect(img, word.bbox, (100,100,0), 0.2)
        return Image.fromarray(img)

    def plot_lines(self):
        #img = np.array(self.image)
        img = gray2rgb(self.image)
        for line in self.lines:
            draw_rect(img, line.bbox, (0,100,100), 0.2)
        return Image.fromarray(img)

    def get_text(self):
        return '\n'.join([line.get_text() for line in self.lines])


def tag_bbox(tag):
    title = tag.get('title')
    match = re.search('bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)', title)
    values = [int(x) for x in match.groups()]
    return BoundingBox(*values)


def tag_conf(tag):
    title = tag.get('title')
    match = re.search('x_wconf ([0-9]+)', title)
    return int(match.group(1))


def draw_rect(img, bb, color, alpha):
    overlay = img.copy()
    cv2.rectangle(overlay, (bb.left, bb.top), (bb.right, bb.bottom), color, -1)
    cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)
    cv2.rectangle(img, (bb.left, bb.top), (bb.right, bb.bottom), color, 2)
    return img


def convert_to_numeric(value):
    """
    Converts a pandas series object (of strings) to numeric if possible.
    If not possible, will return numpy.nan.
    """
    try:
        x = (str(value).replace(" ","").replace("$","").replace("|","")
             .replace(",", "").replace("(", "-").replace(")", ""))
        return np.float(x)
    except:
        return np.nan


# In[14]:


#image = Image.open('working/png/00053475_12.png')
#image = Image.open('working/png/00178090_12.png')
image = Image.open('working/png/00468115_12.png')
#image = Image.open('working/png/00477955_10.png')
#image = Image.open('working/png/00542515_12.png')
#image = Image.open('working/png/00553535_11.png')
#image = Image.open('working/png/00782931_11.png')


# In[15]:


#image = Image.open('working/png/00030177_17.png')
#image = image.crop((200,200, 2250,3200))


# In[16]:


page = Page(image)


# In[17]:


#print(page.get_text())
#page.image


# In[18]:


#img = np.array(page.image)
#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
#plt.plot(h_concentration[0])


# In[19]:


#page.plot_blocks()


# In[20]:


#img = np.array(page.image)

#for block in page.get_blocks():
#    draw_rect(img, block.bbox, (0,0,0), 1)


#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
#plt.plot(h_concentration[0])
#Image.fromarray(img)


# In[21]:


#numeric_blocks = [block for block in page.get_blocks() if not np.isnan(block.get_numeric())]

img = np.array(page.image)
img.fill(255)

#for block in numeric_blocks:
for block in page.get_blocks():
    draw_rect(img, block.bbox, 0, 1)

#Image.fromarray(img)

h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
h_concentration = np.reshape(h_concentration, page.bbox.right)
#plt.plot(h_concentration[0])

v_concentration = cv2.reduce(img, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
v_concentration = np.reshape(v_concentration, page.bbox.bottom)
plt.plot(v_concentration)


# In[22]:


page.plot_blocks()
#page.plot_words()


# In[23]:


#img = np.array(page.image)
img = gray2rgb(page.image)

overlay = img.copy()

alpha = 0.4
threshold = 230

for i, x in enumerate(h_concentration):
   color = 0 if int(x) > threshold else (0,255,0)
   cv2.line(overlay, (i,0), (i,page.bbox.bottom), color)

cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)

for i, x in enumerate(v_concentration):
    color = 0 if int(x) > threshold else (255,0,0)
    cv2.line(overlay, (0,i), (page.bbox.right,i), color)

cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)

Image.fromarray(img)


# In[ ]:


# In[ ]:


# In[ ]:


# In[24]:


page.plot_blocks()


# In[25]:


#! pip install scikit-learn


# In[ ]:


# In[26]:


from sklearn.cluster import KMeans


# In[27]:


y_centroids = [line.bbox.top + line.bbox.bottom//2 for line in page.lines]
y_centroids = np.reshape(y_centroids, (-1,1))


# In[28]:


y_centroids.shape


# In[41]:


kmeans = KMeans(n_clusters=11, random_state=0).fit(y_centroids)


# In[42]:


#page.lines


# In[43]:


#kmeans.labels_


# In[44]:


img = gray2rgb(image)
colors = [(255,0,0), (0,255,0), (0,0,255),(100,100,0),(100,0,100),(0,100,100),(100,100,100), (255,255,0), (0,255,255), (0,150,150), (150,150,150)]
for i, line in enumerate(page.lines):
    color = colors[kmeans.labels_[i]]
    draw_rect(img, line.bbox, color, 0.2)

Image.fromarray(img)


# In[33]:


import os
import cv2
import imutils


# In[49]:


img = np.array(image)
morph_size=(4, 12)
min_text_height_limit=20
max_text_height_limit=100

# Otsu threshold
img = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

# dilate the text to make it solid spot
cpy = img.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
img = ~cpy

contours = cv2.findContours(img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0]

# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
    box = cv2.boundingRect(contour)
    h = box[3]

    if min_text_height_limit < h < max_text_height_limit:
        boxes.append(box)

img = gray2rgb(image)

for box in boxes:
    b = BoundingBox(box[0],box[1], box[0]+box[2], box[1]+box[3])
    draw_rect(img, b, (255,0,0), 0.2)

Image.fromarray(img)


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[79]:


def pre_process_image(img, morph_size=(8, 8)):

    # get rid of the color
    #pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Otsu threshold
    pre = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    # dilate the text to make it solid spot
    cpy = pre.copy()
    struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
    cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
    pre = ~cpy

    #if save_in_file is not None:
    #    cv2.imwrite(save_in_file, pre)
    return pre


def find_text_boxes(pre, min_text_height_limit=6, max_text_height_limit=40):
    # Looking for the text spots contours
    contours = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = contours[0] #if imutils.is_cv2() else contours[1]

    # Getting the texts bounding boxes based on the text size assumptions
    boxes = []
    for contour in contours:
        box = cv2.boundingRect(contour)
        h = box[3]

        if min_text_height_limit < h < max_text_height_limit:
            boxes.append(box)

    return boxes


def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
    rows = {}
    cols = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h) = box
        col_key = x // cell_threshold
        row_key = y // cell_threshold
        cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    # Filtering out the clusters having less than 2 cols
    table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
    # Sorting the row cells by x coord
    table_cells = [list(sorted(tb)) for tb in table_cells]
    # Sorting rows by the y coord
    table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

    return table_cells


def build_lines(table_cells):
    if table_cells is None or len(table_cells) <= 0:
        return [], []

    max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
    max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

    max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
    max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

    hor_lines = []
    ver_lines = []

    for box in table_cells:
        x = box[0][0]
        y = box[0][1]
        hor_lines.append((x, y, max_x, y))

    for box in table_cells[0]:
        x = box[0]
        y = box[1]
        ver_lines.append((x, y, x, max_y))

    (x, y, w, h) = table_cells[0][-1]
    ver_lines.append((max_x, y, max_x, max_y))
    (x, y, w, h) = table_cells[0][0]
    hor_lines.append((x, max_y, max_x, max_y))

    return hor_lines, ver_lines


# if __name__ == "__main__":
#     in_file = os.path.join("data", "page.jpg")
#     pre_file = os.path.join("data", "pre.png")
#     out_file = os.path.join("data", "out.png")

#     img = cv2.imread(os.path.join(in_file))

#     pre_processed = pre_process_image(img, pre_file)
#     text_boxes = find_text_boxes(pre_processed)
#     cells = find_table_in_boxes(text_boxes)
#     hor_lines, ver_lines = build_lines(cells)

#     # Visualize the result
#     vis = img.copy()

#     # for box in text_boxes:
#     #     (x, y, w, h) = box
#     #     cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

#     for line in hor_lines:
#         [x1, y1, x2, y2] = line
#         cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

#     for line in ver_lines:
#         [x1, y1, x2, y2] = line
#         cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

#     cv2.imwrite(out_file, vis)


# In[91]:


img = np.array(image)

pre_processed = pre_process_image(img, morph_size=(10, 10))
text_boxes = find_text_boxes(pre_processed, min_text_height_limit=18, max_text_height_limit=100)
cells = find_table_in_boxes(text_boxes,cell_threshold=100, min_columns=2)
hor_lines, ver_lines = build_lines(cells)

vis = gray2rgb(img.copy())

# for box in text_boxes:
#     (x, y, w, h) = box
#     cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

for line in hor_lines:
    [x1, y1, x2, y2] = line
    cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

for line in ver_lines:
    [x1, y1, x2, y2] = line
    cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

Image.fromarray(vis)


# In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# In[11]:


	# dependencies
	import pytesseract
	from bs4 import BeautifulSoup
	from PIL import Image
	import re
	import numpy as np
	import math
	import cv2
	import itertools
	from collections import namedtuple
	import matplotlib.pyplot as plt

	get_ipython().run_line_magic('matplotlib', 'inline')


	# In[12]:


	# utility methods
	def pre_process(image):
	"""
	Image pre-processing entails finding all of the png image files and
	applying a number of cleaning steps to them.
	"""

	# Read in as greyscale
	concatenated = np.array(image.convert('L'))

	# Threshold image to black/white (threshold = 127 I presume)
	num, grey_composite = cv2.threshold(concatenated, 127, 255, cv2.THRESH_BINARY)

	# inverting the image for morphological operations
	inverted_composite = 255 - grey_composite

	# Perform closing, dilation followed by erosion
	kernel = np.ones((2, 2), np.uint8)
	closed_composite = cv2.morphologyEx(inverted_composite, cv2.MORPH_CLOSE, kernel)

	# Undo inversion
	closed_composite = 255 - closed_composite

	# Write over original with processed version
	return Image.fromarray(closed_composite)


	def gray2rgb(image):
	img = np.array(image)
	img = cv2.cvtColor(img,cv2.COLOR_GRAY2RGB)
	return img


	# In[13]:


	# main methods
	BoundingBox = namedtuple('BoundingBox', 'left top right bottom')

	class Word:
	text = None
	conf = None
	bbox = None
	id = None
	def __init__(self, tag):
	self.text = tag.text
	self.conf = tag_conf(tag)
	self.bbox = tag_bbox(tag)
	self.id = tag.get('id').replace('word_','')
	def __str__(self):
	return self.text
	def __repr__(self):
	return f'Word<{self.id}>'
	def get_numeric(self):
	return convert_to_numeric(self.text)


	class Block:
	bbox = None
	text = None
	words = []
	def __init__(self, word: Word):
	self.bbox = word.bbox
	self.text = word.text
	self.words = [word]
	def add(self, word):
	self.bbox = BoundingBox(
	min(word.bbox.left, self.bbox.left),
	min(word.bbox.top, self.bbox.top),
	max(word.bbox.right, self.bbox.right),
	max(word.bbox.bottom, self.bbox.bottom))
	self.text += ' ' + word.text
	self.words.append(word)
	def get_numeric(self):
	return convert_to_numeric(self.text)


	class Line:
	bbox = None
	words = None
	blocks = None
	id = None
	def __init__(self, tag, h_toll=30):
	self.bbox = tag_bbox(tag)
	self.words = [Word(word) for word in tag.find_all('span', 'ocrx_word')]
	self.id = tag.get('id').replace('line_','')

	# group words into blocks
	blocks = []
	block = Block(self.words[0])
	for word in self.words[1:]:
	if (word.bbox.left - block.bbox.right) < h_toll:
	block.add(word)
	else:
	blocks.append(block)
	block = Block(word)
	blocks.append(block)
	self.blocks = blocks

	def __repr__(self):
	return f'Line<{self.id}>'
	def __str__(self):
	return ' '.join([str(word) for word in self.words])
	def get_text(self):
	return str(self)


	class Page:
	image = None
	bbox = None
	lines = None
	id = None
	def __init__(self, image: Image, id=0, h_toll=30):
	self.id = id
	self.image = image
	text = pytesseract.image_to_pdf_or_hocr(image, lang='eng', config='--oem 3 --psm 4', extension='hocr')
	soup = BeautifulSoup(text, 'html.parser')
	page = soup.find('div', 'ocr_page')
	self.bbox = tag_bbox(page)
	self.lines = [Line(line, h_toll) for line in page.find_all('span', 'ocr_line')]
	def __repr__(self):
	return f'Page<{self.id}>'

	def get_words(self):
	for line in self.lines:
	for word in line.words:
	yield word

	def get_blocks(self):
	for line in self.lines:
	for block in line.blocks:
	yield block

	def plot_blocks(self):
	#img = np.array(self.image)
	img = gray2rgb(self.image)
	colors = [(255,0,0), (0,255,0)]
	for block in self.get_blocks():
	color = colors[int(np.isnan(block.get_numeric()))]
	draw_rect(img, block.bbox, color, 0.2)
	return Image.fromarray(img)

	def plot_words(self):
	#img = np.array(self.image)
	img = gray2rgb(self.image)
	for word in self.get_words():
	draw_rect(img, word.bbox, (100,100,0), 0.2)
	return Image.fromarray(img)

	def plot_lines(self):
	#img = np.array(self.image)
	img = gray2rgb(self.image)
	for line in self.lines:
	draw_rect(img, line.bbox, (0,100,100), 0.2)
	return Image.fromarray(img)

	def get_text(self):
	return '\n'.join([line.get_text() for line in self.lines])


	def tag_bbox(tag):
	title = tag.get('title')
	match = re.search('bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)', title)
	values = [int(x) for x in match.groups()]
	return BoundingBox(*values)


	def tag_conf(tag):
	title = tag.get('title')
	match = re.search('x_wconf ([0-9]+)', title)
	return int(match.group(1))


	def draw_rect(img, bb, color, alpha):
	overlay = img.copy()
	cv2.rectangle(overlay, (bb.left, bb.top), (bb.right, bb.bottom), color, -1)
	cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)
	cv2.rectangle(img, (bb.left, bb.top), (bb.right, bb.bottom), color, 2)
	return img


	def convert_to_numeric(value):
	"""
	Converts a pandas series object (of strings) to numeric if possible.
	If not possible, will return numpy.nan.
	"""
	try:
	x = (str(value).replace(" ","").replace("$","").replace("\|","")
	.replace(",", "").replace("(", "-").replace(")", ""))
	return np.float(x)
	except:
	return np.nan


	# In[14]:


	#image = Image.open('working/png/00053475_12.png')
	#image = Image.open('working/png/00178090_12.png')
	image = Image.open('working/png/00468115_12.png')
	#image = Image.open('working/png/00477955_10.png')
	#image = Image.open('working/png/00542515_12.png')
	#image = Image.open('working/png/00553535_11.png')
	#image = Image.open('working/png/00782931_11.png')


	# In[15]:


	#image = Image.open('working/png/00030177_17.png')
	#image = image.crop((200,200, 2250,3200))


	# In[16]:


	page = Page(image)


	# In[17]:


	#print(page.get_text())
	#page.image


	# In[18]:


	#img = np.array(page.image)
	#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
	#plt.plot(h_concentration[0])


	# In[19]:


	#page.plot_blocks()


	# In[20]:


	#img = np.array(page.image)

	#for block in page.get_blocks():
	# draw_rect(img, block.bbox, (0,0,0), 1)


	#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
	#plt.plot(h_concentration[0])
	#Image.fromarray(img)


	# In[21]:


	#numeric_blocks = [block for block in page.get_blocks() if not np.isnan(block.get_numeric())]

	img = np.array(page.image)
	img.fill(255)

	#for block in numeric_blocks:
	for block in page.get_blocks():
	draw_rect(img, block.bbox, 0, 1)

	#Image.fromarray(img)

	h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
	h_concentration = np.reshape(h_concentration, page.bbox.right)
	#plt.plot(h_concentration[0])

	v_concentration = cv2.reduce(img, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32S)
	v_concentration = np.reshape(v_concentration, page.bbox.bottom)
	plt.plot(v_concentration)


	# In[22]:


	page.plot_blocks()
	#page.plot_words()


	# In[23]:


	#img = np.array(page.image)
	img = gray2rgb(page.image)

	overlay = img.copy()

	alpha = 0.4
	threshold = 230

	for i, x in enumerate(h_concentration):
	color = 0 if int(x) > threshold else (0,255,0)
	cv2.line(overlay, (i,0), (i,page.bbox.bottom), color)

	cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)

	for i, x in enumerate(v_concentration):
	color = 0 if int(x) > threshold else (255,0,0)
	cv2.line(overlay, (0,i), (page.bbox.right,i), color)

	cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img)

	Image.fromarray(img)


	# In[ ]:





	# In[ ]:





	# In[ ]:





	# In[24]:


	page.plot_blocks()


	# In[25]:


	#! pip install scikit-learn


	# In[ ]:





	# In[26]:


	from sklearn.cluster import KMeans


	# In[27]:


	y_centroids = [line.bbox.top + line.bbox.bottom//2 for line in page.lines]
	y_centroids = np.reshape(y_centroids, (-1,1))


	# In[28]:


	y_centroids.shape


	# In[41]:


	kmeans = KMeans(n_clusters=11, random_state=0).fit(y_centroids)


	# In[42]:


	#page.lines


	# In[43]:


	#kmeans.labels_


	# In[44]:


	img = gray2rgb(image)
	colors = [(255,0,0), (0,255,0), (0,0,255),(100,100,0),(100,0,100),(0,100,100),(100,100,100), (255,255,0), (0,255,255), (0,150,150), (150,150,150)]
	for i, line in enumerate(page.lines):
	color = colors[kmeans.labels_[i]]
	draw_rect(img, line.bbox, color, 0.2)

	Image.fromarray(img)


	# In[33]:


	import os
	import cv2
	import imutils


	# In[49]:


	img = np.array(image)
	morph_size=(4, 12)
	min_text_height_limit=20
	max_text_height_limit=100

	# Otsu threshold
	img = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)[1]

	# dilate the text to make it solid spot
	cpy = img.copy()
	struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
	cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
	img = ~cpy

	contours = cv2.findContours(img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	contours = contours[0]

	# Getting the texts bounding boxes based on the text size assumptions
	boxes = []
	for contour in contours:
	box = cv2.boundingRect(contour)
	h = box[3]

	if min_text_height_limit < h < max_text_height_limit:
	boxes.append(box)

	img = gray2rgb(image)

	for box in boxes:
	b = BoundingBox(box[0],box[1], box[0]+box[2], box[1]+box[3])
	draw_rect(img, b, (255,0,0), 0.2)

	Image.fromarray(img)


	# In[ ]:





	# In[ ]:





	# In[ ]:





	# In[ ]:





	# In[79]:



	def pre_process_image(img, morph_size=(8, 8)):

	# get rid of the color
	#pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	# Otsu threshold
	pre = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)[1]
	# dilate the text to make it solid spot
	cpy = pre.copy()
	struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
	cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
	pre = ~cpy

	#if save_in_file is not None:
	# cv2.imwrite(save_in_file, pre)
	return pre


	def find_text_boxes(pre, min_text_height_limit=6, max_text_height_limit=40):
	# Looking for the text spots contours
	contours = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	contours = contours[0] #if imutils.is_cv2() else contours[1]

	# Getting the texts bounding boxes based on the text size assumptions
	boxes = []
	for contour in contours:
	box = cv2.boundingRect(contour)
	h = box[3]

	if min_text_height_limit < h < max_text_height_limit:
	boxes.append(box)

	return boxes


	def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
	rows = {}
	cols = {}

	# Clustering the bounding boxes by their positions
	for box in boxes:
	(x, y, w, h) = box
	col_key = x // cell_threshold
	row_key = y // cell_threshold
	cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
	rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

	# Filtering out the clusters having less than 2 cols
	table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
	# Sorting the row cells by x coord
	table_cells = [list(sorted(tb)) for tb in table_cells]
	# Sorting rows by the y coord
	table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

	return table_cells


	def build_lines(table_cells):
	if table_cells is None or len(table_cells) <= 0:
	return [], []

	max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
	max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

	max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
	max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

	hor_lines = []
	ver_lines = []

	for box in table_cells:
	x = box[0][0]
	y = box[0][1]
	hor_lines.append((x, y, max_x, y))

	for box in table_cells[0]:
	x = box[0]
	y = box[1]
	ver_lines.append((x, y, x, max_y))

	(x, y, w, h) = table_cells[0][-1]
	ver_lines.append((max_x, y, max_x, max_y))
	(x, y, w, h) = table_cells[0][0]
	hor_lines.append((x, max_y, max_x, max_y))

	return hor_lines, ver_lines


	# if __name__ == "__main__":
	# in_file = os.path.join("data", "page.jpg")
	# pre_file = os.path.join("data", "pre.png")
	# out_file = os.path.join("data", "out.png")

	# img = cv2.imread(os.path.join(in_file))

	# pre_processed = pre_process_image(img, pre_file)
	# text_boxes = find_text_boxes(pre_processed)
	# cells = find_table_in_boxes(text_boxes)
	# hor_lines, ver_lines = build_lines(cells)

	# # Visualize the result
	# vis = img.copy()

	# # for box in text_boxes:
	# # (x, y, w, h) = box
	# # cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

	# for line in hor_lines:
	# [x1, y1, x2, y2] = line
	# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

	# for line in ver_lines:
	# [x1, y1, x2, y2] = line
	# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

	# cv2.imwrite(out_file, vis)


	# In[91]:


	img = np.array(image)

	pre_processed = pre_process_image(img, morph_size=(10, 10))
	text_boxes = find_text_boxes(pre_processed, min_text_height_limit=18, max_text_height_limit=100)
	cells = find_table_in_boxes(text_boxes,cell_threshold=100, min_columns=2)
	hor_lines, ver_lines = build_lines(cells)

	vis = gray2rgb(img.copy())

	# for box in text_boxes:
	# (x, y, w, h) = box
	# cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

	for line in hor_lines:
	[x1, y1, x2, y2] = line
	cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

	for line in ver_lines:
	[x1, y1, x2, y2] = line
	cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

	Image.fromarray(vis)


	# In[ ]: