simonmesmith/pytesseract_tablereader.md

## pytesseract_tablereader.md

      
    Raw
  

              pytesseract_tablereader.md
            
          
    Pytesseract tablereader

Here's some code that tries to solve a problem that I think I've identified with regard to converting tables in images into dataframes that we can work with programmatically.
Right now, I see two main solutions in market. One is to use third-party APIs like Amazon's and Google's products in this area, which can get expensive at scale. Another is to use or build upon complex code that uses image processing libraries like OpenCV to find gridlines and use these to determine table rows and columns.
My hypothesis is that we can use only Pytesseract to read tables, since it provides coordinates of text in images, and tables follow a standard structure (rows and columns). I've been working on the code here accordingly.
Usage is very simple:
img = "/path-to-your-image.jpg"
df = read(img)

  
## pytesseract_tablereader.py
import cv2
import numpy as np
from PIL import Image
import pandas as pd
import pytesseract
from sklearn.cluster import AgglomerativeClustering

# With thanks to these resources:
# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/
# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica

def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame:

    # Preprocess the image.
    img = preprocess(image_path)

    # Read the image into a Pytesseract data frame.
    img_df = pytesseract.image_to_data(img, output_type="data.frame")

    # Drop any blank text.
    img_df.dropna(inplace=True)

    # Add row numbers to the dataframe. We do this by clustering rows according
    # to their "top" value. We then determine the max "top" value for each row.
    # Then we assign row numbers to the dataframe based on top values.
    row_max_tops = get_row_max_tops(img_df, distance_threshold)
    img_df["row_number"] = pd.Series([], dtype=object)
    for row_number, row_max_top in row_max_tops:
        if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51
        else: lower_bound = 0
        upper_bound = row_max_top
        img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number

    # Sort the dataframe by row number, left, and word_num so we can build table content logically.
    img_df.sort_values(["row_number", "left", "word_num"], inplace=True)

    # Build the table content.
    table_content = []
    for row_number in img_df["row_number"].unique():
        row_content = []
        cell_content = []
        for _, word in img_df[img_df["row_number"] == row_number].iterrows():
            if word["word_num"] == 1 and len(cell_content) > 0:
                row_content.append(" ".join(cell_content))
                cell_content = []
            cell_content.append(word["text"])
        row_content.append(" ".join(cell_content))
        table_content.append(row_content)

    # Convert the table content to a dataframe, and return it.
    return pd.DataFrame(table_content)

def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list:

    # Create coordinates to use for clustering top values for rows. Note that
    # we use (0, y), where why is "top." We specify 0 for x because we don't
    # care here about the left value, only the top value.
    row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()]

    # Cluster rows by top values.
    row_clusters = AgglomerativeClustering(
        n_clusters=None,
        affinity="manhattan",
        linkage="complete",
        distance_threshold=distance_threshold)
    row_clusters.fit(row_coordinates)

    # Create max row tops values using row clusters and sort ascending.
    row_max_tops = []
    for row_index in np.unique(row_clusters.labels_):
        row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0]
        row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes])
        row_max_tops.append(row_max_top)
    row_max_tops.sort()

    # Return the row index and max top for each row.
    return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)]

def preprocess(image_path: str) -> np.ndarray:

    # Get the image.
    img = cv2.imread(image_path)

    # Convert the image to grayscale.
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Remove backgrounds.
    bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1]

    # Create an inverse image to use for removing lines.
    inverted_img = ~ bg_free_img

    # Remove horizontal lines.
    # TODO: Set line thickness dynamically.
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

    # Remove vertical lines.
    # TODO: Set line thickness dynamically.
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
    remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

    # Return the output image.
    return bg_free_img
	import cv2
	import numpy as np
	from PIL import Image
	import pandas as pd
	import pytesseract
	from sklearn.cluster import AgglomerativeClustering

	# With thanks to these resources:
	# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/
	# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica

	def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame:

	# Preprocess the image.
	img = preprocess(image_path)

	# Read the image into a Pytesseract data frame.
	img_df = pytesseract.image_to_data(img, output_type="data.frame")

	# Drop any blank text.
	img_df.dropna(inplace=True)

	# Add row numbers to the dataframe. We do this by clustering rows according
	# to their "top" value. We then determine the max "top" value for each row.
	# Then we assign row numbers to the dataframe based on top values.
	row_max_tops = get_row_max_tops(img_df, distance_threshold)
	img_df["row_number"] = pd.Series([], dtype=object)
	for row_number, row_max_top in row_max_tops:
	if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51
	else: lower_bound = 0
	upper_bound = row_max_top
	img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number

	# Sort the dataframe by row number, left, and word_num so we can build table content logically.
	img_df.sort_values(["row_number", "left", "word_num"], inplace=True)

	# Build the table content.
	table_content = []
	for row_number in img_df["row_number"].unique():
	row_content = []
	cell_content = []
	for _, word in img_df[img_df["row_number"] == row_number].iterrows():
	if word["word_num"] == 1 and len(cell_content) > 0:
	row_content.append(" ".join(cell_content))
	cell_content = []
	cell_content.append(word["text"])
	row_content.append(" ".join(cell_content))
	table_content.append(row_content)

	# Convert the table content to a dataframe, and return it.
	return pd.DataFrame(table_content)

	def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list:

	# Create coordinates to use for clustering top values for rows. Note that
	# we use (0, y), where why is "top." We specify 0 for x because we don't
	# care here about the left value, only the top value.
	row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()]

	# Cluster rows by top values.
	row_clusters = AgglomerativeClustering(
	n_clusters=None,
	affinity="manhattan",
	linkage="complete",
	distance_threshold=distance_threshold)
	row_clusters.fit(row_coordinates)

	# Create max row tops values using row clusters and sort ascending.
	row_max_tops = []
	for row_index in np.unique(row_clusters.labels_):
	row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0]
	row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes])
	row_max_tops.append(row_max_top)
	row_max_tops.sort()

	# Return the row index and max top for each row.
	return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)]

	def preprocess(image_path: str) -> np.ndarray:

	# Get the image.
	img = cv2.imread(image_path)

	# Convert the image to grayscale.
	gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Remove backgrounds.
	bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1]

	# Create an inverse image to use for removing lines.
	inverted_img = ~ bg_free_img

	# Remove horizontal lines.
	# TODO: Set line thickness dynamically.
	horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
	remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
	cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	cnts = cnts[0] if len(cnts) == 2 else cnts[1]
	for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

	# Remove vertical lines.
	# TODO: Set line thickness dynamically.
	vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
	remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
	cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	cnts = cnts[0] if len(cnts) == 2 else cnts[1]
	for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

	# Return the output image.
	return bg_free_img