kayalshri/aws-textract-table.py

## aws-textract-table.py
# Copyright 2010-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# This file is licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License. A copy of the
# License is located at
#
# http://aws.amazon.com/apache2.0/
#
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint


def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}

                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '
    return text


def get_table_csv_results(file_name):

    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'

    return csv

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)

    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():

        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'

    csv += '\n\n\n'
    return csv

def main(file_name):
    table_csv = get_table_csv_results(file_name)

    output_file = 'output.csv'

    # replace content
    with open(output_file, "wt") as fout:
        fout.write(table_csv)

    # show the results
    print('CSV OUTPUT FILE: ', output_file)


if __name__ == "__main__":
    file_name = sys.argv[1]
    main(file_name)
	# Copyright 2010-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
	#
	# This file is licensed under the Apache License, Version 2.0 (the "License").
	# You may not use this file except in compliance with the License. A copy of the
	# License is located at
	#
	# http://aws.amazon.com/apache2.0/
	#
	# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
	# OF ANY KIND, either express or implied. See the License for the specific
	# language governing permissions and limitations under the License.

	import webbrowser, os
	import json
	import boto3
	import io
	from io import BytesIO
	import sys
	from pprint import pprint


	def get_rows_columns_map(table_result, blocks_map):
	rows = {}
	for relationship in table_result['Relationships']:
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	cell = blocks_map[child_id]
	if cell['BlockType'] == 'CELL':
	row_index = cell['RowIndex']
	col_index = cell['ColumnIndex']
	if row_index not in rows:
	# create new row
	rows[row_index] = {}

	# get the text value
	rows[row_index][col_index] = get_text(cell, blocks_map)
	return rows


	def get_text(result, blocks_map):
	text = ''
	if 'Relationships' in result:
	for relationship in result['Relationships']:
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	word = blocks_map[child_id]
	if word['BlockType'] == 'WORD':
	text += word['Text'] + ' '
	if word['BlockType'] == 'SELECTION_ELEMENT':
	if word['SelectionStatus'] =='SELECTED':
	text += 'X '
	return text


	def get_table_csv_results(file_name):

	with open(file_name, 'rb') as file:
	img_test = file.read()
	bytes_test = bytearray(img_test)
	print('Image loaded', file_name)

	# process using image bytes
	# get the results
	client = boto3.client('textract')

	response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

	# Get the text blocks
	blocks=response['Blocks']
	pprint(blocks)

	blocks_map = {}
	table_blocks = []
	for block in blocks:
	blocks_map[block['Id']] = block
	if block['BlockType'] == "TABLE":
	table_blocks.append(block)

	if len(table_blocks) <= 0:
	return "<b> NO Table FOUND </b>"

	csv = ''
	for index, table in enumerate(table_blocks):
	csv += generate_table_csv(table, blocks_map, index +1)
	csv += '\n\n'

	return csv

	def generate_table_csv(table_result, blocks_map, table_index):
	rows = get_rows_columns_map(table_result, blocks_map)

	table_id = 'Table_' + str(table_index)

	# get cells.
	csv = 'Table: {0}\n\n'.format(table_id)

	for row_index, cols in rows.items():

	for col_index, text in cols.items():
	csv += '{}'.format(text) + ","
	csv += '\n'

	csv += '\n\n\n'
	return csv

	def main(file_name):
	table_csv = get_table_csv_results(file_name)

	output_file = 'output.csv'

	# replace content
	with open(output_file, "wt") as fout:
	fout.write(table_csv)

	# show the results
	print('CSV OUTPUT FILE: ', output_file)


	if __name__ == "__main__":
	file_name = sys.argv[1]
	main(file_name)