Created
November 19, 2019 05:27
-
-
Save kayalshri/6e7e5c55433733ff591b2d2b849afdfa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2010-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. | |
# | |
# This file is licensed under the Apache License, Version 2.0 (the "License"). | |
# You may not use this file except in compliance with the License. A copy of the | |
# License is located at | |
# | |
# http://aws.amazon.com/apache2.0/ | |
# | |
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS | |
# OF ANY KIND, either express or implied. See the License for the specific | |
# language governing permissions and limitations under the License. | |
import webbrowser, os | |
import json | |
import boto3 | |
import io | |
from io import BytesIO | |
import sys | |
from pprint import pprint | |
def get_rows_columns_map(table_result, blocks_map): | |
rows = {} | |
for relationship in table_result['Relationships']: | |
if relationship['Type'] == 'CHILD': | |
for child_id in relationship['Ids']: | |
cell = blocks_map[child_id] | |
if cell['BlockType'] == 'CELL': | |
row_index = cell['RowIndex'] | |
col_index = cell['ColumnIndex'] | |
if row_index not in rows: | |
# create new row | |
rows[row_index] = {} | |
# get the text value | |
rows[row_index][col_index] = get_text(cell, blocks_map) | |
return rows | |
def get_text(result, blocks_map): | |
text = '' | |
if 'Relationships' in result: | |
for relationship in result['Relationships']: | |
if relationship['Type'] == 'CHILD': | |
for child_id in relationship['Ids']: | |
word = blocks_map[child_id] | |
if word['BlockType'] == 'WORD': | |
text += word['Text'] + ' ' | |
if word['BlockType'] == 'SELECTION_ELEMENT': | |
if word['SelectionStatus'] =='SELECTED': | |
text += 'X ' | |
return text | |
def get_table_csv_results(file_name): | |
with open(file_name, 'rb') as file: | |
img_test = file.read() | |
bytes_test = bytearray(img_test) | |
print('Image loaded', file_name) | |
# process using image bytes | |
# get the results | |
client = boto3.client('textract') | |
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES']) | |
# Get the text blocks | |
blocks=response['Blocks'] | |
pprint(blocks) | |
blocks_map = {} | |
table_blocks = [] | |
for block in blocks: | |
blocks_map[block['Id']] = block | |
if block['BlockType'] == "TABLE": | |
table_blocks.append(block) | |
if len(table_blocks) <= 0: | |
return "<b> NO Table FOUND </b>" | |
csv = '' | |
for index, table in enumerate(table_blocks): | |
csv += generate_table_csv(table, blocks_map, index +1) | |
csv += '\n\n' | |
return csv | |
def generate_table_csv(table_result, blocks_map, table_index): | |
rows = get_rows_columns_map(table_result, blocks_map) | |
table_id = 'Table_' + str(table_index) | |
# get cells. | |
csv = 'Table: {0}\n\n'.format(table_id) | |
for row_index, cols in rows.items(): | |
for col_index, text in cols.items(): | |
csv += '{}'.format(text) + "," | |
csv += '\n' | |
csv += '\n\n\n' | |
return csv | |
def main(file_name): | |
table_csv = get_table_csv_results(file_name) | |
output_file = 'output.csv' | |
# replace content | |
with open(output_file, "wt") as fout: | |
fout.write(table_csv) | |
# show the results | |
print('CSV OUTPUT FILE: ', output_file) | |
if __name__ == "__main__": | |
file_name = sys.argv[1] | |
main(file_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment