MichaelWalker-git/combine_textract_json.py

## combine_textract_json.py

# I tried doing this in a lambda for 110 Textract files, but it exceeded the 10GB memory limitation

# Lambda approach

#         combined_data = []

#         response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
#         if 'Contents' in response:
#             files = sorted(response['Contents'], key=lambda x: x['Key'])

#             for file in files:
# I ALSO NEEEDED TO RENAME ALL MY FILES AS JSON because Textract doesn't save it as a JSON (no file extension)
#                 original_key = file['Key']
#                 if original_key.endswith('.json'):

#                     try:
#                         file_response = s3_client.get_object(Bucket=bucket, Key=original_key)
#                         # Read and decode the file content
#                         file_content = file_response['Body'].read().decode('utf-8')
#                         data = json.loads(file_content)
#                         combined_data.append(data)

#                         print(f"Content of {original_key}:")
#                         print(file_content)
#                         print("\n")  # For better separation between files
#                     except Exception as e:
#                         print(f"Error reading file {original_key}: {str(e)}")
#                     # Assuming each file is a valid JSON object

#         return combined_data

# LOCAL Approach

# Grab all json files in this local folder
    # combined_data = []
    # for file in os.listdir():
    #     if file.endswith(".json"):
    #         with open(file, "r") as f:
    #             data = json.load(f)
    #             combined_data.append(data)
    #
    # with open("combined.json", "w") as f:
    #     json.dump(combined_data, f, indent=4)

    all_text = ""
    # iterate through combined.json and if [Blocks] is not empty, then check the [BlockType] is "LINE" and [Text] fields
    # open combined.json and read the content
    with open("combined.json", "r") as f:
        combined_data = json.load(f)
        print(combined_data)
        # Iterate through the array and check for the key ExpenseDocuments and then check for Blocks
        for data in combined_data:
            if "ExpenseDocuments" in data:
                for doc in data["ExpenseDocuments"]:
                    if "Blocks" in doc:
                        for block in doc["Blocks"]:
                            if block["BlockType"] == "LINE":
                                all_text += block["Text"] + " "

    print(all_text)
    # Write all the text to a txt file
    with open("all_text.txt", "w") as f:
        f.write(all_text)

	# I tried doing this in a lambda for 110 Textract files, but it exceeded the 10GB memory limitation

	# Lambda approach

	# combined_data = []

	# response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
	# if 'Contents' in response:
	# files = sorted(response['Contents'], key=lambda x: x['Key'])

	# for file in files:
	# I ALSO NEEEDED TO RENAME ALL MY FILES AS JSON because Textract doesn't save it as a JSON (no file extension)
	# original_key = file['Key']
	# if original_key.endswith('.json'):

	# try:
	# file_response = s3_client.get_object(Bucket=bucket, Key=original_key)
	# # Read and decode the file content
	# file_content = file_response['Body'].read().decode('utf-8')
	# data = json.loads(file_content)
	# combined_data.append(data)

	# print(f"Content of {original_key}:")
	# print(file_content)
	# print("\n") # For better separation between files
	# except Exception as e:
	# print(f"Error reading file {original_key}: {str(e)}")
	# # Assuming each file is a valid JSON object

	# return combined_data

	# LOCAL Approach

	# Grab all json files in this local folder
	# combined_data = []
	# for file in os.listdir():
	# if file.endswith(".json"):
	# with open(file, "r") as f:
	# data = json.load(f)
	# combined_data.append(data)
	#
	# with open("combined.json", "w") as f:
	# json.dump(combined_data, f, indent=4)

	all_text = ""
	# iterate through combined.json and if [Blocks] is not empty, then check the [BlockType] is "LINE" and [Text] fields
	# open combined.json and read the content
	with open("combined.json", "r") as f:
	combined_data = json.load(f)
	print(combined_data)
	# Iterate through the array and check for the key ExpenseDocuments and then check for Blocks
	for data in combined_data:
	if "ExpenseDocuments" in data:
	for doc in data["ExpenseDocuments"]:
	if "Blocks" in doc:
	for block in doc["Blocks"]:
	if block["BlockType"] == "LINE":
	all_text += block["Text"] + " "

	print(all_text)
	# Write all the text to a txt file
	with open("all_text.txt", "w") as f:
	f.write(all_text)