Created
April 25, 2024 15:39
-
-
Save MichaelWalker-git/1e71ae61da8f9f429c77d59d458148d6 to your computer and use it in GitHub Desktop.
Combine raw textract files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I tried doing this in a lambda for 110 Textract files, but it exceeded the 10GB memory limitation | |
# Lambda approach | |
# combined_data = [] | |
# response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) | |
# if 'Contents' in response: | |
# files = sorted(response['Contents'], key=lambda x: x['Key']) | |
# for file in files: | |
# I ALSO NEEEDED TO RENAME ALL MY FILES AS JSON because Textract doesn't save it as a JSON (no file extension) | |
# original_key = file['Key'] | |
# if original_key.endswith('.json'): | |
# try: | |
# file_response = s3_client.get_object(Bucket=bucket, Key=original_key) | |
# # Read and decode the file content | |
# file_content = file_response['Body'].read().decode('utf-8') | |
# data = json.loads(file_content) | |
# combined_data.append(data) | |
# print(f"Content of {original_key}:") | |
# print(file_content) | |
# print("\n") # For better separation between files | |
# except Exception as e: | |
# print(f"Error reading file {original_key}: {str(e)}") | |
# # Assuming each file is a valid JSON object | |
# return combined_data | |
# LOCAL Approach | |
# Grab all json files in this local folder | |
# combined_data = [] | |
# for file in os.listdir(): | |
# if file.endswith(".json"): | |
# with open(file, "r") as f: | |
# data = json.load(f) | |
# combined_data.append(data) | |
# | |
# with open("combined.json", "w") as f: | |
# json.dump(combined_data, f, indent=4) | |
all_text = "" | |
# iterate through combined.json and if [Blocks] is not empty, then check the [BlockType] is "LINE" and [Text] fields | |
# open combined.json and read the content | |
with open("combined.json", "r") as f: | |
combined_data = json.load(f) | |
print(combined_data) | |
# Iterate through the array and check for the key ExpenseDocuments and then check for Blocks | |
for data in combined_data: | |
if "ExpenseDocuments" in data: | |
for doc in data["ExpenseDocuments"]: | |
if "Blocks" in doc: | |
for block in doc["Blocks"]: | |
if block["BlockType"] == "LINE": | |
all_text += block["Text"] + " " | |
print(all_text) | |
# Write all the text to a txt file | |
with open("all_text.txt", "w") as f: | |
f.write(all_text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment