Skip to content

Instantly share code, notes, and snippets.

@MichaelWalker-git
Created April 25, 2024 15:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MichaelWalker-git/1e71ae61da8f9f429c77d59d458148d6 to your computer and use it in GitHub Desktop.
Save MichaelWalker-git/1e71ae61da8f9f429c77d59d458148d6 to your computer and use it in GitHub Desktop.
Combine raw textract files
# I tried doing this in a lambda for 110 Textract files, but it exceeded the 10GB memory limitation
# Lambda approach
# combined_data = []
# response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
# if 'Contents' in response:
# files = sorted(response['Contents'], key=lambda x: x['Key'])
# for file in files:
# I ALSO NEEEDED TO RENAME ALL MY FILES AS JSON because Textract doesn't save it as a JSON (no file extension)
# original_key = file['Key']
# if original_key.endswith('.json'):
# try:
# file_response = s3_client.get_object(Bucket=bucket, Key=original_key)
# # Read and decode the file content
# file_content = file_response['Body'].read().decode('utf-8')
# data = json.loads(file_content)
# combined_data.append(data)
# print(f"Content of {original_key}:")
# print(file_content)
# print("\n") # For better separation between files
# except Exception as e:
# print(f"Error reading file {original_key}: {str(e)}")
# # Assuming each file is a valid JSON object
# return combined_data
# LOCAL Approach
# Grab all json files in this local folder
# combined_data = []
# for file in os.listdir():
# if file.endswith(".json"):
# with open(file, "r") as f:
# data = json.load(f)
# combined_data.append(data)
#
# with open("combined.json", "w") as f:
# json.dump(combined_data, f, indent=4)
all_text = ""
# iterate through combined.json and if [Blocks] is not empty, then check the [BlockType] is "LINE" and [Text] fields
# open combined.json and read the content
with open("combined.json", "r") as f:
combined_data = json.load(f)
print(combined_data)
# Iterate through the array and check for the key ExpenseDocuments and then check for Blocks
for data in combined_data:
if "ExpenseDocuments" in data:
for doc in data["ExpenseDocuments"]:
if "Blocks" in doc:
for block in doc["Blocks"]:
if block["BlockType"] == "LINE":
all_text += block["Text"] + " "
print(all_text)
# Write all the text to a txt file
with open("all_text.txt", "w") as f:
f.write(all_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment