Last active
September 17, 2019 04:46
-
-
Save songthamtung/4dc348b4a1eaf36082e4ef06eb04157c to your computer and use it in GitHub Desktop.
AWS Python SDK Textract x s3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Detects text in a document stored in an S3 bucket. | |
import boto3 | |
import sys | |
from time import sleep | |
import math | |
import pandas as pd | |
if __name__ == "__main__": | |
bucket='your_bucket_name' | |
ACCESS_KEY='your_access_key' | |
SECRET_KEY='your_secret_key' | |
client = boto3.client('textract', | |
region_name='your_region', | |
aws_access_key_id=ACCESS_KEY, | |
aws_secret_access_key=SECRET_KEY) | |
s3 = boto3.resource('s3', | |
aws_access_key_id=ACCESS_KEY, | |
aws_secret_access_key=SECRET_KEY) | |
your_bucket = s3.Bucket(bucket) | |
extracted_data = [] | |
for s3_file in your_bucket.objects.all(): | |
print(s3_file) | |
# use textract to process s3 file | |
response = client.detect_document_text( | |
Document={'S3Object': {'Bucket': bucket, 'Name': s3_file.key}}) | |
blocks=response['Blocks'] | |
for block in blocks: | |
if block['BlockType'] != 'PAGE': | |
print('Detected: ' + block['Text']) | |
print('Confidence: ' + "{:.2f}".format(block['Confidence']) + "%") | |
# Example case where you want to extract words with # | |
if("#" in block['Text']): | |
words = block['Text'].split() | |
for word in words: | |
if("#" in word): | |
extracted_data.append({"word" : word, "file" : s3_file.key, "confidence": "{:.2f}".format(block['Confidence']) + "%"}) | |
# sleep 2 seconds to prevent ProvisionedThroughputExceededException | |
sleep(2) | |
df = pd.DataFrame(extracted_data) | |
df = df.drop_duplicates() | |
df.to_csv('output.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment