Last active
October 31, 2023 11:29
-
-
Save aribornstein/b18b6b6b46ed0715510fc95b32b55f15 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import time | |
import pandas as pd | |
from requests import get, post | |
def extract_value(value): | |
""" | |
Helper Method to Extract Cell Value from Response | |
""" | |
if value['type'] == 'number': | |
return value['text'] | |
elif value['type'] == 'string': | |
return value['valueString'] | |
elif value['type'] == 'date': | |
return value['valueDate'] | |
elif value['type'] == 'time': | |
return value['valueTime'] | |
elif value['type'] == 'phoneNumber': | |
return value['valuePhoneNumber'] | |
elif value['type'] == 'object': | |
objectKeys = value['valueObject'].keys(); | |
item_info = "" | |
for ok in objectKeys: | |
item_info += ok + ":" + extract_value(value['valueObject'][ok]) + " " | |
return item_info | |
elif value['type'] == 'array': | |
itemInfo = "" | |
for item in value["valueArray"]: | |
itemInfo += extract_value(item) + "; " | |
return itemInfo[:-3] # ; | |
else: | |
print("Skipping Unsupported Type") | |
def recognizer2DF(post_url, apim_key, headers, data_bytes, confidence_threshold = 0, query_interval=5): | |
""" | |
Submits Table or Form to recognizer asyncronously and processes the response | |
queryInterval amount of time to wait between checking whether a job is done | |
Optional confidence_threshold to deterimine whether to process a extracted feild | |
""" | |
try: | |
# Submit Async Table Job to Form Recognizer Endpoint | |
resp = post(url = post_url, data = data_bytes, headers = headers) | |
if resp.status_code == 202: | |
# Query Submit Table Job | |
get_url = resp.headers["operation-location"] | |
resp = get(url = resp.headers["operation-location"], headers = {"Ocp-Apim-Subscription-Key": apim_key}) | |
resp_json = json.loads(resp.text) | |
while resp_json["status"] == "running": | |
resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key}) | |
resp_json = json.loads(resp.text) | |
time.sleep(query_interval) | |
if resp_json["status"] == "succeeded": | |
# Process Documents | |
docResults = resp_json['analyzeResult']['documentResults'] | |
docs = [] | |
for doc in docResults: | |
fields = doc['fields'] | |
docs.append({key:extract_value(fields[key]) for key in fields.keys() \ | |
if 'confidence' in fields[key] and fields[key]['confidence'] > confidence_threshold}) | |
return pd.DataFrame(docs) | |
elif resp_json["status"] == "failed": | |
print("Layout analyze failed:\n%s" % resp_json) | |
else: | |
print("POST analyze failed:\n%s" % resp.text) | |
except Exception as e: | |
print("Code Failed analyze failed:\n%s" % str(e)) | |
# Endpoint URL | |
apim_key = r"<Subscription Key>" | |
endpoint = r"<endpoint>" | |
source = r"<Image or PDF Source path>" | |
headers = { | |
# Request headers | |
'Content-Type': r'<form file type - application/pdf, image/jpeg, image/png, or image/tiff>', | |
'Ocp-Apim-Subscription-Key': apim_key, | |
} | |
with open(source, "rb") as f: | |
data_bytes = f.read() | |
df = recognizer2DF(endpoint, apim_key, headers, data_bytes) | |
df.to_csv("form_data.csv") # can now be processed with excel |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I am getting this error when running the code with my apim_key and endpoint.
POST analyze failed:
{"error":{"code":"404","message": "Resource not found"}}
Could anyone please suggest how values should look like in headers? My df from recognizer2DF is coming empty as the function is unable to find resource