Skip to content

Instantly share code, notes, and snippets.

@daveebbelaar
Created July 13, 2024 07:08
Show Gist options
  • Save daveebbelaar/05fe2c89adb6824807cc57aa64dc2d4b to your computer and use it in GitHub Desktop.
Save daveebbelaar/05fe2c89adb6824807cc57aa64dc2d4b to your computer and use it in GitHub Desktop.
A service class for interacting with Azure Document Intelligence API.
import logging
import requests
import time
from typing import Union, Dict
from config.settings import get_settings
class DocumentIntelligenceService:
"""
A service class for interacting with Azure Document Intelligence API.
This class provides methods to analyze documents using Azure's Document Intelligence service.
"""
def __init__(self):
"""
Initialize the DocumentIntelligenceService with API credentials and endpoint.
"""
settings = get_settings()
self.key = settings.document_intelligence.api_key
self.endpoint = settings.document_intelligence.endpoint
self.api_version = "2024-02-29-preview" # Currently only available in East US, West US2, and West Europe
def analyze(
self,
source: Union[str, bytes],
is_url: bool = True,
model_id: str = "prebuilt-layout",
) -> Dict:
"""
Analyze a document using Azure Document Intelligence.
Args:
source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
is_url (bool): True if the source is a URL, False if it's base64 encoded content.
model_id (str): The ID of the model to use for analysis.
Returns:
Dict: The analysis results.
Raises:
requests.HTTPError: If the API request fails.
"""
result_id = self._submit_analysis(source, is_url, model_id)
return self._get_analysis_results(result_id, model_id)
def _submit_analysis(
self, source: Union[str, bytes], is_url: bool, model_id: str
) -> str:
"""
Submit a document for analysis to Azure Document Intelligence.
Args:
source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
is_url (bool): True if the source is a URL, False if it's base64 encoded content.
model_id (str): The ID of the model to use for analysis.
Returns:
str: The result ID for the submitted analysis.
Raises:
ValueError: If the Operation-Location header is missing in the response.
requests.HTTPError: If the API request fails.
"""
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown"
headers = {
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Key": self.key,
}
data = {"urlSource": source} if is_url else {"base64Source": source}
logging.info("Submitting document for analysis")
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
operation_location = response.headers.get("Operation-Location")
if not operation_location:
raise ValueError("Operation-Location header is missing in the response.")
return operation_location.split("/")[-1].split("?")[0]
def _get_analysis_results(self, result_id: str, model_id: str) -> Dict:
"""
Retrieve the analysis results from Azure Document Intelligence.
Args:
result_id (str): The ID of the analysis result to retrieve.
model_id (str): The ID of the model used for analysis.
Returns:
Dict: The analysis results.
Raises:
requests.HTTPError: If the API request fails.
"""
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown"
headers = {"Ocp-Apim-Subscription-Key": self.key}
while True:
logging.info("Waiting for analysis to complete.")
time.sleep(2)
response = requests.get(url, headers=headers)
response.raise_for_status()
data = response.json()
if data.get("status") in ["succeeded", "failed"]:
return data
if __name__ == "__main__":
# Example usage of the DocumentIntelligenceService
client = DocumentIntelligenceService()
analysis_results = client.analyze(
source="https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf"
)
print(analysis_results.keys())
print(analysis_results["analyzeResult"].keys())
print(analysis_results["analyzeResult"]["content"])
print(analysis_results["analyzeResult"]["tables"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment