Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jayantkodwani/a2f59a11f0ef4364d22940d434053e37 to your computer and use it in GitHub Desktop.
Save jayantkodwani/a2f59a11f0ef4364d22940d434053e37 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Spyder Editor
INTRODUCTION
This is a script file to perform the following operations:
1.Use Microsoft Azure Computer vision Cognitive service to extract text from images (OCR) stored on Blob storage or any link.
2.Export the extracted text to an MS Excel file on your desktop for further analysis
pip install --upgrade azure-cognitiveservices-vision-computervision
pip install pillow
pip install azure-storage-blob
"""
""" IMPORT LIBRARIES """
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
from PIL import Image
import sys
import time
import pandas as pd
from requests import get, post
import json
""" CREATE A PYTHON DATAFRAME TO STORE OUTPUT """
output=pd.DataFrame()
""" AUTHENTICATE TO COMPUTER VISION """
subscription_key = "<<REPLACE_subscription_key>>"
endpoint = "<<REPLACE_endpoint>>"
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
""" CALL THE API AND GET RESULTS (TEXT IN AN IMAGE), THEN PRINT THE RESULTS LINE BY LINE """
print("===== Batch Read File - remote =====")
# Get an image with handwritten text, REPLACE image link as per your requirements
remote_image_handw_text_url = "https://jayantml1356189034.blob.core.windows.net/jayantcontainer/0001.jpg"
# Call API with URL and raw response (allows you to get the operation location)
recognize_handw_results = computervision_client.read(remote_image_handw_text_url, raw=True)
# Get the operation location (URL with an ID at the end) from the response
operation_location_remote = recognize_handw_results.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = operation_location_remote.split("/")[-1]
# Call the "GET" API and wait for it to retrieve the results
while True:
get_handw_text_results = computervision_client.get_read_result(operation_id)
if get_handw_text_results.status not in ['notStarted', 'running']:
break
time.sleep(1)
# Print the detected text, line by line
if get_handw_text_results.status == OperationStatusCodes.succeeded:
for text_result in get_handw_text_results.analyze_result.read_results:
for line in text_result.lines:
print(line.text)
ser = pd.Series(line.text)
output = output.append (ser,ignore_index=True)
#print(line.bounding_box)
print()
# Export the Output to MS Excel with custom sheet name
output.to_excel("Words.xlsx", sheet_name="WordList")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment