Skip to content

Instantly share code, notes, and snippets.

Created November 8, 2022 06:25
Show Gist options
  • Save navanchauhan/5fc602b1e023b60a66bc63bd4eecd4f8 to your computer and use it in GitHub Desktop.
Save navanchauhan/5fc602b1e023b60a66bc63bd4eecd4f8 to your computer and use it in GitHub Desktop.
import fitz
from PIL import Image
from import ComputerVisionClient
from import OperationStatusCodes
from import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
import sys
import time
from io import BytesIO
from tqdm import tqdm
subscription_key = ""
endpoint = ""
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
cooldown = 20 # seconds, 20 calls per minute for student tier but doesn't work without cooldown
def replace_markdown_chars(text):
text = text.replace("* *","**")
text = text.replace("[ ", "[")
text = text.replace(" ]", "]")
text = text.replace(" .", ".")
return text
def text2mdtxt(text):
"""Converts text to Markdown text."""
txt_block = ""
incomplete_line = ""
for line in text.splitlines():
if line[0] == '#': # Header
txt_block += incomplete_line + "\n"
incomplete_line = ""
line = line.replace(' #', '#')
txt_block += line + "\n"
elif "¶" in line.replace(' ',''):
txt_block += incomplete_line + "\n\n"
incomplete_line = ""
line = line.strip()
incomplete_line += line + " "
txt_block += incomplete_line + "\n"
return replace_markdown_chars(txt_block)
def get_images_from_pdf(pdf_file):
pdf_file =
images = []
# Get Pages
for page_index in tqdm(range(len(pdf_file))):
page = pdf_file[page_index]
image_list = page.get_images()
# printing number of images found in this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
print("[!] No images found on page", page_index)
for image_index, img in enumerate(page.get_images(), start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
return images
def get_text_from_pdf(fname: str):
all_txt = ""
images = get_images_from_pdf(fname)
for image in tqdm(images):
OCR: Read File using the Read API, extract text - local
This example will extract text in an image
This API call can also extract handwriting style text
# Call API with file and raw response (allows you to get the operation location)
read_response = computervision_client.read_in_stream(BytesIO(image), raw=True)
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]
# Call the "GET" API and wait for it to retrieve the results
while True:
read_result = computervision_client.get_read_result(operation_id)
if read_result.status not in ['notStarted', 'running']:
# Append the detected text, line by line
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
all_txt += line.text + "\n"
# Now, sleep for cooldown period
return all_txt
if __name__ == "__main__":
fname = "PDFs/zamn.pdf"
txt = get_text_from_pdf(fname)
with open("","w") as f:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment