Skip to content

Instantly share code, notes, and snippets.

@navanchauhan
Created November 8, 2022 06:25
Embed
What would you like to do?
import fitz
from PIL import Image
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
import sys
import time
from io import BytesIO
from tqdm import tqdm
subscription_key = ""
endpoint = ""
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
cooldown = 20 # seconds, 20 calls per minute for student tier but doesn't work without cooldown
def replace_markdown_chars(text):
text = text.replace("* *","**")
text = text.replace("[ ", "[")
text = text.replace(" ]", "]")
text = text.replace(" .", ".")
return text
def text2mdtxt(text):
"""Converts text to Markdown text."""
txt_block = ""
incomplete_line = ""
for line in text.splitlines():
if line[0] == '#': # Header
txt_block += incomplete_line + "\n"
incomplete_line = ""
line = line.replace(' #', '#')
txt_block += line + "\n"
elif "¶" in line.replace(' ',''):
txt_block += incomplete_line + "\n\n"
incomplete_line = ""
else:
line = line.strip()
incomplete_line += line + " "
txt_block += incomplete_line + "\n"
return replace_markdown_chars(txt_block)
def get_images_from_pdf(pdf_file):
pdf_file = fitz.open(pdf_file)
images = []
# Get Pages
for page_index in tqdm(range(len(pdf_file))):
page = pdf_file[page_index]
image_list = page.get_images()
# printing number of images found in this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on page", page_index)
for image_index, img in enumerate(page.get_images(), start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
images.append(image_bytes)
return images
def get_text_from_pdf(fname: str):
all_txt = ""
images = get_images_from_pdf(fname)
for image in tqdm(images):
'''
OCR: Read File using the Read API, extract text - local
This example will extract text in an image
This API call can also extract handwriting style text
'''
# Call API with file and raw response (allows you to get the operation location)
read_response = computervision_client.read_in_stream(BytesIO(image), raw=True)
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]
# Call the "GET" API and wait for it to retrieve the results
while True:
read_result = computervision_client.get_read_result(operation_id)
if read_result.status not in ['notStarted', 'running']:
break
time.sleep(1)
# Append the detected text, line by line
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
all_txt += line.text + "\n"
# Now, sleep for cooldown period
time.sleep(cooldown)
return all_txt
if __name__ == "__main__":
fname = "PDFs/zamn.pdf"
txt = get_text_from_pdf(fname)
with open("output.md","w") as f:
f.write(text2mdtxt(txt))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment