Skip to content

Instantly share code, notes, and snippets.

@mancap314
Last active December 22, 2023 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mancap314/2a6fa4b0b9be1a1c2a7aa27981adc6ed to your computer and use it in GitHub Desktop.
Save mancap314/2a6fa4b0b9be1a1c2a7aa27981adc6ed to your computer and use it in GitHub Desktop.
Single local PDF extraction with Document AI processor
"""
Command line tool to run OCR with a Document AI processor over a single local PDF file
`pip install python-dotenv google-cloud google-cloud-documentai`
Also create a .env file for populating default values, containing following
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION corresponding to an OCR
processor created on the Document AI platform of your GCP project.
"""
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from typing import Union
import os
import argparse
from dotenv import load_dotenv
def ocr_extract(
file_path: Union[str, os.PathLike],
mime_type: str,
project_id: str,
processor_id: str,
processor_location: str,
) -> str:
# Instantiates a client
docai_client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(api_endpoint=f"{processor_location}-documentai.googleapis.com")
)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
resource_name = docai_client.processor_path(
project_id,
processor_location,
processor_id
)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(
content=image_content,
mime_type=mime_type
)
# Configure the process request
request = documentai.ProcessRequest(
name=resource_name,
raw_document=raw_document
)
# Use the Document AI client to process the sample form
result = docai_client.process_document(request=request)
document_object = result.document
print("[INFO] ocr_extract(): Document processing complete.")
return document_object.text
def validate_file(f):
if not os.path.isfile(f):
raise argparse.ArgumentTypeError(f"{f} does not exist".format(f))
return f
if __name__ == "__main__":
load_dotenv()
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file-path",
help="Path of the file to extract",
type=validate_file,
dest="file_path",
required=True
)
parser.add_argument("-m", "--mime-type",
help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
type=str,
dest="mime_type",
required=False,
default="application/pdf"
)
parser.add_argument("-p", "--project-id",
help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
type=str,
dest="project_id",
required=False,
default=os.getenv("GCP_PROJECT_ID")
)
parser.add_argument("-r", "--processor-id",
help="ID of the Processor created on the GCloud Document AI console",
type=str,
dest="processor_id",
required=False,
default=os.getenv("PROCESSOR_ID")
)
parser.add_argument("-l", "--location",
help="Location of the Document AI Processor. 'eu' or 'us'.",
type=str,
dest="processor_location",
required=False,
default=os.getenv("PROCESSOR_LOCATION")
)
args = parser.parse_args()
file_path = args.file_path
mime_type = args.mime_type
project_id = args.project_id
processor_id = args.processor_id
processor_location = args.processor_location
text = ocr_extract(
file_path=file_path,
mime_type=mime_type,
project_id=project_id,
processor_id=processor_id,
processor_location=processor_location
)
# Do whatever locally with the text then...
print(text[:100])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment