mancap314/single_local_documentai_ocr.py

## single_local_documentai_ocr.py
"""
Command line tool to run OCR with a Document AI processor over a single local PDF file
`pip install python-dotenv google-cloud google-cloud-documentai`
Also create a .env file for populating default values, containing following
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION corresponding to an OCR
processor created on the Document AI platform of your GCP project.
"""

from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from typing import Union
import os
import argparse
from dotenv import load_dotenv


def ocr_extract(
        file_path: Union[str, os.PathLike],
        mime_type: str,
        project_id: str,
        processor_id: str,
        processor_location: str,
) -> str:
    # Instantiates a client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(api_endpoint=f"{processor_location}-documentai.googleapis.com")
    )

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = docai_client.processor_path(
        project_id,
        processor_location,
        processor_id
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Load Binary Data into Document AI RawDocument Object
        raw_document = documentai.RawDocument(
            content=image_content,
            mime_type=mime_type
        )

        # Configure the process request
        request = documentai.ProcessRequest(
            name=resource_name,
            raw_document=raw_document
        )

        # Use the Document AI client to process the sample form
        result = docai_client.process_document(request=request)

        document_object = result.document
        print("[INFO] ocr_extract(): Document processing complete.")

        return document_object.text


def validate_file(f):
    if not os.path.isfile(f):
        raise argparse.ArgumentTypeError(f"{f} does not exist".format(f))
    return f


if __name__ == "__main__":
    load_dotenv()
    parser = argparse.ArgumentParser()

    parser.add_argument("-f", "--file-path",
        help="Path of the file to extract",
        type=validate_file,
        dest="file_path",
        required=True
    )
    parser.add_argument("-m", "--mime-type",
            help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
        type=str,
        dest="mime_type",
        required=False,
        default="application/pdf"
    )
    parser.add_argument("-p", "--project-id",
        help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
        type=str,
        dest="project_id",
        required=False,
        default=os.getenv("GCP_PROJECT_ID")
    )
    parser.add_argument("-r", "--processor-id",
        help="ID of the Processor created on the GCloud  Document AI console",
        type=str,
        dest="processor_id",
        required=False,
        default=os.getenv("PROCESSOR_ID")
    )
    parser.add_argument("-l", "--location",
        help="Location of the Document AI Processor. 'eu' or 'us'.",
        type=str,
        dest="processor_location",
        required=False,
        default=os.getenv("PROCESSOR_LOCATION")
    )

    args = parser.parse_args()

    file_path = args.file_path
    mime_type = args.mime_type
    project_id = args.project_id
    processor_id = args.processor_id
    processor_location = args.processor_location

    text = ocr_extract(
        file_path=file_path,
        mime_type=mime_type,
        project_id=project_id,
        processor_id=processor_id,
        processor_location=processor_location
    )
    # Do whatever locally with the text then...
    print(text[:100])
	"""
	Command line tool to run OCR with a Document AI processor over a single local PDF file
	`pip install python-dotenv google-cloud google-cloud-documentai`
	Also create a .env file for populating default values, containing following
	keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION corresponding to an OCR
	processor created on the Document AI platform of your GCP project.
	"""

	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai
	from typing import Union
	import os
	import argparse
	from dotenv import load_dotenv


	def ocr_extract(
	file_path: Union[str, os.PathLike],
	mime_type: str,
	project_id: str,
	processor_id: str,
	processor_location: str,
	) -> str:
	# Instantiates a client
	docai_client = documentai.DocumentProcessorServiceClient(
	client_options=ClientOptions(api_endpoint=f"{processor_location}-documentai.googleapis.com")
	)

	# The full resource name of the processor, e.g.:
	# projects/project-id/locations/location/processor/processor-id
	# You must create new processors in the Cloud Console first
	resource_name = docai_client.processor_path(
	project_id,
	processor_location,
	processor_id
	)

	# Read the file into memory
	with open(file_path, "rb") as image:
	image_content = image.read()

	# Load Binary Data into Document AI RawDocument Object
	raw_document = documentai.RawDocument(
	content=image_content,
	mime_type=mime_type
	)

	# Configure the process request
	request = documentai.ProcessRequest(
	name=resource_name,
	raw_document=raw_document
	)

	# Use the Document AI client to process the sample form
	result = docai_client.process_document(request=request)

	document_object = result.document
	print("[INFO] ocr_extract(): Document processing complete.")

	return document_object.text


	def validate_file(f):
	if not os.path.isfile(f):
	raise argparse.ArgumentTypeError(f"{f} does not exist".format(f))
	return f


	if __name__ == "__main__":
	load_dotenv()
	parser = argparse.ArgumentParser()

	parser.add_argument("-f", "--file-path",
	help="Path of the file to extract",
	type=validate_file,
	dest="file_path",
	required=True
	)
	parser.add_argument("-m", "--mime-type",
	help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
	type=str,
	dest="mime_type",
	required=False,
	default="application/pdf"
	)
	parser.add_argument("-p", "--project-id",
	help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
	type=str,
	dest="project_id",
	required=False,
	default=os.getenv("GCP_PROJECT_ID")
	)
	parser.add_argument("-r", "--processor-id",
	help="ID of the Processor created on the GCloud Document AI console",
	type=str,
	dest="processor_id",
	required=False,
	default=os.getenv("PROCESSOR_ID")
	)
	parser.add_argument("-l", "--location",
	help="Location of the Document AI Processor. 'eu' or 'us'.",
	type=str,
	dest="processor_location",
	required=False,
	default=os.getenv("PROCESSOR_LOCATION")
	)

	args = parser.parse_args()

	file_path = args.file_path
	mime_type = args.mime_type
	project_id = args.project_id
	processor_id = args.processor_id
	processor_location = args.processor_location

	text = ocr_extract(
	file_path=file_path,
	mime_type=mime_type,
	project_id=project_id,
	processor_id=processor_id,
	processor_location=processor_location
	)
	# Do whatever locally with the text then...
	print(text[:100])