mancap314/batch_documentai_ocr.py

## batch_documentai_ocr.py
"""
Command line tool for processing of a batch of PDF files stored on gcloud storage with a Document
AI processor.
`pip install python-dotenv google-cloud google-cloud-documentai
google-cloud-documentai-toolbox google-cloud-storage`
Also create a .env file for populating default values, containing following
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION, GCS_DOCUMENT_BUCKET and
GCS_DOCUMENT_TEXT corresponding to an OCR
processor created on the Document AI platform of your GCP project. The two last
keys correspond to the name of gcloud storage buckets, first for the stored PDF
documents and last for storing the results, without 'gs://'.
"""

from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import documentai_toolbox
import os
from dotenv import load_dotenv
import argparse


def batch_process_toolbox(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    input_mime_type: Optional[str] = None,
    field_mask: Optional[str] = None,
):
    # You must set the api_endpoint if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if not gcs_input_uri.endswith("/") and "." in gcs_input_uri:
        # Specify specific GCS URIs to process individual documents
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        # Load GCS Input URI into a List of document files
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        # Specify a GCS URI Prefix to process an entire directory
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}
        name = client.processor_path(project_id, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    # BatchProcess returns a Long Running Operation (LRO)
    operation = client.batch_process_documents(request)

    # Operation Name Format: projects/{project_id}/locations/{location}/operations/{operation_id}
    documents = documentai_toolbox.document.Document.from_batch_process_operation(
        location=location, operation_name=operation.operation.name
    )

    for document in documents:
        # Read the text recognition output from the processor
        print("The document contains the following text:")
        # Truncated at 100 characters for brevity
        print(document.text[:100])


if __name__ == "__main__":
    load_dotenv()
    parser = argparse.ArgumentParser()

    parser.add_argument("-i", "--input-uri",
            help="Input URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
        type=str,
        dest="input_uri",
        required=False,
        default=os.getenv("GCS_DOCUMENT_BUCKET")
    )
    parser.add_argument("-o", "--output-uri",
            help="Output URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
        type=str,
        dest="output_uri",
        required=False,
        default=os.getenv("GCS_TEXT_BUCKET")

    )
    parser.add_argument("-m", "--mime-type",
            help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
        type=str,
        dest="mime_type",
        required=False,
        default="application/pdf"
    )
    parser.add_argument("-p", "--project-id",
        help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
        type=str,
        dest="project_id",
        required=False,
        default=os.getenv("GCP_PROJECT_ID")
    )
    parser.add_argument("-r", "--processor-id",
        help="ID of the Processor created on the GCloud  Document AI console",
        type=str,
        dest="processor_id",
        required=False,
        default=os.getenv("PROCESSOR_ID")
    )
    parser.add_argument("-l", "--location",
        help="Location of the Document AI Processor. 'eu' or 'us'.",
        type=str,
        dest="processor_location",
        required=False,
        default=os.getenv("PROCESSOR_LOCATION")
    )
    parser.add_argument("-f", "--field-mask",
        help="Fields to return in the document object",
        type=str,
        dest="field_mask",
        required=False,
        default="text,pages.pageNumber"
    )

    args = parser.parse_args()

    input_uri = args.input_uri
    output_uri = args.output_uri
    mime_type = args.mime_type
    project_id = args.project_id
    processor_id = args.processor_id
    processor_location = args.processor_location
    field_mask = args.field_mask

    batch_process_toolbox(
        project_id=project_id,
        location=processor_location,
        processor_id=processor_id,
        gcs_input_uri=input_uri,
        gcs_output_uri=output_uri,
        input_mime_type=mime_type,
        field_mask=field_mask,
    )
	"""
	Command line tool for processing of a batch of PDF files stored on gcloud storage with a Document
	AI processor.
	`pip install python-dotenv google-cloud google-cloud-documentai
	google-cloud-documentai-toolbox google-cloud-storage`
	Also create a .env file for populating default values, containing following
	keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION, GCS_DOCUMENT_BUCKET and
	GCS_DOCUMENT_TEXT corresponding to an OCR
	processor created on the Document AI platform of your GCP project. The two last
	keys correspond to the name of gcloud storage buckets, first for the stored PDF
	documents and last for storing the results, without 'gs://'.
	"""

	from typing import Optional
	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai
	from google.cloud import documentai_toolbox
	import os
	from dotenv import load_dotenv
	import argparse


	def batch_process_toolbox(
	project_id: str,
	location: str,
	processor_id: str,
	gcs_input_uri: str,
	gcs_output_uri: str,
	processor_version_id: Optional[str] = None,
	input_mime_type: Optional[str] = None,
	field_mask: Optional[str] = None,
	):
	# You must set the api_endpoint if you use a location other than "us".
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

	client = documentai.DocumentProcessorServiceClient(client_options=opts)

	if not gcs_input_uri.endswith("/") and "." in gcs_input_uri:
	# Specify specific GCS URIs to process individual documents
	gcs_document = documentai.GcsDocument(
	gcs_uri=gcs_input_uri, mime_type=input_mime_type
	)
	# Load GCS Input URI into a List of document files
	gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
	input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
	else:
	# Specify a GCS URI Prefix to process an entire directory
	gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
	input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

	# Cloud Storage URI for the Output Directory
	gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
	gcs_uri=gcs_output_uri, field_mask=field_mask
	)

	# Where to write results
	output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

	if processor_version_id:
	# The full resource name of the processor version, e.g.:
	# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
	name = client.processor_version_path(
	project_id, location, processor_id, processor_version_id
	)
	else:
	# The full resource name of the processor, e.g.:
	# projects/{project_id}/locations/{location}/processors/{processor_id}
	name = client.processor_path(project_id, location, processor_id)

	request = documentai.BatchProcessRequest(
	name=name,
	input_documents=input_config,
	document_output_config=output_config,
	)

	# BatchProcess returns a Long Running Operation (LRO)
	operation = client.batch_process_documents(request)

	# Operation Name Format: projects/{project_id}/locations/{location}/operations/{operation_id}
	documents = documentai_toolbox.document.Document.from_batch_process_operation(
	location=location, operation_name=operation.operation.name
	)

	for document in documents:
	# Read the text recognition output from the processor
	print("The document contains the following text:")
	# Truncated at 100 characters for brevity
	print(document.text[:100])


	if __name__ == "__main__":
	load_dotenv()
	parser = argparse.ArgumentParser()

	parser.add_argument("-i", "--input-uri",
	help="Input URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
	type=str,
	dest="input_uri",
	required=False,
	default=os.getenv("GCS_DOCUMENT_BUCKET")
	)
	parser.add_argument("-o", "--output-uri",
	help="Output URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
	type=str,
	dest="output_uri",
	required=False,
	default=os.getenv("GCS_TEXT_BUCKET")

	)
	parser.add_argument("-m", "--mime-type",
	help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
	type=str,
	dest="mime_type",
	required=False,
	default="application/pdf"
	)
	parser.add_argument("-p", "--project-id",
	help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
	type=str,
	dest="project_id",
	required=False,
	default=os.getenv("GCP_PROJECT_ID")
	)
	parser.add_argument("-r", "--processor-id",
	help="ID of the Processor created on the GCloud Document AI console",
	type=str,
	dest="processor_id",
	required=False,
	default=os.getenv("PROCESSOR_ID")
	)
	parser.add_argument("-l", "--location",
	help="Location of the Document AI Processor. 'eu' or 'us'.",
	type=str,
	dest="processor_location",
	required=False,
	default=os.getenv("PROCESSOR_LOCATION")
	)
	parser.add_argument("-f", "--field-mask",
	help="Fields to return in the document object",
	type=str,
	dest="field_mask",
	required=False,
	default="text,pages.pageNumber"
	)

	args = parser.parse_args()

	input_uri = args.input_uri
	output_uri = args.output_uri
	mime_type = args.mime_type
	project_id = args.project_id
	processor_id = args.processor_id
	processor_location = args.processor_location
	field_mask = args.field_mask

	batch_process_toolbox(
	project_id=project_id,
	location=processor_location,
	processor_id=processor_id,
	gcs_input_uri=input_uri,
	gcs_output_uri=output_uri,
	input_mime_type=mime_type,
	field_mask=field_mask,
	)