Skip to content

Instantly share code, notes, and snippets.

@mancap314
Last active December 22, 2023 10:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mancap314/a96ae779eee07e6b9c940e8b72bb9c87 to your computer and use it in GitHub Desktop.
Save mancap314/a96ae779eee07e6b9c940e8b72bb9c87 to your computer and use it in GitHub Desktop.
"""
Command line tool for processing of a batch of PDF files stored on gcloud storage with a Document
AI processor.
`pip install python-dotenv google-cloud google-cloud-documentai
google-cloud-documentai-toolbox google-cloud-storage`
Also create a .env file for populating default values, containing following
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION, GCS_DOCUMENT_BUCKET and
GCS_DOCUMENT_TEXT corresponding to an OCR
processor created on the Document AI platform of your GCP project. The two last
keys correspond to the name of gcloud storage buckets, first for the stored PDF
documents and last for storing the results, without 'gs://'.
"""
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import documentai_toolbox
import os
from dotenv import load_dotenv
import argparse
def batch_process_toolbox(
project_id: str,
location: str,
processor_id: str,
gcs_input_uri: str,
gcs_output_uri: str,
processor_version_id: Optional[str] = None,
input_mime_type: Optional[str] = None,
field_mask: Optional[str] = None,
):
# You must set the api_endpoint if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
if not gcs_input_uri.endswith("/") and "." in gcs_input_uri:
# Specify specific GCS URIs to process individual documents
gcs_document = documentai.GcsDocument(
gcs_uri=gcs_input_uri, mime_type=input_mime_type
)
# Load GCS Input URI into a List of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
else:
# Specify a GCS URI Prefix to process an entire directory
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
# Cloud Storage URI for the Output Directory
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_output_uri, field_mask=field_mask
)
# Where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
if processor_version_id:
# The full resource name of the processor version, e.g.:
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)
else:
# The full resource name of the processor, e.g.:
# projects/{project_id}/locations/{location}/processors/{processor_id}
name = client.processor_path(project_id, location, processor_id)
request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)
# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)
# Operation Name Format: projects/{project_id}/locations/{location}/operations/{operation_id}
documents = documentai_toolbox.document.Document.from_batch_process_operation(
location=location, operation_name=operation.operation.name
)
for document in documents:
# Read the text recognition output from the processor
print("The document contains the following text:")
# Truncated at 100 characters for brevity
print(document.text[:100])
if __name__ == "__main__":
load_dotenv()
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input-uri",
help="Input URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
type=str,
dest="input_uri",
required=False,
default=os.getenv("GCS_DOCUMENT_BUCKET")
)
parser.add_argument("-o", "--output-uri",
help="Output URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]",
type=str,
dest="output_uri",
required=False,
default=os.getenv("GCS_TEXT_BUCKET")
)
parser.add_argument("-m", "--mime-type",
help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types",
type=str,
dest="mime_type",
required=False,
default="application/pdf"
)
parser.add_argument("-p", "--project-id",
help="GCloud Project ID. Get it through `gcloud config get-value core/project`",
type=str,
dest="project_id",
required=False,
default=os.getenv("GCP_PROJECT_ID")
)
parser.add_argument("-r", "--processor-id",
help="ID of the Processor created on the GCloud Document AI console",
type=str,
dest="processor_id",
required=False,
default=os.getenv("PROCESSOR_ID")
)
parser.add_argument("-l", "--location",
help="Location of the Document AI Processor. 'eu' or 'us'.",
type=str,
dest="processor_location",
required=False,
default=os.getenv("PROCESSOR_LOCATION")
)
parser.add_argument("-f", "--field-mask",
help="Fields to return in the document object",
type=str,
dest="field_mask",
required=False,
default="text,pages.pageNumber"
)
args = parser.parse_args()
input_uri = args.input_uri
output_uri = args.output_uri
mime_type = args.mime_type
project_id = args.project_id
processor_id = args.processor_id
processor_location = args.processor_location
field_mask = args.field_mask
batch_process_toolbox(
project_id=project_id,
location=processor_location,
processor_id=processor_id,
gcs_input_uri=input_uri,
gcs_output_uri=output_uri,
input_mime_type=mime_type,
field_mask=field_mask,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment