Skip to content

Instantly share code, notes, and snippets.

@rasmi
Created November 30, 2018 18:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rasmi/835a25e36ffff1c5d460dd2b2819299a to your computer and use it in GitHub Desktop.
Save rasmi/835a25e36ffff1c5d460dd2b2819299a to your computer and use it in GitHub Desktop.
Google Cloud Vision API Document OCR
#!/usr/bin/env python
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Perform OCR on a directory of PDF documents in Google Cloud Storage.
Example Usage:
GCS_BUCKET="gs://my-bucket"
PDF_INPUT_DIRECTORY="${GCS_BUCKET}/pdf-input-directory"
OCR_OUTPUT_DIRECTORY="${GCS_BUCKET}/ocr-output-directory"
TEXT_OUTPUT_DIRECTORY="${GCS_BUCKET}/text-output-directory"
# To kick off an OCR process, run:
python document_ocr.py run-ocr \
--input-directory $PDF_INPUT_DIRECTORY \
--output-directory $OCR_OUTPUT_DIRECTORY
# Wait until OCR is complete.
# To write the plaintext output to GCS, run:
python document_ocr.py convert-ocr-output \
--input-directory $OCR_OUTPUT_DIRECTORY \
--output-directory $TEXT_OUTPUT_DIRECTORY
# To print the plaintext output locally, run:
python document_ocr.py print-ocr-output \
--input-directory $OCR_OUTPUT_DIRECTORY
Based on
https://github.com/GoogleCloudPlatform/python-docs-samples/blob/8f28cb6fc85335b0b4fb0b0a113d0248852ac94d/vision/cloud-client/detect/detect.py#L665
"""
import argparse
import os
import re
from google.cloud import storage
from google.cloud import vision
from google.protobuf import json_format
def gcs_bucket_and_prefix(gcs_path):
match = re.match(r'gs://([^/]+)/(.+)', gcs_path)
bucket_name = match.group(1)
prefix = match.group(2)
return (bucket_name, prefix)
def list_blobs(gcs_directory):
storage_client = storage.Client()
bucket_name, prefix = gcs_bucket_and_prefix(gcs_directory)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
# Remove directories from blob_list.
blob_list = [blob for blob in blob_list if not blob.name.endswith('/')]
return blob_list
def write_string_to_gcs(string, uri):
storage_client = storage.Client()
bucket_name, prefix = gcs_bucket_and_prefix(uri)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blob = bucket.blob(prefix)
blob.upload_from_string(string)
def blob_uri(blob):
uri = 'gs://{}/{}'.format(blob.bucket.name, blob.name)
return uri
def filename_extension(path):
full_filename = os.path.basename(path)
filename, extension = full_filename.split('.')
return (filename, extension)
def create_ocr_request(source_blob, output_directory):
feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
# Source file input configuration.
source_uri = blob_uri(source_blob)
gcs_source = vision.types.GcsSource(uri=source_uri)
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'
input_config = vision.types.InputConfig(
gcs_source=gcs_source,
mime_type=mime_type)
# Destination file output configuration.
filename, _ = filename_extension(source_blob.name)
destination_uri = '{}/{}-'.format(output_directory.rstrip('/'), filename)
gcs_destination = vision.types.GcsDestination(uri=destination_uri)
# How many pages should be grouped into each json output file (max 100).
batch_size = 100
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination,
batch_size=batch_size)
request = vision.types.AsyncAnnotateFileRequest(
features=[feature],
input_config=input_config,
output_config=output_config)
return request
def async_detect_document(input_directory, output_directory):
"""OCR with PDF/TIFF as source files on GCS."""
client = vision.ImageAnnotatorClient()
input_blobs = list_blobs(input_directory)
for blob in input_blobs:
request = create_ocr_request(blob, output_directory)
client.async_batch_annotate_files(requests=[request])
def read_ocr_output(blob):
"""Reads an OCR output from GCS."""
json_string = blob.download_as_string()
response = json_format.Parse(
json_string,
vision.types.AnnotateFileResponse())
all_pages = []
for page in response.responses:
annotation = page.full_text_annotation
all_pages.append(annotation.text)
return '\n'.join(all_pages)
def convert_ocr_output(input_directory, output_directory):
"""Converts OCR output to .txt files written on GCS."""
ocr_output_blobs = list_blobs(input_directory)
for blob in ocr_output_blobs:
parsed_text = read_ocr_output(blob)
filename, _ = filename_extension(blob.name)
destination_uri = '{}/{}.txt'.format(
output_directory.rstrip('/'), filename)
write_string_to_gcs(parsed_text, destination_uri)
def print_ocr_output(input_directory):
"""Prints all OCR output blobs in a directory."""
ocr_output_blobs = list_blobs(input_directory)
for blob in ocr_output_blobs:
print(read_ocr_output(blob))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('command')
parser.add_argument('--input-directory', required=True)
parser.add_argument('--output-directory')
args = parser.parse_args()
if args.command == 'run-ocr':
assert args.output_directory, (
'--output-directory is required to run OCR.')
async_detect_document(args.input_directory, args.output_directory)
elif args.command == 'convert-ocr-output':
assert args.output_directory, (
'--output-directory is required to convert OCR output.')
convert_ocr_output(args.input_directory, args.output_directory)
elif args.command == 'print-ocr-output':
print_ocr_output(args.input_directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment