atuyosi/convert_image_and_ocr.py

## convert_image_and_ocr.py
from pdf2image import convert_from_path, convert_from_bytes

import tempfile,os,sys
from pathlib import Path
import io
from time import sleep
from google.protobuf.json_format import MessageToJson
from google.cloud import vision
from retry import retry

@retry(tries=-3, delay=1)
def call_api_with_image(content):

    # 事前に環境変数GOOGLE_APPLICATION_CREDENTIALSがセットされている前提
    #
    # $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"
    try:
        client = vision.ImageAnnotatorClient()

        response = client.text_detection({
            'content': content,
        } )

        serialized = MessageToJson(response)

        return serialized

    except Exception as e :
        print("exception in {0}, {1}".format("call_api", e))
        raise(e)


subdir_list = os.listdir('./pdf')

p = Path('./pdf')

if __name__ == '__main__' :

    if not os.path.exists('json'):
        os.mkdir('json')


    total = 0
    total_by_category = [0, 0, 0]

    for i, sd in enumerate(subdir_list) :

        st_subdir = 'json' + '/' + sd
        if not os.path.exists(st_subdir):
            os.mkdir(st_subdir)


        pdf_list = [ path.as_posix() for path in p.glob( sd + "/*.pdf" ) ]


        for filename in pdf_list :

            print("processing pdf: {0}".format(filename))
            with tempfile.TemporaryDirectory() as path:
                images_from_path = convert_from_path(filename, output_folder=path)


                for j, image in enumerate(images_from_path, start=1) :
                    temp = io.BytesIO()

                    # image.save('test{}.png'.format(j), 'png')
                    image.save(temp,format="png")
                    sleep(0.1)

                    json_data = call_api_with_image(temp.getvalue()) # ファイルポインタが末尾を指しているので read() ではなく getvalue()

                    #total += 1
                    #total_by_category[i] += 1

                    output_filename = filename.replace('pdf', 'json').split('.')[0] + "_{0}.json".format(j)
                    print("output: {0}".format(output_filename))
                    with open(output_filename,'w') as output :
                        output.write(json_data)
	from pdf2image import convert_from_path, convert_from_bytes

	import tempfile,os,sys
	from pathlib import Path
	import io
	from time import sleep
	from google.protobuf.json_format import MessageToJson
	from google.cloud import vision
	from retry import retry

	@retry(tries=-3, delay=1)
	def call_api_with_image(content):

	# 事前に環境変数GOOGLE_APPLICATION_CREDENTIALSがセットされている前提
	#
	# $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"
	try:
	client = vision.ImageAnnotatorClient()

	response = client.text_detection({
	'content': content,
	} )

	serialized = MessageToJson(response)

	return serialized

	except Exception as e :
	print("exception in {0}, {1}".format("call_api", e))
	raise(e)


	subdir_list = os.listdir('./pdf')

	p = Path('./pdf')

	if __name__ == '__main__' :

	if not os.path.exists('json'):
	os.mkdir('json')


	total = 0
	total_by_category = [0, 0, 0]

	for i, sd in enumerate(subdir_list) :

	st_subdir = 'json' + '/' + sd
	if not os.path.exists(st_subdir):
	os.mkdir(st_subdir)


	pdf_list = [ path.as_posix() for path in p.glob( sd + "/*.pdf" ) ]



	for filename in pdf_list :

	print("processing pdf: {0}".format(filename))
	with tempfile.TemporaryDirectory() as path:
	images_from_path = convert_from_path(filename, output_folder=path)


	for j, image in enumerate(images_from_path, start=1) :
	temp = io.BytesIO()

	# image.save('test{}.png'.format(j), 'png')
	image.save(temp,format="png")
	sleep(0.1)

	json_data = call_api_with_image(temp.getvalue()) # ファイルポインタが末尾を指しているので read() ではなく getvalue()

	#total += 1
	#total_by_category[i] += 1

	output_filename = filename.replace('pdf', 'json').split('.')[0] + "_{0}.json".format(j)
	print("output: {0}".format(output_filename))
	with open(output_filename,'w') as output :
	output.write(json_data)