Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
陸自イラク日報PDFのOCRスクリプト
from pdf2image import convert_from_path, convert_from_bytes
import tempfile,os,sys
from pathlib import Path
import io
from time import sleep
from google.protobuf.json_format import MessageToJson
from google.cloud import vision
from retry import retry
@retry(tries=-3, delay=1)
def call_api_with_image(content):
# 事前に環境変数GOOGLE_APPLICATION_CREDENTIALSがセットされている前提
#
# $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"
try:
client = vision.ImageAnnotatorClient()
response = client.text_detection({
'content': content,
} )
serialized = MessageToJson(response)
return serialized
except Exception as e :
print("exception in {0}, {1}".format("call_api", e))
raise(e)
subdir_list = os.listdir('./pdf')
p = Path('./pdf')
if __name__ == '__main__' :
if not os.path.exists('json'):
os.mkdir('json')
total = 0
total_by_category = [0, 0, 0]
for i, sd in enumerate(subdir_list) :
st_subdir = 'json' + '/' + sd
if not os.path.exists(st_subdir):
os.mkdir(st_subdir)
pdf_list = [ path.as_posix() for path in p.glob( sd + "/*.pdf" ) ]
for filename in pdf_list :
print("processing pdf: {0}".format(filename))
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(filename, output_folder=path)
for j, image in enumerate(images_from_path, start=1) :
temp = io.BytesIO()
# image.save('test{}.png'.format(j), 'png')
image.save(temp,format="png")
sleep(0.1)
json_data = call_api_with_image(temp.getvalue()) # ファイルポインタが末尾を指しているので read() ではなく getvalue()
#total += 1
#total_by_category[i] += 1
output_filename = filename.replace('pdf', 'json').split('.')[0] + "_{0}.json".format(j)
print("output: {0}".format(output_filename))
with open(output_filename,'w') as output :
output.write(json_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment