Created
April 22, 2018 11:50
-
-
Save atuyosi/c9d14478d2ac569e9ed17ac151c14e71 to your computer and use it in GitHub Desktop.
陸自イラク日報PDFのOCRスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pdf2image import convert_from_path, convert_from_bytes | |
import tempfile,os,sys | |
from pathlib import Path | |
import io | |
from time import sleep | |
from google.protobuf.json_format import MessageToJson | |
from google.cloud import vision | |
from retry import retry | |
@retry(tries=-3, delay=1) | |
def call_api_with_image(content): | |
# 事前に環境変数GOOGLE_APPLICATION_CREDENTIALSがセットされている前提 | |
# | |
# $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" | |
try: | |
client = vision.ImageAnnotatorClient() | |
response = client.text_detection({ | |
'content': content, | |
} ) | |
serialized = MessageToJson(response) | |
return serialized | |
except Exception as e : | |
print("exception in {0}, {1}".format("call_api", e)) | |
raise(e) | |
subdir_list = os.listdir('./pdf') | |
p = Path('./pdf') | |
if __name__ == '__main__' : | |
if not os.path.exists('json'): | |
os.mkdir('json') | |
total = 0 | |
total_by_category = [0, 0, 0] | |
for i, sd in enumerate(subdir_list) : | |
st_subdir = 'json' + '/' + sd | |
if not os.path.exists(st_subdir): | |
os.mkdir(st_subdir) | |
pdf_list = [ path.as_posix() for path in p.glob( sd + "/*.pdf" ) ] | |
for filename in pdf_list : | |
print("processing pdf: {0}".format(filename)) | |
with tempfile.TemporaryDirectory() as path: | |
images_from_path = convert_from_path(filename, output_folder=path) | |
for j, image in enumerate(images_from_path, start=1) : | |
temp = io.BytesIO() | |
# image.save('test{}.png'.format(j), 'png') | |
image.save(temp,format="png") | |
sleep(0.1) | |
json_data = call_api_with_image(temp.getvalue()) # ファイルポインタが末尾を指しているので read() ではなく getvalue() | |
#total += 1 | |
#total_by_category[i] += 1 | |
output_filename = filename.replace('pdf', 'json').split('.')[0] + "_{0}.json".format(j) | |
print("output: {0}".format(output_filename)) | |
with open(output_filename,'w') as output : | |
output.write(json_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment