Skip to content

Instantly share code, notes, and snippets.

@g-leech

g-leech/gcp_ocr.py Secret

Created Jun 4, 2021
Embed
What would you like to do?
import os
#from google.cloud import vision
CRED = "/mnt/c/Users/techn/code/gcloud/My First Project-752b5f9e522b.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = CRED
PATH = "/mnt/e/Takeout/Drive"
l = [x[0] for x in os.walk(PATH)]
l = [x for x in l if "ipynb" not in x]
l = l[1:]
files_nested = [ list(os.walk(d)) for d in l ]
files_nested = [ x for x in files_nested if x[0][0][1]]
def process_files(fs) :
books = {}
with open('results.txt', 'w') as f:
for notebook in fs :
book = {}
notebook = notebook[0]
path = notebook[0]
notebook = notebook[2:][0]
notebookName = path.split("/")[-1]
for img in notebook:
book[img] = detect_text(path + "/" + img, f)
books[notebookName] = book
return books
def detect_text(path, f):
"""Detects text in the file."""
from google.cloud import vision
import io
client = vision.ImageAnnotatorClient()
with io.open(path, 'rb') as image_file:
content = image_file.read()
image = vision.Image(content=content)
hint = {"language_hints": ["en-t-i0-handwrit"]}
response = client.document_text_detection(image=image, \
image_context=hint)
texts = response.text_annotations
if response.error.message:
print(path)
l = [text.description for text in texts]
print(l)
for item in l:
f.write("%s\n" % item)
process_files(files_nested)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment