Skip to content

Instantly share code, notes, and snippets.

@jmgibson71
Created February 15, 2021 14:40
Show Gist options
  • Save jmgibson71/1c780548b664c099f08be083bd1e1bcd to your computer and use it in GitHub Desktop.
Save jmgibson71/1c780548b664c099f08be083bd1e1bcd to your computer and use it in GitHub Desktop.
Google vision document find
#!/usr/bin/python
from pathlib import Path
from datetime import datetime
from google.cloud import vision
LOG_PATH = Path("/home/me/logs")
PATH_TO_IMAGES = Path("/data/Pictures/Takeout")
client = vision.ImageAnnotatorClient()
POTENTIAL_DOCUMENT = ['font', 'material property', 'parallel', 'stationery', 'recipe', 'paper', 'paper product',
'letter', 'document', 'post-it note', 'screenshot', '']
def is_paper(labels: list):
test = [x for x in labels if x in POTENTIAL_DOCUMENT]
match_per = len(test) / len(POTENTIAL_DOCUMENT)
if len(test) > 0:
return True, match_per
return False, match_per
def get_file():
images = [".jpg", ".png"]
for p in PATH_TO_IMAGES.glob("**/*.*"):
if p.suffix.lower() in images:
yield p
def label_maker():
log = LOG_PATH / f"potential_documents_{datetime.now()}.log"
with log.open('w') as logfh:
for f in get_file():
print(f"Examining: {f}")
with f.open('rb') as fh:
content = fh.read()
image = vision.Image(content=content)
response = client.label_detection(image=image)
labels = response.label_annotations
labels = [x.description.lower() for x in labels]
potential, percentage = is_paper(labels)
if potential:
print(f"{f} is probably a document.")
logfh.write(f"{f}: {percentage} {labels}\n")
if __name__ == "__main__":
label_maker()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment