-
-
Save jpf/9cdc33632d83b3ce7e0361a1d732a55b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import datetime | |
import os | |
import shelve | |
import requests | |
class CustomVisionResult: | |
def __init__(self, payload): | |
self._id = payload['id'] | |
self._project = payload['project'] | |
self._iteration = payload['iteration'] | |
self._created = payload['created'] | |
self.probabilities = {} | |
for prediction in payload['predictions']: | |
self.probabilities[prediction['tagName']] = prediction['probability'] | |
def __getattr__(self, key): | |
return self.probabilities[key] | |
class CustomVision: | |
def __init__(self, prediction_key=None, prediction_endpoint=None, project_id=None, iteration=None, cache={}): | |
self.base_url = prediction_endpoint + project_id | |
self.iteration = iteration | |
self.session = requests.Session() | |
self.session.headers.update({'Prediction-Key': prediction_key}) | |
def classify(self, thing): | |
result = None | |
if False: | |
url = '{}/classify/iterations/{}/image'.format(self.base_url, self.iteration) | |
result = self.session.post(url, files={'file', thing}) | |
elif True: | |
url = '{}/classify/iterations/{}/url'.format(self.base_url, self.iteration) | |
if thing in cache: | |
result = cache[thing] | |
else: | |
print('Classifying: {}'.format(thing)) | |
result = self.session.post(url, json={'Url': thing}).json() | |
if result: | |
cache[thing] = result | |
return CustomVisionResult(result) | |
thumbnail_dir = 'thumbnails' | |
if not os.path.exists(thumbnail_dir): | |
os.mkdir(thumbnail_dir) | |
proxies = {'http': 'http://localhost:3030'} | |
session = requests.Session() | |
session.proxies = proxies | |
start_url = 'http://chroniclingamerica.loc.gov/lccn/sn84026749.json' | |
def loc_url_to_thumbnail(url): | |
url = url.replace('.json', '') | |
url = url + '/image_92x120_from_0,0_to_5698,6998.jpg' | |
return url | |
def loc_url_to_filename(url): | |
return url.replace('.json', '').replace('http://', '').replace('/', '-') + '.jpeg' | |
def save_as_filename(url, filename): | |
if not os.path.isfile(filename): | |
print('Saving: {}'.format(filename)) | |
r = session.get(url) | |
with open(filename, 'wb') as file: | |
file.write(r.content) | |
iteration = 'Iteration6' | |
# TODO: Move this to CustomVision class | |
cache_file = 'customvision-{}.cache'.format(iteration) | |
cache = shelve.open(cache_file, writeback=True) | |
cs = CustomVision(prediction_key='<YOUR PREDICTION KEY HERE>', | |
prediction_endpoint='https://westus2.api.cognitive.microsoft.com/customvision/v3.0/Prediction/', | |
project_id='<YOUR PROJECT ID HERE>', | |
iteration=iteration, | |
cache=cache) | |
start_date = '1916-04-23' | |
newspaper = session.get(start_url).json() | |
for issue in newspaper['issues']: | |
if issue['date_issued'] < start_date: | |
continue | |
issue_date = datetime.datetime.strptime(issue['date_issued'], '%Y-%m-%d') | |
issue_day = issue_date.strftime('%A') | |
if issue_day != 'Sunday': | |
continue | |
pages = session.get(issue['url']).json()['pages'] | |
for page in pages: | |
url = page['url'] | |
suggested_filename = loc_url_to_filename(url) | |
filename = thumbnail_dir + '/' + suggested_filename | |
thumbnail = loc_url_to_thumbnail(url) | |
prediction = None | |
try: | |
prediction = cs.classify(thumbnail) | |
except Exception as e: | |
print(prediction) | |
print(str(e)) | |
continue | |
# Note: This assumes that the tag you're looking for is named "krazy" | |
if prediction.krazy < 0.7: | |
continue | |
save_as_filename(thumbnail, filename) | |
cache.sync() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment