Skip to content

Instantly share code, notes, and snippets.

@ResidentMario
Last active February 19, 2020 19:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ResidentMario/f934504ecb70a87879661c1c0170f533 to your computer and use it in GitHub Desktop.
Save ResidentMario/f934504ecb70a87879661c1c0170f533 to your computer and use it in GitHub Desktop.
import sys
import os
import pandas as pd
import requests
from tqdm import tqdm
import ratelim
from checkpoints import checkpoints
checkpoints.enable()
def download(categories):
# Download the metadata
kwargs = {'header': None, 'names': ['LabelID', 'LabelName']}
orig_url = "https://storage.googleapis.com/openimages/2018_04/class-descriptions-boxable.csv"
class_names = pd.read_csv(orig_url, **kwargs)
orig_url = "https://storage.googleapis.com/openimages/2018_04/train/train-annotations-bbox.csv"
train_boxed = pd.read_csv(orig_url)
orig_url = "https://storage.googleapis.com/openimages/2018_04/train/train-images-boxable-with-rotation.csv"
image_ids = pd.read_csv(orig_url)
# Get category IDs for the given categories and sub-select train_boxed with them.
label_map = dict(class_names.set_index('LabelName').loc[categories, 'LabelID']
.to_frame().reset_index().set_index('LabelID')['LabelName'])
label_values = set(label_map.keys())
relevant_training_images = train_boxed[train_boxed.LabelName.isin(label_values)]
# Start from prior results if they exist and are specified, otherwise start from scratch.
relevant_flickr_urls = (relevant_training_images.set_index('ImageID')
.join(image_ids.set_index('ImageID'))
.loc[:, 'OriginalURL'])
relevant_flickr_img_metadata = (relevant_training_images.set_index('ImageID').loc[relevant_flickr_urls.index]
.pipe(lambda df: df.assign(LabelValue=df.LabelName.map(lambda v: label_map[v]))))
remaining_todo = len(relevant_flickr_urls) if checkpoints.results is None else\
len(relevant_flickr_urls) - len(checkpoints.results)
# Download the images
with tqdm(total=remaining_todo) as progress_bar:
relevant_image_requests = relevant_flickr_urls.safe_map(lambda url: _download_image(url, progress_bar))
progress_bar.close()
# Write the images to files, adding them to the package as we go along.
if not os.path.isdir("temp/"):
os.mkdir("temp/")
for ((_, r), (_, url), (_, meta)) in zip(relevant_image_requests.iteritems(), relevant_flickr_urls.iteritems(),
relevant_flickr_img_metadata.iterrows()):
image_name = url.split("/")[-1]
image_label = meta['LabelValue']
_write_image_file(r, image_name)
@ratelim.patient(5, 5)
def _download_image(url, pbar):
"""Download a single image from a URL, rate-limited to once per second"""
r = requests.get(url)
r.raise_for_status()
pbar.update(1)
return r
def _write_image_file(r, image_name):
"""Write an image to a file"""
filename = f"temp/{image_name}"
with open(filename, "wb") as f:
f.write(r.content)
if __name__ == '__main__':
categories = sys.argv[1:]
download(categories)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment