Skip to content

Instantly share code, notes, and snippets.

@Franklin-Yao
Last active June 21, 2022 09:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Franklin-Yao/71d5988b6540b6b25da1e8c451cc7f5c to your computer and use it in GitHub Desktop.
Save Franklin-Yao/71d5988b6540b6b25da1e8c451cc7f5c to your computer and use it in GitHub Desktop.
yfcc100m downloader
# pip install parlai
# download hash file from https://drive.google.com/file/d/1u_u6MPx1i3AVuw0CkjyAC8udvcid3vX_/view?usp=sharing
from parlai.core.build_data import download_multiprocess
image_path = '/mnt/data/dataset/yfcc'
with open(f'/home/franklin/fewshot/dataset/yfcc-100m/yfcc100m_hash') as f:
data= f.readlines()
print(f'number of images: {len(data)}')
#data = data[:1000]
n_img = len(data)
image_urls = []
hashes = []
image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images'
for i in range(n_img):
p_hash = data[i].split('\t')[1].strip()
hashes.append(p_hash)
url = f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg"
image_urls.append(url)
print('[downloading images to {}]'.format(image_path))
download_multiprocess(
image_urls, image_path, dest_filenames=[f"{h}.jpg" for h in hashes]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment