Franklin-Yao/download_yfcc100m.py

## download_yfcc100m.py
# pip install parlai
# download hash file from https://drive.google.com/file/d/1u_u6MPx1i3AVuw0CkjyAC8udvcid3vX_/view?usp=sharing
from parlai.core.build_data import download_multiprocess

image_path = '/mnt/data/dataset/yfcc'
with open(f'/home/franklin/fewshot/dataset/yfcc-100m/yfcc100m_hash') as f:
    data= f.readlines()
    print(f'number of images: {len(data)}')
#data = data[:1000]
n_img = len(data)
image_urls = []
hashes = []
image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images'

for i in range(n_img):
    p_hash = data[i].split('\t')[1].strip()
    hashes.append(p_hash)
    url = f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg"
    image_urls.append(url)

print('[downloading images to {}]'.format(image_path))
download_multiprocess(
    image_urls, image_path, dest_filenames=[f"{h}.jpg" for h in hashes]
)
	# pip install parlai
	# download hash file from https://drive.google.com/file/d/1u_u6MPx1i3AVuw0CkjyAC8udvcid3vX_/view?usp=sharing
	from parlai.core.build_data import download_multiprocess

	image_path = '/mnt/data/dataset/yfcc'
	with open(f'/home/franklin/fewshot/dataset/yfcc-100m/yfcc100m_hash') as f:
	data= f.readlines()
	print(f'number of images: {len(data)}')
	#data = data[:1000]
	n_img = len(data)
	image_urls = []
	hashes = []
	image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images'

	for i in range(n_img):
	p_hash = data[i].split('\t')[1].strip()
	hashes.append(p_hash)
	url = f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg"
	image_urls.append(url)

	print('[downloading images to {}]'.format(image_path))
	download_multiprocess(
	image_urls, image_path, dest_filenames=[f"{h}.jpg" for h in hashes]
	)