Skip to content

Instantly share code, notes, and snippets.

@jianjieluo
Last active September 27, 2020 08:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jianjieluo/7b5f07992fe408a6201a1a90d25dc8ce to your computer and use it in GitHub Desktop.
Save jianjieluo/7b5f07992fe408a6201a1a90d25dc8ce to your computer and use it in GitHub Desktop.
Download images or gifs given urls
import os
import sys
import csv
import argparse
import numpy as np
import urllib.request
#from urllib.request import urlopen
import socket
socket.setdefaulttimeout(10)
from urllib.parse import quote
import urllib.request, urllib.error
import requests
from tqdm import tqdm
OUTPUT_ROOT = 'gifs'
def parse_args():
parser = argparse.ArgumentParser(
description='Arg parser'
)
parser.add_argument('--url', default='data/splits/val.txt', type=str)
parser.add_argument('--begin', default=0,type=int)
parser.add_argument('--end', default=80000,type=int)
return parser.parse_args()
def main():
args = parse_args()
print(args)
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
with open(args.url) as fid:
lines = [line.strip() for line in fid]
fid1 = open('bad_gid_HTTP_' + str(args.begin) + '.txt', 'w')
fid2 = open('bad_gid_URL_' + str(args.begin) + '.txt', 'w')
split = os.path.basename(args.url).split('.')[0]
output_dir = os.path.join(OUTPUT_ROOT, split)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
total_len = min(len(lines), args.end - args.begin + 1)
with tqdm(total=total_len, ascii=True) as pbar:
for i, line in enumerate(lines):
if i < args.begin or i > args.end:
continue
pbar.update(1)
gid = '%s_%d' % (split, i)
gifUrl = line
url_ext = gifUrl.split('.')[-1]
if url_ext == 'gifv' or url_ext == 'gif':
ext = url_ext
else:
ext = 'gif'
out_path = os.path.join(output_dir, gid + '.' + ext)
if os.path.exists(out_path):
continue
#urllib.request.urlretrieve(quote(gifUrl, safe=':/=&?'), out_path)
try:
urllib.request.urlretrieve(quote(gifUrl, safe=':/=&?'), out_path)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code))
fid1.write(gid + '\t' + str(e.code) +'\n')
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason))
fid2.write(gid + '\n')
except:
print('Can not download the ' + gid + ' gif')
else:
pass
print('finish')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment