Skip to content

Instantly share code, notes, and snippets.

@makefile
Created July 21, 2021 04:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save makefile/797abe7f2ca46405a63b0c27299e4ec2 to your computer and use it in GitHub Desktop.
Save makefile/797abe7f2ca46405a63b0c27299e4ec2 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# requirements: requests
# conda/pip install requests
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests==1.2.3 # version for python2.6, but has problem in SSL
import os
from contextlib import closing
import threading
import time
import sys
import requests
from requests.adapters import HTTPAdapter
# lower python version does not support SNI used for SSL connection
# or throws exception: "hostname doesn't match", refer to https://docs.python-requests.org/zh_CN/latest/community/faq.html
assert sys.version_info >= (2, 7, 9)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=2))
s.mount('https://', HTTPAdapter(max_retries=2))
id_url_file = sys.argv[1]
out_dir = sys.argv[2] # './images'
thread_num = 20
if len(sys.argv) > 3: thread_num = int(sys.argv[3])
# connect + read timeout seconds
timeout = 5
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
if not os.path.exists(out_dir):
os.mkdir(out_dir)
def download(img_url, img_name):
if os.path.isfile(os.path.join(out_dir, img_name)):
return
#with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
with closing(s.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
rc = r.status_code
if 299 < rc or rc < 200:
print('returnCode%s\t%s\t%s' % (rc, img_name, img_url))
return
content_length = int(r.headers.get('content-length', '0'))
if content_length == 0:
print('size0\t%s' % img_url)
return
try:
with open(os.path.join(out_dir, img_name + ".jpg"), 'wb') as f:
for data in r.iter_content(1024):
f.write(data)
except:
print('savefail\t%s' % img_url)
def get_imgurl_generate():
with open(id_url_file, 'r') as f:
index = 0
for line in f:
index += 1
if index % 500 == 0:
print('execute %s line at %s' % (index, time.time()))
if not line:
print('line %s is empty "\t"' % index)
continue
line = line.strip()
try:
imgs = line.split('\t')
if len(imgs) != 2:
print('line %s splite error' % index)
continue
if not imgs[0] or not imgs[1]:
print('line %s img is empty' % index)
continue
yield imgs
except:
print('line %s can not split by "\t"' % index)
lock = threading.Lock()
def loop(imgs):
print('thread %s is running...' % threading.current_thread().name)
while True:
try:
with lock:
#img_url, img_name = next(imgs)
img_name, img_url = next(imgs)
except StopIteration:
break
try:
download(img_url, img_name)
except Exception as e:
print('exceptfail\t%s\t%s' % (img_url, e))
time.sleep(1) # seconds
print('thread %s is end...' % threading.current_thread().name)
img_gen = get_imgurl_generate()
for i in range(0, thread_num):
t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,))
t.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment