Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
多线程下载器
import re
from os import path
import requests
from threading import Lock
from six.moves.urllib.parse import unquote, urlparse
from concurrent.futures import ThreadPoolExecutor, wait
class Downloader(object):
def __init__(self, workers_num=8):
self.session = requests.session()
self.pool = ThreadPoolExecutor(max_workers=workers_num)
self.workers_num = workers_num
def get_file_name(self, url):
token = '[-!#-\'*+.\dA-Z^-z|~]+'
qdtext='[]-~\t !#-[]'
mimeCharset='[-!#-&+\dA-Z^-z]+'
language='(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}(?:-[A-Za-z]{3}){,2})?|[A-Za-z]{4,8})(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|\d{3}))(?:-(?:[\dA-Za-z]{5,8}|\d[\dA-Za-z]{3}))*(?:-[\dA-WY-Za-wy-z](?:-[\dA-Za-z]{2,8})+)*(?:-[Xx](?:-[\dA-Za-z]{1,8})+)?|[Xx](?:-[\dA-Za-z]{1,8})+|[Ee][Nn]-[Gg][Bb]-[Oo][Ee][Dd]|[Ii]-[Aa][Mm][Ii]|[Ii]-[Bb][Nn][Nn]|[Ii]-[Dd][Ee][Ff][Aa][Uu][Ll][Tt]|[Ii]-[Ee][Nn][Oo][Cc][Hh][Ii][Aa][Nn]|[Ii]-[Hh][Aa][Kk]|[Ii]-[Kk][Ll][Ii][Nn][Gg][Oo][Nn]|[Ii]-[Ll][Uu][Xx]|[Ii]-[Mm][Ii][Nn][Gg][Oo]|[Ii]-[Nn][Aa][Vv][Aa][Jj][Oo]|[Ii]-[Pp][Ww][Nn]|[Ii]-[Tt][Aa][Oo]|[Ii]-[Tt][Aa][Yy]|[Ii]-[Tt][Ss][Uu]|[Ss][Gg][Nn]-[Bb][Ee]-[Ff][Rr]|[Ss][Gg][Nn]-[Bb][Ee]-[Nn][Ll]|[Ss][Gg][Nn]-[Cc][Hh]-[Dd][Ee]'
valueChars = '(?:%[\dA-F][\dA-F]|[-!#$&+.\dA-Z^-z|~])*'
dispositionParm = '[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\s*=\s*(?:({token})|"((?:{qdtext}|\\\\[\t !-~])*)")|[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\*\s*=\s*({mimeCharset})\'(?:{language})?\'({valueChars})|{token}\s*=\s*(?:{token}|"(?:{qdtext}|\\\\[\t !-~])*")|{token}\*\s*=\s*{mimeCharset}\'(?:{language})?\'{valueChars}'.format(**locals())
rsp = self.session.head(url)
try:
m = re.match('(?:{token}\s*;\s*)?(?:{dispositionParm})(?:\s*;\s*(?:{dispositionParm}))*|{token}'.format(**locals()), rsp.headers['Content-Disposition'])
except KeyError:
name = path.basename(unquote(urlparse(url).path))
else:
if not m:
name = path.basename(unquote(urlparse(url).path))
elif m.group(8) is not None:
name = unquote(m.group(8)).decode(m.group(7))
elif m.group(4) is not None:
name = unquote(m.group(4)).decode(m.group(3))
elif m.group(6) is not None:
name = re.sub('\\\\(.)', '\1', m.group(6))
elif m.group(5) is not None:
name = m.group(5)
elif m.group(2) is not None:
name = re.sub('\\\\(.)', '\1', m.group(2))
else:
name = m.group(1)
if name:
name = path.basename(name)
else:
name = path.basename(unquote(urlparse(url).path))
return name
def get_file_size(self, url):
while True:
rsp = self.session.head(url)
if 300 < rsp.status_code < 400:
url = rsp.headers["Location"]
continue
return int(rsp.headers["content-length"])
def handler(self, url, file_name, start, end):
headers = {'Range': 'bytes={}-{}'.format(start, end)}
rsp = self.session.get(url, headers=headers, stream=True)
with open(file_name, 'rb+') as f:
f.seek(start)
f.write(rsp.content)
def run(self, url):
file_name = self.get_file_name(url)
file_size = self.get_file_size(url)
with open(file_name, 'wb+') as f:
f.truncate(file_size)
f.close()
part = int(file_size / self.workers_num)
futures = []
for i in range(self.workers_num):
start = i * part
if i == self.workers_num - 1:
end = file_size
else:
end = start + part - 1
futures.append(self.pool.submit(self.handler, url, file_name, start, end))
wait(futures)
if __name__ == "__main__":
import time
d = Downloader()
s = time.time()
d.run("http://demo.borland.com/testsite/downloads/downloadfile.php?file=Small.zip&cd=attachment+filename")
d.run("http://static-aliyun-doc.oss-cn-hangzhou.aliyuncs.com/download/pdf/DNHCS_MGW1842487_zh-CN_cn_181112170048_public_92408e650bbaaab8b146f371082a0ac3.pdf")
d.run("https://p.pstatp.com/origin/ff670000482866725305")
e = time.time()
print(e - s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.