多线程下载器
import re | |
from os import path | |
import requests | |
from threading import Lock | |
from six.moves.urllib.parse import unquote, urlparse | |
from concurrent.futures import ThreadPoolExecutor, wait | |
class Downloader(object): | |
def __init__(self, workers_num=8): | |
self.session = requests.session() | |
self.pool = ThreadPoolExecutor(max_workers=workers_num) | |
self.workers_num = workers_num | |
def get_file_name(self, url): | |
token = '[-!#-\'*+.\dA-Z^-z|~]+' | |
qdtext='[]-~\t !#-[]' | |
mimeCharset='[-!#-&+\dA-Z^-z]+' | |
language='(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}(?:-[A-Za-z]{3}){,2})?|[A-Za-z]{4,8})(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|\d{3}))(?:-(?:[\dA-Za-z]{5,8}|\d[\dA-Za-z]{3}))*(?:-[\dA-WY-Za-wy-z](?:-[\dA-Za-z]{2,8})+)*(?:-[Xx](?:-[\dA-Za-z]{1,8})+)?|[Xx](?:-[\dA-Za-z]{1,8})+|[Ee][Nn]-[Gg][Bb]-[Oo][Ee][Dd]|[Ii]-[Aa][Mm][Ii]|[Ii]-[Bb][Nn][Nn]|[Ii]-[Dd][Ee][Ff][Aa][Uu][Ll][Tt]|[Ii]-[Ee][Nn][Oo][Cc][Hh][Ii][Aa][Nn]|[Ii]-[Hh][Aa][Kk]|[Ii]-[Kk][Ll][Ii][Nn][Gg][Oo][Nn]|[Ii]-[Ll][Uu][Xx]|[Ii]-[Mm][Ii][Nn][Gg][Oo]|[Ii]-[Nn][Aa][Vv][Aa][Jj][Oo]|[Ii]-[Pp][Ww][Nn]|[Ii]-[Tt][Aa][Oo]|[Ii]-[Tt][Aa][Yy]|[Ii]-[Tt][Ss][Uu]|[Ss][Gg][Nn]-[Bb][Ee]-[Ff][Rr]|[Ss][Gg][Nn]-[Bb][Ee]-[Nn][Ll]|[Ss][Gg][Nn]-[Cc][Hh]-[Dd][Ee]' | |
valueChars = '(?:%[\dA-F][\dA-F]|[-!#$&+.\dA-Z^-z|~])*' | |
dispositionParm = '[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\s*=\s*(?:({token})|"((?:{qdtext}|\\\\[\t !-~])*)")|[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\*\s*=\s*({mimeCharset})\'(?:{language})?\'({valueChars})|{token}\s*=\s*(?:{token}|"(?:{qdtext}|\\\\[\t !-~])*")|{token}\*\s*=\s*{mimeCharset}\'(?:{language})?\'{valueChars}'.format(**locals()) | |
rsp = self.session.head(url) | |
try: | |
m = re.match('(?:{token}\s*;\s*)?(?:{dispositionParm})(?:\s*;\s*(?:{dispositionParm}))*|{token}'.format(**locals()), rsp.headers['Content-Disposition']) | |
except KeyError: | |
name = path.basename(unquote(urlparse(url).path)) | |
else: | |
if not m: | |
name = path.basename(unquote(urlparse(url).path)) | |
elif m.group(8) is not None: | |
name = unquote(m.group(8)).decode(m.group(7)) | |
elif m.group(4) is not None: | |
name = unquote(m.group(4)).decode(m.group(3)) | |
elif m.group(6) is not None: | |
name = re.sub('\\\\(.)', '\1', m.group(6)) | |
elif m.group(5) is not None: | |
name = m.group(5) | |
elif m.group(2) is not None: | |
name = re.sub('\\\\(.)', '\1', m.group(2)) | |
else: | |
name = m.group(1) | |
if name: | |
name = path.basename(name) | |
else: | |
name = path.basename(unquote(urlparse(url).path)) | |
return name | |
def get_file_size(self, url): | |
while True: | |
rsp = self.session.head(url) | |
if 300 < rsp.status_code < 400: | |
url = rsp.headers["Location"] | |
continue | |
return int(rsp.headers["content-length"]) | |
def handler(self, url, file_name, start, end): | |
headers = {'Range': 'bytes={}-{}'.format(start, end)} | |
rsp = self.session.get(url, headers=headers, stream=True) | |
with open(file_name, 'rb+') as f: | |
f.seek(start) | |
f.write(rsp.content) | |
def run(self, url): | |
file_name = self.get_file_name(url) | |
file_size = self.get_file_size(url) | |
with open(file_name, 'wb+') as f: | |
f.truncate(file_size) | |
f.close() | |
part = int(file_size / self.workers_num) | |
futures = [] | |
for i in range(self.workers_num): | |
start = i * part | |
if i == self.workers_num - 1: | |
end = file_size | |
else: | |
end = start + part - 1 | |
futures.append(self.pool.submit(self.handler, url, file_name, start, end)) | |
wait(futures) | |
if __name__ == "__main__": | |
import time | |
d = Downloader() | |
s = time.time() | |
d.run("http://demo.borland.com/testsite/downloads/downloadfile.php?file=Small.zip&cd=attachment+filename") | |
d.run("http://static-aliyun-doc.oss-cn-hangzhou.aliyuncs.com/download/pdf/DNHCS_MGW1842487_zh-CN_cn_181112170048_public_92408e650bbaaab8b146f371082a0ac3.pdf") | |
d.run("https://p.pstatp.com/origin/ff670000482866725305") | |
e = time.time() | |
print(e - s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment