Created
May 4, 2018 08:48
-
-
Save lttzzlll/646d0b419724d9045faa75765b341fe8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import urllib.request | |
import requests | |
import os | |
headers = { | |
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} | |
URLS = ['http://www.jianshu.com/', | |
'http://www.163.com/', | |
'http://www.baidu.com/', | |
'http://www.qq.com/', | |
'http://www.google.com/', | |
'http://www.xiaomi.com', | |
'http://taobao.com', | |
'http://tmall.com', | |
'http://jd.com', | |
'http://apple.com', | |
'http://bing.com'] | |
# Retrieve a single page and report the URL and contents | |
def load_url(url, timeout=30): | |
try: | |
req = requests.get(url, timeout=timeout, headers=headers) | |
if req.status_code == 200: | |
req.encoding = req.apparent_encoding | |
return req.text | |
else: | |
print(url, req.status_code) | |
except: | |
pass | |
return '' | |
def save(path, text): | |
with open(path, 'w', encoding='utf-8') as f: | |
f.write(text) | |
# We can use a with statement to ensure threads are cleaned up promptly | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
# Start the load operations and mark each future with its URL | |
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS} | |
for future in concurrent.futures.as_completed(future_to_url): | |
url = future_to_url[future] | |
print(url, len(future.result())) | |
# for url, txt in zip(URLS, executor.map(load_url, URLS)): | |
# print(url, len(txt)) | |
# try: | |
# data = future.result() | |
# except Exception as exc: | |
# print('%r generated an exception: %s' % (url, exc)) | |
# else: | |
# print('%r page is %d length' % (url, len(data))) | |
# save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp', | |
# url.split('.')[1] + '.html'), data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
如果修改成
生成的结果与输入的结果相同。
所以 executor.submit更具有灵活性。executor.map则更适合数据按顺序获取结果的情况。