Created
April 25, 2018 06:27
-
-
Save lttzzlll/8481172c9d048a57a949301b3aaf1da6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import urllib.request | |
import requests | |
import os | |
headers = { | |
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} | |
URLS = ['http://www.jianshu.com/', | |
'http://www.163.com/', | |
'http://www.baidu.com/', | |
'http://www.qq.com/', | |
'http://www.google.com/'] | |
# Retrieve a single page and report the URL and contents | |
def load_url(url, timeout): | |
try: | |
req = requests.get(url, timeout=timeout, headers=headers) | |
if req.status_code == 200: | |
req.encoding = req.apparent_encoding | |
return req.text | |
else: | |
print(url, req.status_code) | |
except: | |
pass | |
return '' | |
def save(path, text): | |
with open(path, 'w', encoding='utf-8') as f: | |
f.write(text) | |
# We can use a with statement to ensure threads are cleaned up promptly | |
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: | |
# Start the load operations and mark each future with its URL | |
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS} | |
for future in concurrent.futures.as_completed(future_to_url): | |
url = future_to_url[future] | |
try: | |
data = future.result() | |
except Exception as exc: | |
print('%r generated an exception: %s' % (url, exc)) | |
else: | |
print('%r page is %d length' % (url, len(data))) | |
save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp', | |
url.split('.')[1] + '.html'), data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
使用request的时候记得要加上headers信息来模拟浏览器行为。虽然可能还是回被封。