Created
September 20, 2017 13:46
-
-
Save anonymous/4a7f7d2c1efbbaaa515f0031cea9c4b2 to your computer and use it in GitHub Desktop.
FuckDemo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import random | |
import pymongo | |
import requests | |
def get_user_agent(): | |
""" | |
返回一个 User-Agent就好 | |
""" | |
ua_list = ['GoogleBot', 'BaiduBot'] # User-Agent list | |
return random.choice(ua_list) | |
def get_proxy(): | |
""" | |
这里可以从数据库里面获取,或者从文件获取,或者从 API 获取 | |
这个方法 return 一个下面格式的 HTTP 代理就好了 | |
""" | |
return {'http': '127.0.0.1'} | |
def store_item(item): | |
pymongo.MongoClient()['demo']['demo'].insert(item) | |
if __name__ == '__main__': | |
page_list = ['https://blog.fiht.me/page/%s' % i for i in xrange(3)] # 爬完当前页后接着爬取下一页直至全部爬取完毕 | |
for url in page_list: | |
headers = {"User-Agent": get_user_agent()} | |
response = requests.get(url=url, headers=headers, proxies=get_proxy()) | |
item = {"url": response.url, "text": response.text} | |
store_item(item) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment