Skip to content

Instantly share code, notes, and snippets.

@Yxnt
Last active April 18, 2018 06:15
Show Gist options
  • Save Yxnt/510e8f3c12377b8869643c42d263a69a to your computer and use it in GitHub Desktop.
Save Yxnt/510e8f3c12377b8869643c42d263a69a to your computer and use it in GitHub Desktop.
简易爬虫基类
import requests
import redis
import json
from bs4 import BeautifulSoup
from requests import request
from config.config import Config
pool = redis.ConnectionPool(host=Config.redis_host, port=Config.redis_port, db=Config.redis_db,
password=Config.redis_pass)
class Spider(object):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
r = redis.Redis(connection_pool=pool)
bs4 = BeautifulSoup
cookie = None
def __init__(self, login_url: str, domain: str, login: bool, loginparams: dict, loginmethod: str):
self._login_url = login_url # 登录接口
self._islogin = login # 是否要登录
self._domain = domain # 域名
self._loginmethod = loginmethod # 登录接口请求方式
self._loginparams = loginparams # 登录参数
self.headers['Referer'] = self._domain # 配置Referer
if self._islogin is True:
status = self.login()
if status is not True:
raise Exception('Login Failed')
def login(self):
cookie = self.__checkcookie()
if cookie is None:
if self._loginmethod == 'GET':
resp = request(self._loginmethod, self._login_url, params=self._loginparams, headers=self.headers)
else:
resp = request(self._loginmethod, self._login_url, json=self._loginparams, headers=self.headers)
print(resp.json())
if resp.status_code == 200:
cookie = requests.utils.dict_from_cookiejar(resp.cookies) # cookiejar 转 dict
self.__savecookie(cookie)
else:
return False
self.cookie = cookie
return True
def __checkcookie(self):
result = self.r.get(self._domain)
if not result:
return None
result = json.loads(result.decode('utf8'))
return result
def __savecookie(self, cookie):
self.r.set(self._domain, json.dumps(cookie), ex=10800, nx=True)
@Yxnt
Copy link
Author

Yxnt commented Apr 18, 2018

from spiders import Spider
import requests
class test(Spider):
    def parser(self,url):
        resp = requests.get(url,cookies=self.cookie,headers=self.headers)
        soup = self.bs4(resp,'lxml')
        return soup

if __name__ == '__main__':
    login_params = {
        'username':'abcd',
        'password':'abcd'
    }
    t = test(login_url='http://test.com/login',
              login=True, domain='http://test.com/', loginmethod='POST', loginparams=login_params)
    test.parse('http://test.com/product/abc')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment