Skip to content

Instantly share code, notes, and snippets.

@Chitsing
Last active January 5, 2019 02:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Chitsing/7e1fe9a31fb647405feedc3e2dbf7b67 to your computer and use it in GitHub Desktop.
Save Chitsing/7e1fe9a31fb647405feedc3e2dbf7b67 to your computer and use it in GitHub Desktop.
爬取十余个p2p网站的爬虫 python
本爬虫可以爬取几个著名互联网平台如小赢理财,爱钱进,玖富普惠,积木盒子,等平台P2P产品的产品名,期限,利息和加息等信息
主要使用的方法是request获取网页信息,再用bs4、lxml(xpath),pyquery(query)等select选择器筛选定位所需信息,或者直接通过json读取相应接口内容,再numpy的array暂存数组,最后用pandas保存到csv里面,以供继续分析
相关网页链接主要通过谷歌浏览器抓包获得,APP相关网页信息接口主要通过stream平台获得
本爬虫仅供技术交流,请勿商用
如有问题,欢迎随时给我留言
# -- coding: utf-8 -- 备注文件编码,方便中文
import requests #访问网页
from lxml import etree #用xpath解析网页找信息
import json #json化网页数据方便提取
from bs4 import BeautifulSoup #用soup解析网页
from fake_useragent import UserAgent #防反爬虫识别UA
import time #时间,休眠,防止服务器反爬
import random #随机函数,设定随机时间
import pandas as pd #计算,存储
from pyquery import PyQuery #用jpy选择器找信息
import numpy as np #用矩阵来暂存并新增信息
import datetime #给保存的信息加上时间戳
nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #获取系统当前时间,日期+时间
nowdate = datetime.datetime.now().date() #获取系统当前日期
nowTime_hms = datetime.datetime.now().strftime('%H:%M:%S')#获取系统当前时间,时分秒
# 获取小赢理财信息的函数
def get_xiaoying():
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'referer': 'https://www.yingzt.com/invest/list',
'x-requested-with': 'XMLHttpRequest'
} #伪装浏览器
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫
#第1页产品信息列表
url_1 = 'https://www.yingzt.com/invest/list/'
r = requests.get(url_1,headers=headers).text
#用 lxml的etree.HTML方法解析网页
s = etree.HTML(r)
info = np.array(['小赢理财',nowTime,'']) #在矩阵里面标记下网站名称
for i in range(10):
path_name = '//*[@id="contentList"]/ul/li[{}]/div[1]/div/a/text()'.format(i+1)
path_month = ' // *[ @ id = "contentList"]/ul/li[{}]/div[1]/ul/li[2]/p[1]/span/text()'.format(i + 1)
path_price = '//*[@id="contentList"]/ul/li[{}]/div[1]/ul/li[1]/p[1]/text()'.format(i+1)
#获取的是list,用,join转化成为str
name_= ','.join(s.xpath(path_name))
month_ = ','.join(s.xpath(path_month))
price_ = ','.join(s.xpath(path_price))
print(name_,month_,price_)
info = np.row_stack((info,[name_,month_,price_]))
#解析第1页之后的页数
for p in range(2,10):
t =random.randint(1,5)
time.sleep(t)
print("page",p)
url_p = 'https://www.yingzt.com/invest/apiListV2?app_ver=2&loanGroup=1&period=ALL&interest=ALL&repay=ALL&order=&orderBy=&p1={}&_fromAjax_=1&_csrfToken_=d41d8cd98f00b204e9800998ecf8427e&_=1527152826113'.format(
p)
content = requests.get(url_p,headers=headers)
r2 = content.text
json_content =
soup = BeautifulSoup(json_content['data']['html'],'html.parser')
p_list = soup.find_all(class_ = "fl card-info")
# 判断是否有内容,要是没有就不获取,退出循环
if len(p_list):
#用循环的方式获取各个产品的名字,利率,期限等
for each_p_list in p_list:
rate = each_p_list.find(class_="light-txt").string
time2 = each_p_list.find(class_="big-txt").string
p_name = each_p_list.find(class_="weak-fontc").string
print(p_name, time2, rate)
info = np.row_stack((info, [p_name, time2, rate]))
else:
break
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
#爱钱进网站 www.iqianjin.com
def get_iqianjin():
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.iqianjin.com',
'Pragma': 'no-cache',
'Referer': 'http://www.iqianjin.com/',
'X-Requested-With': 'XMLHttpRequest'
} #伪装浏览器
url2= 'http://www.iqianjin.com/criteriaPlanPlus/planData?_=1527177318602' #js隐藏,关键字 plandata?,内有定期数据
url3 = 'http://www.iqianjin.com/criteriaDemand/data?_=1527177318601' #js隐藏,关键字data?15271777318601 内有活期数据
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫
#获取网页内容的text
net_content_html2 = requests.get(url2,headers=headers)
net_content_html3 = requests.get(url3,headers=headers)
net_content2 = net_content_html2.text
net_content3 = net_content_html3.text
json_content1 = json.loads(net_content3)
print('活期产品锁定期,利率,新手加息')
print(json_content1['bean']['fullTimeDesc'],json_content1['bean']['avgYield'],json_content1['bean']['interestlimit'])
info = np.array(['爱钱进',nowTime,''])
# title = ['(零存宝)期限','利率','新手加息']
info = np.row_stack((info,['(零存宝)期限-天','利率','新手加息']))
info = np.row_stack((info,[json_content1['bean']['fullTimeDesc'],json_content1['bean']['avgYield'],json_content1['bean']['interestlimit']]))
json_content = json.loads(net_content2)
print('整存宝(定期产品期限,利率,新手加息)')
product_info = [(item.get('period', 'NA'), item.get('basicProfileRate', 'NA'), item.get('extraReward', 'NA')) for item in
json_content['bean']]
info = np.row_stack((info, ['(定存宝)期限-月', '利率%', '新手加息%']))
info = np.row_stack((info, product_info))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
for result in product_info:
info = list(result)
print(info[0],info[1],info[2])
# 陆金服网站
def get_lup2p():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Connection': 'keep-alive',
'Host': 'lup2p.com',
'Referer': 'http://lup2p.com/',
'Upgrade-Insecure-Requests': '1'
} #伪装浏览器
url = 'https://www.lup2p.com/' #信息页面网址
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫
# #获取网页内容的text
net_content_html = requests.get(url,headers=headers)
net_content = net_content_html.text
s = etree.HTML(net_content)
path_name = '//*[@id="p2p-list"]/div/div[2]/ul/li/a/@title'
path_period = '//*[@id="p2p-list"]/div/div[2]/ul/li/ul/li[2]/p/text()'
path_rate = '//*[@id="p2p-list"]/div/div[2]/ul/li/ul/li[1]/p/text()'
name_= ','.join(s.xpath(path_name)) #把list转换成str
period_withb= ','.join(s.xpath(path_period))
rate_withb= ','.join(s.xpath(path_rate))
period_ = period_withb.strip() #把空格去掉
rate_ = rate_withb.strip()
print(name_,period_,rate_)
info = np.array(['陆金所', nowTime, ''])
info = np.row_stack((info, ['名称', '期限(月)', '利率%']))
info = np.row_stack((info, [name_, period_.replace('个月',''), rate_.replace('%','')]))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
# 获取玖富普惠相关数据 可自定义需要爬取的优选产品页数
def get_9fph(page=5):
headers = {
'Referer': 'https://8.9fpuhui.com/productplan/',
'Host': '8.9fpuhui.com',
'Origin': 'https://8.9fpuhui.com',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/html, */*; q=0.01'
} # 伪装浏览器
url = 'https://8.9fpuhui.com/productPlan/productPlanListData.html' # 产品信息页的js
url_h = 'https://www.9fpuhui.com/' #玖富普惠首页home
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫
payload = {
'queryProfit': '',
'queryPeriod': '',
'queryProductCode': '',
'showType': 'K',
'page': 0,
'orderby': '0',
'orderByType': 'desc',
'productType': ''
}
#获得首页信息
h_content_html = requests.post(url_h, headers=headers)
h_content = h_content_html.text
s1 = etree.HTML(h_content)
info = np.array(['玖富普惠', nowTime, '',''])
#获取新手数据
n_url = 'https://8.9fpuhui.com//greenHandsProductPlan/productPlan.html' # 新手产品的url
n_content_html = requests.post(n_url, headers=headers).text
json_n = json.loads(n_content_html)
result = [(item.get('productName', 'NA'),item.get('period', 'NA'), item.get('standardProfit', 'NA'), item.get('plusProfit', 'NA')) for item in json_n]
info = np.row_stack((info, ['名称', '期限(天)', '利率%','加息']))
info = np.row_stack((info,result))
for productinfo in result:
p_info = list(productinfo)
print(p_info[0],p_info[1],p_info[2],p_info[3])
#获取特供数据
path_name = '//div[@class ="subprod bg_ff box-bor po_re"]/div/div/a/text()'
path_period = '//p[@class ="fr"]/em/text()'
path_rate = '//div[@class ="fl"]/em/text()'
name_1_1 = ",".join(s1.xpath(path_name))
period_1_1 = s1.xpath(path_period)[2]
rate_1_1 = s1.xpath(path_rate)[1].replace("%","")
print(name_1_1, period_1_1, rate_1_1)
info = np.row_stack((info, [name_1_1, period_1_1, rate_1_1, 0]))
# 获取宝贝计划
payload['productType'] = "BBJH"
net_content_html = requests.post(url, data=payload, headers=headers)
bbjh_content = net_content_html.text
s2 = etree.HTML(bbjh_content)
path_name = '//div[@class="opname clearfix"]/em/a/text()'
path_period = '//li[@class="opinfo-li-r"]/span/text()'
path_rate = '//div[@class="oplixi clearfix"]/h2/em/text()'
for i in range(6):
name_2 = s2.xpath(path_name)[i]
period_2 = s2.xpath(path_period)[i]
rate_2 = s2.xpath(path_rate)[i]
print(name_2, period_2, rate_2)
info = np.row_stack((info, [name_2, period_2, rate_2, 0]))
#获取优选计划
for p in range(page):
payload['page'] = p
payload['productType'] = "yx"
net_content_html = requests.post(url, data=payload, headers=headers)
net_content = net_content_html.text
s3 = etree.HTML(net_content)
path_name = '//div[@class="opname clearfix"]/em/a/text()' # ok
path_period = '//li[@class="opinfo-li-r"]/span/text()' # ok
path_rate = '//div[@class="oplixi clearfix"]/h2/em/text()' # ok
t = random.randint(1, 5)
print("正在获取第{}页数据,下一页数据将在{}秒后显示".format(p + 1, t))
time.sleep(t)
for i in range(6):
name_3 = s3.xpath(path_name)[i]
period_3 = s3.xpath(path_period)[i]
rate_3 = s3.xpath(path_rate)[i]
print(name_3, period_3, rate_3)
info = np.row_stack((info, [name_3, period_3, rate_3, 0]))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
# 积木盒子
def get_jimu(num = 20 ,page = 3):
headers = {
'Host': 'box.jimu.com',
'Referer': 'https://www.jimu.com/',
'Upgrade-Insecure-Requests': '1'
} #伪装浏览器
url_q = 'https://box.jimu.com/Venus/List' #轻松投
#获取网页内容的text
html_content_q = requests.get(url_q,headers=headers)
content_q = html_content_q.text
q = etree.HTML(content_q)
jpyq = PyQuery(content_q)
infoj = np.array(['积木盒子', nowTime, '', '',''])
infoj = np.row_stack((infoj, (['名称', '期限(月)', '利率', '加息', '开放状态'])))
#获取轻松投数据
for i in range(num):
path_name = '//html/body/div[4]/div[2]/div/a[{}]/div/div[1]/text()'.format(i+1)
path_period = '/html/body/div[4]/div[2]/div/a[{}]/div/div[3]/div[2]/div[1]/text()'.format(i+1)
path_status = '/html/body/div[4]/div[2]/div/a[{}]/div/div[3]/div[3]/div[1]/text()'.format(i+1)
name = ','.join(q.xpath(path_name))
period = ','.join(q.xpath(path_period)).strip()
status = ','.join(q.xpath(path_status))
ratejpyq = jpyq(
'body > div.container.venus-container > div.project-container > div > a:nth-child({}) > div > div.info > div.rate > div.num.invest-item-profit'.format(
i + 1)).text()
rate = ratejpyq.split('+')
if len(rate) == 2:
print(name, period, rate[0], rate[1].replace('%', ''), status)
infoj = np.row_stack((infoj,(name, period.replace(',',''), rate[0], rate[1].replace('%', ''), status)))
else:
print(name, period, rate[0].replace('%', ''), status)
infoj = np.row_stack((infoj,(name, period.replace(',',''), rate[0].replace('%', ''),'0', status)))
#获取自选投数据
infoj = np.row_stack((infoj, (['名称', '期限(月)', '利率', '加息', '募集进度/已募集金额(万元)'])))
for page_z in range(page):
t = random.randint(1, 5)
print('正在获取第{}页数据,下一页数据将在{}秒后显示'.format(page_z+1,t))
time.sleep(t)
url_z = 'https://box.jimu.com/Project/List?rate=&guarantee=&range=&page={}&category=&status='.format(page_z+1) # 自选投
html_content_z = requests.get(url_z,headers=headers)
content_z = html_content_z.text
z = etree.HTML(content_z)
jpyz = PyQuery(content_z)
for a in range(12):
path_name_1 = "/html/body/div[6]/div/div[{}]/a/div/div[1]/div[1]/text()".format(a+1)
path_period_1 = "/html/body/div[6]/div/div[{}]/a/div/div[4]/div/div[3]/div[1]/span/text()".format(a+1)
path_status_1 = "/html/body/div[6]/div/div[{}]/a/div/p/span[1]/text()".format(a+1)
name_1 = ','.join(z.xpath(path_name_1)).strip()
ratejpyz = jpyz(
'body > div.container.project-list > div > div:nth-child({}) > a > div:nth-child(1) > div.invest-item-features > div > div.invest-item-feature.invest-item-rate > div:nth-child(1) > span'.format(
a+1
)).text()
period_1 = ','.join(z.xpath(path_period_1)).strip()
status_1 = ','.join(z.xpath(path_status_1)).strip()
infoj = np.row_stack((infoj,(name_1,period_1,ratejpyz.replace('%',''),'0',status_1)))
print(name_1,period_1,ratejpyz.replace('%',''),status_1)
f = pd.DataFrame.from_dict(infoj) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
#获取拍拍贷
def get_ppd():
headers = {
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://tz.ppdai.com',
'Referer': 'https://tz.ppdai.com/resplendent/list'
} # 伪装浏览器
url = 'https://tz.ppdai.com/api/dplan/productNoAuth/queryProductLists' # 产品信息页
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫
payload = { 'appid':'h5',
'source':'1',
'version':'1'
}
h_content_html = requests.post(url, data=json.dumps(payload), headers=headers)
h_content = h_content_html.text
json_content = json.loads(h_content)
info = np.array(['【拍拍贷】', nowTime, '',''])
info = np.row_stack((info, ['名称', '期限(天)', '利率%','加息%']))
result = [(item.get('title', 'NA'),item.get('days', 'NA'),item.get('rate', 'NA'),item.get('addInterestRate', 'NA')) for item in json_content["resultContent"]['produts']]
info = np.row_stack((info, result))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
for productinfo in result:
print(productinfo)
# 信而富
def get_xinerfu(pro_num = 11):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.crfchina.com',
'Upgrade-Insecure-Requests': '1'
} # 伪装浏览器
url = 'https://www.crfchina.com/financeproduct.html' # 产品信息页
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
h_content_html = requests.get(url, headers=headers)
h_content = h_content_html.text
h = etree.HTML(h_content)
info = np.array(['信而富', nowTime, '','',''])
info = np.row_stack((info, ['名称', '期限(天)', '利率下限%','利率上限%','平均利率']))
for i in range (pro_num):
name = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[1]/a/text()'.format(i + 1))) #名称''
period = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[4]/text()'.format(i + 1))) #天数
rate = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[5]/*/text()'.format(i + 1))) #利率
rate2 = rate.split(',')
meanrate = (float(rate2[0])+ float(rate2[1]))/len(rate2)
print(name,period,rate2[0],rate2[1],meanrate)
info = np.row_stack((info, [name,period,rate2[0],rate2[1],meanrate]))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
# 人人贷
def get_rrd():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.renrendai.com',
'Upgrade-Insecure-Requests': '1'
} # 伪装浏览器
url = 'https://www.renrendai.com/' # 产品信息页
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
#访问首页
h_content_html = requests.get(url, headers=headers)
h_content = h_content_html.text
h = etree.HTML(h_content)
info = np.array(['人人贷', nowTime, '',''])
info = np.row_stack((info, ['名称', '期限(月)', '利率%','加息%']))
#获取新手专享
name = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[1]/div/span/text()')) # 名称
period = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[2]/div[2]/div[1]/text()')) # 期限
rate = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[2]/div[1]/div[1]/text()')) # 利率
print(name, period.replace('个月',''), rate)
info = np.row_stack((info, [name, period.replace('个月',''), rate, '0']))
#获取优选计划
name = ",".join(h.xpath('/html/body/div[4]/div[4]/div[1]/div/span/text()')) # 名称
period = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[2]/div[1]/text()')) # 期限
rate1 = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[1]/div[1]/text()')) # 基础利率
rate2 = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[1]/div[1]/*/text()')) # 加息利率
rate = (rate1.replace("%","")+rate2.replace("%","")).replace(",+"," ")
info = np.row_stack((info, [name, period.replace('个月',''), rate1, rate2.replace("%","").replace(",+"," ")]))
print(name, period.replace('个月',''), rate)
#获取U计划
for u in range(6):
period = ",".join(h.xpath('/html/body/div[4]/div[5]/div[2]/ul/li[{}]/a/p[1]/span/em/text()'.format(u+1))) # 期限
rate = ",".join(h.xpath('/html/body/div[4]/div[5]/div[2]/ul/li[{}]/a/p[2]/span[1]/i/text()'.format(u+1))) # 基础利率
rates = rate.split(',+')
if len(rates) == 2:
info = np.row_stack((info, ["U计划", period.replace('个月',''), rates[0], rates[1]]))
else:
info = np.row_stack((info, ["U计划", period.replace('个月',''), rates[0], '']))
print("U计划",period.replace('个月',''),rate.replace(",+"," "))
#print(rates)
#info = np.row_stack((info, ["U计划", period, rates[0], rates[1]]))
#获取薪计划
name = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[1]/div/div/text()')) # 名称
period = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[2]/div[3]/div[1]/text()')) # 期限
rate = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[2]/div[1]/div[1]/text()')) # 利率
print(name, period.replace('个月',''), rate)
info = np.row_stack((info, [name, period.replace('个月',''), rate, '0']))
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
# 网信普惠
def get_wxph(page=5):
info = np.array(['网信普惠', nowTime, '', ''])
info = np.row_stack((info, ['名称1', '名称2', '期限(月)', '利率%']))
for p in range(page):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.firstp2p.cn',
'Upgrade-Insecure-Requests': '1'
} # 伪装浏览器
url = 'https://www.firstp2p.cn/deals?p={}'.format(p+1) # 产品信息页
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
# 获取页面信息
h_content_html = requests.get(url, headers=headers)
h_content = h_content_html.text
jpy = PyQuery(h_content)
t = random.randint(1,5)
print("第{}页产品信息下载中,下一页将在{}秒后下载".format(p+1,t))
time.sleep(t)
# 智多鑫
if p == 0:
for i in range(3):
jpyzdx = jpy("#duotou > div.ph_zdxlist > div:nth-child({})".format(i + 1))
name1 = jpyzdx('div > div.con_l > h3 > a').text()
rate1 = jpyzdx('div>div>div>p>span>i').text()
period1 = jpyzdx('div> p> em').text()
rate2 = rate1.replace('%','').split('~ ')
rate3 = (float(rate2[0])+float(rate2[1]))/len(rate2)
period2 = period1.replace('天可申请转让/退出','').replace('天','')
period3 = float(period2)/30
info = np.row_stack((info, [name1, '', period3, rate3]))
print(name1, period3, rate3)
# 消费贷,经营贷
for i in range(10):
jpyp2p = jpy("#conbd > div:nth-child({})".format(i + 1))
name = jpyp2p('div > div.con_l > h3 > a').text().replace(' ', '')
namegyl = jpyp2p('div > div.con_l > h3 > span').text().replace(' ', '')
rate = jpyp2p('div>div>div>p>span>i').text().replace('进度条', '')
period = jpyp2p('div> p> em').text()
info = np.row_stack((info, [name, namegyl, period.replace('个月',''), rate]))
print(name, namegyl, period.replace('个月',''), rate)
f = pd.DataFrame.from_dict(info) # 把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') # 把信息矩阵存入csv
# 微贷网
def get_weidai(p_yx=5,p_sb=5):
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.weidai.com.cn',
'Referer': 'https://www.weidai.com.cn/list/showBidList.html',
'X-Requested-With': 'XMLHttpRequest'
} # 伪装浏览器
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
info = np.array(['微贷网', nowTime, '','',''])
info = np.row_stack((info, ['名称', '期限(天)', '期限(月)', '利率%', '加息%']))
# X计划
for pe in [3,6,12]:
url_xplan = 'https://www.weidai.com.cn/bid/intelligent/info?_api=bid.info&_mock=false&period={}&_stamp=1527673964336'.format(pe) # 产品信息页
xplan_content_html = requests.get(url_xplan, headers=headers)
xplan_content = xplan_content_html.text
xplan_json = json.loads(xplan_content)
period = xplan_json['data']['month']
baserate = xplan_json['data']['baseRate']
addrate = xplan_json['data']['addRate']
print("X计划 0 {} {} {}".format(period, baserate, addrate))
info = np.row_stack((info, ['X计划', '0', period, baserate*100, addrate*100]))
time.sleep(1)
#优选智投
for page in range(p_yx): #优选产品只有2页
t = random.randint(1,5)
time.sleep(t)
url_yx = 'https://www.weidai.com.cn/list/goodsList?type=0&periodType=0&page={}&rows=10&goodsType=PACKAGE'.format(page+1) # 产品信息页
yx_content_html = requests.get(url_yx, headers=headers)
yx_content = yx_content_html.text
yx_json = json.loads(yx_content)
if yx_json['resultCode'] == '1000': #如果获取数据状态码校验正常,则获取进一步的数据
if len(yx_json['data']['data']) : #如果数据信息不为空
info1 = [(item.get('goodsTitle', 'NA'), item.get('days', 'NA'), item.get('month', 'NA'),item.get('baseRate', 'NA'), item.get('addRate', 'NA')) for item in yx_json['data']['data']]
info = np.row_stack((info, np.array(info1)))
print(info1)
else:
print('数据信息为空')
break
else: #如果获取数据状态码校验异常,则跳出并报错
print('网页状态有误')
break
# 散标
for page in range(p_sb):
t = random.randint(1, 5)
time.sleep(t)
url_sb = 'https://www.weidai.com.cn/list/goodsList?type=0&periodType=0&sort=0&page={}&rows=10&goodsType=BIDDING'.format(page+1) # 产品信息页
sb_content_html = requests.get(url_sb, headers=headers)
sb_content = sb_content_html.text
sb_json = json.loads(sb_content)
info2 = [(item.get('goodsTitle', 'NA'), item.get('days', 'NA'), item.get('month', 'NA'),item.get('baseRate', 'NA'),item.get('addRate', 'NA')) for item in sb_json["data"]['data']]
info = np.row_stack((info,np.array(info2)))
for sb_info in info2:
info_d = list(sb_info)
name = info_d[0].replace(" ","")
print(name,info_d[1],info_d[2],info_d[3],info_d[4])
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
# 星火
def get_xinghuo():
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Host': 'xinghuo.yixin.com',
'Referer': 'https://xinghuo.yixin.com/finance/productList.html',
'X-Requested-With': 'XMLHttpRequest'
} # 伪装浏览器
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
info = np.array(['星火', nowTime, '', '', ''])
info = np.row_stack((info, ['名称', '期限(月)', '利率%', '加息%', '剩余额度(元)']))
url_ltb = 'https://xinghuo.yixin.com/webapi/product/list?version=20185253674&category=1&isAnXin=0&_=1527734045796' # 产品信息页
url_yyb = 'https://xinghuo.yixin.com/webapi/product/list?version=20185253674&category=8&_=1527734045792' # 产品信息页
ltb_content_html = requests.get(url_ltb, headers=headers)
ltb_content = ltb_content_html.text
ltb_json = json.loads(ltb_content)
data_ltb = [(item.get('productName', 'NA'),round(item.get('productPeriod', 'NA')/30,2),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('productQuota', 'NA')) for item in ltb_json['data']]
info = np.row_stack((info,np.array(data_ltb)))
for ltb in data_ltb:
list(ltb)
print(ltb[0],ltb[1],ltb[2],ltb[3],ltb[4])
yyb_content_html = requests.get(url_yyb, headers=headers)
yyb_content = yyb_content_html.text
yyb_json = json.loads(yyb_content)
data_yyb = [(item.get('productName', 'NA'),item.get('productPeriod', 'NA'),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('productQuota', 'NA')) for item in yyb_json['data']]
info = np.row_stack((info, np.array(data_yyb)))
for yyb in data_yyb:
list(yyb)
print(yyb[0],yyb[1],yyb[2],yyb[3],yyb[4])
f = pd.DataFrame.from_dict(info) #把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv
#投米
def get_toumi():
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.itoumi.com',
'Referer':'https://www.itoumi.com/p2p.shtml',
'X-Requested-With': 'XMLHttpRequest'
} # 伪装浏览器
ua = UserAgent()
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬
url_tm = 'https://www.itoumi.com/indexPc/selectProduct.json?currentpage=1' # 产品信息页
tm_content_html = requests.get(url_tm, headers=headers)
tm_content = tm_content_html.text
info = np.array(['投米', nowTime, '', '', ''])
info = np.row_stack((info, ['产品名称', '期限(月)', '利率%', '加息%', '可购状态']))
tm_json = json.loads(tm_content)
data_tm = [(item.get('subProductName', 'NA'),round(float(item.get('runDays', 'NA'))/30,2),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('canBuy', 'NA')) for item in tm_json['data']['list']]
info = np.row_stack((info, np.array(data_tm)))
for tm in data_tm:
list(tm)
print(tm[0],tm[1],tm[2],tm[3],tm[4])
f = pd.DataFrame.from_dict(info) # 把矩阵字典化
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') # 把信息矩阵存入csv
#指旺
import requests
import json
import time
url_newuser_product_list='https://www.91zhiwang.com/api/product/list?device_guid=1A605A25-8F7E-448A-88EF-0D74AD64E5D0&device_model=iPhone7%2C2&device_name=iPhone&sn=e0cb94bb6411bc050a5d9febc87a1763&timestamp=1529284324564.817&user_id=0'
headers = {'Host': 'www.91zhiwang.com',
'Accept': '*/*',
'User-Agent': 'ZW/4.7.3 (iPhone; iOS 11.4; Scale/2.00)',
'Accept-Language': 'en-CN;q=1, zh-Hans-CN;q=0.9',
'Accept-Encoding': 'br, gzip, deflate',
'Connection': 'keep-alive'}
def get_product_info():
if jsoninfo['return_code'] == 0: #如果返回状态正确
try:
pro_group = jsoninfo['product_categories'][0]['product_groups'] #标注一下group方便后面引用
for i in range(len(pro_group)): #用遍历获取所有产品类型的列表
if pro_group[int('{}'.format(i))]['group_name'] != '': #如果产品类型列表不为空
products = pro_group[int('{}'.format(i))]['products'] #标注一下products方便后面引用
if len(products) > 1: #如果产品列表里面有多个产品
for i2 in range(len(products)): #遍历产品信息
name = products[int('{}'.format(i2))]['name']
period = products[int('{}'.format(i2))]['product_list_items'][1]['value']
rate1 = products[int('{}'.format(i2))]['product_list_items'][0]['value']
rate2 = products[int('{}'.format(i2))]['product_list_items'][0]['extra'].replace('%','0')
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 2: #判断该产品券和红包栏里面的元素数量
quan = '可用红包、券'
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 1:
quan1 = products[int('{}'.format(i2))]['prod_list_icons'] #获取icon信息
quanq = 'https://static2.yixin.com/public/zhiwang/2017-04-20-13e4746c-a392-4533-949a-652b7c64b951.png'
quanh = 'https://static2.yixin.com/public/zhiwang/2017-04-20-94e6ac88-83c6-464f-a4f5-dec8732f126c.png'
if quanq in quan1: #如果是券的icon,就可用券
quan = '可用券'
if quanh in quan1: #如果是红包的icon,就可用红包
quan = '可用红包'
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 0: #如果券和红包栏里面没有信息,那就都不可用
quan = '不可用券'
print(name, period, rate1, rate2, quan)
if len(products) == 1: #如果产品列表只有一个产品,就不用遍历,直接获取即可
name = products[0].get('name')
period = products[0]['product_list_items'][1].get('value')
rate1 = products[0]['annual_rate_info'].get('annual_rate_str')
rate2 = ((products[0]['annual_rate_info'].get('added_annual_rate_str')).replace('+', '')).replace(
'%', '')
if pro_group[int('{}'.format(i))]['group_name'] == '福卡专享':
quan = '不可用券'
if pro_group[int('{}'.format(i))]['group_name'] != '福卡专享':
i2 = '0'
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 2: #判断该产品券和红包栏里面的元素数量
quan = '可用红包、券'
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 1:
quan1 = products[int('{}'.format(i2))]['prod_list_icons'] #获取icon信息
quanq = 'https://static2.yixin.com/public/zhiwang/2017-04-20-13e4746c-a392-4533-949a-652b7c64b951.png'
quanh = 'https://static2.yixin.com/public/zhiwang/2017-04-20-94e6ac88-83c6-464f-a4f5-dec8732f126c.png'
if quanq in quan1: #如果是券的icon,就可用券
quan = '可用券'
if quanh in quan1: #如果是红包的icon,就可用红包
quan = '可用红包'
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 0: #如果券和红包栏里面没有信息,那就都不可用
quan = '不可用券'
print(name, period, rate1, rate2,quan)
except: #用exept来做异常时候的捕获和处理,输出异常时获取到的json信息内容
print('超出list')
else:
print('json error')
#获取未登录时候的产品列表信息
html = requests.get(url_newuser_product_list, headers=headers)
jsoninfo = json.loads(html.text)
get_product_info()
time.sleep(5)
if __name__ == '__main__':
get_xinghuo() #获取星火数据
#get_toumi() #获取投米数据
#get_xiaoying() #执行获取小赢理财信息,产品名,期限,利率的函数
#get_iqianjin()#执行获取爱钱进信息的函数
#get_lup2p()#执行获取陆金所信息的函数
#get_9fph(5) #执行获取玖富信息的函数,可自定义需要多少页的优选计划信息,一般5-10页,默认是5页
#get_jimu() #执行获取积木盒子相关信息的函数,需要自定义告诉积木盒子轻松投产品数量和自选投爬取页数,默认是21个,3页
#get_ppd() #拍拍贷,名称,期限,利率,加息
#get_xinerfu() # 注意,期限单位是天数,利率是区间,新手信息是3月期的信息。括号内可以填信息页面的产品数,默认11个
#get_rrd() # 注意,散标信息并没有爬取,散标信息和活动信息需要手动查看。项目为:产品名称,期限,利率,加息(如有)
#get_wxph(10) #在括号内输入要下载的页数,一般是5或者10页,默认5页
#get_weidai(10,10) #在括号内输入要下载的优选页数(一般2或者3),散标页数(一般是5或者10页,默认5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment