Skip to content

Instantly share code, notes, and snippets.

@wgzhao
Created August 16, 2018 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wgzhao/31b539004356662d88723b26843d5ccc to your computer and use it in GitHub Desktop.
Save wgzhao/31b539004356662d88723b26843d5ccc to your computer and use it in GitHub Desktop.
从北京新发地(xinfadi.com.cn) 获取每个类目中有典型代表的商品(SKU)的2018年历史价格,然后验证是否出于上涨趋势
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests
from urllib.parse import quote
"""
从北京新发地(xinfadi.com.cn) 获取每个类目中有典型代表的商品(SKU)的2018年历史价格,然后验证是否出于上涨趋势
"""
# 索引与品类的映射关系
cate_map = {1:'蔬菜',2:'水果',3:'肉禽蛋',4:'水产',5:'粮油'}
# 选择具有代表性的Sku进行抽样检测
sku_list = {1:['白菜','大蒜','葱','尖椒','丝瓜','茄子','黄瓜'],
2:['苹果','西瓜','雪花梨','水蜜桃','哈密瓜'],
3:['鸡蛋','鸭蛋','松花蛋','五花肉','肉鸡','肥牛','羊肉'],
4:['草鱼','胖头鱼','鲫鱼','黄鳝','多宝鱼','基围虾'],
5:['东北大米','河北小米','小米面','金龙鱼调和油']}
query_string = 'begintime=2018-01-01&endtime=2018-08-15'
url_temp = 'http://xinfadi.com.cn/marketanalysis/{}/list/{}.shtml?'
#result = pd.DataFrame()
for k,v in sku_list.items():
url = url_temp.format(k, 1)
for sku in v:
print(sku)
# 首先找出一共有多少页
url += "&prodname={}".format(quote(sku))
# 把当前页的内容获取
df = pd.read_html(url, attrs={'class':'hq_table'}, skiprows=1)[0]
df.drop([7], axis=1, inplace=True)
result = result.append(df)
data = requests.get(url).text
root = bs(data, 'lxml')
manu = root.find('div', attrs={'class':'manu'})
if not manu:
continue
e = manu.find('a',attrs={'title':'尾页'})
if not e:
continue
# 页数
pages = int(e.get('href').split('.')[0].split('/')[-1])
#开始循环
for page in range(2, pages+1):
url = url_temp.format(k, page) + query_string + "&prodname={}".format(quote(sku))
df_iter = pd.read_html(url, attrs={'class':'hq_table'}, skiprows=1)[0]
df_iter.drop([7], axis=1, inplace=True)
result = result.append(df_iter)
#df.columns = ['品名','最低价','平均价','最高价','规格','单位','发布日期']
result.columns=['品名','最低价','平均价','最高价','规格','单位','发布日期']
result.to_csv('./xinfadi_sku_2018.csv', index=False)
result.to_excel('./xinfadi_sku_2018.xlsx', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment