Created
April 14, 2017 03:54
-
-
Save fzls/1174abab07ffcc87b42683be8efb53f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import inspect | |
import json | |
import logging.handlers | |
import re | |
import sys | |
import time | |
import timeit | |
import types | |
from random import randint | |
from urllib.parse import parse_qs, urlparse, urlencode | |
import requests | |
import xlwt | |
from pypinyin import lazy_pinyin | |
ezxf = xlwt.easyxf | |
from bs4 import BeautifulSoup, Tag | |
### log 相关设置 | |
# 设置时间格式 | |
DATE_TIME_FORMAT = '%Y-%m-%d_%H-%M-%S' | |
logFormatter = logging.Formatter('%(threadName)10s %(asctime)s %(levelname)s [line:%(lineno)d] %(message)s') | |
log = logging.getLogger(__name__) | |
fileHandler = logging.handlers.RotatingFileHandler("full_logs.log", maxBytes=(15 * 1024 * 1024), backupCount=7, | |
encoding='utf-8') | |
# fileHandler = logging.FileHandler("full_logs.log", encoding='utf-8') | |
fileHandler.setFormatter(logFormatter) | |
fileHandler.setLevel(logging.DEBUG) | |
log.addHandler(fileHandler) | |
consoleHandler = logging.StreamHandler(sys.stdout) | |
consoleHandler.setFormatter(logFormatter) | |
consoleHandler.setLevel(logging.ERROR) | |
log.addHandler(consoleHandler) | |
def eye_catching_logging(msg='', logger=log.info): | |
dashes = '-' * 50 | |
msg = '%s %s %s' % (dashes, str(msg).title(), dashes) | |
logger(msg) | |
log.eye_catching_logging = eye_catching_logging | |
def list_debug(l: list): | |
line_number = inspect.stack()[1][2] | |
log.eye_catching_logging('called from [line:%s]' % (line_number)) | |
posfix = 'OF PRINTING LIST with size of [{length}]'.format(length=len(l)) | |
log.eye_catching_logging('{position} {posfix}'.format(position='start', posfix=posfix)) | |
for v in l: | |
log.debug(v) | |
log.eye_catching_logging('{position} {posfix}'.format(position='end', posfix=posfix)) | |
log.list_debug = list_debug | |
def json_debug(var): | |
line_number = inspect.stack()[1][2] | |
log.eye_catching_logging('called from [line:%s]' % (line_number)) | |
log.debug(json.dumps(var, ensure_ascii=False, indent=2)) | |
pass | |
log.json_debug = json_debug | |
### requests 设置 | |
meituan_waimai_url = 'http://waimai.meituan.com' | |
headers = { | |
# "Host": "waimai.meituan.com", | |
"Connection": "keep-alive", | |
"Cache-Control": "max-age=0", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"DNT": "1", | |
"Accept-Encoding": "gzip, deflate, sdch", | |
"Referer": "http://waimai.meituan.com", | |
"Accept-Language": "en,zh-CN;q=0.8,zh;q=0.6,zh-TW;q=0.4,en-GB;q=0.2,ja;q=0.2", | |
"Cookie": | |
"BAIDUID=5365A55222D580D81C224BB2827B9BBD:FG=1; " | |
"PSTM=1488433005; BIDUPSID=31FB76D71AEF46DDEDAB7059DACCD5B6; " | |
"BDUSS=mpZeEQybW9uTzlmRUowTEl0UnlXQ3FtMWdWSHFlV0s2OGVGdHE5QUpGM1NtLXRZSVFBQUFBJCQAAAAAAAAAAAEAAAAx-PUbt-fWrsHo6eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANIOxFjSDsRYc; " | |
"cflag=15%3A3; " | |
"BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; " | |
"w_uuid=kmIPS3P4B38ydNA-3Yl9Vn5TzEGkTuG59D2C4pKPtlt6Y37qEPCb7eO4xLiGV-3x; " | |
"rvd=27303032; " | |
"abt=1489757394.0%7CBDE; " | |
"rvct=59; " | |
"ci=20; " | |
"__mta=142562829.1489572822289.1489757414882.1489757425452.4; " | |
"uuid=aaed622404d6d7fea9a0.1489572821.0.0.0; " | |
"oc=VGWS9bnksswT-gz8M7PP-Ne2q5g07ua8nfubsP9-rEXUnmQrxtF3jDBs3H6L0QNcaHHphDGejyd7HvPsGDgzOvM4voY-wYZmDkVOx-pagiHERjILjD3HpqtkPiYMhhKpblRJbv9DTGRCNtag10fg6NGJIbzp5xyspZpoaiF5aVc; " | |
"__utma=211559370.1817606946.1489557637.1489572806.1489757397.2; " | |
"__utmz=211559370.1489572806.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search; " | |
"__utmv=211559370.|1=city=hangzhou=1^3=dealtype=11=1; " | |
"_ga=GA1.2.1817606946.1489557637; " | |
"w_cid=440111; " | |
"w_cpy_cn=\"%E7%99%BD%E4%BA%91%E5%8C%BA\"; " | |
"w_cpy=baiyunqu; " | |
"waddrname=\"%E7%BE%8E%E4%BC%98%E4%B9%90%28%E9%92%9F%E8%90%BD%E6%BD%AD%E5%BA%97%29\"; " | |
"w_geoid=ws0th9r1hpzn; " | |
"w_ah=\"23.38598694652319,113.41067299246788,%E7%BE%8E%E4%BC%98%E4%B9%90%28%E9%92%9F%E8%90%BD%E6%BD%AD%E5%BA%97%29|23.38598694652319,113.41067299246788,%E6%9C%AA%E7%9F%A5|20.559364277869463,109.84969820827246,%E6%B9%9B%E6%B1%9F%E5%B8%82%24%E9%9B%B7%E5%B7%9E%E5%B8%82%24%24%E7%BE%8E%E4%BC%98%E4%B9%90%28%E9%BE%99%E9%97%A81%E5%BA%97%29|20.336991380900145,110.18095470964909,%E6%B9%9B%E6%B1%9F%E5%B8%82%24%E5%BE%90%E9%97%BB%E5%8E%BF%24%24%E7%BE%8E%E4%BC%98%E4%B9%90%28%E9%87%91%E8%B4%B8%E5%BA%97%29|30.084730871021748,120.06984293460846,%E6%9D%AD%E5%B7%9E%E7%BE%8E%E4%BC%98%E4%B9%90%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8|21.270870845764875,110.33403977751732,%E7%BE%8E%E4%BC%98%E4%B9%90%28%E9%BA%BB%E7%AB%A01%E5%BA%97%29|23.30090392380953,113.36225274950266,%E7%BE%8E%E4%BC%98%E4%B9%90%28%E5%A4%AA%E5%92%8C%E5%BA%97%29|23.176579847931862,113.26622597873211,%E5%B0%8A%E5%AE%9D%E6%AF%94%E8%90%A8%28%E6%9C%BA%E5%9C%BA%E8%B7%AF%E5%BA%97%29|23.18809188902378,113.27573172748089,%E5%A4%96%E5%A9%86%E5%AE%B6|23.178626876324415,113.26014876365662,%E6%96%B0%E5%AE%A2%E5%AE%B6%28%E8%BF%9C%E6%99%AF%E8%B7%AF%E5%BA%97%29\";" | |
"JSESSIONID=196iown7kardy1amnqmutiy82x; _" | |
"ga=GA1.3.1817606946.1489557637; _" | |
"gat=1; __" | |
"mta=142562829.1489572822289.1489757425452.1490092529677.5; " | |
"w_utmz=\"utm_campaign=(direct)&utm_source=(direct)&utm_medium=(none)&utm_content=(none)&utm_term=(none)\"; " | |
"w_visitid=c5ba70c7-4160-42b9-b168-0a9eabc359b8" | |
} | |
session = requests.session() | |
session.headers = headers | |
## xls heading相关设置 | |
shop_heading = [ | |
'origin_price', | |
'price', | |
'id', | |
'name', | |
'isSellOut', | |
'month_sold_count', | |
'description', | |
'zan', | |
'minCount', | |
] | |
SHOP_NAME = 'shop_name' | |
shops_heading = [SHOP_NAME] | |
shops_heading += shop_heading | |
shops_info_heading = [ | |
'name', | |
'address', | |
'lat', | |
'lng', | |
'month_sale_count', | |
'start_price', | |
'send_price', | |
'send_time', | |
'url', | |
] | |
heading_tpye = { | |
SHOP_NAME: 'text', | |
'origin_price': 'float', | |
'price': 'float', | |
'id': 'int', | |
'name': 'text', | |
'isSellOut': 'bool', | |
'month_sold_count': 'int', | |
'description': 'text', | |
'zan': 'int', | |
'minCount': 'int', | |
'address': 'text', | |
'lat': 'float', | |
'lng': 'float', | |
'month_sale_count': 'text', | |
'start_price': 'text', | |
'send_price': 'text', | |
'send_time': 'text', | |
'url': 'text', | |
} | |
heading_cn = { | |
SHOP_NAME: '商铺名', | |
'origin_price': '原价', | |
'price': '现价', | |
'id': 'id', | |
'name': '名称', | |
'isSellOut': '售罄', | |
'month_sold_count': '月销量', | |
'description': '描述', | |
'zan': '点赞数', | |
'minCount': '单次最小购买量', | |
'address': '地址', | |
'lat': '纬度', | |
'lng': '经度', | |
'month_sale_count': '月销量', | |
'start_price': '起送价', | |
'send_price': '配送费', | |
'send_time': '平均送餐时间', | |
'url': '{current_time}:当前店铺网址(PS:美团外卖为防爬虫,每隔一段时间店铺的id会更新,对应的网址也会变化)'.format(current_time=time.ctime()), | |
} | |
def change_type(val, type_): | |
if type_ == 'bool': | |
return bool(val) | |
elif type_ == 'float' and val is not None: | |
return float(val) | |
elif type_ == 'int' and val is not None: | |
return int(val) | |
elif not val: | |
return '' | |
else: | |
return str(val) | |
type_fmt = { | |
'text': ezxf(), | |
'float': ezxf(num_format_str='#,##0.00'), | |
'int': ezxf(num_format_str='#,##0'), | |
'bool': ezxf(num_format_str='#,##0'), | |
} | |
###################################################################### | |
class CityIdName(object): | |
def __init__(self, city_id: str, city_name: str): | |
self.id = city_id | |
self.name = city_name | |
def __str__(self): | |
return 'id: %s, name: %s' % (self.id, self.name) | |
class Shop(object): | |
def __init__(self, name: str, address: str, lat: str, lng: str, geo_hash='', urls=None): | |
self.name = name | |
self.address = address | |
self.lat = lat | |
self.lng = lng | |
self.geo_hash = geo_hash | |
if urls: | |
self.urls = urls | |
else: | |
self.urls = [] | |
self.month_sale_count = None | |
def __str__(self): | |
# TODO: google it for if it can strify automatically | |
return 'name: %s, address: %s, lat: %s, lng: %s, geo_hash: %s, url: %s' % ( | |
self.name, self.address, self.lat, self.lng, self.geo_hash, self.urls) | |
class MeituanCrawler(object): | |
def __init__(self): | |
self.shop_index = 0 | |
self.wb = xlwt.Workbook(encoding='utf-8') | |
def export_shop_to_xls_sheet(self, parsed_info, sheet_name): | |
if len(parsed_info) == 0: | |
return | |
ws = self.wb.add_sheet(sheet_name) # type: xlwt.Worksheet | |
# 写入heading | |
row = 0 | |
for col, h in enumerate(shop_heading): | |
ws.write(row, col, heading_cn[h]) | |
# 写入记录 | |
for food in parsed_info: | |
row += 1 | |
for col, h in enumerate(shop_heading): | |
ws.write(row, col, change_type(food[h], heading_tpye[h]), type_fmt[heading_tpye[h]]) | |
pass | |
def export_one_shop(self, parsed_info, unique_file_name): | |
log.eye_catching_logging('开始准备导出{filename}'.format(filename=unique_file_name)) | |
self.export_shop_to_xls_sheet(parsed_info, unique_file_name) | |
log.eye_catching_logging('完成导出{filename}'.format(filename=unique_file_name)) | |
log.debug('') | |
pass | |
def get_sheet_name(self, filename): | |
return re.sub(r'\[|\]|:|\\|\?|/*|\x00', '', filename)[:31] | |
def parse_shop_page(self, shop: Shop): | |
return self.parse_urls([shop.url], shop.name, shop.address) | |
def parse_urls(self, urls, name='商铺', address=''): | |
parsed_infos = [] | |
for idx, shop_url in enumerate(urls): | |
res = session.get(shop_url) | |
soup = BeautifulSoup(res.text, 'lxml') | |
# 获取基本信息 | |
food_data_nodes = soup.find_all('script', {'type': 'text/template', 'id': re.compile('foodcontext-\d+')}) | |
parsed_info = [] | |
for food_data_node in food_data_nodes: # type: Tag | |
# TODO: add class product | |
try: | |
food = json.loads(food_data_node.string) | |
except json.decoder.JSONDecodeError as e: | |
log.error(e) | |
log.error(food_data_node) | |
log.eye_catching_logging('提取 [{name}] 的信息'.format(name=food.get('name'))) | |
container = food_data_node.parent # type: Tag | |
# ------------------------如果存在食物描述,则获取------------------- | |
food_description = container.find('div', { | |
'class': 'description' | |
}) | |
if food_description and food_description.string: | |
food_description = food_description.string.strip() | |
else: | |
food_description = None | |
log.debug('描述 : %s' % food_description) | |
# ------------------------如果存在点赞数,则获取------------------- | |
food_zan = container.find('div', { | |
'class': 'zan-count' | |
}) | |
if food_zan and food_zan.span and food_zan.span.string: | |
food_zan = food_zan.span.string.strip()[1:-1] | |
else: | |
food_zan = None | |
log.debug('点赞数 : %s' % food_zan) | |
# ------------------------如果存在当月销量,则获取------------------- | |
food_sold_count = container.find('div', { | |
'class': 'sold-count' | |
}) | |
if food_sold_count and food_sold_count.span and food_sold_count.span.string: | |
# 原始的数据类似于: 月售12份 | |
food_sold_count = food_sold_count.span.string.strip() | |
# 从中提取销售量 | |
cnt_pattern = '月售(\d+)份' | |
food_sold_count = re.search(cnt_pattern, food_sold_count).group(1) | |
else: | |
food_sold_count = None | |
log.debug("count : %s", food_sold_count) | |
parsed_info.append({ | |
'id': food.get('id'), | |
'name': food.get('name'), | |
'price': food.get('price'), | |
'origin_price': food.get('origin_price'), | |
'minCount': food.get('minCount'), | |
'isSellOut': food['sku'][0]['isSellOut'], | |
'description': food_description, | |
'zan': food_zan, | |
'month_sold_count': food_sold_count | |
}) | |
pass | |
if parsed_info: | |
parsed_infos.append(parsed_info) | |
# 从详情页中获取商铺名 | |
details_list = soup.select('div.details .list .na')[0] # type: Tag | |
shop_name = details_list.find_all('span')[0].string.strip() | |
# TODO: 从详情页中获取商铺地址 | |
# shop_address_div = soup.select('div.rest-info-thirdpart')[0] | |
# shop_address = shop_address_div.string.strip().replace('商家地址:', '') | |
self.shop_index += 1 | |
shop_unique_name = '{idx}_{shop_name}@{shop_address}'.format(idx=self.shop_index, | |
shop_name=shop_name, | |
shop_address=address.replace('$', '')) | |
if idx > 0: | |
shop_unique_name += '_{index}'.format(index=idx) | |
shop_unique_name += '_商品信息' | |
unique_file_name = self.get_sheet_name(shop_unique_name) | |
# 将当前商家导出 | |
self.export_one_shop(parsed_info, unique_file_name) | |
return parsed_infos | |
def is_shop_in_this_city(self, address: str, shop_name: str, city_name: str): | |
return shop_name in address and city_name in address | |
def export_all_to_xls_sheet(self, parsed_infos, sheet_name): | |
if len(parsed_infos) == 0: | |
return | |
ws = self.wb.add_sheet(sheet_name) | |
# 写入heading | |
row = 0 | |
for col, h in enumerate(shops_heading): | |
ws.write(row, col, heading_cn[h]) | |
# 写入记录 | |
for shop_name, parsed_info in parsed_infos.items(): | |
for food in parsed_info: | |
row += 1 | |
# 添加商店名信息 | |
food[SHOP_NAME] = shop_name | |
# 将数据一行行写入表单 | |
for col, h in enumerate(shops_heading): | |
ws.write(row, col, change_type(food[h], heading_tpye[h]), type_fmt[heading_tpye[h]]) | |
pass | |
log.eye_catching_logging('成功导出为{sheetname}表单'.format(sheetname=sheet_name)) | |
pass | |
def export_all_shops(self, parsed_infos: dict, filename): | |
log.eye_catching_logging('开始准备导出{filename}'.format(filename=filename)) | |
self.export_all_to_xls_sheet(parsed_infos, filename) | |
log.eye_catching_logging('完成导出{filename}'.format(filename=filename)) | |
log.info('') | |
pass | |
def is_the_shop_we_want(self, res_name, shop_name): | |
return shop_name in res_name | |
def parse_shops_and_export(self, shops: list, shop_name: str): | |
if len(shops) == 0: | |
log.eye_catching_logging('商家列表为空') | |
return | |
log.eye_catching_logging('开始解析商家页面') | |
parsed_infos = {} | |
for shop in shops: | |
for idx, parsed_info in enumerate(self.parse_shop_page(shop)): | |
shop_unique_name = '{name}@{address}'.format(name=shop.name, address=shop.address.replace('$', '')) | |
if idx > 0: | |
shop_unique_name += '_{index}'.format(index=idx) | |
parsed_infos[shop_unique_name] = parsed_info | |
# log.json_debug(parsed_infos) | |
# 将本次获取的商家信息导出到单个汇总的表单 | |
filepath = '{shop_name}_商品信息_汇总'.format(shop_name=shop_name) | |
self.export_all_shops(parsed_infos, self.get_sheet_name(filepath)) | |
return parsed_infos | |
def export_shops_info_to_xls_sheet(self, shops, sheetname): | |
ws = self.wb.add_sheet(sheetname) # type: xlwt.Worksheet | |
row = 0 | |
for col, h in enumerate(shops_info_heading): | |
ws.write(row, col, heading_cn[h]) | |
ws.write(row, len(shops_info_heading), 'geo_hash') | |
for shop in shops: | |
row += 1 | |
for col, h in enumerate(shops_info_heading): | |
# NOTE: 这里需要str,由其针对urls:list,否则会产生错误 | |
ws.write(row, col, change_type(shop.__getattribute__(h), heading_tpye[h]), type_fmt[heading_tpye[h]]) | |
ws.write(row, len(shops_info_heading), str(shop.geo_hash)) | |
log.eye_catching_logging('成功导出为{sheetname}表单'.format(sheetname=sheetname)) | |
pass | |
def remove_duplicate_shops(self, shops): | |
unique_shops = [] | |
visited_urls = {} | |
for shop in shops: | |
if not visited_urls.get(shop.url): | |
unique_shops.append(shop) | |
visited_urls[shop.url] = True | |
return unique_shops | |
def filter_out_shop_with_no_urls(self, shops): | |
filtered = list(filter(lambda shop: shop.urls, shops)) | |
log.eye_catching_logging('美团上该品牌开设{number_of_shop}家的店面如下所示'.format(number_of_shop=len(filtered))) | |
log.list_debug(filtered) | |
return filtered | |
def get_striped_str(self, res_tag_with_str): | |
if res_tag_with_str and res_tag_with_str.string: | |
res_tag_with_str = res_tag_with_str.string.replace('\n', '').strip() | |
else: | |
res_tag_with_str = '' | |
return res_tag_with_str | |
def get_shop_with_url_by_geo_hash_and_name(self, shop: Shop): | |
query = {'keyword': shop.name} | |
meituan_search_api = 'http://waimai.meituan.com/search/{geo_hash}/rt?{query}'.format( | |
geo_hash=shop.geo_hash, query=urlencode(query)) | |
res = session.get(meituan_search_api) | |
log.eye_catching_logging('start find shop url in search results') | |
soup = BeautifulSoup(res.text, 'lxml') | |
res_lis = soup.find_all('li', {'class': 'rest-list'}) | |
# log.debug(res_lis) | |
if len(res_lis) == 0: | |
log.eye_catching_logging() | |
log.warning( | |
'fail to find [{shop}] url in : {search_url}'.format(search_url=meituan_search_api, shop=shop.address)) | |
log.eye_catching_logging() | |
shops_in_res = [] | |
for res_li in res_lis: | |
res_name = res_li.find('p', {'class': 'name'}).string.replace('\n', '').strip() | |
res_path = res_li.find('a')['href'] | |
res_total = self.get_striped_str(res_li.find('span', {'class': 'total'})).replace('月售', '') | |
res_start_price = self.get_striped_str(res_li.find('span', {'class': 'start-price'})).replace('起送', '') | |
res_send_price = self.get_striped_str(res_li.find('span', {'class': 'send-price'})).replace('配送费', '') | |
res_send_time = self.get_striped_str(res_li.find('p', {'class': 'send-time'})).replace('平均送餐时间:', '') | |
log.debug([res_total, res_start_price, res_send_price, res_send_time]) | |
# 检查该搜索结果是否为我们想要的结果 : 看品牌名是否在搜索结果的标题中 | |
if self.is_the_shop_we_want(res_name, shop.name): | |
log.debug('SUCCESSED in finding url in : {search_url}'.format(search_url=meituan_search_api)) | |
log.debug("result shop name is : %s" % res_name) | |
log.debug("result shop path is : %s" % res_path) | |
if res_path: | |
shop_url = '{host}{path}'.format(host=meituan_waimai_url, path=res_path) | |
shop_in_res = Shop(res_name, shop.address, shop.lat, shop.lng, shop.geo_hash) | |
# TODO: add elems | |
shop_in_res.url = shop_url | |
shop_in_res.month_sale_count = res_total | |
shop_in_res.start_price = res_start_price | |
shop_in_res.send_price = res_send_price | |
shop_in_res.send_time = res_send_time | |
log.debug('url is : {url}'.format(url=shop_url)) | |
shops_in_res.append(shop_in_res) | |
# log.debug(shop) | |
return shops_in_res | |
pass | |
def batch_get_shop_with_url_by_geo_hash_and_name(self, shops): | |
shops_exists_in_meituan = [] | |
for shop in shops: | |
if shop.geo_hash: | |
shops_exists_in_meituan += self.get_shop_with_url_by_geo_hash_and_name(shop) | |
return shops_exists_in_meituan | |
def fetch_geo_hash_for_shops(self, shops: list): | |
log.eye_catching_logging('start fetch geo hash') | |
meituan_calc_geo_hash_api = 'http://waimai.meituan.com/geo/geohash' | |
for idx, shop in enumerate(shops): | |
query = { | |
'lat': shop.lat, | |
'lng': shop.lng, | |
'addr': shop.address, | |
'from': 'm', | |
} | |
MAX_TRIES = 5 | |
while MAX_TRIES: | |
res = session.get(meituan_calc_geo_hash_api, params=query) | |
shop.geo_hash = res.cookies.get('w_geoid') | |
if shop.geo_hash: | |
break | |
# if goes here, access limit is exceed | |
up = urlparse(res.url) | |
returned_params = parse_qs(up.query) | |
log.debug(res.url) | |
log.debug('returned_params is : %s' % returned_params) | |
wait_for = randint(3, 4) | |
log.warning( | |
'access limit is exceed when fetching [%dth] shop named [%s], wait for %ds, remaing try time: %d' % ( | |
idx, shop.address, wait_for, MAX_TRIES)) | |
time.sleep(wait_for) | |
MAX_TRIES -= 1 | |
log.eye_catching_logging('Geo hash fetched') | |
log.list_debug(shops) | |
def add_lng_lat_by_address(self, addresses: list, shop_name=''): | |
bdmap_address_to_lng_lat_api = 'http://api.map.baidu.com/geocoder/v2/' | |
application_key = 'Eze6dPlb3bnUrihPNaaKljdUosb4G41B' | |
shops = [] | |
for address in addresses: | |
query = { | |
'output': 'json', | |
'address': address, | |
'ak': application_key | |
} | |
res = session.get(bdmap_address_to_lng_lat_api, params=query).json() | |
log.debug(res) | |
location = res['result']['location'] | |
shop = Shop(shop_name, address, location['lat'], location['lng']) | |
shops.append(shop) | |
log.list_debug(shops) | |
return shops | |
def find_possiable_addresses(self, cid_name: CityIdName, shop_name: str): | |
def find_res_upper_limit(wd, cid, rn_l=0, rn_h=130): | |
from functools import lru_cache | |
@lru_cache() | |
def _try(wd, cid, rn, max_try=5): | |
res = session.get('http://map.baidu.com/su', params={ | |
"wd": wd, | |
"cid": cid, | |
"rn": rn, | |
"type": "0", | |
}) | |
res.encoding = 'utf-8' | |
try: | |
return len(res.json().get('s')) != 0 | |
except Exception as e: | |
log.eye_catching_logging(str(e), log.error) | |
log.eye_catching_logging("sleep for 0.5s", log.error) | |
time.sleep(0.5) | |
if max_try <= 0: | |
log.eye_catching_logging('address get failed with max_try times', log.error) | |
return False | |
return _try(wd, cid, rn, max_try - 1) | |
@lru_cache() | |
def _rel(wd, cid, rn): | |
# TTT for -1, TTF for 0, TFF,FFF for 1 | |
if _try(wd, cid, rn): | |
if _try(wd, cid, rn + 1): | |
return -1 | |
else: | |
return 0 | |
else: | |
return 1 | |
rn_l = 0 | |
rn_h = 130 | |
if _try(wd, cid, rn_h): | |
print('found at high', rn_h) | |
return rn_h | |
while True: | |
rn = int((rn_l + rn_h) / 2) | |
r = _rel(wd, cid, rn) | |
if r == 0: | |
return rn | |
elif r == -1: | |
rn_l = rn + 1 | |
else: | |
rn_h = rn - 1 | |
if rn_l > rn_h: | |
return -1 | |
bdmap_find_address_by_name_api = 'http://map.baidu.com/su' | |
# note: 这个数可能会引起问题,之前65在广东华莱士会出现问题,其他地方不会 | |
# 采用二分法找到合适的结果数 | |
result_number = find_res_upper_limit(shop_name, cid_name.id, 0, 130) | |
log.eye_catching_logging('合适的结果数为:%d' % result_number, log.error) | |
query = { | |
"wd": shop_name, | |
"cid": cid_name.id, | |
"rn": result_number, | |
"type": "0", | |
} | |
while True: | |
res = session.get(bdmap_find_address_by_name_api, params=query) | |
# print(test.status_code) | |
res.encoding = 'utf-8' | |
try: | |
json_res = res.json() | |
break | |
except Exception as e: | |
log.eye_catching_logging(str(e), log.error) | |
log.eye_catching_logging("sleep for 0.5s", log.error) | |
time.sleep(0.5) | |
addresses = [] | |
for address in json_res['s']: | |
if not self.is_shop_in_this_city(address, shop_name, cid_name.name): | |
continue | |
addresses.append(address) | |
log.eye_catching_logging('api response for %s' % cid_name) | |
log.debug(json.dumps(json_res, ensure_ascii=False, indent=2)) | |
log.eye_catching_logging('%s' % cid_name) | |
log.list_debug(addresses) | |
log.eye_catching_logging('total %s' % (len(addresses))) | |
return addresses | |
def get_city_id_and_name(self, city_name: str): | |
""" | |
:param city_name: 城市名称 | |
:return: [id:int, name:str] | |
""" | |
ID = 0 | |
NAME = 1 | |
with open('BaiduMap_cityCode_1102.txt', encoding='utf-8') as city_ids: | |
import csv | |
for city_id_name in csv.reader(city_ids): | |
if city_name in city_id_name[NAME]: | |
log.eye_catching_logging('city id found') | |
log.eye_catching_logging(city_id_name, log.error) | |
return CityIdName(*city_id_name) | |
log.eye_catching_logging('%s not found in city list' % city_name, log.error) | |
return [0, 'not found'] | |
def collect_shop_urls(self, city_name, shop_name): | |
## 处理逻辑 | |
# 1. 获取该城市对应的id,组成tuple | |
cid_name = self.get_city_id_and_name(city_name) | |
# 2. 利用百度地图接口找到该城市存在的该品牌的店的地址作为初始结果集 | |
addresses = self.find_possiable_addresses(cid_name, shop_name) | |
shops_exists_in_meituan_unique = self.fetch_shop_url_by_address(addresses, city_name, shop_name) | |
return shops_exists_in_meituan_unique | |
def fetch_shop_url_by_address(self, addresses, city_name, shop_name): | |
# 3. 利用百度的坐标反查接口获取这些地址对应的坐标值 | |
shops = self.add_lng_lat_by_address(addresses, shop_name) | |
# 4. 利用坐标值和地址,通过美团外卖的接口获取该店(所在区域)在美团外卖内部系统内的地理哈希值 | |
self.fetch_geo_hash_for_shops(shops) | |
# 5. 利用该地理哈希值和商店名称,通过美团的搜索接口尝试获取其店铺网址,获取其中所有的在美团上开设的该店铺信息 | |
shops_exists_in_meituan = self.batch_get_shop_with_url_by_geo_hash_and_name(shops) | |
# 6. 对获取的店铺进行去重(根据URL) | |
shops_exists_in_meituan_unique = self.remove_duplicate_shops(shops_exists_in_meituan) | |
# 7. 将所有商店必要的统计信息导入到一个表单内 | |
shops_info_sheet_name = '{city_name}_{shop_name}'.format(city_name=city_name, shop_name=shop_name) | |
self.export_shops_info_to_xls_sheet(shops_exists_in_meituan_unique, self.get_sheet_name(shops_info_sheet_name)) | |
return shops_exists_in_meituan_unique | |
def run_crawler_and_export_with_shop_urls(self, urls, city_name, shop_name): | |
# 根据url获取店铺地址 | |
addresses = self.get_addresses_by_urls(urls) | |
# 根据地址获取店铺相关信息及url | |
shops_exists_in_meituan = self.fetch_shop_url_by_address(addresses, city_name, shop_name) | |
parse_shops_info = self.parse_shops_and_export(shops_exists_in_meituan, '') | |
def run_crawler_and_export(self, city_name, shop_name): | |
# 获取在该城市范围内该商店在美团上所开设的所有店铺的网址等信息 | |
shops_exists_in_meituan = self.collect_shop_urls(city_name, shop_name) | |
# 对这些找到的店铺抓取其页面数据 | |
parse_shops_info = self.parse_shops_and_export(shops_exists_in_meituan, shop_name) | |
def run(self, city_name='湛江', shop_name='美优乐', ids=''): | |
""" | |
根据输入的城市名和商店名,找到该城市内该商店在美团所开设的所有店铺的商品的信息列表,并导出为xls文件 | |
:return: | |
""" | |
# 1. 获取url | |
# 2. 爬取内容 | |
# 3. 开发前端: 试试用Python写GUI | |
## 从用户获取城市和商店名 | |
# city_name = '湛江' | |
# shop_name = '美优乐' | |
log.eye_catching_logging('开始抓取[{city}]:[{shop}]'.format(city=city_name, shop=shop_name), log.error) | |
if ids: | |
shop_url = 'http://waimai.meituan.com/restaurant/{id}' | |
ids = ids.split(',') | |
urls = list(map(lambda x: shop_url.format(id=x.strip()), ids)) | |
self.run_crawler_and_export_with_shop_urls(urls, city_name.strip(), shop_name.strip()) | |
else: | |
self.run_crawler_and_export(city_name.strip(), shop_name.strip()) | |
log.eye_catching_logging('完成抓取[{city}]:[{shop}]'.format(city=city_name, shop=shop_name), log.error) | |
# TODO: 设置不同类型的格式,以及每个列对应的类型 | |
# global data_dir | |
# # 确保数据文件夹已创建 | |
# data_dir = './结果/{shopname}'.format(shopname=shop_name, time=time.strftime(DATE_TIME_FORMAT)) | |
# | |
# if not os.path.exists(data_dir): | |
# os.makedirs(data_dir) | |
saved_file = '{current_time}_{location}_{shop}.xls'.format( | |
current_time=time.strftime(DATE_TIME_FORMAT), | |
location=city_name, | |
shop=shop_name | |
) | |
res = [self.wb, saved_file] | |
# log.info(res) | |
return res | |
def get_addresses_by_urls(self, urls: list): | |
addresses = [] | |
# 从每个页面中抓取地址信息,并返回 | |
for shop_url in urls: | |
res = session.get(shop_url) | |
soup = BeautifulSoup(res.text, 'lxml') | |
shop_address_div = soup.select('div.rest-info-thirdpart')[0] | |
shop_address = shop_address_div.string.strip().replace('商家地址:', '') | |
addresses.append(shop_address) | |
return addresses | |
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
def export_shop_to_xls_eleme(self, shops): | |
wb = xlwt.Workbook(encoding='utf-8') | |
if len(shops) == 0: | |
wb.add_sheet('没有找到相关结果') | |
return wb | |
def patch_with_helper(worksheet): | |
# 添加辅助方法 | |
def write_float(self, row, col, val): | |
if val is None: | |
val = 0.0 | |
self.write(row, col, float(val), type_fmt['float']) | |
def write_int(self, row, col, val): | |
if val is None: | |
val = 0 | |
self.write(row, col, int(val), type_fmt['int']) | |
def write_bool(self, row, col, val): | |
if val is None: | |
val = False | |
self.write(row, col, bool(val), type_fmt['bool']) | |
worksheet.write_float = types.MethodType(write_float, worksheet) | |
worksheet.write_int = types.MethodType(write_int, worksheet) | |
worksheet.write_bool = types.MethodType(write_bool, worksheet) | |
return worksheet | |
# -------------------商家汇总信息-------------------------------- | |
ws = wb.add_sheet('商家汇总信息') # type: xlwt.Worksheet | |
ws = patch_with_helper(ws) | |
# 写入heading | |
ws.write(0, 0, '商铺名') | |
ws.write(0, 1, '地址') | |
ws.write(0, 2, '纬度') | |
ws.write(0, 3, '经度') | |
ws.write(0, 4, '月售单数') | |
ws.write(0, 5, '起送价') | |
ws.write(0, 6, '费送费') | |
ws.write(0, 7, '平均送达时间') | |
ws.write(0, 8, '评分') | |
ws.write(0, 9, '评分数') | |
ws.write(0, 10, '评分高于周边商家比例') | |
ws.write(0, 11, '好评率') | |
ws.write(0, 12, '菜品评分') | |
ws.write(0, 13, '服务态度评分') | |
ws.write(0, 14, '店铺网址') | |
# 写入记录 | |
row = 0 | |
for shop in shops: # type: dict | |
row += 1 | |
ws.write(row, 0, shop.get('name')) | |
ws.write(row, 1, shop.get('address')) | |
ws.write_float(row, 2, shop.get('latitude')) | |
ws.write_float(row, 3, shop.get('longitude')) | |
ws.write_int(row, 4, shop.get('recent_order_num')) | |
ws.write_int(row, 5, shop.get('float_minimum_order_amount')) | |
ws.write_int(row, 6, shop.get('float_delivery_fee')) | |
ws.write_int(row, 7, shop.get('order_lead_time')) | |
ws.write_float(row, 8, shop.get('rating')) | |
ws.write_int(row, 9, shop.get('rating_count')) | |
ws.write_float(row, 10, shop.get('compare_rating')) | |
ws.write_float(row, 11, shop.get('positive_rating')) | |
ws.write_float(row, 12, shop.get('food_score')) | |
ws.write_float(row, 13, shop.get('service_score')) | |
ws.write(row, 14, shop.get('url')) | |
# -------------------各家店商品信息-------------------------------- | |
for idx, shop in enumerate(shops): | |
ws = wb.add_sheet(self.get_sheet_name('{index}_{name}@{address}'.format(index=idx, name=shop.get('name'), | |
address=shop.get( | |
'address')))) # type: xlwt.Worksheet | |
ws = patch_with_helper(ws) | |
menu_for_shops_heading = [ | |
'original_price', | |
'price', | |
'item_id', | |
'name', | |
'sold_out', | |
'month_sales', | |
'packing_fee', | |
'rating', | |
'rating_count', | |
'description' | |
] | |
# 写入heading | |
ws.write(0, 0, '原价') | |
ws.write(0, 1, '现价') | |
ws.write(0, 2, 'id') | |
ws.write(0, 3, '名称') | |
ws.write(0, 4, '售罄') | |
ws.write(0, 5, '月销量') | |
ws.write(0, 6, '打包费') | |
ws.write(0, 7, '评分') | |
ws.write(0, 8, '评分数') | |
ws.write(0, 9, '描述') | |
ws.write(0, 10, '规格') | |
# 写入记录 | |
row = 0 | |
for sub_menu in shop.get('menu'): # type: dict | |
for food in sub_menu.get('foods'): # type: dict | |
for specfood in food.get('specfoods'): # type: dict | |
row += 1 | |
ws.write_float(row, 0, specfood.get('original_price')) | |
ws.write_float(row, 1, specfood.get('price')) | |
ws.write_int(row, 2, specfood.get('food_id')) | |
ws.write(row, 3, specfood.get('name')) | |
ws.write_bool(row, 4, specfood.get('sold_out')) | |
ws.write_int(row, 5, specfood.get('recent_popularity')) | |
ws.write_float(row, 6, specfood.get('packing_fee')) | |
ws.write_float(row, 7, food.get('rating')) | |
ws.write_int(row, 8, food.get('rating_count')) | |
ws.write(row, 9, food.get('description')) | |
ws.write(row, 10, str(specfood.get('specs'))) | |
# -------------------各菜品信息(总计)-------------------------------- | |
# ---计算统计信息 | |
food_infos = self.compute_food_statistics(shops) # -------导出统计信息 | |
ws = wb.add_sheet('菜品统计信息') # type: xlwt.Worksheet | |
ws = patch_with_helper(ws) | |
# 写入heading | |
ws.write(0, 0, '名称') | |
ws.write(0, 1, '总销量') | |
ws.write(0, 2, '总销售额') | |
ws.write(0, 3, '平均销量') | |
ws.write(0, 4, '平均销售额') | |
ws.write(0, 5, '平均价格') | |
ws.write(0, 6, '各门店总计展现次数(每个规格作为计数单位)') | |
# 写入记录 | |
row = 0 | |
for food_info in food_infos: # type: dict | |
row += 1 | |
ws.write(row, 0, food_info.get('name')) | |
ws.write_int(row, 1, food_info.get('total_sold_count')) | |
ws.write_float(row, 2, food_info.get('total_sold_money')) | |
ws.write_float(row, 3, food_info.get('average_sold_count')) | |
ws.write_float(row, 4, food_info.get('average_sold_money')) | |
ws.write_float(row, 5, food_info.get('average_price')) | |
ws.write_int(row, 6, food_info.get('occur_times')) | |
return wb | |
pass | |
def compute_food_statistics(self, shops): | |
foods = [] | |
for shop in shops: # type: dict | |
for sub_menu in shop.get('menu'): # type: dict | |
for food in sub_menu.get('foods'): # type: dict | |
for specfood in food.get('specfoods'): # type: dict | |
foods.append({ | |
'name': specfood.get('name'), | |
'price': float(specfood.get('price')), | |
'sold_count': int(specfood.get('recent_popularity')), | |
'food_id': int(specfood.get('food_id')), | |
'item_id': int(food.get('item_id')) | |
}) | |
import itertools | |
from operator import itemgetter | |
food_infos = [] | |
foods.sort(key=itemgetter("name")) | |
for name, foods_with_same_name in itertools.groupby(foods, lambda food: food['name']): | |
foods_with_same_name = list(foods_with_same_name) | |
total_sold_count = sum([food['sold_count'] for food in foods_with_same_name]) | |
total_sold_money = sum([food['sold_count'] * food['price'] for food in foods_with_same_name]) | |
total_count = len(foods_with_same_name) | |
food_infos.append({ | |
"name": name, | |
"total_sold_count": total_sold_count, | |
"total_sold_money": total_sold_money, | |
"average_sold_count": total_sold_count / total_count, | |
"average_sold_money": total_sold_money / total_count, | |
"average_price": sum([food['price'] for food in foods_with_same_name]) / total_count, | |
"occur_times": total_count, | |
}) | |
# food_infos.sort(key=itemgetter("")) | |
return food_infos | |
def run_eleme(self, city_name, brand_name, ids: str): | |
if not ids: | |
# 使用城市和商家名进行搜素 | |
cid_name = self.get_city_id_and_name(city_name) | |
## TODO: 使用采集到的离线商铺信息进行匹配shops_exists_in_eleme_unique | |
addresses = self.find_possiable_addresses(cid_name, brand_name) | |
shops = self.add_lng_lat_by_address(addresses, brand_name) | |
# 根据坐标获取商铺url | |
shops_exists_in_eleme = self.batch_get_shop_with_url_by_lat_and_lng(shops, brand_name) | |
# 移除重复结果 | |
shops_exists_in_eleme_unique = self.remove_duplicate_shops_eleme(shops_exists_in_eleme) | |
else: | |
# 使用商家名和ids进行搜索 | |
# 根据ids构造shops | |
# 'brand': brand_name, | |
# 'latitude': shop.lat, | |
# 'longitude': shop.lng, | |
# 'id': id, | |
# 'url': 'https://www.ele.me/shop/{id}'.format(id=id) | |
shop_url = 'https://www.ele.me/shop/{id}' | |
shops_exists_in_eleme_unique = [] | |
for id in ids.split(','): | |
shops_exists_in_eleme_unique.append({ | |
'brand': brand_name, | |
'latitude': 0, | |
'longitude': 0, | |
'id': id.strip(), | |
'url': shop_url.format(id=id.strip()) | |
}) | |
# 解析每个页面,将相关信息添加到shop中 | |
self.parse_shops_eleme(shops_exists_in_eleme_unique, brand_name) | |
# 导出结果 | |
filename = ''.join(lazy_pinyin('{current_time}_{location}_{shop}.xls'.format( | |
current_time=time.strftime(DATE_TIME_FORMAT), | |
location=city_name, | |
shop=brand_name | |
))) | |
return [self.export_shop_to_xls_eleme(shops_exists_in_eleme_unique), filename] | |
pass | |
def batch_get_shop_with_url_by_lat_and_lng(self, shops, brand_name): | |
# https://mainsite-restapi.ele.me/shopping/restaurants/search | |
# extras[]:activity | |
# keyword:外婆家 +++++++++++++ | |
# latitude:30.262373 +++++++++++++ | |
# limit:100 | |
# longitude:120.12105 +++++++++++++ | |
# offset:0 | |
# terminal:web | |
# ?extras%5B%5D=activity&keyword=%E5%A4%96%E5%A9%86%E5%AE%B6&latitude=30.262373&limit=100&longitude=120.12105&offset=0&terminal=web | |
# result => restaurant_with_foods->[0,1,2..n]->restaurant->id | |
# url => https://www.ele.me/shop/{id} | |
api = 'https://mainsite-restapi.ele.me/shopping/restaurants/search' | |
query = { | |
'extras[]': 'activity', | |
'limit': 100, | |
'offset': 0, | |
'terminal': 'web', | |
} | |
shops_with_url = [] | |
for shop in shops: | |
query['keyword'] = brand_name | |
query['latitude'] = shop.lat | |
query['longitude'] = shop.lng | |
res = requests.get(api, params=query) | |
res = res.json() | |
ids = set() | |
for restaurant in res['restaurant_with_foods']: | |
ids.add(restaurant['restaurant']['id']) | |
for id in ids: | |
shops_with_url.append({ | |
'brand': brand_name, | |
'latitude': shop.lat, | |
'longitude': shop.lng, | |
'id': id, | |
'url': 'https://www.ele.me/shop/{id}'.format(id=id) | |
}) | |
return shops_with_url | |
pass | |
def remove_duplicate_shops_eleme(self, shops): | |
unique_shops = [] | |
visited_urls = {} | |
for shop in shops: | |
if not visited_urls.get(shop['url']): | |
unique_shops.append(shop) | |
visited_urls[shop['url']] = True | |
return unique_shops | |
pass | |
def parse_shops_eleme(self, shops_exists_in_eleme_unique, brand_name): | |
_session = requests.session() | |
from multiprocessing.dummy import Pool as ThreadPool | |
pool = ThreadPool(8) | |
# TODO: replace with pool | |
for shop in shops_exists_in_eleme_unique: # type: dict | |
# 商家信息:https://mainsite-restapi.ele.me/shopping/restaurant/1387370?extras%5B%5D=activity&extras%5B%5D=license&extras%5B%5D=identification&extras%5B%5D=albums&extras%5B%5D=flavors&latitude=30.262373&longitude=120.12105 | |
# 坐标参数影响 [distance] | |
# NOTE:根据该接口的结果中的latitude和longitude得到其坐标 | |
info_api = 'https://mainsite-restapi.ele.me/shopping/restaurant/{id}'.format(id=shop['id']) | |
query = { | |
'extras[]': [ | |
'activity', | |
'license', | |
'identification', | |
'albums', | |
'flavors', | |
], | |
'latitude': shop.get('latitude', 0), | |
'longitude': shop.get('longitude', 0) | |
} | |
shop_info = _session.get(info_api, params=query).json() | |
shop.update(shop_info) | |
# 商家评价信息: https://mainsite-restapi.ele.me/ugc/v1/restaurants/1387370/rating_scores?latitude=30.262373&longitude=120.12105 | |
# 坐标影响结果中的 compare_rating 字段 | |
rating_api = 'https://mainsite-restapi.ele.me/ugc/v1/restaurants/{id}/rating_scores'.format(id=shop['id']) | |
query = { | |
# 若直接输入店铺ids,则无该信息 | |
'latitude': shop.get('latitude', 0), | |
'longitude': shop.get('longitude', 0) | |
} | |
shop_ratings = _session.get(rating_api, params=query).json() | |
shop.update(shop_ratings) | |
# 菜品信息: https://mainsite-restapi.ele.me/shopping/v2/menu?restaurant_id=1387370 | |
menu_api = 'https://mainsite-restapi.ele.me/shopping/v2/menu' | |
query = { | |
'restaurant_id': shop['id'] | |
} | |
while True: | |
menu = _session.get(menu_api, params=query).json() | |
if type(menu) is list: | |
# 正常情况下返回list | |
break | |
else: | |
# 若返回结果为dict,则说明操作过频繁,休息一会儿 | |
# {'name': 'SERVICE_REJECTED', 'message': '操作太频繁啦,请休息一下再试。'} | |
time.sleep(1) | |
pass | |
shop['menu'] = menu | |
def timer(func, *args, **kwargs): | |
ran_time = timeit.timeit(func, number=1) | |
log.critical('method %s run %s seconds' % (func, ran_time)) | |
ban_cnt = 0 | |
def get_eleme_ids(): | |
global ban_cnt | |
api = 'https://mainsite-restapi.ele.me/shopping/restaurant/{id}?latitude=30.262373&longitude=120.12105' | |
# get start id | |
# 目前观测到的最后一个六位数的id为 580703 | |
start_id = get_start_id_from_file() | |
id = start_id | |
valid_shops = [] | |
start_at = time.time() | |
_session = requests.session() | |
from multiprocessing.dummy import Pool as ThreadPool | |
pool = ThreadPool(8) | |
def _process_for_id(id): | |
global ban_cnt | |
prefix = '[%9d] : ' % id | |
res = _session.get(api.format(id=id)).json() # type: dict | |
# 每次请求相隔0.1s恰好不会被ban, 0.5s屏蔽率为1% | |
# time.sleep(0.05) | |
if res.get('name') == 'RESTAURANT_NOT_FOUND': | |
# log.error(prefix + res.get('message')) | |
# time.sleep(0.05) | |
pass | |
elif res.get('name') == 'SYSTEM_ERROR': | |
# log.error(prefix + res.get('message')) | |
pass | |
elif res.get('name') == 'OpenAPI 测试餐厅': | |
pass | |
elif res.get('name') == 'SERVICE_REJECTED': | |
# log.error(prefix + res.get('message')+' :: 休息0.05s') | |
# add mutex | |
ban_cnt += 1 | |
# time.sleep(0.05) | |
return _process_for_id(id) | |
else: | |
# http://api.map.baidu.com/geocoder/v2/?output=json&ak=Eze6dPlb3bnUrihPNaaKljdUosb4G41B&location=30.271933,120.1195 | |
# 根据坐标添加地理信息 | |
# {"status":0,"result":{"location":{"lng":120.11949999999993,"lat":30.271933048715849},"formatted_address":"浙江省杭州市西湖区西溪路","business":"西溪,西湖,古荡","addressComponent":{"country":"中国","country_code":0,"province":"浙江省","city":"杭州市","district":"西湖区","adcode":"330106","street":"西溪路","street_number":"","direction":"","distance":""},"pois":[],"poiRegions":[],"sematic_description":"秦亭山北123米","cityCode":179}} | |
shop = { | |
'id': id, | |
'name': res.get('name'), | |
'address': res.get('address'), | |
'latitude': res.get('latitude'), | |
'longitude': res.get('longitude'), | |
# 'city_code': _session.get(baidu_api).json()['result']['cityCode'], | |
} | |
log.error(prefix + json.dumps(shop, ensure_ascii=False)) | |
return shop | |
batch_size = 100 | |
batch_data_max_size = 5000 | |
while id <= 999999999: | |
loop_start_at = time.time() | |
last_res_cnt = len(valid_shops) | |
results = list(filter(None, pool.map(_process_for_id, range(id, id + batch_size)))) | |
# results = filter(None, map(_process_for_id, range(id, id+batch_size))) | |
valid_shops += results | |
id += batch_size | |
log_current_status(batch_size, id, last_res_cnt, loop_start_at, start_at, start_id, valid_shops) | |
if last_res_cnt != len(valid_shops): | |
with open('valid_shops_start_at_{start_id}.json'.format(start_id=start_id), 'w', encoding='utf-8') as save: | |
log.error('导出数据ing...') | |
json.dump(valid_shops, save, ensure_ascii=False, indent=2) | |
log.error('导出数据成功') | |
else: | |
# log.error('没有新数据') | |
pass | |
with open('__start_id.json', 'w', encoding='utf-8') as ip_fp: | |
json.dump(id, ip_fp) | |
if len(valid_shops) >= batch_data_max_size: | |
log.critical('本批次有效商铺数已超过%d,开始下一批次以减少json的dump的影响' % batch_data_max_size) | |
start_id = id | |
ban_cnt = 0 | |
valid_shops = [] | |
start_at = time.time() | |
pass | |
log.error('完成抓取') | |
# 合并多次的结果,并添加地理信息 | |
merge_old_files_into_one_and_add_geo_info() | |
def get_start_id_from_file(): | |
try: | |
with open('__start_id.json', 'r', encoding='utf-8') as ip_fp: | |
start_id = json.load(ip_fp) | |
except Exception: | |
start_id = 0 | |
return start_id | |
def log_current_status(batch_size, id, last_res_cnt, loop_start_at, start_at, start_id, valid_shops): | |
total_items_cnt = len(valid_shops) | |
new_items_cnt = total_items_cnt - last_res_cnt | |
new_items_rate = 100 * new_items_cnt / batch_size | |
total_id = id - start_id | |
total_items_rate = 100 * total_items_cnt / total_id | |
loop_used = time.time() - loop_start_at | |
total_used = time.time() - start_at | |
req_uesd_avg = total_used / total_id | |
loop_used_avg = 100 * req_uesd_avg | |
ban_rate = 100 * ban_cnt / total_id | |
log.error( | |
'已连续抓取%d/%d(%.2f%%)个,导出目前已有数据%d/%d(%.2f%%)个,目前id为%d, 起始id为%d, 当前循环耗时%.2f(%.4f)秒,目前总计运行%.2f(%.4f)秒, 屏蔽次数%d/%d(%.2f%%)' % ( | |
new_items_cnt, batch_size, new_items_rate, total_items_cnt, total_id, total_items_rate, id, start_id, | |
loop_used, | |
loop_used_avg, total_used, req_uesd_avg, ban_cnt, total_id, ban_rate)) | |
def merge_old_files_into_one_and_add_geo_info(): | |
import json | |
import os | |
import shutil | |
from functools import reduce | |
shop_filenames = [filename for filename in os.listdir('.') if filename.startswith('valid_shops_start_at_')] | |
def get_records_in_file(file_name): | |
with open(file_name, encoding='utf-8') as shops: | |
return json.load(shops) | |
# 从各个文件中获取数据,并将其合并 | |
items = reduce(lambda l1, l2: l1 + l2, map(get_records_in_file, shop_filenames)) | |
# 过滤掉无效的数据 | |
def is_valid_shop(shop): | |
invalid = shop['name'] is None or shop['longitude'] is None or shop['address'] is None or shop[ | |
'latitude'] is None | |
return not invalid | |
items = filter(is_valid_shop, items) | |
items = list(items) | |
# 过滤掉重复的数据 | |
processed = set() | |
def is_duplicated(shop): | |
if shop['id'] in processed: | |
return False | |
else: | |
processed.add(shop['id']) | |
return True | |
unique_items = filter(is_duplicated, items) | |
unique_items = list(unique_items) | |
duplicated_cnt = len(list(items)) - len(unique_items) | |
print('重复ID个数: ', duplicated_cnt) | |
total_ids = len(unique_items) | |
# total_range = unique_items[-1].get('id', 0) | |
total_range = max(unique_items, key=lambda item: item.get('id', 0)).get('id', 0) | |
rate = total_ids / total_range | |
print('总计: ', total_ids) | |
print('遍历至: ', total_range) | |
print('有效ID比率: ', rate) | |
# 添加城市信息(city_id and formatted_address) | |
print('添加地理信息ing') | |
_session = requests.session() | |
from multiprocessing.dummy import Pool as ThreadPool | |
pool = ThreadPool(8) | |
processed_status = { | |
'ran_cnt': 0, | |
'skipped_cnt': 0, | |
'banned': False, | |
'banned_cnt': 0 | |
} | |
def fetch_cityid_and_formatted_address(item): | |
if 'city_code' in item: | |
processed_status['skipped_cnt'] += 1 | |
elif processed_status['banned']: | |
processed_status['banned_cnt'] += 1 | |
else: | |
api = 'http://api.map.baidu.com/geocoder/v2/' | |
res = _session.get(api, params={ | |
'output': 'json', | |
'ak': 'Eze6dPlb3bnUrihPNaaKljdUosb4G41B', | |
'location': '{latitude},{longitude}'.format(latitude=item['latitude'], longitude=item['longitude']), | |
}).json() | |
if res['status'] != 0: | |
processed_status['banned'] = True | |
processed_status['banned_cnt'] = 1 | |
return fetch_cityid_and_formatted_address(item) | |
pass | |
res = res['result'] | |
item['city_code'] = res['cityCode'] | |
item['formatted_address'] = res['formatted_address'] | |
processed_status['ran_cnt'] += 1 | |
ran_cnt, skipped_cnt, banned_cnt = processed_status['ran_cnt'], processed_status['skipped_cnt'], processed_status['banned_cnt'] | |
cnt = ran_cnt + skipped_cnt | |
print('\rran:%6d, skipped: %6d, banned: %6d/ total: %6d(%.2f%%)' % (ran_cnt, skipped_cnt, banned_cnt, total_ids, 100 * cnt / total_ids), end='', flush=True) | |
pool.map(fetch_cityid_and_formatted_address, unique_items) | |
print() | |
# backup old files | |
# check if backup directory exists | |
import time | |
backup_directory = '_old_files/{backuped_at}'.format(backuped_at=time.strftime('%Y-%m-%d_%H-%M-%S')) | |
if not os.path.exists(backup_directory): | |
os.makedirs(backup_directory) | |
print('将旧的文件移至备份区 @', backup_directory) | |
for old_file in shop_filenames: | |
shutil.move(old_file, os.path.join(backup_directory, old_file)) | |
# merge to one file | |
saved_as = 'valid_shops_start_at_0_to_{max_id}@total_{total}.json'.format(max_id=total_range, total=total_ids) | |
print('合并为一个文件 @ ', saved_as) | |
with open(saved_as, 'w', encoding='utf-8') as save: | |
json.dump(unique_items, save, ensure_ascii=False, indent=2) | |
def main(): | |
# TODO: | |
# 1. 移除Shop等类, 只采用dict | |
# 2. 将导出步骤分离出来 | |
# 3. 规范log | |
# city = input('城市名: ') | |
# brand = input('商家品牌名: ') | |
# ids = input('商家ids:') | |
city = '' | |
brand = '周大虾龙虾盖浇饭' | |
ids = '1468934, 1314141, 1214943, 1215005, 1314147, 1314153, 1314143, 1314146, 1215087, 1214907, 1314151, 1797366, 1357350, 1215053, 1447024, 1215012, 1215019' | |
# | |
crawler = MeituanCrawler() | |
# 将两种整合到一起 | |
wb, saved_file = crawler.run_eleme(city, brand, ids) | |
wb.save(saved_file) | |
if __name__ == '__main__': | |
# FIXME: 美团的./backend/meituan_crawer.py in parse_urls line:354 进行屏蔽(404)休眠后重试 | |
# TODO: 提取类(url_fetcher, page_parser, item_exporter),并设计各类职责,以及类的结构(继承与接口) | |
# TODO: 提取各种设置到单独的类中 | |
# TODO:添加前端接口 | |
# timer(main) | |
timer(get_eleme_ids) | |
# timer(merge_old_files_into_one_and_add_geo_info) | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment