Skip to content

Instantly share code, notes, and snippets.

@xuexingdong
Created January 22, 2017 03:01
Show Gist options
  • Save xuexingdong/47baa6ed4b88ac4cf622905293cb9718 to your computer and use it in GitHub Desktop.
Save xuexingdong/47baa6ed4b88ac4cf622905293cb9718 to your computer and use it in GitHub Desktop.
Tmall spider
<?php
/**
* @类名 Tmall
* @功能 天猫爬虫
* @author XueXingdong(306947352@qq.com)
*
*/
include_once('Curl.class.php');
include_once('simple_html_dom.class.php');
class Tmall {
//获取cookie的链接
const URL_COOKIE = 'https://login.taobao.com/jump?target=https://list.tmall.com/search_product.htm';
//浏览器UA,如果不填写会被当爬虫过滤
const UA = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0';
//根据关键词检索商品的链接
const URL_SEARCH = 'https://list.tmall.com/search_product.htm';
//商品详情信息填充链接(非网页源代码)
const URL_DETAIL = 'https://mdskip.taobao.com/core/initItemDetail.htm';
//收藏数的链接
const URL_COLLECTION = 'https://count.taobao.com/counter3?callback=callback&keys=SM_368_dsr-875041721,ICCP_1_';
//交易记录链接https://rate.tmall.com/listTryReport.htm?itemId=26293100988&pageSize=5&currentPage=1&_ksTS=1443832993932_1023&callback=jsonp1024
const URL_RECORD = 'https://ext-mdskip.taobao.com/extension/dealRecords.htm';
//评分链接
const URL_RANK = 'https://ext-mdskip.taobao.com/extension/seller_info.htm';
//评论链接
const URL_COMMENT = 'https://rate.tmall.com/list_detail_rate.htm';
//REFERER地址
const URL_REFERER = 'https://detail.tmall.com/item.htm';
//cookie
private $cookie;
//构造函数,初始化cookie
public function __construct($cookie = '') {
if (empty($cookie)) {
$curl = new Curl();
$curl->get(self::URL_COOKIE);
$this->cookie = $curl->getCookies();
}
else {
$this->cookie = $cookie;
}
}
/**
* @function 返回cookie,用于保存
* @param void
* @return string cookie
*/
public function getCookie(){
return $this->cookie;
}
/**
* @function 获取检索结果总页数
* @param string 检索词
* @return int 检索结果页数
*/
public function getTotalPage($keyword) {
//totalPage代表总页数,jumpto代表跳到第几页
$param['q'] = $keyword;
//取消自动选择品牌
$param['search_condition'] = 2;
$curl = new Curl();
//设置UA
$curl->userAgent(self::UA);
$curl->setCookies($this->cookie);
$content = $curl->get(self::URL_SEARCH.'?'.http_build_query($param));
$total = 0;
if (preg_match('/共(\d+)页/', $content, $match)) {
$total = (int)$match[1];
}
return $total;
}
/**
* @function 根据关键词检索天猫
* @param string 关键词
* @param int 要查询的页码
* @param int 总页数
* @return array 检索结果
* @note 天猫默认每页60个
*/
public function search($keyword, $page = 1,$totalPage = 1) {
//检索词
$param['q'] = $keyword;
//取消自动选择品牌
$param['search_condition'] = 2;
//查看第几页
$param['jumpto'] = $page;
//总页数
$param['totalPage'] = $totalPage;
$curl = new Curl();
//设置UA
$curl->userAgent(self::UA);
$curl->setCookies($this->cookie);
$content = $curl->get(self::URL_SEARCH.'?'.http_build_query($param));
$html = new simple_html_dom();
$html->load($content);
$ps = $html->find('div[class=product]');
$result = array();
foreach ($ps as $p) {
//有的不是商品而是商品特辑的集合,直接跳过
if (!array_key_exists('data-id', $p->attr)) {
continue;
}
$product = array();
//封面图
$img = $p->find('div[class=productImg-wrap] a[class=productImg] img',0);
if ($img->src) {
$product['coverImg'] = 'https:'.$img->src;
}
else {
$product['coverImg'] = 'https:'.$img->attr['data-ks-lazyload'];
}
//封面价格
$product['coverPrice'] = (float)$p->find('p[class=productPrice] em',0)->title;
//标题
$product['title'] = html_entity_decode($p->find('p[class=productTitle] a',0)->title, ENT_QUOTES);
//所属店铺
$product['storeName'] = trim(strip_tags($p->find('a[class=productShop-name]',0)->innertext));
//详情页地址
$product['url'] = 'https:'.html_entity_decode($p->find('div[class=productImg-wrap] a',0)->href, ENT_QUOTES);
//商品编号
preg_match('/id=(\d+)&?/', $product['url'], $match);
$product['itemId'] = $match[1];
//skuId,后面的json有个键是这个,根据这个获取价格信息
preg_match('/skuId=(\d+)&?/', $product['url'], $match);
$product['skuId'] = $match[1];
preg_match('/user_id=(\d+)&?/', $product['url'], $match);
$product['sellerId'] = $match[1];
$result[] = $product;
}
$html->clear();
return $result;
}
/**
* @function 获取商品详细信息
* @param string 商品页面链接
* @return array 商品信息数组
*/
public function getProductInfo($itemId, $skuId, $sellerId) {
$product = array();
$product['itemId'] = $itemId;
$product['skuId'] = $skuId;
$product['sellerId'] = $sellerId;
$curl = new Curl();
$curl->userAgent(self::UA);
$curl->setCookies($this->cookie);
//构造referer
$referer = array();
$referer['id'] = $itemId;
$referer['skuId'] = $skuId;
$referer['sellerId'] = $sellerId;
$url = self::URL_REFERER.'?'.http_build_query($referer);
$curl->referer($url);
//先获取界面源代码,看看有哪些信息
$content = $curl->get($url);
$html = new simple_html_dom();
$html->load($content);
//商品展示图
$trumbs = $html->find('ul[id=J_UlThumb] a img');
$product['trumbs'] = array();
foreach ($trumbs as $trumb) {
$product['trumbs'][] = 'https:'.$trumb->src;
}
//收藏数
$content = $curl->get(self::URL_COLLECTION.$product['itemId']);
if (preg_match('/\"ICCP_1_\d+\":(\d+),/', $content, $match)) {
$product['collection'] = (int)$match[1];
}
$html->clear();
//其他信息需要访问详情页动态加载的json
$content = $curl->get(self::URL_DETAIL.'?itemId='.$product['itemId']);
//商品详情数组
$productDetail = json_decode($content,true);
//判断是否获取json成功
if (!array_key_exists('isSuccess', $productDetail) || $productDetail['isSuccess']!==true) {
return array();
}
//价格信息数组
$priceInfo = $productDetail['defaultModel']['itemPriceResultDO']['priceInfo'];
//如果存在该键,则价格信息就在里面,否则是在def键中
if (array_key_exists($product['skuId'], $priceInfo)) {
$priceInfo = $priceInfo[$product['skuId']];
}
else {
$priceInfo = $priceInfo['def'];
}
//原价信息
$product['price'] = (float)$priceInfo['price'];
//如果有促销信息
if (array_key_exists('promotionList', $priceInfo)) {
//促销价格
$product['promotePrice'] = (float)$priceInfo['promotionList'][0]['price'];
}
//库存数
//如果这个键的值是空的,说明没有可选的品目,直接取总库存即可
if (empty($productDetail['defaultModel']['inventoryDO']['skuQuantity'])) {
$product['stock'] = (int)$productDetail['defaultModel']['inventoryDO']['icTotalQuantity'];
}
else {
$product['stock'] = (int)$productDetail['defaultModel']['inventoryDO']['skuQuantity'][$product['skuId']]['quantity'];
}
//评分
$param = array();
//店铺id
$param['user_num_id'] = $product['sellerId'];
$content = $curl->get(self::URL_RANK.'?'.http_build_query($param));
$html->load($content);
$lis = $html->find('li[class=J_RateInfoTrigger]');
//与商品相符
$product['rank']['match']['total'] = (float)$lis[0]->find('div[class=total] em',0)->title;
for ($i=1; $i <= 5; $i++) {
$value = $lis[0]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext;
preg_match('/.*\((\d+)人\).*/', $value, $match);
$product['rank']['match'][$i] = (int)$match[1];
}
//服务态度
$product['rank']['service']['total'] = (float)$lis[1]->find('div[class=total] em',0)->title;
for ($i=1; $i <= 5; $i++) {
$value = $lis[1]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext;
preg_match('/.*\((\d+)人\).*/', $value, $match);
$product['rank']['service'][$i] = (int)$match[1];
}
//物流
$product['rank']['transform']['total'] = (float)$lis[2]->find('div[class=total] em',0)->title;
for ($i=1; $i <= 5; $i++) {
$value = $lis[2]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext;
preg_match('/.*\((\d+)人\).*/', $value, $match);
$product['rank']['transform'][$i] = (int)$match[1];
}
//评论信息
$product['comments'] = $this->getComment($product['itemId'], $product['skuId'], $product['sellerId']);
//交易记录
$product['records'] = $this->getRecord($product['itemId'], $product['skuId'], $product['sellerId']);
return $product;
}
/**
* @function 获取评论信息
* @param string 商品id
* @param string 商品品类id
* @param string 卖家id
* @return array 评论信息数组
*/
public function getComment($itemId, $skuId, $sellerId) {
$comments = array();
$param = array();
$param['callback'] = 'callback';
//是否显示有内容的评论
$param['content'] = 1;
//按时间顺序
$param['order'] = 1;
//物品id
$param['itemId'] = $itemId;
//卖家id;
$param['sellerId'] = $sellerId;
$curl = new Curl();
$curl->userAgent(self::UA);
$curl->setCookies($this->cookie);
//取前十页的评论
for ($page=1; $page <= 10; $page++) {
$param['currentPage'] = $page;
$content = $curl->get(self::URL_COMMENT.'?'.http_build_query($param));
preg_match('/callback\(({.*})\)/', $content, $match);
$commentsInfo = json_decode($match[1],true);
//到某页评论为空了,则跳出循环
if (empty($commentsInfo['rateDetail']['rateList'])) {
break;
}
foreach ($commentsInfo['rateDetail']['rateList'] as $key => $commentInfo) {
$comment = array();
$comment['id'] = $commentInfo['id'];
$comment['content'] = html_entity_decode($commentInfo['rateContent'], ENT_QUOTES);
$comment['time'] = strtotime($commentInfo['rateDate']);
//评论的买家信用积分
$comment['credit'] = (int)$commentInfo['displayRateSum'];
//评论是否有图
if (array_key_exists('pics', $commentInfo) && !empty($commentInfo['pics'])) {
$comment['pics'] = array();
foreach ($commentInfo['pics'] as $pic) {
$comment['pics'][] = 'https:'.$pic;
}
}
//判断是否有追评
if (array_key_exists('appendComment', $commentInfo) && !empty($commentInfo['appendComment'])) {
$append = $commentInfo['appendComment'];
$comment['append'] = array();
$comment['append']['commentId'] = $append['commentId'];
$comment['append']['content'] = html_entity_decode($append['content'], ENT_QUOTES);
//是几天后的追评
$comment['append']['days'] = (int)$append['days'];
$comment['append']['time'] = strtotime($append['commentTime']);
//追评是否有图
if (array_key_exists('pics', $append) && !empty($append['pics'])) {
$comment['append']['pics'] = array();
foreach ($append['pics'] as $pic) {
$comment['append']['pics'][] = 'https:'.$pic;
}
}
}
$comments[] = $comment;
}
}
return $comments;
}
/**
* @function 获取交易记录
* @param string 商品id
* @param string 商品品类id
* @param string 卖家id
* @return array 交易记录数组
*/
public function getRecord($itemId, $skuId, $sellerId) {
$curl = new Curl();
$curl->userAgent(self::UA);
$curl->setCookies($this->cookie);
//构造referer
$referer = array();
$referer['id'] = $itemId;
$referer['skuId'] = $skuId;
$referer['sellerId'] = $sellerId;
$url = self::URL_REFERER.'?'.http_build_query($referer);
$curl->referer($url);
//成交记录,每页最多15条
$param = array();
//一定要设置回调,随便什么字符串都行,否则返回不了结果
$param['callback'] = 'callback';
//分页大小
$param['pageSize'] = 10;
//开始时间,不写会显示系统忙,估计需要时间区间才算合法请求,虽然没什么用
$param['starts'] = time();
//结束时间,同上
$param['ends'] = time();
//物品id
$param['itemId'] = $itemId;
//卖家id
$param['seller_num_id'] = $sellerId;
$records = array();
//取最近十次交易记录
for ($i=1; $i <= 10; $i++) {
//页码
$param['bidPage'] = $i;
$content = $curl->get(self::URL_RECORD.'?'.http_build_query($param));
//反转义
preg_match('/html:\"(.*)\",type/', stripcslashes($content), $match);
$html = new simple_html_dom();
$html->load($match[1]);
$trs = $html->find('table[class=table-deal-record] tbody tr');
$count = count($trs);
//有记录存在,则匹配交易记录
if ($count>1) {
for ($j=1; $j < $count; $j++) {
$record = array();
//买家有积分
if ($trs[$j]->find('td[class=cell-align-l] img[class=rank]',0)) {
$credit = $trs[$j]->find('td[class=cell-align-l] img[class=rank]',0)->title;
preg_match('/(\d+)-(\d+)个买家信用积分/', $credit, $match);
$record['min_credit'] = $match[1];
$record['max_credit'] = $match[2];
}
else {
$record['min_credit'] = 0;
$record['max_credit'] = 0;
}
$record['num'] = (int)$trs[$j]->find('td[class=quantity]',0)->plaintext;
$record['time'] = strtotime($trs[$j]->find('td[class=dealtime]',0)->plaintext);
$records[] = $record;
}
}
//如果这是最后一页,则跳出循环
if ($html->find('span[class=page-end]')) {
break;
}
$html->clear();
}
return $records;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment