Created
January 22, 2017 03:01
-
-
Save xuexingdong/47baa6ed4b88ac4cf622905293cb9718 to your computer and use it in GitHub Desktop.
Tmall spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @类名 Tmall | |
* @功能 天猫爬虫 | |
* @author XueXingdong(306947352@qq.com) | |
* | |
*/ | |
include_once('Curl.class.php'); | |
include_once('simple_html_dom.class.php'); | |
class Tmall { | |
//获取cookie的链接 | |
const URL_COOKIE = 'https://login.taobao.com/jump?target=https://list.tmall.com/search_product.htm'; | |
//浏览器UA,如果不填写会被当爬虫过滤 | |
const UA = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0'; | |
//根据关键词检索商品的链接 | |
const URL_SEARCH = 'https://list.tmall.com/search_product.htm'; | |
//商品详情信息填充链接(非网页源代码) | |
const URL_DETAIL = 'https://mdskip.taobao.com/core/initItemDetail.htm'; | |
//收藏数的链接 | |
const URL_COLLECTION = 'https://count.taobao.com/counter3?callback=callback&keys=SM_368_dsr-875041721,ICCP_1_'; | |
//交易记录链接https://rate.tmall.com/listTryReport.htm?itemId=26293100988&pageSize=5¤tPage=1&_ksTS=1443832993932_1023&callback=jsonp1024 | |
const URL_RECORD = 'https://ext-mdskip.taobao.com/extension/dealRecords.htm'; | |
//评分链接 | |
const URL_RANK = 'https://ext-mdskip.taobao.com/extension/seller_info.htm'; | |
//评论链接 | |
const URL_COMMENT = 'https://rate.tmall.com/list_detail_rate.htm'; | |
//REFERER地址 | |
const URL_REFERER = 'https://detail.tmall.com/item.htm'; | |
//cookie | |
private $cookie; | |
//构造函数,初始化cookie | |
public function __construct($cookie = '') { | |
if (empty($cookie)) { | |
$curl = new Curl(); | |
$curl->get(self::URL_COOKIE); | |
$this->cookie = $curl->getCookies(); | |
} | |
else { | |
$this->cookie = $cookie; | |
} | |
} | |
/** | |
* @function 返回cookie,用于保存 | |
* @param void | |
* @return string cookie | |
*/ | |
public function getCookie(){ | |
return $this->cookie; | |
} | |
/** | |
* @function 获取检索结果总页数 | |
* @param string 检索词 | |
* @return int 检索结果页数 | |
*/ | |
public function getTotalPage($keyword) { | |
//totalPage代表总页数,jumpto代表跳到第几页 | |
$param['q'] = $keyword; | |
//取消自动选择品牌 | |
$param['search_condition'] = 2; | |
$curl = new Curl(); | |
//设置UA | |
$curl->userAgent(self::UA); | |
$curl->setCookies($this->cookie); | |
$content = $curl->get(self::URL_SEARCH.'?'.http_build_query($param)); | |
$total = 0; | |
if (preg_match('/共(\d+)页/', $content, $match)) { | |
$total = (int)$match[1]; | |
} | |
return $total; | |
} | |
/** | |
* @function 根据关键词检索天猫 | |
* @param string 关键词 | |
* @param int 要查询的页码 | |
* @param int 总页数 | |
* @return array 检索结果 | |
* @note 天猫默认每页60个 | |
*/ | |
public function search($keyword, $page = 1,$totalPage = 1) { | |
//检索词 | |
$param['q'] = $keyword; | |
//取消自动选择品牌 | |
$param['search_condition'] = 2; | |
//查看第几页 | |
$param['jumpto'] = $page; | |
//总页数 | |
$param['totalPage'] = $totalPage; | |
$curl = new Curl(); | |
//设置UA | |
$curl->userAgent(self::UA); | |
$curl->setCookies($this->cookie); | |
$content = $curl->get(self::URL_SEARCH.'?'.http_build_query($param)); | |
$html = new simple_html_dom(); | |
$html->load($content); | |
$ps = $html->find('div[class=product]'); | |
$result = array(); | |
foreach ($ps as $p) { | |
//有的不是商品而是商品特辑的集合,直接跳过 | |
if (!array_key_exists('data-id', $p->attr)) { | |
continue; | |
} | |
$product = array(); | |
//封面图 | |
$img = $p->find('div[class=productImg-wrap] a[class=productImg] img',0); | |
if ($img->src) { | |
$product['coverImg'] = 'https:'.$img->src; | |
} | |
else { | |
$product['coverImg'] = 'https:'.$img->attr['data-ks-lazyload']; | |
} | |
//封面价格 | |
$product['coverPrice'] = (float)$p->find('p[class=productPrice] em',0)->title; | |
//标题 | |
$product['title'] = html_entity_decode($p->find('p[class=productTitle] a',0)->title, ENT_QUOTES); | |
//所属店铺 | |
$product['storeName'] = trim(strip_tags($p->find('a[class=productShop-name]',0)->innertext)); | |
//详情页地址 | |
$product['url'] = 'https:'.html_entity_decode($p->find('div[class=productImg-wrap] a',0)->href, ENT_QUOTES); | |
//商品编号 | |
preg_match('/id=(\d+)&?/', $product['url'], $match); | |
$product['itemId'] = $match[1]; | |
//skuId,后面的json有个键是这个,根据这个获取价格信息 | |
preg_match('/skuId=(\d+)&?/', $product['url'], $match); | |
$product['skuId'] = $match[1]; | |
preg_match('/user_id=(\d+)&?/', $product['url'], $match); | |
$product['sellerId'] = $match[1]; | |
$result[] = $product; | |
} | |
$html->clear(); | |
return $result; | |
} | |
/** | |
* @function 获取商品详细信息 | |
* @param string 商品页面链接 | |
* @return array 商品信息数组 | |
*/ | |
public function getProductInfo($itemId, $skuId, $sellerId) { | |
$product = array(); | |
$product['itemId'] = $itemId; | |
$product['skuId'] = $skuId; | |
$product['sellerId'] = $sellerId; | |
$curl = new Curl(); | |
$curl->userAgent(self::UA); | |
$curl->setCookies($this->cookie); | |
//构造referer | |
$referer = array(); | |
$referer['id'] = $itemId; | |
$referer['skuId'] = $skuId; | |
$referer['sellerId'] = $sellerId; | |
$url = self::URL_REFERER.'?'.http_build_query($referer); | |
$curl->referer($url); | |
//先获取界面源代码,看看有哪些信息 | |
$content = $curl->get($url); | |
$html = new simple_html_dom(); | |
$html->load($content); | |
//商品展示图 | |
$trumbs = $html->find('ul[id=J_UlThumb] a img'); | |
$product['trumbs'] = array(); | |
foreach ($trumbs as $trumb) { | |
$product['trumbs'][] = 'https:'.$trumb->src; | |
} | |
//收藏数 | |
$content = $curl->get(self::URL_COLLECTION.$product['itemId']); | |
if (preg_match('/\"ICCP_1_\d+\":(\d+),/', $content, $match)) { | |
$product['collection'] = (int)$match[1]; | |
} | |
$html->clear(); | |
//其他信息需要访问详情页动态加载的json | |
$content = $curl->get(self::URL_DETAIL.'?itemId='.$product['itemId']); | |
//商品详情数组 | |
$productDetail = json_decode($content,true); | |
//判断是否获取json成功 | |
if (!array_key_exists('isSuccess', $productDetail) || $productDetail['isSuccess']!==true) { | |
return array(); | |
} | |
//价格信息数组 | |
$priceInfo = $productDetail['defaultModel']['itemPriceResultDO']['priceInfo']; | |
//如果存在该键,则价格信息就在里面,否则是在def键中 | |
if (array_key_exists($product['skuId'], $priceInfo)) { | |
$priceInfo = $priceInfo[$product['skuId']]; | |
} | |
else { | |
$priceInfo = $priceInfo['def']; | |
} | |
//原价信息 | |
$product['price'] = (float)$priceInfo['price']; | |
//如果有促销信息 | |
if (array_key_exists('promotionList', $priceInfo)) { | |
//促销价格 | |
$product['promotePrice'] = (float)$priceInfo['promotionList'][0]['price']; | |
} | |
//库存数 | |
//如果这个键的值是空的,说明没有可选的品目,直接取总库存即可 | |
if (empty($productDetail['defaultModel']['inventoryDO']['skuQuantity'])) { | |
$product['stock'] = (int)$productDetail['defaultModel']['inventoryDO']['icTotalQuantity']; | |
} | |
else { | |
$product['stock'] = (int)$productDetail['defaultModel']['inventoryDO']['skuQuantity'][$product['skuId']]['quantity']; | |
} | |
//评分 | |
$param = array(); | |
//店铺id | |
$param['user_num_id'] = $product['sellerId']; | |
$content = $curl->get(self::URL_RANK.'?'.http_build_query($param)); | |
$html->load($content); | |
$lis = $html->find('li[class=J_RateInfoTrigger]'); | |
//与商品相符 | |
$product['rank']['match']['total'] = (float)$lis[0]->find('div[class=total] em',0)->title; | |
for ($i=1; $i <= 5; $i++) { | |
$value = $lis[0]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext; | |
preg_match('/.*\((\d+)人\).*/', $value, $match); | |
$product['rank']['match'][$i] = (int)$match[1]; | |
} | |
//服务态度 | |
$product['rank']['service']['total'] = (float)$lis[1]->find('div[class=total] em',0)->title; | |
for ($i=1; $i <= 5; $i++) { | |
$value = $lis[1]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext; | |
preg_match('/.*\((\d+)人\).*/', $value, $match); | |
$product['rank']['service'][$i] = (int)$match[1]; | |
} | |
//物流 | |
$product['rank']['transform']['total'] = (float)$lis[2]->find('div[class=total] em',0)->title; | |
for ($i=1; $i <= 5; $i++) { | |
$value = $lis[2]->find('div[class=count'.$i.'] span[class=people-no]',0)->plaintext; | |
preg_match('/.*\((\d+)人\).*/', $value, $match); | |
$product['rank']['transform'][$i] = (int)$match[1]; | |
} | |
//评论信息 | |
$product['comments'] = $this->getComment($product['itemId'], $product['skuId'], $product['sellerId']); | |
//交易记录 | |
$product['records'] = $this->getRecord($product['itemId'], $product['skuId'], $product['sellerId']); | |
return $product; | |
} | |
/** | |
* @function 获取评论信息 | |
* @param string 商品id | |
* @param string 商品品类id | |
* @param string 卖家id | |
* @return array 评论信息数组 | |
*/ | |
public function getComment($itemId, $skuId, $sellerId) { | |
$comments = array(); | |
$param = array(); | |
$param['callback'] = 'callback'; | |
//是否显示有内容的评论 | |
$param['content'] = 1; | |
//按时间顺序 | |
$param['order'] = 1; | |
//物品id | |
$param['itemId'] = $itemId; | |
//卖家id; | |
$param['sellerId'] = $sellerId; | |
$curl = new Curl(); | |
$curl->userAgent(self::UA); | |
$curl->setCookies($this->cookie); | |
//取前十页的评论 | |
for ($page=1; $page <= 10; $page++) { | |
$param['currentPage'] = $page; | |
$content = $curl->get(self::URL_COMMENT.'?'.http_build_query($param)); | |
preg_match('/callback\(({.*})\)/', $content, $match); | |
$commentsInfo = json_decode($match[1],true); | |
//到某页评论为空了,则跳出循环 | |
if (empty($commentsInfo['rateDetail']['rateList'])) { | |
break; | |
} | |
foreach ($commentsInfo['rateDetail']['rateList'] as $key => $commentInfo) { | |
$comment = array(); | |
$comment['id'] = $commentInfo['id']; | |
$comment['content'] = html_entity_decode($commentInfo['rateContent'], ENT_QUOTES); | |
$comment['time'] = strtotime($commentInfo['rateDate']); | |
//评论的买家信用积分 | |
$comment['credit'] = (int)$commentInfo['displayRateSum']; | |
//评论是否有图 | |
if (array_key_exists('pics', $commentInfo) && !empty($commentInfo['pics'])) { | |
$comment['pics'] = array(); | |
foreach ($commentInfo['pics'] as $pic) { | |
$comment['pics'][] = 'https:'.$pic; | |
} | |
} | |
//判断是否有追评 | |
if (array_key_exists('appendComment', $commentInfo) && !empty($commentInfo['appendComment'])) { | |
$append = $commentInfo['appendComment']; | |
$comment['append'] = array(); | |
$comment['append']['commentId'] = $append['commentId']; | |
$comment['append']['content'] = html_entity_decode($append['content'], ENT_QUOTES); | |
//是几天后的追评 | |
$comment['append']['days'] = (int)$append['days']; | |
$comment['append']['time'] = strtotime($append['commentTime']); | |
//追评是否有图 | |
if (array_key_exists('pics', $append) && !empty($append['pics'])) { | |
$comment['append']['pics'] = array(); | |
foreach ($append['pics'] as $pic) { | |
$comment['append']['pics'][] = 'https:'.$pic; | |
} | |
} | |
} | |
$comments[] = $comment; | |
} | |
} | |
return $comments; | |
} | |
/** | |
* @function 获取交易记录 | |
* @param string 商品id | |
* @param string 商品品类id | |
* @param string 卖家id | |
* @return array 交易记录数组 | |
*/ | |
public function getRecord($itemId, $skuId, $sellerId) { | |
$curl = new Curl(); | |
$curl->userAgent(self::UA); | |
$curl->setCookies($this->cookie); | |
//构造referer | |
$referer = array(); | |
$referer['id'] = $itemId; | |
$referer['skuId'] = $skuId; | |
$referer['sellerId'] = $sellerId; | |
$url = self::URL_REFERER.'?'.http_build_query($referer); | |
$curl->referer($url); | |
//成交记录,每页最多15条 | |
$param = array(); | |
//一定要设置回调,随便什么字符串都行,否则返回不了结果 | |
$param['callback'] = 'callback'; | |
//分页大小 | |
$param['pageSize'] = 10; | |
//开始时间,不写会显示系统忙,估计需要时间区间才算合法请求,虽然没什么用 | |
$param['starts'] = time(); | |
//结束时间,同上 | |
$param['ends'] = time(); | |
//物品id | |
$param['itemId'] = $itemId; | |
//卖家id | |
$param['seller_num_id'] = $sellerId; | |
$records = array(); | |
//取最近十次交易记录 | |
for ($i=1; $i <= 10; $i++) { | |
//页码 | |
$param['bidPage'] = $i; | |
$content = $curl->get(self::URL_RECORD.'?'.http_build_query($param)); | |
//反转义 | |
preg_match('/html:\"(.*)\",type/', stripcslashes($content), $match); | |
$html = new simple_html_dom(); | |
$html->load($match[1]); | |
$trs = $html->find('table[class=table-deal-record] tbody tr'); | |
$count = count($trs); | |
//有记录存在,则匹配交易记录 | |
if ($count>1) { | |
for ($j=1; $j < $count; $j++) { | |
$record = array(); | |
//买家有积分 | |
if ($trs[$j]->find('td[class=cell-align-l] img[class=rank]',0)) { | |
$credit = $trs[$j]->find('td[class=cell-align-l] img[class=rank]',0)->title; | |
preg_match('/(\d+)-(\d+)个买家信用积分/', $credit, $match); | |
$record['min_credit'] = $match[1]; | |
$record['max_credit'] = $match[2]; | |
} | |
else { | |
$record['min_credit'] = 0; | |
$record['max_credit'] = 0; | |
} | |
$record['num'] = (int)$trs[$j]->find('td[class=quantity]',0)->plaintext; | |
$record['time'] = strtotime($trs[$j]->find('td[class=dealtime]',0)->plaintext); | |
$records[] = $record; | |
} | |
} | |
//如果这是最后一页,则跳出循环 | |
if ($html->find('span[class=page-end]')) { | |
break; | |
} | |
$html->clear(); | |
} | |
return $records; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment