Created
December 28, 2012 10:46
-
-
Save mimosz/4396767 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 获取购买记录 | |
def get_trades(crawler, item_id, seller_id, item_sbn, page=1) | |
start_at = Date.yesterday.beginning_of_day.to_i * 1000 | |
end_at = Date.yesterday.end_of_day.to_i * 1000 | |
# 购买列表 | |
crawler.url = 'http://tbskip.taobao.com/json/show_buyer_list.htm' | |
crawler.params = { | |
bid_page: page, | |
ends: end_at, | |
starts: start_at, | |
item_id: item_id, | |
seller_num_id: seller_id, | |
sbn: item_sbn, | |
} | |
item_url = "http://detail.tmall.com/auction/item_detail.htm?item_num_id=#{item_id}&show_review=1&tbpm=1" | |
crawler.headers = { 'Host' => 'tbskip.taobao.com', 'Referer' => item_url } | |
body = crawler.request.execute | |
if body | |
body = Zlib::GzipReader.new(StringIO.new(body)).read.force_encoding('GB18030').encode('UTF-8') | |
body = Nokogiri::HTML(body) | |
return body | |
end | |
end | |
# 宝贝SBN,用于获取购买记录 | |
def parse_sbn(page_dom) | |
sbn_dom = page_dom.at('button#J_listBuyerOnView') # 找节点 | |
item_sbn = nil | |
if sbn_dom | |
sbn_dom = sbn_dom['detail:params'] # 淘宝自定义属性 | |
item_sbn = sbn_dom.match(/sbn=(.+)\,showBuyerList/) if sbn_dom | |
item_sbn = item_sbn[1] if item_sbn | |
else | |
logger.warn "无法解析,宝贝购买记录"; | |
end | |
return item_sbn | |
end | |
# 拆分交易属性 | |
def props_split(str) | |
props = {} | |
str.split(';').each do |prop| | |
prop = prop.split(':') | |
props[prop[0]] = prop[1] | |
end | |
return props | |
end | |
url = "http://jump.taobao.com/jump" | |
crawler = Crawler.new(url) | |
crawler.params = { target: 'http://detail.tmall.com/auction/item_detail.htm?item_num_id=20485336962&show_review=1&tbpm=1' } | |
page_dom = crawler.get_dom | |
if page_dom | |
detail_dom = page_dom.at('div#detail>script') | |
if detail_dom | |
detail_dom = detail_dom.text.gsub(/\s+/, '').strip | |
detail_dom = detail_dom.match(/\"itemDO\"\:(.*)\,\"detail/) | |
if detail_dom | |
json = detail_dom[1].gsub("'","\"") | |
item_info = ActiveSupport::JSON.decode(json) | |
item = {} | |
item[:id] = item_info['itemId'] | |
item[:type] = item_info['auctionType'] | |
item[:status] = case | |
when item_info['isOnline'] == true | |
'在售' | |
when item_info['isOnline'] == true | |
'售罄' | |
else | |
'下架' | |
end | |
item[:category_id] = item_info['categoryId'].to_i | |
item[:seller_id] = item_info['userId'].to_i | |
item[:spu_id] = item_info['spuId'].to_i | |
item[:tag_price] = item_info['reservePrice'].to_f | |
item[:brand_id] = item_info['brandId'].to_i | |
item[:brand] = item_info['brand'] | |
item[:quantity] = item_info['quantity'].to_i | |
item[:item_sbn] = parse_sbn(page_dom) | |
trades_dom = get_trades(crawler, item[:id], item[:seller_id], item[:item_sbn]) | |
page = 1 | |
if trades_dom.at('span.page-end').nil? | |
page += 1 | |
end | |
trades_dom = trades_dom.css('table>tr') | |
trades.each do |trade_dom| | |
trade_dom = trade_dom.text.split(' ') | |
if trade_dom.last == '成交' | |
trade = { buyer: trade_dom[0], props: props_split(trade_dom[2]), price: trade_dom[3].to_f, num: trade_dom[4].to_i, date: Time.parse("#{trade_dom[5]} #{trade_dom[6]}") } | |
puts trade | |
end | |
end | |
else | |
logger.warn "无法解析,宝贝基础信息"; | |
end | |
else | |
logger.warn "无法解析,宝贝详情"; | |
end | |
else | |
logger.error "没什么好说的了~~"; | |
end | |
json = str.match(/\"skuMap\"\:(.*)\}\,\"isSevenDaysRefundment/)[1].gsub("'","\"") | |
item_sku = ActiveSupport::JSON.decode(json) | |
item_sku.each do |tag, sku| | |
props_tag = tag.split(';') | |
props_tag.delete("") | |
sku_props = {} | |
props_tag.each do |prop_tag| | |
if props.has_key?(prop_tag) | |
prop = props[prop_tag] | |
sku_props[prop[:type]] = prop[:name] | |
end | |
end | |
s = { id: sku['skuId'], tag: tag, quantity: sku['stock'].to_i, props: sku_props } | |
puts s | |
end | |
# 下架宝贝无此节点,宝贝SKU属性 | |
props_dom = page_dom.at('div.tb-sku').css('dl.tb-prop') | |
props = {} | |
props_dom.each do |prop_dom| | |
prop_type = prop_dom.at('dt').text | |
if prop_type | |
prop_dom.css('dd>ul>li').each do |el| | |
props[el['data-value']] = { type: prop_type, name: el.at('a>span').text} | |
end | |
end | |
end | |
# 创建宝贝时,需要解析产品属性 | |
def parse_attributes(page_dom) | |
attributes_dom = page_dom.css('div#attributes>div.attributes-list>ul>li') | |
attributes = {} | |
if attributes_dom | |
attributes_dom.each do |attribute| | |
str = attribute.text.strip | |
if str.count(" ") > 2 | |
arr = str.split(' ') | |
attr_name = arr[0].gsub(':', '') | |
attributes[attr_name] = (arr - [arr[0]]).join(',') | |
else | |
arr = str.split(':') | |
attr_name = arr[0] | |
attributes[attr_name] = arr[1].gsub(" ", '') | |
end | |
end | |
else | |
logger.warn "无法解析,产品属性"; | |
end | |
return attributes | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment