Skip to content

Instantly share code, notes, and snippets.

@zernel
Forked from mimosz/.gitignore
Created October 23, 2012 15:55
Show Gist options
  • Save zernel/3939624 to your computer and use it in GitHub Desktop.
Save zernel/3939624 to your computer and use it in GitHub Desktop.
京东商品价格解析
# -*- encoding: utf-8 -*-
require 'mini_magick'
require 'rtesseract'
require 'nokogiri'
require 'nestful'
require 'csv'
require 'pp'
class Buy360
def initialize(url='')
@items = {}
@request = Nestful::Request.new(url)
end
def request
@request
end
def url=(value)
request.url = value
end
def params=(value)
request.params = value
end
def search(keyword)
self.url = search_url
self.params = { keyword: keyword, qrst: 'UNEXPAND', enc: 'utf-8' }
dom = get_dom
return parse_item(dom) if dom
end
def brand(brand_id)
self.url = brand_url
self.params = { 'BrandId' => brand_id }
dom = get_dom
return parse_item(dom) if dom
end
def export
unless @items.empty?
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数', '好评率']
CSV.open("#{Date.today}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
@items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
item[:title],
item[:pic_url],
item[:proms] ? item[:proms].join(' ') : '',
item[:price],
item[:rates][:count],
item[:rates][:rank],
]
end
end
end
end
def get_dom(try_count=0)
if request.url
debug
response = request.connection.get(request.query_path)
html = response.body.force_encoding("GBK").encode("UTF-8")
return parse_html(html) if html
else
puts "亲,干嘛?"
end
rescue Nestful::ForbiddenAccess => error
if try_count < 3 # 重试3次
puts "========================开始重试========================"
get_dom(try_count + 1)
else
puts "========================很扯,三次都没搞定========================"
end
end
private
def debug
puts request.url
puts request.params
end
def next_page(page)
self.params = request.params.merge(page: page)
end
def parse_item(dom)
total = parse_total(dom)
if total > 1
items = items_dom(dom)
pages = pages_count(total, items.count)
set_items(items)
if pages > 1
2.upto(pages).each do |page|
next_page(page)
puts "第#{page}/#{pages}页"
dom = get_dom
if dom
items = items_dom(dom)
set_items(items)
end
end
end
end
return @items
end
def set_items(dom)
unless dom.empty?
items = {}
dom.each do |item|
img_dom = item.at('div.p-img').at('img')
link_dom = item.at('div.p-name').at('a')
pic_url = img_dom['data-lazyload'] || img_dom[:src]
name = link_dom.text
rates_count = item.at('div.extra').at('a').text
rates_rank = item.at('div.extra').at('span.reputation').text
sku_id = item[:sku] || link_dom[:href].match(/product\/(.*)\.html$/)[1]
if sku_id
rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank?
rates_rank = rates_rank.match(/\d+/)[0] unless rates_rank.blank?
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count, rank: rates_rank} }
end
end
items = set_proms(items)
@items.merge!(items)
end
end
def set_proms(node)
proms = get_proms(node.keys)
unless proms.empty?
proms.each do |item_id, prom|
node[item_id][:proms] = prom if node.has_key?(item_id)
end
end
return node
end
def parse_html(html)
return Nokogiri::HTML(html)
end
def parse_proms(proms)
item_proms = []
proms.each do |flag|
item_proms << case flag.to_i
when 1
'直降'
when 2
'赠品'
when 3
'返券'
when 4
'送积分'
end
end
return item_proms
end
def items_dom(dom)
list = dom.at('div#plist')
list.at('ul.list-h').css('li') if list
end
def parse_total(dom)
filter_dom = dom.at('div#filter')
total_dom = filter_dom.at('div.total')
if total_dom
total_dom.at('strong').text.to_i
else
total_dom = filter_dom.at('ul.extra')
total_dom.at('li').at('strong').text.to_i if total_dom
end
end
def pages_count(total, size=24)
page = (total / size.to_f).to_i
page += 1 if (total % size) > 0
return page
end
def parse_price(sku_id, try_count=0)
sleep try_count
img = MiniMagick::Image.open(price_url(sku_id))
img.colorspace("GRAY") # 灰度化
img.monochrome # 去色
str = RTesseract.new(img.path).to_s.strip # 识别
File.unlink(img.path) # 删除临时文件
price = str.match(/\d+\.\d+/)[0].to_f
puts "#{sku_id}:#{str}:#{price}"
price if price > 1
rescue MiniMagick::Error => error
if try_count < 3 # 重试3次
puts "========================开始重试:#{sku_id}========================"
parse_price(sku_id,try_count + 1)
else
puts "========================很扯,三次都没搞定:#{price_url(sku_id)}========================"
end
end
def price_url(sku_id)
"http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-3.png"
end
def brand_url
'http://www.360buy.com/brandlist.aspx'
end
def search_url
'http://search.360buy.com/search'
end
def item_url(sku_id)
"http://www.360buy.com/product/#{sku_id}.html"
end
def store_url(store_id, page=1)
"http://mall.360buy.com/shopWare-#{store_id}----#{page}.html"
end
def get_proms(sku_ids)
prom_url = 'http://price.360buy.com/PromotionFlag.aspx'
params = { pid: sku_ids.join(',')}
html = Nestful.get prom_url, params: params
html = html.force_encoding("GBK").encode("UTF-8")
json = html.match(/\((.*)\)/)[1]
item_proms = {}
if json
json = ActiveSupport::JSON.decode(json)
proms = json['data']
unless proms.empty?
item_proms = {}
proms.each do |prom|
item_proms[prom['Pid']] = parse_proms(prom['PF'])
end
end
end
return item_proms
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment