Skip to content

Instantly share code, notes, and snippets.

@mimosz
Last active October 14, 2022 04:28
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 12 You must be signed in to fork a gist
  • Save mimosz/3899790 to your computer and use it in GitHub Desktop.
Save mimosz/3899790 to your computer and use it in GitHub Desktop.
京东商品价格解析
rvm use jruby
# -*- encoding: utf-8 -*-
require 'mini_magick'
require 'rtesseract'
require 'crawler'
class Buy360 < Crawler
def initialize(band_name, page=1, pages=0)
@band_name = band_name
@max_attempts = 3
@pages = pages
@page = page
@path = "#{@band_name}/#{Date.today}/360buy"
mkdirs(@path)
@request = Nestful::Request.new('http://search.360buy.com/search')
end
def finishing
merge_csv_files(@path, @pages)
end
def process
self.params = { keyword: @band_name, qrst: 'UNEXPAND', enc: 'utf-8', page: @page }
page_dom = get_page_dom('GBK')
items_dom = get_items_dom(page_dom)
page_count = export_items(items_dom)
if @pages < 1
total = parse_total(page_dom)
@pages = pages_count(total, page_count)
end
if @pages > 1
next_page
else
finishing
end
end
def next_page
if @page < @pages
@page += 1
puts "开始执行:#{@page}/#{@pages}"
process
else
finishing
end
end
private
def export(items)
unless items.empty?
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数']
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
item[:title],
item[:pic_url],
item[:proms] ? item[:proms].join(' ') : '',
item[:price],
item[:rates][:count],
]
end
end
end
return items.count
end
def export_items(items_dom)
items = {}
unless items_dom.empty?
items_dom.each do |item|
link_dom = item.at('div.p-name>a')
if link_dom
img_dom = item.at('div.p-img>a>img')
sku_id = item[:sku] || link_dom[:href].match(/product\/(.*)\.html$/)[1]
pic_url = img_dom['data-lazyload'] || img_dom[:src]
name = link_dom.text
rates_count = item.at('div.extra').at('a').text
rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank?
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count } }
end
end
items = set_proms(items) unless items.empty?
end
export(items)
end
def set_proms(node)
proms = get_proms(node.keys)
unless proms.empty?
proms.each do |item_id, prom|
node[item_id][:proms] = prom if node.has_key?(item_id)
end
end
return node
end
def parse_proms(proms)
item_proms = []
proms.each do |flag|
item_proms << case flag.to_i
when 1
'直降'
when 2
'赠品'
when 3
'返券'
when 4
'送积分'
end
end
return item_proms
end
def get_items_dom(page_dom)
list = page_dom.at('div#plist>ul.list-h')
if list
return list.css('li')
else
return []
end
end
def parse_total(page_dom)
filter_dom = page_dom.at('div#filter')
return 0 if filter_dom.nil?
total_dom = filter_dom.at('div.total')
if total_dom
total_dom.at('strong').text.to_i
else
total_dom = filter_dom.at('ul.extra')
total_dom.at('li').at('strong').text.to_i if total_dom
end
end
def parse_price(sku_id, type=3)
attempts = 0
img_url = price_url(sku_id, type)
begin
img = MiniMagick::Image.open(img_url)
img.colorspace("GRAY") # 灰度化
img.monochrome # 去色
rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error
attempts = attempts + 1
puts "错误: #{error}"
puts img_url
return 0 if(attempts < @max_attempts)
end
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
File.unlink(img.path) # 删除临时文件
price = str.match(/\d+\.\d+/)
return price ? price[0].to_f : try_parse_price(sku_id)
end
def try_parse_price(sku_id, type=2)
img_url = price_url(sku_id, type)
img = MiniMagick::Image.open(img_url)
img.resize '200x100' # 放大
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
File.unlink(img.path) # 删除临时文件
price = str.match(/\d+\.\d+/)
price = price[0] if price
return price.to_f
end
def price_url(sku_id, type=3)
"http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-#{type}.png"
end
def item_url(sku_id)
"http://www.360buy.com/product/#{sku_id}.html"
end
def get_proms(sku_ids, try_count=0)
item_proms = {}
json = nil
begin
prom_url = 'http://price.360buy.com/PromotionFlag.aspx'
params = { pid: sku_ids.join(',')}
html = Nestful.get prom_url, params: params
rescue Nestful::ServerError => error
return item_proms
end
html = html.force_encoding("GBK").encode("UTF-8")
if html.first == '({' && html.last == '})' # 京东系统错误时,会跳回首页
json = html.match(/\((.*)\)/)
elsif try_count < @max_attempts # 重试3次
get_proms(sku_ids, try_count + 1)
end
if json
json = json[1]
json = ActiveSupport::JSON.decode(json)
proms = json['data']
unless proms.empty?
item_proms = {}
proms.each do |prom|
item_proms[prom['Pid']] = parse_proms(prom['PF'])
end
end
end
return item_proms
end
end
# -*- encoding: utf-8 -*-
require 'crawler'
class Amazon < Crawler
def initialize(band_name, page=1, pages=0)
@band_name = band_name
@pages = pages
@page = page
@path = "#{@band_name}/#{Date.today}/amazon"
mkdirs(@path)
@request = Nestful::Request.new('http://www.amazon.cn/s')
@request.headers = { 'Host' => 'www.amazon.cn' }
end
def finishing
merge_csv_files(@path, @pages)
end
def process
self.params = { ie: 'UTF8', keywords: @band_name, page: @page, rh: "i:aps,k:#{@band_name}" }
page_dom = get_page_dom('UTF-8')
page_dom = page_dom.at('div#main>div#searchTemplate')
extra_dom = page_dom.at('div#centerBelow>div#btfResults')
page_dom = page_dom.at('div#center')
items_dom = get_items_dom(page_dom, extra_dom)
page_count = export_items(items_dom)
if @pages < 1
@pages = parse_total(page_dom)
end
if @pages > 1
next_page
else
finishing
end
end
def next_page
if @page < @pages
@page += 1
puts "开始执行:#{@page}/#{@pages}"
process
else
finishing
end
end
private
def export(items)
unless items.empty?
header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
item[:title],
item[:pic_url],
item[:tag_price],
item[:price],
item[:rates][:count],
]
end
end
end
return items.count
end
def export_items(items_dom)
items = {}
unless items_dom.empty?
items_dom.each do |item|
sku_id = item[:name]
pic_url = item.at('div.productImage>a>img')[:src]
item = item.at('div.productData')
name = item.at('div.productTitle').text
price = nil
tag_price = nil
price_dom = item.at('div.newPrice')
if price_dom
price = price_dom.at('span')
price = parse_price(price.text) if price
tag_price = price_dom.at('strike')
tag_price = parse_price(tag_price.text) if tag_price
end
rates_count = item.at('div.starsAndPrime')
rates_count = rates_count.css('a').last if rates_count
rates_count = rates_count ? rates_count.text.gsub(',', '').to_i : 0
if sku_id
items[sku_id] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, tag_price: tag_price }
end
end
end
export(items)
end
def get_items_dom(page_dom, extra_dom=nil)
items_dom = []
page_dom = page_dom.at('div#atfResults')
if page_dom
items_dom = page_dom.css('div.product')
end
if extra_dom
items_dom = items_dom + extra_dom.css('div.product')
end
return items_dom
end
def parse_total(page_dom)
total_dom = page_dom.at('div#resultCount')
if total_dom
total_dom.text.match(/\-(.*)条,/)[1].to_i
else
0
end
end
def parse_price(str)
str = str.gsub('¥ ', '')
str = str.gsub(',', '')
str.to_f
end
def item_url(sku_id)
"http://www.amazon.cn/dp/#{sku_id}"
end
end
# -*- encoding: utf-8 -*-
require 'fileutils'
require 'nokogiri'
require 'nestful'
require 'csv'
class Crawler
def initialize
@max_attempts = 3
end
def request
@request
end
def url=(value)
request.url = value
end
def headers=(value)
request.headers = value
end
def params=(value)
request.params = value
end
def get_page_dom(charset)
html = nil
begin
response = request.connection.get(request.query_path)
html = response.body.force_encoding(charset).encode("UTF-8")
rescue Nestful::Redirection => error
location = error.response['Location']
cookie = error.response['Set-Cookie']
if location.include?('no_results')
return html
else
self.headers = { 'Cookie' => cookie, 'Referer' => request.url }
self.url = location
retry
end
end
return Nokogiri::HTML(html) unless html.nil?
end
def mkdirs(path)
FileUtils.mkdir_p(path)
end
def merge_csv_files(path, files_count)
csv_files = Dir["#{path}/*.csv" + '']
puts "文件不够,呵呵~" if csv_files.count != files_count
CSV.open("#{path}.csv", "w:binary", col_sep: ',') do |csv|
has_header = false
csv_files.each do |csv_file|
data = CSV.read(csv_file, 'r:binary', headers: true, col_sep: ',')
unless has_header
csv << data.headers
has_header = true
end
data.each do |line|
csv << line
end
end
end
FileUtils.rm_r(path) # 删除临时文件夹
end
def pages_count(total, size)
return 0 if total == 0 || size == 0
page = (total / size.to_f).to_i
page += 1 if (total % size) > 0
return page
end
end
# -*- encoding: utf-8 -*-
require 'crawler'
class Jumei < Crawler
def initialize(band_name, page=1, pages=0)
@band_name = band_name
@pages = pages
@page = page
@path = "#{@band_name}/#{Date.today}/jumei"
mkdirs(@path)
@request = Nestful::Request.new('http://search.jumei.com')
end
def finishing
merge_csv_files(@path, @pages)
end
def process
self.params = { filter: "0-0-0-0-31-#{@page}", search: @band_name }
page_dom = get_page_dom('UTF-8')
if page_dom
page_dom = page_dom.at('div#search_result_wrap')
items_dom = get_items_dom(page_dom)
export_items(items_dom)
if @pages < 1
total = parse_total(page_dom)
@pages = pages_count(total, 40)
end
if @pages > 1
next_page
else
finishing
end
else
puts "聚美优品中搜索 #{@band_name},无结果"
end
end
def next_page
if @page < @pages
@page += 1
puts "开始执行:#{@page}/#{@pages}"
process
else
finishing
end
end
private
def export(items)
unless items.empty?
header_row = ['SKU', '名称', '图片', '价格', '购买数', '折扣']
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
item[:title],
item[:pic_url],
item[:price],
item[:rates][:count],
item[:proms].join(' '),
]
end
end
end
return items.count
end
def export_items(items_dom)
items = {}
unless items_dom.empty?
items_dom.each do |item|
proms = []
sku_id = item['pid'].to_i
name_dom = item.at('div>div.num_warp_list_name')
proms_dom = name_dom.at('span')
proms << proms_dom.text.gsub('/', '') if proms_dom
name = name_dom.at('a').text
pic_url = item.at('div>div.num_warp_list_pic_top').at('img')[:src]
countdown_dom = item.at('div>div.num_warp_list_warp_word.time_countdown')
countdown_dom.remove if countdown_dom
rates_count = item.at('div>div.num_warp_list_warp_word').css('span').last.text.gsub(/\p{Han}/, '').to_i
price_dom = item.at('div>div.num_warp_list_view_bg') || item.at('div>div.num_warp_list_name_mall')
price = price_dom.css('span').last.text.gsub('¥', '').to_f
proms_dom = price_dom.text.match(/\((.*)\)/)
proms << proms_dom[1] if proms_dom
if sku_id
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, proms: proms }
end
end
end
export(items)
end
def get_items_dom(page_dom)
list = page_dom.at('div#search_list_wrap>div.products>ul')
if list
return list.css('li.item')
else
return []
end
end
def parse_total(page_dom)
total_dom = page_dom.at('div.search_info>div>div.content').css('label.red')
if total_dom.count == 2
total_dom[1].text.to_i
else
0
end
end
def item_url(sku_id)
"http://mall.jumei.com/product_#{sku_id}.html"
end
end
# -*- encoding: utf-8 -*-
require 'mini_magick'
require 'rtesseract'
require 'crawler'
class Lefeng < Crawler
def initialize(band_name, page=1, pages=0)
@band_name = band_name
@pages = pages
@page = page
@path = "#{@band_name}/#{Date.today}/lefeng"
mkdirs(@path)
@canvas = MiniMagick::Image.open('./assets/canvas.jpg')
@request = Nestful::Request.new('http://search.lefeng.com/search/search')
end
def finishing
merge_csv_files(@path, @pages)
FileUtils.rm(@canvas.path) # 删除临时文件
end
def process
self.params = { key: @band_name, pageNo: @page }
page_dom = get_page_dom('UTF-8')
items_dom = get_items_dom(page_dom)
page_count = export_items(items_dom)
if @pages < 1
total = parse_total(page_dom)
@pages = pages_count(total, page_count)
end
if @pages > 1
next_page
else
finishing
end
end
def next_page
if @page < @pages
@page += 1
puts "开始执行:#{@page}/#{@pages}"
process
else
finishing
end
end
private
def export(items)
unless items.empty?
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数']
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
item[:title],
item[:pic_url],
item[:proms],
item[:price],
item[:rates][:count],
]
end
end
end
return items.count
end
def export_items(items_dom)
items = {}
unless items_dom.empty?
items_dom.each do |item|
link_dom = item.at('dt>a')
pic_url = link_dom.at('img')['src2']
name = link_dom[:title]
proms = item.at('dd.nam>a>i')
proms = proms.text if proms
rates_count = item.at('dd.mess>a').text.match(/\d+/)[0]
sku_id = link_dom[:href].match(/product\/(.*)\.html$/)[1]
price_url = item.at('dd.pri>img')[:src]
if sku_id
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(price_url), rates: { count: rates_count }, proms: proms }
end
end
end
export(items)
end
def get_items_dom(page_dom)
list = page_dom.at('div.list>div.smPruArea>div.makeup')
if list
return list.css('div.makeupdl')
else
return []
end
end
def parse_total(page_dom)
total_dom = page_dom.at('b#searchernum')
if total_dom
total_dom.text.to_i
else
0
end
end
def parse_price(price_url)
attempts = 0
begin
img_price = MiniMagick::Image.open(price_url)
img_price.resize '125%' # 放大
img = @canvas.composite(img_price) do |c|
c.gravity 'center'
end
img.colorspace("GRAY") # 灰度化
rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error
attempts = attempts + 1
puts "错误: #{error}"
retry if(attempts < @max_attempts)
end
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
FileUtils.rm( [img_price.path, img.path] ) # 删除临时文件
price = str.match(/\d+\.\d+/)
return price ? price[0].to_f : price_url
end
def item_url(sku_id)
"http://product.lefeng.com/product/#{sku_id}.html"
end
end
# -*- encoding: utf-8 -*-
$:.unshift File.expand_path('./lib')
namespace :cai do
# 必填项
task :required do
if ENV['band_name'].nil?
puts '缺少参数,请参照以下命令:'
puts 'rake cai band_name=品牌名称'
puts 'rake cai:amazon band_name=品牌名称'
puts 'rake cai:buy360 band_name=品牌名称'
puts 'rake cai:lefeng band_name=品牌名称'
puts 'rake cai:jumei band_name=品牌名称'
puts 'rake cai:yihaodian band_name=品牌名称'
exit
end
end
desc "采集 京东 数据"
task :buy360, [:page, :pages] => :required do |t, args|
args.with_defaults(page: 1, pages: 0)
require '360buy'
buy360 = Buy360.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
buy360.process
end
desc "采集 乐峰 数据"
task :lefeng, [:page, :pages] => :required do |t, args|
args.with_defaults(page: 1, pages: 0)
require 'lefeng'
lefeng = Lefeng.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
lefeng.process
end
desc "采集 聚美 数据"
task :jumei, [:page, :pages] => :required do |t, args|
args.with_defaults(page: 1, pages: 0)
require 'jumei'
jumei = Jumei.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
jumei.process
end
desc "采集 一号店 数据"
task :yihaodian, [:page, :pages] => :required do |t, args|
args.with_defaults(page: 1, pages: 0)
require 'yihaodian'
yihaodian = Yihaodian.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
yihaodian.process
end
desc "采集 亚马逊 数据"
task :amazon, [:page, :pages] => :required do |t, args|
args.with_defaults(page: 1, pages: 0)
require 'amazon'
amazon = Amazon.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
amazon.process
end
end
desc "采集 全平台 数据"
task :cai => ['cai:jumei', 'cai:lefeng', 'cai:yihaodian', 'cai:amazon', 'cai:buy360' ] do
puts "全平台目前仅支持:京东、乐峰、聚美、亚马逊、一号店。"
end
# -*- encoding: utf-8 -*-
require 'crawler'
class Yihaodian < Crawler
def initialize(band_name, page=1, pages=0)
@band_name = band_name
@pages = pages
@page = page
@path = "#{@band_name}/#{Date.today}/yihaodian"
mkdirs(@path)
@request = Nestful::Request.new('')
end
def get_json_dom(charset, try_count=0)
response = request.connection.get(request.query_path)
html = response.body.force_encoding(charset).encode("UTF-8")
html = ActiveSupport::JSON.decode(html)['value'] if html
return Nokogiri::HTML(html)
rescue Nestful::ForbiddenAccess => error
if try_count < 3 # 重试3次
puts "========================开始重试========================"
get_page_dom(charset, try_count + 1)
else
puts "========================很扯,三次都没搞定========================"
end
end
def finishing
merge_csv_files(@path, @pages)
end
def search_url
"http://search.yihaodian.com/searchPage/c0-0/b/a-s1-v0-p#{@page}-price-d0-f0-m1-rt0-pid-k#{URI::encode(@band_name)}"
end
def process
self.url = search_url
page_dom = get_json_dom('UTF-8')
items_dom = get_items_dom(page_dom)
page_count = export_items(items_dom)
if @pages < 1
@pages = parse_total(page_dom)
end
if @pages > 1
next_page
else
finishing
end
end
def next_page
if @page < @pages
@page += 1
puts "开始执行:#{@page}/#{@pages}"
process
else
finishing
end
end
private
def export(items)
unless items.empty?
header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
csv << header_row
items.each do |sku_id, item|
csv << [
"=HYPERLINK(\"#{item_url(item[:item_id])}\",\"#{sprintf("%010d", sku_id)}\")",
item[:title],
item[:pic_url],
item[:tag_price],
item[:price],
item[:rates][:count],
]
end
end
end
return items.count
end
def export_items(items_dom)
items = {}
unless items_dom.empty?
items_dom.each do |item|
product_id = item[:id].match(/\d+/)[0].to_i
link_dom = item.at("a#pdlink1_#{product_id}")
item_id = link_dom['pmid'].to_i
sku_id = item.at("div.buyInfo>button#buyButton_#{product_id}")
sku_id = sku_id['productcode'].to_i if sku_id
link_dom = link_dom.at('img')
pic_url = link_dom['original'] || link_dom[:src]
name = link_dom[:title]
link_dom = item.at('p.price')
rates_count = link_dom.at('a')
rates_count = rates_count ? rates_count.text.match(/\d+/)[0].to_i : 0
price = link_dom.at('strong').text
tag_price = link_dom.at('del')
tag_price = parse_price(tag_price.text) if tag_price
if sku_id
items[sku_id.to_i] = { item_id: item_id, pic_url: pic_url.strip, title: name.strip, price: parse_price(price), rates: { count: rates_count }, tag_price: tag_price }
end
end
end
export(items)
end
def get_items_dom(page_dom)
list = page_dom.at('div#plist>div#search_table>div.itemSearchResult.clearfix>ul.itemSearchList')
if list
return list.css('li')
else
return []
end
end
def parse_total(page_dom)
total_dom = page_dom.at('ul.page.clearfix>li.pageNum')
if total_dom
total_dom.text.match(/\/(.*)$/)[1].to_i
else
0
end
end
def parse_price(str)
str.gsub('¥', '').to_f
end
def item_url(item_id)
"http://www.yihaodian.com/item/#{item_id}_1"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment