Skip to content

Instantly share code, notes, and snippets.

@mimosa
Last active August 29, 2015 14:00
Show Gist options
  • Save mimosa/11153744 to your computer and use it in GitHub Desktop.
Save mimosa/11153744 to your computer and use it in GitHub Desktop.
新京东爬虫
# -*- encoding: utf-8 -*-
require './jd'
url = 'http://search.jd.com/search?keyword=%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&enc=utf-8&qr=&qrst=UNEXPAND&et=&rt=1&bs=no&stop=1&area=15&wtype=1&ev=&uc=0#select'
jd = JD.new(url)
items = jd.parse
# -*- encoding: utf-8 -*-
require 'addressable/uri' unless defined?(::Addressable::URI)
require 'multi_json' unless defined?(::MultiJson)
require 'nokogiri' unless defined?(::Nokogiri)
require 'faraday' unless defined?(::Faraday)
require 'excon' unless defined?(::Excon)
require 'awesome_print'
class JD
def initialize(url)
@items = {}
uri = Addressable::URI.parse( url )
@url = "#{uri.scheme}://#{uri.host}#{uri.path}"
@params = uri.query_values
@pages = 1
end
def conn
@conn ||= Faraday.new( ssl: false ) do |c|
c.adapter :excon
end
end
def parse(page=1)
resp = conn.get(@url, @params.merge('page' => page))
if resp.status == 200
html = Nokogiri::HTML.parse(resp.body)
body = html.css('body').first
total = body.at('div#filter>div>div.total>span>strong').text.to_i rescue 0 # 页数
return '没有记录' if total == 0
# 列表
list = body.at('div#plist>ul.list-h').css('li')
if list.empty?
puts '_'*88
puts list.to_html
puts '_'*88
else
per_page = list.count # 单页数
puts '_'*88
puts "第#{page}页,#{per_page}条记录"
puts '_'*88
# 分页
@pages = num_pages(total, per_page) if @pages == 1 && total > per_page # 总页数
# 赋值
item_ids = []
list.each do |li|
item_id = li.attributes['sku'].value rescue nil
if item_id.nil?
parse(page)
else
photo = li.at('div.p-img>a>img').attributes['data-lazyload'].value rescue nil
title = li.at('div.p-name>a').text.strip rescue nil
@items[item_id] = {
photo_url: photo ? photo : nil,
title: title ? title : nil,
}
item_ids << "J_#{item_id}"
end
end
prices(item_ids) # 价格
# 下一页
if @pages > page
parse(page+1)
else
puts '_'*88
puts "#{@items.count}条记录"
puts '_'*88
return @items
end
end
else
puts '_'*88
puts "网络中断在第#{page}页"
end
nil
end
def num_pages(total, limit)
num = total / limit
num += 1 if total % limit > 0
num
end
def prices(ids)
resp = conn.get('http://p.3.cn/prices/mgets', skuids: ids.join(','))
if resp.status == 200
items = ::MultiJson.load(resp.body)
items.each do |item|
item_id = item['id'].gsub('J_', '')
if @items.has_key?(item_id)
@items[item_id][:price] = item['p'].to_f
@items[item_id][:tag_price] = item['m'].to_f
else
puts '_'*88
puts "#{id},遗失啦?"
end
end
else
puts '_'*88
puts "没有获取到"
end
nil
end
end
# -*- encoding: utf-8 -*-
require 'addressable/uri' unless defined?(::Addressable::URI)
require 'multi_json' unless defined?(::MultiJson)
require 'nokogiri' unless defined?(::Nokogiri)
require 'faraday' unless defined?(::Faraday)
require 'awesome_print'
conn = Faraday.new( ssl: false ) do |c|
c.adapter :patron
end
url = 'http://www.yhd.com/ctg/s2/c0-0/b/a-s1-v0-p1-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/'
uri = Addressable::URI.parse( url )
@url = "#{uri.scheme}://#{uri.host}#{uri.path}"
@params = uri.query_values || {}
resp = conn.get(@url)
html = Nokogiri::HTML.parse(resp.body)
body = html.css('div#bodyRight').first
total = body.at('div.mod_search_crumb>small.result_count').text[1..-2].to_i rescue 0 # 页数
list = body.at('div#plist>div#search_table>div>ul#itemSearchList').css('li')
# 分页使用 url.gusb("-p#{page-1}-", "-p#{page}-") if page > 1
# 取出隐藏部分产品
http://www.yhd.com/ctg/searchPage/c0-0/b/a-s1-v0-p2-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/?isGetMoreProducts=1&moreProductsDefaultTemplate=0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment