Skip to content

Instantly share code, notes, and snippets.

@paranoidxc
Created June 27, 2013 08:57
Show Gist options
  • Save paranoidxc/5875048 to your computer and use it in GitHub Desktop.
Save paranoidxc/5875048 to your computer and use it in GitHub Desktop.
爬 百度个人空间 相册 , 未完
#encoding: UTF-8
# Author: XiaochuanHuang
# Email : emohuang [at] gmail [dot] com
require 'open-uri'
require "net/http"
require "uri"
require 'hpricot'
class Db_Album
def url_get_content(url)
url = URI.parse( url )
req = Net::HTTP::Get.new(url.path+'?'+url.query.to_s )
req.add_field("X-Forwarded-For", "0.0.0.0")
req.add_field('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:18.0) Gecko/20100101 Firefox/18.0')
res = Net::HTTP.new(url.host, url.port).start do |http|
http.request(req)
end
return res.body
end
def url_to_file(url)
File.open('album.txt', "wb") do |file|
file.write( self.url_get_content(url) )
end
end
def initialize(url="http://www.douban.com/photos/album/44503288/")
dir = 'images/'
@dir = File.join(File.dirname(__FILE__), dir)
if !Dir.exists?(@dir)
Dir.mkdir(@dir, 0777)
#Dir.mkdir(File.join(File.dirname(__FILE__), @dir), 0777)
end
@page = 0;
@offset = 18
@url = url
@sleep_photo = 0.2
@sleep_album_page = 4
self.album_page
puts ' ablum download done :) '
end
def album_page
start = @page*@offset
url = start == 0 ? @url : url = @url +'?start='+start.to_s
#file=open('db.txt')
#text=file.read
#doc = Hpricot(text)
#puts url
puts 'start album page '+ url
content = url_get_content(url)
content.scan(/picSign:\ \'(.+?)',/) do |detail|
self.album_photo_dl( 'http://xiangce.baidu.com/picture/detail/'.concat(detail.shift) )
sleep @sleep_photo
end
end
def album_photo_dl(url)
print 'Photo Url '+ url + ' '
content = url_get_content(url)
photo = Hpricot(content)
is_large = false
img = nil
# parse is not a large photo to download
# set img element with large photo
#photo.search('.report-link a') do |large|
#photo.search('.img-auto-item-wrapper img') do |large|
# is_large = true
# large_page = large.attributes['href'];
#large_page = Hpricot( url_get_content( large.attributes['href'] ) )
#img = large_page.search('#pic-viewer a img')
#end
# if not have large size photo download
# set img element with default size photo
if !is_large
#img = photo.search('.mainphoto img')
img = photo.search('.img-auto-item-wrapper img')
end
# download the photo
if img.length || is_large
if is_large
src = large_page
else
img = img[0]
src = img.attributes['src']
end
u = URI.parse(src)
save_file_path = @dir + u.path.split('/')[-1]
if !File.file?( save_file_path )
print ' Start Download... !'
open( save_file_path, 'wb') do |file|
file << open(src).read
end
puts ' done !'
else
puts ' Photo File exist Skip '
end
end
end
end
# Tips:
# The album photos will save to `images` directory under the current Db_Album.rb file directory
# Will Create `images` directory if not exist
# Will Download album photo if not exist in `images` directory
# How to use ? $ ruby Db_Album.rb
# Db_Album.new( douban_album_url )
# Db_Album.new("http://www.douban.com/photos/album/87115884/") # large photo
#
#http://xiangce.baidu.com/picture/album/list/7f618a1db5da8b5ccccc9ca0d9e6f974902bafec
Db_Album.new("http://xiangce.baidu.com/picture/album/list/7f618a1db5da8b5ccccc9ca0d9e6f974902bafec")
#Db_Album.new("http://www.douban.com/photos/album/46907826/") # default size photo
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment