Created
June 27, 2013 08:57
-
-
Save paranoidxc/5875048 to your computer and use it in GitHub Desktop.
爬 百度个人空间 相册 , 未完
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: UTF-8 | |
# Author: XiaochuanHuang | |
# Email : emohuang [at] gmail [dot] com | |
require 'open-uri' | |
require "net/http" | |
require "uri" | |
require 'hpricot' | |
class Db_Album | |
def url_get_content(url) | |
url = URI.parse( url ) | |
req = Net::HTTP::Get.new(url.path+'?'+url.query.to_s ) | |
req.add_field("X-Forwarded-For", "0.0.0.0") | |
req.add_field('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:18.0) Gecko/20100101 Firefox/18.0') | |
res = Net::HTTP.new(url.host, url.port).start do |http| | |
http.request(req) | |
end | |
return res.body | |
end | |
def url_to_file(url) | |
File.open('album.txt', "wb") do |file| | |
file.write( self.url_get_content(url) ) | |
end | |
end | |
def initialize(url="http://www.douban.com/photos/album/44503288/") | |
dir = 'images/' | |
@dir = File.join(File.dirname(__FILE__), dir) | |
if !Dir.exists?(@dir) | |
Dir.mkdir(@dir, 0777) | |
#Dir.mkdir(File.join(File.dirname(__FILE__), @dir), 0777) | |
end | |
@page = 0; | |
@offset = 18 | |
@url = url | |
@sleep_photo = 0.2 | |
@sleep_album_page = 4 | |
self.album_page | |
puts ' ablum download done :) ' | |
end | |
def album_page | |
start = @page*@offset | |
url = start == 0 ? @url : url = @url +'?start='+start.to_s | |
#file=open('db.txt') | |
#text=file.read | |
#doc = Hpricot(text) | |
#puts url | |
puts 'start album page '+ url | |
content = url_get_content(url) | |
content.scan(/picSign:\ \'(.+?)',/) do |detail| | |
self.album_photo_dl( 'http://xiangce.baidu.com/picture/detail/'.concat(detail.shift) ) | |
sleep @sleep_photo | |
end | |
end | |
def album_photo_dl(url) | |
print 'Photo Url '+ url + ' ' | |
content = url_get_content(url) | |
photo = Hpricot(content) | |
is_large = false | |
img = nil | |
# parse is not a large photo to download | |
# set img element with large photo | |
#photo.search('.report-link a') do |large| | |
#photo.search('.img-auto-item-wrapper img') do |large| | |
# is_large = true | |
# large_page = large.attributes['href']; | |
#large_page = Hpricot( url_get_content( large.attributes['href'] ) ) | |
#img = large_page.search('#pic-viewer a img') | |
#end | |
# if not have large size photo download | |
# set img element with default size photo | |
if !is_large | |
#img = photo.search('.mainphoto img') | |
img = photo.search('.img-auto-item-wrapper img') | |
end | |
# download the photo | |
if img.length || is_large | |
if is_large | |
src = large_page | |
else | |
img = img[0] | |
src = img.attributes['src'] | |
end | |
u = URI.parse(src) | |
save_file_path = @dir + u.path.split('/')[-1] | |
if !File.file?( save_file_path ) | |
print ' Start Download... !' | |
open( save_file_path, 'wb') do |file| | |
file << open(src).read | |
end | |
puts ' done !' | |
else | |
puts ' Photo File exist Skip ' | |
end | |
end | |
end | |
end | |
# Tips: | |
# The album photos will save to `images` directory under the current Db_Album.rb file directory | |
# Will Create `images` directory if not exist | |
# Will Download album photo if not exist in `images` directory | |
# How to use ? $ ruby Db_Album.rb | |
# Db_Album.new( douban_album_url ) | |
# Db_Album.new("http://www.douban.com/photos/album/87115884/") # large photo | |
# | |
#http://xiangce.baidu.com/picture/album/list/7f618a1db5da8b5ccccc9ca0d9e6f974902bafec | |
Db_Album.new("http://xiangce.baidu.com/picture/album/list/7f618a1db5da8b5ccccc9ca0d9e6f974902bafec") | |
#Db_Album.new("http://www.douban.com/photos/album/46907826/") # default size photo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment