Last active
December 14, 2015 03:19
-
-
Save paranoidxc/5019771 to your computer and use it in GitHub Desktop.
下载豆瓣相册中得图片,如果有原始图片则下载原始图片,没有则下载普通大小图片
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: UTF-8 | |
# Author: XiaochuanHuang | |
# Email : emohuang [at] gmail [dot] com | |
require 'open-uri' | |
require "net/http" | |
require "uri" | |
require 'hpricot' | |
class Db_Album | |
def url_get_content(url) | |
url = URI.parse( url ) | |
req = Net::HTTP::Get.new(url.path+'?'+url.query.to_s ) | |
req.add_field("X-Forwarded-For", "0.0.0.0") | |
req.add_field('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:18.0) Gecko/20100101 Firefox/18.0') | |
res = Net::HTTP.new(url.host, url.port).start do |http| | |
http.request(req) | |
end | |
return res.body | |
end | |
def url_to_file(url) | |
File.open('album.txt', "wb") do |file| | |
file.write( self.url_get_content(url) ) | |
end | |
end | |
def initialize(url="http://www.douban.com/photos/album/44503288/") | |
dir = 'images/' | |
@dir = File.join(File.dirname(__FILE__), dir) | |
if !Dir.exists?(@dir) | |
Dir.mkdir(@dir, 0777) | |
#Dir.mkdir(File.join(File.dirname(__FILE__), @dir), 0777) | |
end | |
@page = 0; | |
@offset = 18 | |
@url = url | |
@sleep_photo = 0.2 | |
@sleep_album_page = 4 | |
self.album_page | |
puts ' ablum download done :) ' | |
end | |
def album_page | |
start = @page*@offset | |
url = start == 0 ? @url : url = @url +'?start='+start.to_s | |
#file=open('db.txt') | |
#text=file.read | |
#doc = Hpricot(text) | |
#puts url | |
puts 'start album page '+ url | |
content = url_get_content(url) | |
doc = Hpricot(content) | |
photos = doc.search('.photolst>.photo_wrap>a.photolst_photo') | |
if photos.count != 0 | |
doc.search('.photolst>.photo_wrap>a.photolst_photo') do |photo| | |
self.album_photo_dl( photo.attributes['href'] ) | |
sleep @sleep_photo | |
end | |
sleep @sleep_album_page | |
@page = @page + 1 | |
self.album_page | |
end | |
end | |
def album_photo_dl(url) | |
print 'Photo Url '+ url + ' ' | |
content = url_get_content(url) | |
photo = Hpricot(content) | |
is_large = false | |
img = nil | |
# parse is not a large photo to download | |
# set img element with large photo | |
photo.search('.report-link a') do |large| | |
is_large = true | |
large_page = Hpricot( url_get_content( large.attributes['href'] ) ) | |
img = large_page.search('#pic-viewer a img') | |
end | |
# if not have large size photo download | |
# set img element with default size photo | |
if !is_large | |
img = photo.search('.mainphoto img') | |
end | |
# download the photo | |
if img.length | |
img = img[0] | |
src = img.attributes['src'] | |
u = URI.parse(src) | |
save_file_path = @dir + u.path.split('/')[-1] | |
if !File.file?( save_file_path ) | |
print ' Start Download... !' | |
open( save_file_path, 'wb') do |file| | |
file << open(src).read | |
end | |
puts ' done !' | |
else | |
puts ' Photo File exist Skip ' | |
end | |
end | |
end | |
end | |
# Tips: | |
# The album photos will save to `images` directory under the current Db_Album.rb file directory | |
# Will Create `images` directory if not exist | |
# Will Download album photo if not exist in `images` directory | |
# How to use ? $ ruby Db_Album.rb | |
# Db_Album.new( douban_album_url ) | |
# Db_Album.new("http://www.douban.com/photos/album/87115884/") # large photo | |
Db_Album.new("http://www.douban.com/photos/album/46907826/") # default size photo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment