Skip to content

Instantly share code, notes, and snippets.

@lightory
Created June 28, 2016 14:05
Show Gist options
  • Save lightory/0901aa2caa56d4c5334dddd6ad36a14d to your computer and use it in GitHub Desktop.
Save lightory/0901aa2caa56d4c5334dddd6ad36a14d to your computer and use it in GitHub Desktop.
抓取又拍网特定用户的所有照片
require 'open-uri'
require 'Nokogiri'
require 'digest/sha1'
def main
username = "lightory"
base_url = "http://www.yupoo.com/photos/" + username + "/albums/"
doc = Nokogiri::HTML(safe_open(base_url))
album_elements = doc.css("div#albums_list div.set-case")
for album_element in album_elements
album_url = album_element.css("a").first["href"]
crawl_album(album_url)
end
end
def main2
album_url = "http://www.yupoo.com/photos/lightory/albums/1373449/"
crawl_album(album_url)
end
def crawl_album(base_album_url)
page = 1;
while true
album_url = base_album_url + "page" + page.to_s + "/"
album_doc = Nokogiri::HTML(safe_open(album_url))
album_name = album_doc.css("span#albumtitle").first.content
Dir.mkdir(album_name) unless File.exists?(album_name)
puts "Start Crawling " + album_name + "..." if page == 1
puts "Start Crawling " + album_name + " Page " + page.to_s + "..."
if File.exists?(album_name + "/finished")
puts "Already Cralwed " + album_name + "."
puts ""
break
end
photo_elements = album_doc.css("div.album-photos a.img")
if photo_elements.length == 0
puts "Finished Crawling " + album_name + "."
puts ""
File.open(album_name + "/finished", "w") do |f|
f.write("")
end
break
end
for photo_element in photo_elements
photo_title = photo_element["title"]
photo_page_url = photo_element["href"]
puts "Crawl Photo: " + photo_page_url
file_name = album_name + "/" + Digest::SHA1.hexdigest(photo_page_url) + ".jpg"
crawl_photo(photo_page_url, file_name)
end
puts "Finished Crawling " + album_name + " Page " + page.to_s + "."
page = page + 1
end
end
def crawl_photo(page_url, file_name)
page_url = "http://www.yupoo.com" + page_url
doc = Nokogiri::HTML(safe_open(page_url))
photo_url = doc.css("img#photo_img").first["src"]
open(file_name, 'wb') do |file|
file_content = safe_open(photo_url)
file << file_content.read unless file_content.nil?
end
end
def safe_open(url)
retryTime = 0
begin
sleep(1)
return open(url)
rescue
if (retryTime >= 3)
return
end
puts "Retry " + url
retryTime = retryTime + 1
retry
end
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment