Skip to content

Instantly share code, notes, and snippets.

@hotchpotch
Created March 5, 2012 22:12
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hotchpotch/1981518 to your computer and use it in GitHub Desktop.
Save hotchpotch/1981518 to your computer and use it in GitHub Desktop.
Photozou Scraper
#!/usr/bin/env ruby
require 'digest/sha1'
require 'pathname'
require 'open-uri'
require 'uri'
require 'nokogiri'
module PhotozouScraper
PHOTOZOU_URL = 'http://photozou.jp'
extend self
@cache_path = 'cache'
@download_path = 'download'
attr_accessor :cache_path, :download_path
def all_scrape(url)
path = URI.parse(url).path
while path = scrape(path)
end
end
def get_html(path)
cache = Pathname.new(cache_path + '/' + Digest::SHA1.hexdigest(path))
cache.parent.mkpath
if cache.exist?
puts "cache exist: #{path} (#{cache})"
cache.read
else
puts "cache not found: #{path} (#{cache})"
source = open(PHOTOZOU_URL + path).read
cache.open('w') {|f| f.puts source }
source
end
end
def scrape(path)
puts "scrape start: #{path}"
nokogiri = Nokogiri(get_html(path))
download(nokogiri)
next_link(nokogiri)
end
def download(nokogiri)
image = nokogiri.search('#indivi_media > a > img').first.attributes['src'].value
timestamp = image.scan(/photo\/(\d+)/).flatten.last
alt = nokogiri.search('#indivi_media > a > img').first.attributes['alt'].value
orig = original_image(image)
puts "downloading... #{orig}"
jpg = open(orig, :read_timeout => nil).read
path = Pathname.new("#{download_path}/#{timestamp}_#{alt}.jpg")
path.parent.mkpath
puts "save download file: #{path}"
path.open('wb') {|f| f.puts jpg}
end
def original_image(image)
image.sub(%r{(/photo/\d+)\.}, "\\1_org.")
end
def next_link(nokogiri)
next_link = nokogiri.search('//link[@rel="next"]')
unless next_link.empty?
next_link.first.attributes['href'].value
end
end
end
PhotozouScraper.all_scrape(ARGV.first)
# ./photozou_scrape.rb http://photozou.jp/photo/show/user_id/xxxxxxxxx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment