Skip to content

Instantly share code, notes, and snippets.

@zudochkin
Created August 30, 2012 21:22
Show Gist options
  • Save zudochkin/3541416 to your computer and use it in GitHub Desktop.
Save zudochkin/3541416 to your computer and use it in GitHub Desktop.
b000.ru parser
require 'cgi'
namespace :b000 do
desc 'b000.ru images parser'
task :parser => :environment do
max_image_id = ENV['MAX_IMAGE_ID'] || 2344
# чтобы по стопицот раз не пытаться сохранить одни и те же фотографии
min_image_id = ENV['MIN_IMAGE_ID'] || Image.scoped.select('max(external_id) AS m')[0].m
min_image_id.to_i.upto(max_image_id.to_i) do |id|
next if Image.where(:external_id => id).first
url = "http://b000.ru/view/#{ id }/"
begin
doc = Nokogiri::HTML(open(url))
image_name = if header = doc.xpath('h1')[0]
header.text
else
''
end
if categories_li = doc.css('div#image_list li a')
categories = categories_li.map(&:text).map(&:strip)
end
image_url = doc.css('div#image_list a img')[0][:src]
# в пизду такую магию
a = image_url.split('//')
a[-1] = CGI::escape(a[-1])
image = Image.create(:external_id => id,
:name => image_name,
:picture => open(a.join('//'))
)
categories.each do |category|
image.categories << Category.find_or_create_by_name(category)
end
puts "Image #{ image.external_id } saved with categories #{ categories.join ' ' }"
rescue OpenURI::HTTPError => e
puts "Page with id #{ id } not found (404)"
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment