Skip to content

Instantly share code, notes, and snippets.

@danielgracia
Last active January 4, 2016 19:17
Show Gist options
  • Save danielgracia/e791d7af1e678a9845c9 to your computer and use it in GitHub Desktop.
Save danielgracia/e791d7af1e678a9845c9 to your computer and use it in GitHub Desktop.
Fooling with JRuby and pretty much any gem with a cool name that I feel like using.
require 'date'
require 'rubygems'
require 'mechanize'
require 'concurrent'
require 'daybreak'
require 'thread'
require 'pry'
Promise = Concurrent::Promise
# Patch Mechanize
class Mechanize::HTTP::Agent
def use_tempfile?(*)
false
end
end
# Set thread pool
$pool = Concurrent::CachedThreadPool.new
# Scrapper class
class Scrapper
def initialize(pool:)
@pool = pool
@agent = Mechanize.new
end
def scrap(date)
Promise.new(executor: @pool) do
@agent.get("http://dilbert.com/strip/#{date}") do |page|
page.image_with(dom_class: 'img-responsive img-comic')
.fetch.save!("dilbert/#{date}.gif")
end
end.execute
end
end
# Workers
$queue = Queue.new.tap do |q|
8.times { q.push Scrapper.new(pool: $pool) }
end
# Comic book-keeping
Dir.mkdir("dilbert") unless Dir.exist?("dilbert")
cdb = Daybreak::DB.new("dilbert/checkpoint.db")
unless cdb['init']
start = Date.new(1989, 4, 16)
range = 0 .. (Date.today - start).to_i
range.each do |i|
cdb[start + i] = :no
end
cdb['init'] = true
cdb.flush
end
# Processing
end_lock = Concurrent::ReadWriteLock.new
exit_reason = date_being_processed = nil
current_promises = []
Signal.trap('INT') do
cdb.synchronize do
cdb.flush
cdb.close
Process.exit!
end
end
catch(:abort) do
cdb.select { |_, v| v == :no }.each do |date, _|
scrapper = $queue.pop
current_promises << scrapper.scrap(Date.parse(date)).then do
cdb.synchronize { cdb[date] = :ok }
puts "#{date} processed."
$queue.push scrapper
end.rescue do |reason|
if reason.response_code.to_i != 500
end_lock.with_write_lock do
exit_reason = reason
date_being_processed = date
end
else
cdb.synchronize { cdb[date] = :fail }
puts "Internal server error for #{date}."
$queue.push scrapper
end
end
end_lock.with_read_lock do
unless exit_reason.nil?
throw :abort, :now
end
end
current_promises.reject! { |p| p.state != :pending }
end
end.tap do |val|
# Wait until finish
Promise.zip(*current_promises).value
if val == :now
cdb.synchronize{ cdb.flush; cdb.close }
puts "Error on date #{date_being_processed}"
raise exit_reason
else
puts "Success!"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment