Last active
January 4, 2016 19:17
-
-
Save danielgracia/e791d7af1e678a9845c9 to your computer and use it in GitHub Desktop.
Fooling with JRuby and pretty much any gem with a cool name that I feel like using.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'date' | |
require 'rubygems' | |
require 'mechanize' | |
require 'concurrent' | |
require 'daybreak' | |
require 'thread' | |
require 'pry' | |
Promise = Concurrent::Promise | |
# Patch Mechanize | |
class Mechanize::HTTP::Agent | |
def use_tempfile?(*) | |
false | |
end | |
end | |
# Set thread pool | |
$pool = Concurrent::CachedThreadPool.new | |
# Scrapper class | |
class Scrapper | |
def initialize(pool:) | |
@pool = pool | |
@agent = Mechanize.new | |
end | |
def scrap(date) | |
Promise.new(executor: @pool) do | |
@agent.get("http://dilbert.com/strip/#{date}") do |page| | |
page.image_with(dom_class: 'img-responsive img-comic') | |
.fetch.save!("dilbert/#{date}.gif") | |
end | |
end.execute | |
end | |
end | |
# Workers | |
$queue = Queue.new.tap do |q| | |
8.times { q.push Scrapper.new(pool: $pool) } | |
end | |
# Comic book-keeping | |
Dir.mkdir("dilbert") unless Dir.exist?("dilbert") | |
cdb = Daybreak::DB.new("dilbert/checkpoint.db") | |
unless cdb['init'] | |
start = Date.new(1989, 4, 16) | |
range = 0 .. (Date.today - start).to_i | |
range.each do |i| | |
cdb[start + i] = :no | |
end | |
cdb['init'] = true | |
cdb.flush | |
end | |
# Processing | |
end_lock = Concurrent::ReadWriteLock.new | |
exit_reason = date_being_processed = nil | |
current_promises = [] | |
Signal.trap('INT') do | |
cdb.synchronize do | |
cdb.flush | |
cdb.close | |
Process.exit! | |
end | |
end | |
catch(:abort) do | |
cdb.select { |_, v| v == :no }.each do |date, _| | |
scrapper = $queue.pop | |
current_promises << scrapper.scrap(Date.parse(date)).then do | |
cdb.synchronize { cdb[date] = :ok } | |
puts "#{date} processed." | |
$queue.push scrapper | |
end.rescue do |reason| | |
if reason.response_code.to_i != 500 | |
end_lock.with_write_lock do | |
exit_reason = reason | |
date_being_processed = date | |
end | |
else | |
cdb.synchronize { cdb[date] = :fail } | |
puts "Internal server error for #{date}." | |
$queue.push scrapper | |
end | |
end | |
end_lock.with_read_lock do | |
unless exit_reason.nil? | |
throw :abort, :now | |
end | |
end | |
current_promises.reject! { |p| p.state != :pending } | |
end | |
end.tap do |val| | |
# Wait until finish | |
Promise.zip(*current_promises).value | |
if val == :now | |
cdb.synchronize{ cdb.flush; cdb.close } | |
puts "Error on date #{date_being_processed}" | |
raise exit_reason | |
else | |
puts "Success!" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment