Skip to content

Instantly share code, notes, and snippets.

@hannahwhy
Created December 5, 2011 05:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hannahwhy/1432483 to your computer and use it in GitHub Desktop.
Save hannahwhy/1432483 to your computer and use it in GitHub Desktop.
require 'connection_pool'
require 'girl_friday'
require 'mechanize'
require 'redis'
require 'thread'
require File.expand_path('../support', __FILE__)
ROOTS = %w(
/anime/
/book/
/cartoon/
/comic/
/game/
/misc/
/movie/
/play/
/tv/
/crossovers/anime/
/crossovers/book/
/crossovers/cartoon/
/crossovers/comic/
/crossovers/game/
/crossovers/misc/
/crossovers/movie/
/crossovers/play/
/crossovers/tv/
)
crawler_pool = ConnectionPool.new(:size => 4) do
Mechanize.new.tap do |m|
m.max_history = 0
end
end
redis_pool = ConnectionPool.new(:size => 10) { Redis.new }
discovery = GirlFriday::WorkQueue.new(:discovery, :size => 4, :store => GirlFriday::Store::Redis, :store_config => { :pool => redis_pool }) do |root|
begin
redis_pool.with_connection do |redis|
stories, categories = crawler_pool.with_connection do |agent|
stories_and_categories_of(root, agent, redis)
end
stories.each do |s|
save_story(s, redis)
end
categories.each { |c| discovery << c }
end
rescue Exception => e
$LOG.error("Exception #{e.class} (#{e.message}) raised while scraping #{root}; requeuing.")
discovery << root
end
end
trap 'INT' do
$LOG.info("SIGINT received, terminating")
discovery.shutdown
exit 1
end
redis_pool.with_connection do |r|
r.del SEEN_KEY
end
ROOTS.each { |r| discovery << r }
loop { sleep 5 }
source "http://rubygems.org"
gem 'connection_pool'
gem 'girl_friday'
gem 'mechanize'
gem 'redis'
require 'logger'
$LOG = Logger.new($stderr)
WRAP = lambda { |p| "http://www.fanfiction.net#{p}" }
now = Time.now.to_i
SEEN_KEY = "seen_#{now}"
def within_cache_threshold?(root, redis)
redis.exists "#{root}_cache_control"
end
def stories_and_categories_of(root, agent, redis)
if !redis.setnx("#{root}_working", 1)
# someone's already working on it
$LOG.debug("#{root} already claimed; not fetching it again")
return [[], []]
end
if redis.sismember SEEN_KEY, root
redis.del "#{root}_working"
return [[], []]
end
if within_cache_threshold?(root, redis)
stories = redis.smembers "#{root}_stories"
categories = redis.smembers "#{root}_categories"
$LOG.info("Using cached result for #{root}: #{stories.length} stories, #{categories.length} categories")
redis.del "#{root}_working"
redis.sadd SEEN_KEY, root
return [stories, categories]
end
last_seen = redis.hget 'last_modified', root
page = if last_seen
agent.get WRAP[root], {}, nil, {'If-Modified-Since' => last_seen}
else
agent.get WRAP[root]
end
if page.code.to_i == 304
stories = redis.smembers "#{root}_stories"
categories = redis.smembers "#{root}_categories"
$LOG.info("Received 304, using cached result for #{root}: #{stories.length} stories, #{categories.length} categories")
redis.del "#{root}_working"
redis.sadd SEEN_KEY, root
return [stories, categories]
elsif page.code.to_i == 200
# Category links show up under #list_output,
# story links under #myform.
links = (page/'#list_output a') + (page/'#myform a')
hrefs = links.map do |l|
if l.attribute('href').nil?
$LOG.warn("Found link without href on #{root}: #{l.inspect}; ignoring that link.")
nil
else
l.attribute('href').text
end
end.compact
stories, categories = hrefs.partition { |h| h =~ %r{/s/.+} }
# Remove profile and review links.
categories.reject! do |c|
c =~ %r{/r/.+} or c =~ %r{/u/.+}
end
# Filter chapter designations out of story links.
stories.map! do |s|
s =~ %r{/s/(\d+)}
$1
end
# Store response metadata.
redis.multi do
redis.hset 'last_modified', root, page.response['last-modified']
redis.del "#{root}_stories"
redis.del "#{root}_categories"
stories.each do |s|
redis.sadd "#{root}_stories", s
end
categories.each do |c|
redis.sadd "#{root}_categories", c
end
# TODO: actually read the Cache-Control header
# for now, though, set it so that we don't ravage the servers
redis.set "#{root}_cache_control", "s"
redis.expire "#{root}_cache_control", 259200
redis.del "#{root}_working"
redis.sadd SEEN_KEY, root
end
[stories, categories].tap do |s, c|
$LOG.info("Found #{c.length} categories, #{s.length} stories from #{root}")
# wait a bit to be less of an ass
sleep rand(3)
end
else
$LOG.warn("GET #{root} returned status #{page.code}; returning empty sets for now.")
redis.del "#{root}_working"
return [[], []]
end
end
def save_story(story_link, redis)
redis.sadd 'stories', story_link
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment