Created
December 5, 2011 05:53
-
-
Save hannahwhy/1432483 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'connection_pool' | |
require 'girl_friday' | |
require 'mechanize' | |
require 'redis' | |
require 'thread' | |
require File.expand_path('../support', __FILE__) | |
ROOTS = %w( | |
/anime/ | |
/book/ | |
/cartoon/ | |
/comic/ | |
/game/ | |
/misc/ | |
/movie/ | |
/play/ | |
/tv/ | |
/crossovers/anime/ | |
/crossovers/book/ | |
/crossovers/cartoon/ | |
/crossovers/comic/ | |
/crossovers/game/ | |
/crossovers/misc/ | |
/crossovers/movie/ | |
/crossovers/play/ | |
/crossovers/tv/ | |
) | |
crawler_pool = ConnectionPool.new(:size => 4) do | |
Mechanize.new.tap do |m| | |
m.max_history = 0 | |
end | |
end | |
redis_pool = ConnectionPool.new(:size => 10) { Redis.new } | |
discovery = GirlFriday::WorkQueue.new(:discovery, :size => 4, :store => GirlFriday::Store::Redis, :store_config => { :pool => redis_pool }) do |root| | |
begin | |
redis_pool.with_connection do |redis| | |
stories, categories = crawler_pool.with_connection do |agent| | |
stories_and_categories_of(root, agent, redis) | |
end | |
stories.each do |s| | |
save_story(s, redis) | |
end | |
categories.each { |c| discovery << c } | |
end | |
rescue Exception => e | |
$LOG.error("Exception #{e.class} (#{e.message}) raised while scraping #{root}; requeuing.") | |
discovery << root | |
end | |
end | |
trap 'INT' do | |
$LOG.info("SIGINT received, terminating") | |
discovery.shutdown | |
exit 1 | |
end | |
redis_pool.with_connection do |r| | |
r.del SEEN_KEY | |
end | |
ROOTS.each { |r| discovery << r } | |
loop { sleep 5 } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source "http://rubygems.org" | |
gem 'connection_pool' | |
gem 'girl_friday' | |
gem 'mechanize' | |
gem 'redis' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'logger' | |
$LOG = Logger.new($stderr) | |
WRAP = lambda { |p| "http://www.fanfiction.net#{p}" } | |
now = Time.now.to_i | |
SEEN_KEY = "seen_#{now}" | |
def within_cache_threshold?(root, redis) | |
redis.exists "#{root}_cache_control" | |
end | |
def stories_and_categories_of(root, agent, redis) | |
if !redis.setnx("#{root}_working", 1) | |
# someone's already working on it | |
$LOG.debug("#{root} already claimed; not fetching it again") | |
return [[], []] | |
end | |
if redis.sismember SEEN_KEY, root | |
redis.del "#{root}_working" | |
return [[], []] | |
end | |
if within_cache_threshold?(root, redis) | |
stories = redis.smembers "#{root}_stories" | |
categories = redis.smembers "#{root}_categories" | |
$LOG.info("Using cached result for #{root}: #{stories.length} stories, #{categories.length} categories") | |
redis.del "#{root}_working" | |
redis.sadd SEEN_KEY, root | |
return [stories, categories] | |
end | |
last_seen = redis.hget 'last_modified', root | |
page = if last_seen | |
agent.get WRAP[root], {}, nil, {'If-Modified-Since' => last_seen} | |
else | |
agent.get WRAP[root] | |
end | |
if page.code.to_i == 304 | |
stories = redis.smembers "#{root}_stories" | |
categories = redis.smembers "#{root}_categories" | |
$LOG.info("Received 304, using cached result for #{root}: #{stories.length} stories, #{categories.length} categories") | |
redis.del "#{root}_working" | |
redis.sadd SEEN_KEY, root | |
return [stories, categories] | |
elsif page.code.to_i == 200 | |
# Category links show up under #list_output, | |
# story links under #myform. | |
links = (page/'#list_output a') + (page/'#myform a') | |
hrefs = links.map do |l| | |
if l.attribute('href').nil? | |
$LOG.warn("Found link without href on #{root}: #{l.inspect}; ignoring that link.") | |
nil | |
else | |
l.attribute('href').text | |
end | |
end.compact | |
stories, categories = hrefs.partition { |h| h =~ %r{/s/.+} } | |
# Remove profile and review links. | |
categories.reject! do |c| | |
c =~ %r{/r/.+} or c =~ %r{/u/.+} | |
end | |
# Filter chapter designations out of story links. | |
stories.map! do |s| | |
s =~ %r{/s/(\d+)} | |
$1 | |
end | |
# Store response metadata. | |
redis.multi do | |
redis.hset 'last_modified', root, page.response['last-modified'] | |
redis.del "#{root}_stories" | |
redis.del "#{root}_categories" | |
stories.each do |s| | |
redis.sadd "#{root}_stories", s | |
end | |
categories.each do |c| | |
redis.sadd "#{root}_categories", c | |
end | |
# TODO: actually read the Cache-Control header | |
# for now, though, set it so that we don't ravage the servers | |
redis.set "#{root}_cache_control", "s" | |
redis.expire "#{root}_cache_control", 259200 | |
redis.del "#{root}_working" | |
redis.sadd SEEN_KEY, root | |
end | |
[stories, categories].tap do |s, c| | |
$LOG.info("Found #{c.length} categories, #{s.length} stories from #{root}") | |
# wait a bit to be less of an ass | |
sleep rand(3) | |
end | |
else | |
$LOG.warn("GET #{root} returned status #{page.code}; returning empty sets for now.") | |
redis.del "#{root}_working" | |
return [[], []] | |
end | |
end | |
def save_story(story_link, redis) | |
redis.sadd 'stories', story_link | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment