Skip to content

Instantly share code, notes, and snippets.

@rweald
Created June 16, 2011 21:16
Show Gist options
  • Save rweald/1030307 to your computer and use it in GitHub Desktop.
Save rweald/1030307 to your computer and use it in GitHub Desktop.
require 'em-synchrony'
require 'em-synchrony/em-http'
require 'em-synchrony/em-jack'
require 'nokogiri'
module MendeleyScraper
class MasterScraper
# get all the necessary urls from the mendeley sitemap
# this opperation appear sync due to fibers
def self.extract_sitemap(sitemap_url)
puts "extracting locs at #{sitemap_url}"
http = EventMachine::HttpRequest.new(sitemap_url).get
puts 'parsing the file'
sitemap = Nokogiri::XML(http.response)
loc_list = sitemap.css("loc").map { |ar| ar.content }
return loc_list
end
#this method will extract the specific article urls from the gziped
# article list xml files that mendely provides
# This opperation is async
def self.extract_article_locations(http)
#extract the article urls from the gzipped xml file
sitemap = Nokogiri::XML(Zlib::GzipReader.new(StringIO.new(http)).read)
loc_list = sitemap.css("loc").map { |ar| ar.content }
#put each of the articles locations on our job queue
puts "connecting to our beanstalk server"
@jack = EMJack::Connection.new
@jack.use("urls_to_scrape")
EM::Synchrony::Iterator.new(loc_list[0..10], 10).each do |url, iter|
Fiber.new do
puts url
EM::Synchrony.sync @jack.put(url)
iter.next
end.resume
end
end
# The main method that will execute the master scraper
# this method uses a combo of fiber synced and async operations
def self.run
EM.synchrony do
article_lists = extract_sitemap("http://www.mendeley.com/sitemap-index-articles.xml")
EM::Synchrony::Iterator.new(article_lists[0..1], 2).each do |url , iter|
puts "extracting the locations of articles from #{url}"
http = EventMachine::HttpRequest.new(url).aget
http.callback {
Fiber.new do
extract_article_locations(http.response)
end.resume
iter.next
}
http.errback { puts "failed to extract articles list from #{url}"; iter.next }
end
EM.stop
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment