sstephenson/harpers.rb

## harpers.rb
# A library for downloading issues of Harpers from the archives.
#
# To use, log in to the archives from your web browser. Then set the
# HARPERS_ARCHIVE_COOKIE environment variable to the value of the .harpers.org
# "archive" cookie, which you can find by visiting the following URL while on
# the Harpers website:
#
# javascript:prompt("HARPERS_ARCHIVE_COOKIE",document.cookie.match(/(?:;|^)\s*archive=(.+?)(?:;|$)/)[1])
#
# Example:
#   % HARPERS_ARCHIVE_COOKIE=6q626...PFHOF irb -r harpers
#   >> issue = Harpers::Issue.new(2001, 1)
#   => #<Harpers::Issue http://harpers.org/archive/2001/01>
#   >> issue.download!
#
# Requires Hpricot (gem install hpricot).
#

require "net/http"
require "hpricot"
require "fileutils"

module Harpers
  HARPERS_BASE_URL  = "http://harpers.org/"
  ARCHIVE_COOKIE    = ENV["HARPERS_ARCHIVE_COOKIE"]
  SAFARI_USER_AGENT = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/528.1 (KHTML, like Gecko) Version/4.0 Safari/528.1"

  class Record
    class << self
      def perform_random_delay
        return unless @max_delay
        delay = rand(@max_delay)
        puts "pausing for #{delay} seconds"
        sleep delay
      end

      def with_random_delay(max_delay)
        previous_max_delay, @max_delay = @max_delay, max_delay
        yield
      ensure
        @max_delay = previous_max_delay
      end
    end

    def url(*path)
      URI.parse(File.join(HARPERS_BASE_URL, *path))
    end

    def inspect
      "#<#{self.class.name} #{url}>"
    end

    protected
      def contents
        return @contents if @contents
        Record.perform_random_delay
        Net::HTTP.start(url.host, url.port) do |http|
          http.request_get(url.path, headers) do |response|
            return @contents = response.body
          end
        end
      end

      def headers
        {
          "User-Agent" => SAFARI_USER_AGENT,
          "Cookie" => "archive=#{ARCHIVE_COOKIE}"
        }
      end
  end

  class Issue < Record
    attr_reader :year, :month

    def initialize(year, month)
      @year  = year.to_s
      @month = "%02d" % month
    end

    def url
      super("archive", year, month)
    end

    def pages
      @pages ||= Hpricot(contents).search("div#thumbnails a").map do |link|
        number = link.attributes["href"][/\/(\d+)$/, 1]
        Page.new(self, number, link.attributes["title"])
      end
    end

    def download!
      Record.with_random_delay(60) do
        pages.each_with_index do |page, index|
          page.download!
          puts "Downloaded page #{index + 1} of #{pages.length}"
        end
      end
    end
  end

  class Page < Record
    attr_reader :issue, :number, :title

    def initialize(issue, number, title)
      @issue  = issue
      @number = "%04s" % number
      @title  = title
    end

    def url
      super("media", "pages", issue.year, issue.month, "pdf", "#{number}.pdf")
    end

    def path
      File.join(File.dirname(__FILE__), "archives", issue.year, issue.month, "#{number}.pdf")
    end

    def downloaded?
      File.exists?(path)
    end

    def download!
      return if downloaded?
      FileUtils.mkdir_p(File.dirname(path))
      contents = self.contents
      File.open(path, "w") { |file| file << contents }
    end
  end
end
	# A library for downloading issues of Harpers from the archives.
	#
	# To use, log in to the archives from your web browser. Then set the
	# HARPERS_ARCHIVE_COOKIE environment variable to the value of the .harpers.org
	# "archive" cookie, which you can find by visiting the following URL while on
	# the Harpers website:
	#
	# javascript:prompt("HARPERS_ARCHIVE_COOKIE",document.cookie.match(/(?:;\|^)\s*archive=(.+?)(?:;\|$)/)[1])
	#
	# Example:
	# % HARPERS_ARCHIVE_COOKIE=6q626...PFHOF irb -r harpers
	# >> issue = Harpers::Issue.new(2001, 1)
	# => #<Harpers::Issue http://harpers.org/archive/2001/01>
	# >> issue.download!
	#
	# Requires Hpricot (gem install hpricot).
	#

	require "net/http"
	require "hpricot"
	require "fileutils"

	module Harpers
	HARPERS_BASE_URL = "http://harpers.org/"
	ARCHIVE_COOKIE = ENV["HARPERS_ARCHIVE_COOKIE"]
	SAFARI_USER_AGENT = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/528.1 (KHTML, like Gecko) Version/4.0 Safari/528.1"

	class Record
	class << self
	def perform_random_delay
	return unless @max_delay
	delay = rand(@max_delay)
	puts "pausing for #{delay} seconds"
	sleep delay
	end

	def with_random_delay(max_delay)
	previous_max_delay, @max_delay = @max_delay, max_delay
	yield
	ensure
	@max_delay = previous_max_delay
	end
	end

	def url(*path)
	URI.parse(File.join(HARPERS_BASE_URL, *path))
	end

	def inspect
	"#<#{self.class.name} #{url}>"
	end

	protected
	def contents
	return @contents if @contents
	Record.perform_random_delay
	Net::HTTP.start(url.host, url.port) do \|http\|
	http.request_get(url.path, headers) do \|response\|
	return @contents = response.body
	end
	end
	end

	def headers
	{
	"User-Agent" => SAFARI_USER_AGENT,
	"Cookie" => "archive=#{ARCHIVE_COOKIE}"
	}
	end
	end

	class Issue < Record
	attr_reader :year, :month

	def initialize(year, month)
	@year = year.to_s
	@month = "%02d" % month
	end

	def url
	super("archive", year, month)
	end

	def pages
	@pages \|\|= Hpricot(contents).search("div#thumbnails a").map do \|link\|
	number = link.attributes["href"][/\/(\d+)$/, 1]
	Page.new(self, number, link.attributes["title"])
	end
	end

	def download!
	Record.with_random_delay(60) do
	pages.each_with_index do \|page, index\|
	page.download!
	puts "Downloaded page #{index + 1} of #{pages.length}"
	end
	end
	end
	end

	class Page < Record
	attr_reader :issue, :number, :title

	def initialize(issue, number, title)
	@issue = issue
	@number = "%04s" % number
	@title = title
	end

	def url
	super("media", "pages", issue.year, issue.month, "pdf", "#{number}.pdf")
	end

	def path
	File.join(File.dirname(__FILE__), "archives", issue.year, issue.month, "#{number}.pdf")
	end

	def downloaded?
	File.exists?(path)
	end

	def download!
	return if downloaded?
	FileUtils.mkdir_p(File.dirname(path))
	contents = self.contents
	File.open(path, "w") { \|file\| file << contents }
	end
	end
	end