Skip to content

Instantly share code, notes, and snippets.

@sstephenson
Created May 6, 2009 00:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sstephenson/107296 to your computer and use it in GitHub Desktop.
Save sstephenson/107296 to your computer and use it in GitHub Desktop.
# A library for downloading issues of Harpers from the archives.
#
# To use, log in to the archives from your web browser. Then set the
# HARPERS_ARCHIVE_COOKIE environment variable to the value of the .harpers.org
# "archive" cookie, which you can find by visiting the following URL while on
# the Harpers website:
#
# javascript:prompt("HARPERS_ARCHIVE_COOKIE",document.cookie.match(/(?:;|^)\s*archive=(.+?)(?:;|$)/)[1])
#
# Example:
# % HARPERS_ARCHIVE_COOKIE=6q626...PFHOF irb -r harpers
# >> issue = Harpers::Issue.new(2001, 1)
# => #<Harpers::Issue http://harpers.org/archive/2001/01>
# >> issue.download!
#
# Requires Hpricot (gem install hpricot).
#
require "net/http"
require "hpricot"
require "fileutils"
module Harpers
HARPERS_BASE_URL = "http://harpers.org/"
ARCHIVE_COOKIE = ENV["HARPERS_ARCHIVE_COOKIE"]
SAFARI_USER_AGENT = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/528.1 (KHTML, like Gecko) Version/4.0 Safari/528.1"
class Record
class << self
def perform_random_delay
return unless @max_delay
delay = rand(@max_delay)
puts "pausing for #{delay} seconds"
sleep delay
end
def with_random_delay(max_delay)
previous_max_delay, @max_delay = @max_delay, max_delay
yield
ensure
@max_delay = previous_max_delay
end
end
def url(*path)
URI.parse(File.join(HARPERS_BASE_URL, *path))
end
def inspect
"#<#{self.class.name} #{url}>"
end
protected
def contents
return @contents if @contents
Record.perform_random_delay
Net::HTTP.start(url.host, url.port) do |http|
http.request_get(url.path, headers) do |response|
return @contents = response.body
end
end
end
def headers
{
"User-Agent" => SAFARI_USER_AGENT,
"Cookie" => "archive=#{ARCHIVE_COOKIE}"
}
end
end
class Issue < Record
attr_reader :year, :month
def initialize(year, month)
@year = year.to_s
@month = "%02d" % month
end
def url
super("archive", year, month)
end
def pages
@pages ||= Hpricot(contents).search("div#thumbnails a").map do |link|
number = link.attributes["href"][/\/(\d+)$/, 1]
Page.new(self, number, link.attributes["title"])
end
end
def download!
Record.with_random_delay(60) do
pages.each_with_index do |page, index|
page.download!
puts "Downloaded page #{index + 1} of #{pages.length}"
end
end
end
end
class Page < Record
attr_reader :issue, :number, :title
def initialize(issue, number, title)
@issue = issue
@number = "%04s" % number
@title = title
end
def url
super("media", "pages", issue.year, issue.month, "pdf", "#{number}.pdf")
end
def path
File.join(File.dirname(__FILE__), "archives", issue.year, issue.month, "#{number}.pdf")
end
def downloaded?
File.exists?(path)
end
def download!
return if downloaded?
FileUtils.mkdir_p(File.dirname(path))
contents = self.contents
File.open(path, "w") { |file| file << contents }
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment