Skip to content

Instantly share code, notes, and snippets.

@lasombra
Created August 22, 2014 12:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lasombra/a489f715985715663595 to your computer and use it in GitHub Desktop.
Save lasombra/a489f715985715663595 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# We require...
require "httpclient"
require "json"
# Startup variables
url = "http://archive.org/advancedsearch.php"
query = {"q" => 'collection:"harvardclassicsbound" AND (collection:harvardclassicsbound)',
"output" => "json",
"rows" => "60",
"fl[]" => "identifier",
"save" => "yes",
"page" => "1"}
headers = {"Accept" => "application/json", "Content-Type" => "application/json"}
http = HTTPClient.new
http.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
request = http.get(url, query)
json_response = JSON.parse(request.content)
doc_list = json_response["response"]["docs"]
doc_list.each do |doc|
# Download URL format: https://archive.org/download/identifier/identifier.epub
doc_id = doc["identifier"]
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.epub"
puts "Downloading #{new_url}"
begin
downloaded_file = File.new("#{doc_id}.epub", "wb")
downloaded_file.write(http.get_content(new_url))
rescue
# Let's try a PDF is the EPUB doesn't exist
begin
File.delete("#{doc_id}.epub")
puts "Failed to download EPUB. Trying the PDF version..."
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.pdf"
downloaded_file = File.new("#{doc_id}.pdf", "wb")
downloaded_file.write(http.get_content(new_url))
rescue
begin
# PDF failed, so TXT then
File.delete("#{doc_id}.pdf")
puts "Failed to download PDF. Trying the TXT version..."
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.txt"
downloaded_file = File.new("#{doc_id}.txt", "wb")
downloaded_file.write(http.get_content(new_url))
rescue
# TXT failed? Try DJVu then
File.delete("#{doc_id}.txt")
puts "Failed to download TXT. Trying the DjVu version..."
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.djvu"
downloaded_file = File.new("#{doc_id}.djvu", "wb")
downloaded_file.write(http.get_content(new_url))
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment