Skip to content

Instantly share code, notes, and snippets.

@sheepeeh
Last active August 29, 2015 13:58
Show Gist options
  • Save sheepeeh/10282373 to your computer and use it in GitHub Desktop.
Save sheepeeh/10282373 to your computer and use it in GitHub Desktop.
For a given TXT file of URLs, download PDFs from archive.org
require 'mechanize'
require 'open-uri'
# Usage: download_from_ia Login Password
# Expects files to be named [name]_urls.txt. Change line 12 for a different naming scheme.
def get_pdf(from_file)
abort "#{$0} email password" if (ARGV.size != 2)
log = File.open("ia_pdf_downloads.log","a")
# Create download directory
target_dir = File.basename(from_file, "_urls.txt")
Dir.mkdir("./#{target_dir}") unless Dir.exists?(target_dir)
# Create Mechanize agent, refresh after login
a = Mechanize.new { |agent|
agent.follow_meta_refresh = true
}
# GET archive.org
a.get('https://archive.org/') do |home_page|
puts "Logging in..."
# Click login link
signin_page = a.click(home_page.link_with(:href => /login.php/))
# Submit login form
my_page = signin_page.form_with(:class => 'iaform' ) do |form|
form.username = ARGV[0]
form.password = ARGV[1]
end.submit
puts "Logged in."
# Open file and download PDFs from listed URLs
File.readlines(from_file).each do |line|
begin
download_page = a.get(line)
download_page = a.click(download_page.link_with(:text => 'HTTPS'))
puts "Downloading from #{line}"
# Set parser to Download, download PDF from HTTPS listing
a.pluggable_parser.default = Mechanize::Download
pdf_file = a.click(download_page.link_with(:text => /.pdf/))
Dir.chdir("#{target_dir}") do
a.get(pdf_file).save
end
# Log errors
rescue Mechanize::ResponseCodeError => e1
t = Time.now
puts "Mechanize encountered an error: #{e1} \nMoving on to next record."
log.puts "#{t}: Mechanize encountered an error for #{line}: #{e1}"
next
rescue Net::HTTP::Persistent::Error => e2
t = Time.now
puts "Mechanize encountered an error: #{e2} \nMoving on to next record."
log.puts "#{t}: Mechanize encountered an error for #{line}: #{e2}"
next
rescue OpenSSL::SSL::SSLError => e3
t = Time.now
puts "Mechanize encountered an error: #{e3} \nMoving on to next record."
log.puts "#{t}: Mechanize encountered an error for #{line}: #{e3}"
next
end
end
puts "Done."
end
log.close
end
# Ask for text file at command prompt
loop do
puts "Please enter a TXT file or 'quit' to exit."
answer = $stdin.gets.chomp
if answer.match('.txt') != nil && File.exists?(answer)
from_file = answer if File.exists?(answer)
get_pdf(from_file)
elsif answer == 'quit'
puts "Quitting."
break
else
puts "Please enter a valid TXT file or quit to exit."
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment