Created
July 20, 2020 16:08
-
-
Save CvX/c4d939a300e2edbd3a7fbbea84abc49c to your computer and use it in GitHub Desktop.
soup backup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# frozen_string_literal: true | |
require "nokogiri" | |
require "httparty" | |
require "pathname" | |
require "fileutils" | |
class RetryFileError < StandardError; end; | |
THREAD_COUNT = 6 | |
class Downloader | |
attr_accessor :base_uri, :name | |
def initialize(base_url_or_name) | |
if base_url_or_name.include?(".") | |
self.base_uri = URI(base_url_or_name) | |
self.name = self.base_uri.host.split(".")[0] | |
else | |
uri = "https://#{base_url_or_name}.soup.io" | |
html_files = Dir.glob("#{base_url_or_name}/html/*").sort | |
if html_files.count > 1 | |
timestamp = html_files[1].match(/_since_(\d+)/)[1] | |
uri += "/since/#{timestamp}" | |
end | |
self.base_uri = URI(uri) | |
self.name = base_url_or_name | |
end | |
end | |
def download_file(path_prefix, url, retries = 0) | |
if retries >= 4 | |
puts "file not available, skipping" | |
return | |
end | |
uri = URI(url) | |
if uri.relative? | |
uri.scheme = self.base_uri.scheme | |
uri.host = self.base_uri.host | |
end | |
partial_path = "/" + path_prefix + "/" + uri.to_s.gsub(/[\/:]/, "_") | |
path = self.name + partial_path | |
if File.exist?(path) | |
# puts "skipping #{uri}" | |
return | |
end | |
existing_copy = Dir.glob("**" + partial_path)[0] | |
if existing_copy | |
puts "copying existing file #{existing_copy}" | |
begin | |
FileUtils.cp(existing_copy, path) | |
rescue ArgumentError => e | |
raise unless e.message.match("same file") | |
end | |
return | |
end | |
print "downloading #{uri} " | |
File.open(path, "wb") do |file| | |
response = HTTParty.get(uri, stream_body: true) do |fragment| | |
if fragment.code == 404 | |
print "file is missing 🧨" | |
elsif fragment.code == 403 | |
print "access forbidden 💀" | |
elsif fragment.code >= 500 && fragment.code <= 504 | |
print "#{fragment.code} error" | |
raise RetryFileError | |
elsif [301, 302].include?(fragment.code) | |
print "redirect" | |
elsif fragment.code == 200 | |
print "." | |
file.write(fragment) | |
else | |
raise "file streaming failed #{fragment.code}" | |
end | |
end | |
puts "*" | |
end | |
rescue RetryFileError, EOFError, Net::OpenTimeout, Net::ReadTimeout | |
begin | |
FileUtils.rm(path) | |
rescue Errno::ENOENT | |
end | |
puts "retrying #{uri}" | |
sleep 5 | |
download_file(path_prefix, url, retries + 1) | |
rescue SocketError => e | |
raise unless e.message.match("getaddrinfo") | |
begin | |
FileUtils.rm(path) | |
rescue Errno::ENOENT | |
end | |
puts "site does not exist" | |
rescue Errno::ECONNREFUSED | |
begin | |
FileUtils.rm(path) | |
rescue Errno::ENOENT | |
end | |
puts "connection refused" | |
rescue HTTParty::RedirectionTooDeep | |
begin | |
FileUtils.rm(path) | |
rescue Errno::ENOENT | |
end | |
puts "too many redirects" | |
rescue Errno::ENAMETOOLONG | |
puts "file name too long" | |
end | |
def download_files(infos) | |
puts "#{infos.count} files to download:" | |
queue = Queue.new | |
infos.each { |info| queue << info } | |
threads = [] | |
THREAD_COUNT.times do | |
threads << Thread.new do | |
until queue.empty? | |
prefix, url = queue.pop(true) rescue nil | |
download_file(prefix, url) if url | |
end | |
end | |
end | |
threads.each(&:join) | |
end | |
def download_page(uri, retries = 0) | |
if retries >= 10 | |
puts "Cannot access #{uri}" | |
return | |
end | |
response = HTTParty.get(uri, follow_redirects: false) | |
if response.code >= 300 && response.code < 400 | |
redirect_url = response.headers['location'] | |
# puts "redirected to: #{redirect_url}" | |
return download_page(URI(redirect_url), retries) | |
elsif response.code != 200 | |
seconds = retries * 3 + 4 | |
puts "retry in #{seconds}s..." | |
sleep(seconds) | |
return download_page(uri, retries + 1) | |
end | |
simple_uri = uri.dup | |
simple_uri.query = nil | |
puts "\n🔥 Downloaded #{simple_uri}" | |
body = response.body | |
# Save the page | |
File.write("#{self.name}/html/#{simple_uri.to_s.gsub(/[\/:]/, "_")}", body) | |
html = Nokogiri::HTML(body) | |
infos = [] | |
# Save images | |
html.xpath('//img[@src]').each do |image| | |
infos << ["images", image[:src]] | |
end | |
# Save large images | |
html.xpath('//a[@class="lightbox" and @href]').each do |lightbox| | |
infos << ["images", lightbox[:href]] | |
end | |
# Save CSS | |
html.xpath('//link[@rel="stylesheet"]').each do |link| | |
infos << ["css", link[:href]] | |
end | |
# Save videos | |
html.xpath('//video[@src]').each do |video| | |
infos << ["videos", video[:src]] | |
end | |
# Save audio | |
html.xpath('//audio[@src]').each do |audio| | |
infos << ["audio", audio[:src]] | |
end | |
download_files(infos) | |
next_link = html.xpath('//a[@name="more"]').first | |
if next_link | |
puts "page done, next page: #{next_link[:href]}" | |
next_uri = URI(next_link[:href]) | |
if next_uri.relative? | |
next_uri.scheme = self.base_uri.scheme | |
next_uri.host = self.base_uri.host | |
end | |
next_uri | |
end | |
rescue Net::OpenTimeout, Net::ReadTimeout | |
puts "timeout error" | |
seconds = retries * 4 + 1 | |
puts "retry in #{seconds}s..." | |
sleep(seconds) | |
return download_page(uri, retries + 1) | |
end | |
def run | |
puts "Downloading #{self.name}" | |
FileUtils.mkdir_p(self.name + "/css") | |
FileUtils.mkdir_p(self.name + "/images") | |
FileUtils.mkdir_p(self.name + "/videos") | |
FileUtils.mkdir_p(self.name + "/audio") | |
FileUtils.mkdir_p(self.name + "/html") | |
uri = self.base_uri | |
loop do | |
uri = download_page(uri) | |
unless uri | |
puts "Done\a" | |
break | |
end | |
end | |
end | |
end | |
Downloader.new(ARGV[0]).run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment