Skip to content

Instantly share code, notes, and snippets.

@CvX
Created July 20, 2020 16:08
Show Gist options
  • Save CvX/c4d939a300e2edbd3a7fbbea84abc49c to your computer and use it in GitHub Desktop.
Save CvX/c4d939a300e2edbd3a7fbbea84abc49c to your computer and use it in GitHub Desktop.
soup backup
#!/usr/bin/env ruby
# frozen_string_literal: true
require "nokogiri"
require "httparty"
require "pathname"
require "fileutils"
class RetryFileError < StandardError; end;
THREAD_COUNT = 6
class Downloader
attr_accessor :base_uri, :name
def initialize(base_url_or_name)
if base_url_or_name.include?(".")
self.base_uri = URI(base_url_or_name)
self.name = self.base_uri.host.split(".")[0]
else
uri = "https://#{base_url_or_name}.soup.io"
html_files = Dir.glob("#{base_url_or_name}/html/*").sort
if html_files.count > 1
timestamp = html_files[1].match(/_since_(\d+)/)[1]
uri += "/since/#{timestamp}"
end
self.base_uri = URI(uri)
self.name = base_url_or_name
end
end
def download_file(path_prefix, url, retries = 0)
if retries >= 4
puts "file not available, skipping"
return
end
uri = URI(url)
if uri.relative?
uri.scheme = self.base_uri.scheme
uri.host = self.base_uri.host
end
partial_path = "/" + path_prefix + "/" + uri.to_s.gsub(/[\/:]/, "_")
path = self.name + partial_path
if File.exist?(path)
# puts "skipping #{uri}"
return
end
existing_copy = Dir.glob("**" + partial_path)[0]
if existing_copy
puts "copying existing file #{existing_copy}"
begin
FileUtils.cp(existing_copy, path)
rescue ArgumentError => e
raise unless e.message.match("same file")
end
return
end
print "downloading #{uri} "
File.open(path, "wb") do |file|
response = HTTParty.get(uri, stream_body: true) do |fragment|
if fragment.code == 404
print "file is missing 🧨"
elsif fragment.code == 403
print "access forbidden 💀"
elsif fragment.code >= 500 && fragment.code <= 504
print "#{fragment.code} error"
raise RetryFileError
elsif [301, 302].include?(fragment.code)
print "redirect"
elsif fragment.code == 200
print "."
file.write(fragment)
else
raise "file streaming failed #{fragment.code}"
end
end
puts "*"
end
rescue RetryFileError, EOFError, Net::OpenTimeout, Net::ReadTimeout
begin
FileUtils.rm(path)
rescue Errno::ENOENT
end
puts "retrying #{uri}"
sleep 5
download_file(path_prefix, url, retries + 1)
rescue SocketError => e
raise unless e.message.match("getaddrinfo")
begin
FileUtils.rm(path)
rescue Errno::ENOENT
end
puts "site does not exist"
rescue Errno::ECONNREFUSED
begin
FileUtils.rm(path)
rescue Errno::ENOENT
end
puts "connection refused"
rescue HTTParty::RedirectionTooDeep
begin
FileUtils.rm(path)
rescue Errno::ENOENT
end
puts "too many redirects"
rescue Errno::ENAMETOOLONG
puts "file name too long"
end
def download_files(infos)
puts "#{infos.count} files to download:"
queue = Queue.new
infos.each { |info| queue << info }
threads = []
THREAD_COUNT.times do
threads << Thread.new do
until queue.empty?
prefix, url = queue.pop(true) rescue nil
download_file(prefix, url) if url
end
end
end
threads.each(&:join)
end
def download_page(uri, retries = 0)
if retries >= 10
puts "Cannot access #{uri}"
return
end
response = HTTParty.get(uri, follow_redirects: false)
if response.code >= 300 && response.code < 400
redirect_url = response.headers['location']
# puts "redirected to: #{redirect_url}"
return download_page(URI(redirect_url), retries)
elsif response.code != 200
seconds = retries * 3 + 4
puts "retry in #{seconds}s..."
sleep(seconds)
return download_page(uri, retries + 1)
end
simple_uri = uri.dup
simple_uri.query = nil
puts "\n🔥 Downloaded #{simple_uri}"
body = response.body
# Save the page
File.write("#{self.name}/html/#{simple_uri.to_s.gsub(/[\/:]/, "_")}", body)
html = Nokogiri::HTML(body)
infos = []
# Save images
html.xpath('//img[@src]').each do |image|
infos << ["images", image[:src]]
end
# Save large images
html.xpath('//a[@class="lightbox" and @href]').each do |lightbox|
infos << ["images", lightbox[:href]]
end
# Save CSS
html.xpath('//link[@rel="stylesheet"]').each do |link|
infos << ["css", link[:href]]
end
# Save videos
html.xpath('//video[@src]').each do |video|
infos << ["videos", video[:src]]
end
# Save audio
html.xpath('//audio[@src]').each do |audio|
infos << ["audio", audio[:src]]
end
download_files(infos)
next_link = html.xpath('//a[@name="more"]').first
if next_link
puts "page done, next page: #{next_link[:href]}"
next_uri = URI(next_link[:href])
if next_uri.relative?
next_uri.scheme = self.base_uri.scheme
next_uri.host = self.base_uri.host
end
next_uri
end
rescue Net::OpenTimeout, Net::ReadTimeout
puts "timeout error"
seconds = retries * 4 + 1
puts "retry in #{seconds}s..."
sleep(seconds)
return download_page(uri, retries + 1)
end
def run
puts "Downloading #{self.name}"
FileUtils.mkdir_p(self.name + "/css")
FileUtils.mkdir_p(self.name + "/images")
FileUtils.mkdir_p(self.name + "/videos")
FileUtils.mkdir_p(self.name + "/audio")
FileUtils.mkdir_p(self.name + "/html")
uri = self.base_uri
loop do
uri = download_page(uri)
unless uri
puts "Done\a"
break
end
end
end
end
Downloader.new(ARGV[0]).run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment