To run this script,
- Add
crawl_broken_urls.rb
to you directory - At the same directory level, add a
broken_links.yml
. - Then run
ruby crawl_broken_urls.rb
require 'yaml' | |
require 'pry' | |
require 'rb-readline' | |
require 'net/http' | |
require 'uri' | |
require 'timeout' | |
broken_urls = YAML.load_file("./broken_urls.yml") | |
def fetch(uri_str, limit = 10) | |
default_error = 'HTTPError' | |
raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
url = URI.parse(uri_str) | |
req = Net::HTTP::Get.new(url.path) | |
response = Net::HTTP.start(url.host, url.port) do |http| | |
begin | |
status = Timeout::timeout(3) { | |
http.request(req) | |
} | |
rescue Timeout::Error | |
puts 'That took too long, exiting...' | |
end | |
end | |
begin | |
case response | |
when Net::HTTPSuccess then response | |
when (Net::HTTPRedirection && response.code != '404') then | |
if (uri_str != response['location']) && (response.code != '302') | |
fetch(response['location'], limit - 1) | |
else | |
response | |
end | |
else | |
default_error | |
end | |
rescue | |
"TimeoutError" | |
end | |
end | |
broken_urls.each do |link| | |
# puts "#{link}" | |
# puts `curl -I #{link}` | |
puts "#{link}: #{fetch(link)}" | |
end |