Skip to content

Instantly share code, notes, and snippets.

@mackenziestarr
Created April 15, 2015 17:31
Show Gist options
  • Save mackenziestarr/d5b613d368685fa9a4bf to your computer and use it in GitHub Desktop.
Save mackenziestarr/d5b613d368685fa9a4bf to your computer and use it in GitHub Desktop.
Check xml sitemap for 404s
require 'httparty'
require 'rexml/document'
require 'logger'
class Sitemap
include HTTParty
base_uri 'http://example.com'
format :xml
end
class Codes < Hash
def initialize
super
end
def push(status_code)
if self.has_key?(status_code)
self[status_code] += 1
else
self[status_code] = 1
end
end
def get_codes
results = 'HTTP Request Results : '
self.each{|k,v| results += "#{k} [#{v}] "}
return results
end
end
usage = 'usage: ruby lint-sitemap.rb <XML file>'
if ARGV.length != 1 then puts usage; exit; end
infile_handle = ARGV[0].split(/\.|\//)
if infile_handle[2] != 'xml'
puts "#{ARGV[0]} is not a valid XML file"
puts usage
exit
end
logfile = "#{infile_handle[1]}.log"
Dir.chdir(File.join(__dir__,"logs"))
$log = Logger.new(logfile)
#parse
Dir.chdir(__dir__)
xml = File.new(ARGV[0])
doc = REXML::Document.new xml
status_codes = Codes.new
#select all baron elements
doc.elements.each("//*[not(*)]") {|node|
url = node.text
if url.start_with?('http')
begin
status = Sitemap.get(url).code
printf("%-10s %10s\n", status, url)
rescue URI::InvalidURIError
$log.error "#{url} from #{ARGV[0]} is a bad URI"
next
end
if status != 200
$log.warn "#{status} #{url}"
end
status_codes.push(status)
end
}
puts status_codes.get_codes
$log.info status_codes.get_codes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment