Skip to content

Instantly share code, notes, and snippets.

@gaborbata
Created September 22, 2017 09:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gaborbata/9d962536db88a19758c3621283ad7f03 to your computer and use it in GitHub Desktop.
Save gaborbata/9d962536db88a19758c3621283ad7f03 to your computer and use it in GitHub Desktop.
Sitemap URL Checker
#!/usr/bin/env ruby
# Sitemap URL Checker
#
# MIT License
#
# Copyright (c) 2017 Gabor Bata
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
require 'net/http'
require 'thread'
require 'open-uri'
# Number of threads for processing the URLs
THREAD_NUMBER = 2
STATUS_OK = '200'
@mutex = Mutex.new
@result_group = {}
@queue = Queue.new
@counter = 0
class SitemapUrl
attr_reader :group_name, :url
def initialize(url, group_name)
@url = url
@group_name = group_name
end
end
class Result
def initialize(group_name)
@error_counts = {}
@group_name = group_name
@error_count = 0
@success_count = 0
end
def update(success_count, error_count, response_code)
@success_count = @success_count + success_count
@error_count = @error_count + error_count
if response_code != STATUS_OK
count = @error_counts[response_code].nil? ? 0 : @error_counts[response_code]
@error_counts[response_code] = count + error_count
end
end
def to_s
error_groups = ''
error_groups = (' (' + @error_counts.map { |code, count| "#{code}:#{count}"}.join(", ") + ')').gsub(', )', ')') if @error_counts.size > 0
error_rate = @success_count == 0 ? 100.0 : (@error_count.to_f) / (@success_count.to_f + @error_count.to_f) * 100.0
error_rate = 0.0 if @success_count == 0 && @error_count == 0
"|#{@group_name.to_s.ljust(70)}|#{@success_count.to_s.rjust(5)}|#{@error_count.to_s.rjust(5)}#{error_groups.to_s.ljust(35)}|#{format("%.2f", error_rate)}%|"
end
end
def check_url(url_entry)
uri = URI(url_entry.url)
response = nil
begin
response = Net::HTTP.get_response(uri)
# retry getting the response
#3.times do
# if ['500', '502'].include?(response.code)
# sleep(2)
# response = Net::HTTP.get_response(uri)
# end
#end
rescue Exception => e
puts "ERROR: Cannot check URL [#{url_entry.url}] due to the following error: #{e.message}"
result = @result_group[url_entry.group_name]
if result.nil?
result = Result.new(url_entry.group_name)
@result_group[url_entry.group_name] = result
end
result.update(0, 1, '???')
end
if response
# generate/write statistics
@mutex.synchronize do
@counter += 1
puts "#{@counter.to_s.rjust(5)} [#{response.code}] #{url_entry.url}"
success = 0
error = 0
if response.code == STATUS_OK
success = 1
else
error = 1
end
result = @result_group[url_entry.group_name]
if result.nil?
result = Result.new(url_entry.group_name)
@result_group[url_entry.group_name] = result
end
result.update(success, error, response.code)
puts "#{url_entry.url} [#{response.code}] #{response.code == STATUS_OK ? 'OK' : URI::encode(response.header['location'].to_s)}" if response.code != STATUS_OK
end
end
end
def push_urls_to_queue(file_path, queue)
file = File.open(file_path, 'rb')
sitemap_file = file_path.end_with?('.gz') ? Zlib::GzipReader.new(file) : file
sitemap_file.each_line do |line|
matches = line.scan(/<loc>(.+?)<\/loc>/)
if matches
matches.each do |match|
url = match[0].to_s
group = file_path.gsub(/_\d+.*/, '').gsub(/.*?\//, '')
queue.push(SitemapUrl.new(url, group)) # if rand(500) == 0 # add urls randomly
end
end
end
file.close
end
def main
files = Dir.glob(ARGV[0].nil? ? '*.{xml,xml.gz}' : ARGV[0])
files.each do |file|
puts "Reading URLs from sitemap XML [#{file}]"
push_urls_to_queue(file, @queue)
end
puts "Number of urls to check: [#{@queue.size}]"
threads = []
THREAD_NUMBER.times do
threads << Thread.new do
until @queue.empty?
url = @queue.pop(true) rescue nil
check_url(url) if url
end
end
end
threads.each do |thread|
thread.join
end
puts
puts "|#{'Group'.ljust(70)}|#{'OK'.center(5)}|#{'Error'.center(40)}|#{'Error rate'}|"
@result_group.each do |k, v|
puts v.to_s
end
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment