Skip to content

Instantly share code, notes, and snippets.

@timh
Last active December 20, 2015 04:49
Show Gist options
  • Save timh/6073979 to your computer and use it in GitHub Desktop.
Save timh/6073979 to your computer and use it in GitHub Desktop.
use child processes to walk a list of url's
#!/usr/bin/env ruby
require 'uri'
require 'net/https'
NUM_PROCESSES = 10
MAX_REDIRECTS = 5
def visit_url(url)
original_url = url
num_redirects = 0
done = false
while num_redirects < MAX_REDIRECTS && !done
uri = URI(url)
Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == "https") do |http|
if uri.port == 443
http.use_ssl = true
end
request = Net::HTTP::Get.new uri.request_uri
response = http.request request
# silly net/http returns codes as strings.
code = response.code.to_i
if [301, 302].include?(code)
num_redirects += 1
url = response.response.header['Location']
if num_redirects == MAX_REDIRECTS
puts "#{original_url}: \033[1;31mmaximum redirects exceeded: last one was #{url}\033[0m"
end
else
redirects_str = (num_redirects > 0) ? " (#{num_redirects} redirects -> #{url})" : ""
done = true
if code != 200
puts "#{original_url}: \033[1;31m#{response.code}\033[0m#{redirects_str}"
puts response.body
else
puts "#{original_url}: #{response.code}#{redirects_str}"
end
end
end
end
end
def process_file(process_index, lines)
line_no = 0
lines.each do |line|
url = line.chomp
if ((line_no % NUM_PROCESSES) == process_index)
# begin
visit_url(url)
# rescue => e
# puts "#{url}: error: #{e}"
# end
end
line_no += 1
end
end
file = if ARGV.length > 0
File.open(ARGV[0], "r")
else
STDIN
end
lines = file.lines.map {|l| l}
child_pids = []
NUM_PROCESSES.times do |index|
child_pid = fork
if child_pid == nil
process_file(index, lines)
exit
else
child_pids << child_pid
end
end
child_pids.each do |pid|
Process.wait(pid)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment