|
# Author: Mislav Marohnić |
|
# License: MIT http://mislav.mit-license.org |
|
|
|
require 'uri' |
|
require 'net/https' |
|
|
|
# Public: Service that resolves URLs to their final destination. |
|
# |
|
# Examples |
|
# |
|
# res = UrlResolver::resolve(url) |
|
# |
|
# if res.dead? |
|
# abort "dead link" |
|
# elsif res.failed? |
|
# abort res.failed_reason |
|
# elsif res.changed? |
|
# puts "-> #{res.final_url} (#{res.num_redirects} redirects)" |
|
# else |
|
# warn "URL is direct" |
|
# p res.response_code |
|
# p res.response_headers |
|
# end |
|
class UrlResolver |
|
# Public: Resolve a URL |
|
# |
|
# url - String or URI |
|
# http_adapter - a HTTP adapter to make requests with |
|
# (default: HttpAdapter.new) |
|
# |
|
# Returns a Resolution. |
|
def self.resolve url, http_adapter = HttpAdapter.new |
|
new(url, http_adapter).resolve |
|
end |
|
|
|
attr_reader :url, :http_adapter |
|
|
|
def initialize url, http_adapter |
|
@url = normalize_url(url) |
|
@http_adapter = http_adapter |
|
end |
|
|
|
# Public: Perform URL resolution |
|
# |
|
# limit - Fixnum representing the maximum number of redirects |
|
# (default: 5) |
|
# |
|
# All exceptions are caught and available as Resolution#error. |
|
# |
|
# Returns a Resolution. |
|
def resolve limit = 5 |
|
resolution = Resolution.new url |
|
begin |
|
resolve_url(url, limit) do |new_url, response| |
|
resolution.final_url = new_url |
|
resolution.response = response |
|
resolution.num_requests += 1 |
|
end |
|
rescue => error |
|
resolution.response = nil unless error.respond_to? :response |
|
resolution.error = error |
|
end |
|
resolution |
|
end |
|
|
|
# Public: The result of a URL resolution. |
|
class Resolution |
|
attr_reader :original_url |
|
attr_accessor :final_url, :num_requests, :response, :error |
|
|
|
def initialize url |
|
@final_url = @original_url = url |
|
@num_requests = 0 |
|
@response = @error = nil |
|
end |
|
|
|
def num_redirects() num_requests - 1 end |
|
|
|
def response_code |
|
if response then response.code.to_i |
|
else 500 |
|
end |
|
end |
|
|
|
def response_headers |
|
if response then response.to_hash |
|
else Hash.new |
|
end |
|
end |
|
|
|
def failed? |
|
error |
|
end |
|
|
|
def failed_reason |
|
error.message |
|
end |
|
|
|
def changed? |
|
original_url != final_url |
|
end |
|
|
|
def dead? |
|
error.respond_to?(:not_found?) and error.not_found? |
|
end |
|
end |
|
|
|
class TooManyRedirects < StandardError |
|
attr_reader :response |
|
def initialize(msg, response) |
|
super(msg) |
|
@response = response |
|
end |
|
end |
|
|
|
class HttpError < StandardError |
|
attr_reader :request_url, :response |
|
def initialize(msg, request_url, response) |
|
super(msg) |
|
@request_url, @response = request_url, response |
|
end |
|
|
|
def not_found? |
|
response_code == 404 or response_code == 410 |
|
end |
|
|
|
def response_code |
|
response.code.to_i |
|
end |
|
end |
|
|
|
def normalize_url url |
|
url.respond_to?(:host) ? url : URI(url.to_s) |
|
end |
|
|
|
def resolve_url url, limit, referer = nil, &block |
|
response = request url, referer |
|
yield url, response if block_given? |
|
case response.code.to_i |
|
when 400...600 |
|
raise HttpError.new( |
|
"server returned #{response.code} #{response.message}", |
|
url, response) |
|
when 301 |
|
raise TooManyRedirects.new("redirect limit exceeded", response) if limit < 1 |
|
new_location = normalize_url response['location'] |
|
resolve_url(new_location, limit - 1, url, &block) |
|
else |
|
url |
|
end |
|
end |
|
|
|
def request url, referer = nil |
|
connection = http_adapter.get_connection(url) |
|
headers = referer ? {'referer' => referer.to_s} : {} |
|
request = http_adapter.create_request(url, headers) |
|
http_adapter.perform_request(connection, request) |
|
end |
|
|
|
# Internal: HTTP adapter for Net::HTTP to use for URL resolution. |
|
class HttpAdapter |
|
def get_connection url |
|
http = Net::HTTP.new url.host, url.port |
|
if http.use_ssl = url.scheme == 'https' |
|
http.verify_mode = OpenSSL::SSL::VERIFY_PEER |
|
http.cert_store = cert_store |
|
end |
|
http.open_timeout = 1.5 |
|
http.read_timeout = 2 |
|
http |
|
end |
|
|
|
def cert_store |
|
store = OpenSSL::X509::Store.new |
|
store.set_default_paths |
|
store |
|
end |
|
|
|
def create_request url, headers = {} |
|
Net::HTTP::Head.new url.request_uri, headers |
|
end |
|
|
|
def perform_request connection, request |
|
connection.start do |http| |
|
http.request request |
|
end |
|
end |
|
end |
|
end |
|
|
|
### END implementation; begin tests ### |
|
|
|
if $0 == __FILE__ |
|
require 'test/unit' |
|
|
|
class UrlResolverTest < Test::Unit::TestCase |
|
def setup |
|
@http = TestHttpAdapter.new |
|
end |
|
|
|
def resolve url |
|
UrlResolver.new(url, @http).resolve |
|
end |
|
|
|
class TestHttpAdapter < UrlResolver::HttpAdapter |
|
def initialize |
|
super |
|
@expectations = [] |
|
end |
|
|
|
def expect_request expectation = nil |
|
@expectations << (expectation || Proc.new) |
|
end |
|
|
|
def perform_request connection, request |
|
response = @expectations.first.call(connection, request) |
|
@expectations.shift if @expectations.size > 1 |
|
response |
|
end |
|
end |
|
|
|
class MockResponse < Struct.new(:code, :message, :headers) |
|
def [](name) headers[name] end |
|
alias to_hash headers |
|
end |
|
|
|
def mock_response code, headers = {} |
|
message = (400...500) === code ? 'Not Found' : 'OK' |
|
MockResponse.new(code.to_s, message, headers) |
|
end |
|
|
|
def test_direct_url |
|
@http.expect_request do |http, request| |
|
assert_equal 'disney.com', http.address |
|
assert !http.use_ssl? |
|
assert_equal '/pluto', request.path |
|
mock_response 200 |
|
end |
|
resolution = resolve 'http://disney.com/pluto' |
|
assert !resolution.failed?, "expected not to have failed" |
|
assert !resolution.changed?, "expected not to have changed" |
|
assert resolution.final_url.respond_to?(:host) |
|
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s |
|
assert_equal 0, resolution.num_redirects |
|
end |
|
|
|
def test_failed_resolve |
|
@http.expect_request do |http, request| |
|
raise "boom!" |
|
end |
|
resolution = resolve 'http://disney.com' |
|
assert resolution.failed? |
|
assert_equal "boom!", resolution.failed_reason |
|
end |
|
|
|
def test_endless_redirect |
|
@http.expect_request do |http, request| |
|
mock_response 301, 'location' => 'http://disney.com' |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert resolution.failed?, "expected to have failed" |
|
assert_equal "redirect limit exceeded", resolution.failed_reason |
|
assert_equal 5, resolution.num_redirects |
|
assert_equal 'http://disney.com', resolution.final_url.to_s |
|
end |
|
|
|
def test_normal_redirect |
|
@http.expect_request do |http, request| |
|
assert_equal 't.co', http.address |
|
mock_response 301, 'location' => 'http://disney.com/pluto' |
|
end |
|
@http.expect_request do |http, request| |
|
assert_equal 'disney.com', http.address |
|
assert_equal '/pluto', request.path |
|
mock_response 200 |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert !resolution.failed?, "expected not to have failed" |
|
assert resolution.changed?, "expected to have changed" |
|
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s |
|
assert_equal 1, resolution.num_redirects |
|
end |
|
|
|
def test_redirect_to_dead_url |
|
@http.expect_request do |http, request| |
|
mock_response 301, 'location' => 'http://disney.com/pluto' |
|
end |
|
@http.expect_request do |http, request| |
|
mock_response 404 |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert resolution.failed?, "expected to have failed" |
|
assert resolution.dead?, "expected to be dead" |
|
assert_equal "server returned 404 Not Found", resolution.failed_reason |
|
assert resolution.changed?, "expected to have changed" |
|
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s |
|
assert_equal 1, resolution.num_redirects |
|
end |
|
|
|
def test_multiple_redirects |
|
@http.expect_request do |http, request| |
|
mock_response 301, 'location' => 'http://disney.com/pluto' |
|
end |
|
@http.expect_request do |http, request| |
|
assert_equal '/pluto', request.path |
|
mock_response 301, 'location' => 'http://disney.com' |
|
end |
|
@http.expect_request do |http, request| |
|
assert_equal '/', request.path |
|
mock_response 200 |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert !resolution.failed?, "expected not to have failed" |
|
assert resolution.changed?, "expected to have changed" |
|
assert_equal 'http://disney.com', resolution.final_url.to_s |
|
assert_equal 2, resolution.num_redirects |
|
end |
|
|
|
def test_ssl |
|
@http.expect_request do |http, request| |
|
mock_response 301, 'location' => 'https://disney.com/pluto' |
|
end |
|
@http.expect_request do |http, request| |
|
assert http.use_ssl?, "expected to use SSL" |
|
assert_equal 443, http.port |
|
assert_equal '/pluto', request.path |
|
mock_response 200 |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert_equal 'https://disney.com/pluto', resolution.final_url.to_s |
|
end |
|
|
|
def test_referer |
|
@http.expect_request do |http, request| |
|
assert_nil request['referer'], "expected blank referer" |
|
mock_response 301, 'location' => 'http://disney.com/pluto' |
|
end |
|
@http.expect_request do |http, request| |
|
assert_equal 'http://t.co/short', request['referer'] |
|
mock_response 200 |
|
end |
|
resolution = resolve 'http://t.co/short' |
|
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s |
|
end |
|
|
|
def test_response |
|
@http.expect_request do |http, request| |
|
mock_response 200, 'ETag' => 'hi!' |
|
end |
|
resolution = resolve 'http://disney.com/pluto' |
|
assert_equal '200', resolution.response.code |
|
assert_equal 200, resolution.response_code |
|
assert_equal({'ETag' => 'hi!'}, resolution.response_headers) |
|
end |
|
|
|
def test_failed_response |
|
@http.expect_request do |http, request| |
|
mock_response 503 |
|
end |
|
resolution = resolve 'http://disney.com/pluto' |
|
assert_equal '503', resolution.response.code |
|
assert_equal 503, resolution.response_code |
|
end |
|
|
|
def test_exception_response |
|
@http.expect_request do |http, request| |
|
mock_response 301, 'location' => 'http://disney.com' |
|
end |
|
@http.expect_request do |http, request| |
|
raise "boom!" |
|
end |
|
resolution = resolve 'http://disney.com/pluto' |
|
assert_equal 500, resolution.response_code |
|
assert_equal({}, resolution.response_headers) |
|
assert_nil resolution.response |
|
end |
|
end |
|
end |
This comment has been minimized.
Should it just track redirects w/ code 301, or maybe 302, 303, 307, 308 also? There are so many misconfigured web servers sending false redirect codes…