Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
RESOLVE SHORT URLS before storing. Short URLs are for microblogging; you should never actually keep them around.
require 'net/http'
# WARNING do not use this; it works but is very limited
def resolve url
res = Net::HTTP.get_response URI(url)
if res.code == '301' then res['location']
else url.to_s
end
end
# Why the above method sucks:
# - doesn't handle multiple redirects
# - uses HTTP GET instead of HEAD (slower, wasted bandwidth)
# - no HTTP error and Ruby exception handling
# - no HTTPS support
# - no strict timeouts (lookups can block for too long)
# Author: Mislav Marohnić
# License: MIT http://mislav.mit-license.org
require 'uri'
require 'net/https'
# Public: Service that resolves URLs to their final destination.
#
# Examples
#
# res = UrlResolver::resolve(url)
#
# if res.dead?
# abort "dead link"
# elsif res.failed?
# abort res.failed_reason
# elsif res.changed?
# puts "-> #{res.final_url} (#{res.num_redirects} redirects)"
# else
# warn "URL is direct"
# p res.response_code
# p res.response_headers
# end
class UrlResolver
# Public: Resolve a URL
#
# url - String or URI
# http_adapter - a HTTP adapter to make requests with
# (default: HttpAdapter.new)
#
# Returns a Resolution.
def self.resolve url, http_adapter = HttpAdapter.new
new(url, http_adapter).resolve
end
attr_reader :url, :http_adapter
def initialize url, http_adapter
@url = normalize_url(url)
@http_adapter = http_adapter
end
# Public: Perform URL resolution
#
# limit - Fixnum representing the maximum number of redirects
# (default: 5)
#
# All exceptions are caught and available as Resolution#error.
#
# Returns a Resolution.
def resolve limit = 5
resolution = Resolution.new url
begin
resolve_url(url, limit) do |new_url, response|
resolution.final_url = new_url
resolution.response = response
resolution.num_requests += 1
end
rescue => error
resolution.response = nil unless error.respond_to? :response
resolution.error = error
end
resolution
end
# Public: The result of a URL resolution.
class Resolution
attr_reader :original_url
attr_accessor :final_url, :num_requests, :response, :error
def initialize url
@final_url = @original_url = url
@num_requests = 0
@response = @error = nil
end
def num_redirects() num_requests - 1 end
def response_code
if response then response.code.to_i
else 500
end
end
def response_headers
if response then response.to_hash
else Hash.new
end
end
def failed?
error
end
def failed_reason
error.message
end
def changed?
original_url != final_url
end
def dead?
error.respond_to?(:not_found?) and error.not_found?
end
end
class TooManyRedirects < StandardError
attr_reader :response
def initialize(msg, response)
super(msg)
@response = response
end
end
class HttpError < StandardError
attr_reader :request_url, :response
def initialize(msg, request_url, response)
super(msg)
@request_url, @response = request_url, response
end
def not_found?
response_code == 404 or response_code == 410
end
def response_code
response.code.to_i
end
end
def normalize_url url
url.respond_to?(:host) ? url : URI(url.to_s)
end
def resolve_url url, limit, referer = nil, &block
response = request url, referer
yield url, response if block_given?
case response.code.to_i
when 400...600
raise HttpError.new(
"server returned #{response.code} #{response.message}",
url, response)
when 301
raise TooManyRedirects.new("redirect limit exceeded", response) if limit < 1
new_location = normalize_url response['location']
resolve_url(new_location, limit - 1, url, &block)
else
url
end
end
def request url, referer = nil
connection = http_adapter.get_connection(url)
headers = referer ? {'referer' => referer.to_s} : {}
request = http_adapter.create_request(url, headers)
http_adapter.perform_request(connection, request)
end
# Internal: HTTP adapter for Net::HTTP to use for URL resolution.
class HttpAdapter
def get_connection url
http = Net::HTTP.new url.host, url.port
if http.use_ssl = url.scheme == 'https'
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = cert_store
end
http.open_timeout = 1.5
http.read_timeout = 2
http
end
def cert_store
store = OpenSSL::X509::Store.new
store.set_default_paths
store
end
def create_request url, headers = {}
Net::HTTP::Head.new url.request_uri, headers
end
def perform_request connection, request
connection.start do |http|
http.request request
end
end
end
end
### END implementation; begin tests ###
if $0 == __FILE__
require 'test/unit'
class UrlResolverTest < Test::Unit::TestCase
def setup
@http = TestHttpAdapter.new
end
def resolve url
UrlResolver.new(url, @http).resolve
end
class TestHttpAdapter < UrlResolver::HttpAdapter
def initialize
super
@expectations = []
end
def expect_request expectation = nil
@expectations << (expectation || Proc.new)
end
def perform_request connection, request
response = @expectations.first.call(connection, request)
@expectations.shift if @expectations.size > 1
response
end
end
class MockResponse < Struct.new(:code, :message, :headers)
def [](name) headers[name] end
alias to_hash headers
end
def mock_response code, headers = {}
message = (400...500) === code ? 'Not Found' : 'OK'
MockResponse.new(code.to_s, message, headers)
end
def test_direct_url
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert !http.use_ssl?
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://disney.com/pluto'
assert !resolution.failed?, "expected not to have failed"
assert !resolution.changed?, "expected not to have changed"
assert resolution.final_url.respond_to?(:host)
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 0, resolution.num_redirects
end
def test_failed_resolve
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com'
assert resolution.failed?
assert_equal "boom!", resolution.failed_reason
end
def test_endless_redirect
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert_equal "redirect limit exceeded", resolution.failed_reason
assert_equal 5, resolution.num_redirects
assert_equal 'http://disney.com', resolution.final_url.to_s
end
def test_normal_redirect
@http.expect_request do |http, request|
assert_equal 't.co', http.address
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
def test_redirect_to_dead_url
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
mock_response 404
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert resolution.dead?, "expected to be dead"
assert_equal "server returned 404 Not Found", resolution.failed_reason
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
def test_multiple_redirects
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal '/pluto', request.path
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
assert_equal '/', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com', resolution.final_url.to_s
assert_equal 2, resolution.num_redirects
end
def test_ssl
@http.expect_request do |http, request|
mock_response 301, 'location' => 'https://disney.com/pluto'
end
@http.expect_request do |http, request|
assert http.use_ssl?, "expected to use SSL"
assert_equal 443, http.port
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'https://disney.com/pluto', resolution.final_url.to_s
end
def test_referer
@http.expect_request do |http, request|
assert_nil request['referer'], "expected blank referer"
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'http://t.co/short', request['referer']
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
end
def test_response
@http.expect_request do |http, request|
mock_response 200, 'ETag' => 'hi!'
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '200', resolution.response.code
assert_equal 200, resolution.response_code
assert_equal({'ETag' => 'hi!'}, resolution.response_headers)
end
def test_failed_response
@http.expect_request do |http, request|
mock_response 503
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '503', resolution.response.code
assert_equal 503, resolution.response_code
end
def test_exception_response
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com/pluto'
assert_equal 500, resolution.response_code
assert_equal({}, resolution.response_headers)
assert_nil resolution.response
end
end
end
@datenimperator

Should it just track redirects w/ code 301, or maybe 302, 303, 307, 308 also? There are so many misconfigured web servers sending false redirect codes…

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.