public
Last active

RESOLVE SHORT URLS before storing. Short URLs are for microblogging; you should never actually keep them around.

  • Download Gist
easy_way.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
require 'net/http'
 
# WARNING do not use this; it works but is very limited
def resolve url
res = Net::HTTP.get_response URI(url)
if res.code == '301' then res['location']
else url.to_s
end
end
 
# Why the above method sucks:
# - doesn't handle multiple redirects
# - uses HTTP GET instead of HEAD (slower, wasted bandwidth)
# - no HTTP error and Ruby exception handling
# - no HTTPS support
# - no strict timeouts (lookups can block for too long)
proper_way.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
# Author: Mislav Marohnić
# License: MIT http://mislav.mit-license.org
 
require 'uri'
require 'net/https'
 
# Public: Service that resolves URLs to their final destination.
#
# Examples
#
# res = UrlResolver::resolve(url)
#
# if res.dead?
# abort "dead link"
# elsif res.failed?
# abort res.failed_reason
# elsif res.changed?
# puts "-> #{res.final_url} (#{res.num_redirects} redirects)"
# else
# warn "URL is direct"
# p res.response_code
# p res.response_headers
# end
class UrlResolver
# Public: Resolve a URL
#
# url - String or URI
# http_adapter - a HTTP adapter to make requests with
# (default: HttpAdapter.new)
#
# Returns a Resolution.
def self.resolve url, http_adapter = HttpAdapter.new
new(url, http_adapter).resolve
end
 
attr_reader :url, :http_adapter
 
def initialize url, http_adapter
@url = normalize_url(url)
@http_adapter = http_adapter
end
 
# Public: Perform URL resolution
#
# limit - Fixnum representing the maximum number of redirects
# (default: 5)
#
# All exceptions are caught and available as Resolution#error.
#
# Returns a Resolution.
def resolve limit = 5
resolution = Resolution.new url
begin
resolve_url(url, limit) do |new_url, response|
resolution.final_url = new_url
resolution.response = response
resolution.num_requests += 1
end
rescue => error
resolution.response = nil unless error.respond_to? :response
resolution.error = error
end
resolution
end
 
# Public: The result of a URL resolution.
class Resolution
attr_reader :original_url
attr_accessor :final_url, :num_requests, :response, :error
 
def initialize url
@final_url = @original_url = url
@num_requests = 0
@response = @error = nil
end
 
def num_redirects() num_requests - 1 end
 
def response_code
if response then response.code.to_i
else 500
end
end
 
def response_headers
if response then response.to_hash
else Hash.new
end
end
 
def failed?
error
end
 
def failed_reason
error.message
end
 
def changed?
original_url != final_url
end
 
def dead?
error.respond_to?(:not_found?) and error.not_found?
end
end
 
class TooManyRedirects < StandardError
attr_reader :response
def initialize(msg, response)
super(msg)
@response = response
end
end
 
class HttpError < StandardError
attr_reader :request_url, :response
def initialize(msg, request_url, response)
super(msg)
@request_url, @response = request_url, response
end
 
def not_found?
response_code == 404 or response_code == 410
end
 
def response_code
response.code.to_i
end
end
 
def normalize_url url
url.respond_to?(:host) ? url : URI(url.to_s)
end
 
def resolve_url url, limit, referer = nil, &block
response = request url, referer
yield url, response if block_given?
case response.code.to_i
when 400...600
raise HttpError.new(
"server returned #{response.code} #{response.message}",
url, response)
when 301
raise TooManyRedirects.new("redirect limit exceeded", response) if limit < 1
new_location = normalize_url response['location']
resolve_url(new_location, limit - 1, url, &block)
else
url
end
end
 
def request url, referer = nil
connection = http_adapter.get_connection(url)
headers = referer ? {'referer' => referer.to_s} : {}
request = http_adapter.create_request(url, headers)
http_adapter.perform_request(connection, request)
end
 
# Internal: HTTP adapter for Net::HTTP to use for URL resolution.
class HttpAdapter
def get_connection url
http = Net::HTTP.new url.host, url.port
if http.use_ssl = url.scheme == 'https'
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = cert_store
end
http.open_timeout = 1.5
http.read_timeout = 2
http
end
 
def cert_store
store = OpenSSL::X509::Store.new
store.set_default_paths
store
end
 
def create_request url, headers = {}
Net::HTTP::Head.new url.request_uri, headers
end
 
def perform_request connection, request
connection.start do |http|
http.request request
end
end
end
end
 
### END implementation; begin tests ###
 
if $0 == __FILE__
require 'test/unit'
 
class UrlResolverTest < Test::Unit::TestCase
def setup
@http = TestHttpAdapter.new
end
 
def resolve url
UrlResolver.new(url, @http).resolve
end
 
class TestHttpAdapter < UrlResolver::HttpAdapter
def initialize
super
@expectations = []
end
 
def expect_request expectation = nil
@expectations << (expectation || Proc.new)
end
 
def perform_request connection, request
response = @expectations.first.call(connection, request)
@expectations.shift if @expectations.size > 1
response
end
end
 
class MockResponse < Struct.new(:code, :message, :headers)
def [](name) headers[name] end
alias to_hash headers
end
 
def mock_response code, headers = {}
message = (400...500) === code ? 'Not Found' : 'OK'
MockResponse.new(code.to_s, message, headers)
end
 
def test_direct_url
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert !http.use_ssl?
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://disney.com/pluto'
assert !resolution.failed?, "expected not to have failed"
assert !resolution.changed?, "expected not to have changed"
assert resolution.final_url.respond_to?(:host)
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 0, resolution.num_redirects
end
 
def test_failed_resolve
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com'
assert resolution.failed?
assert_equal "boom!", resolution.failed_reason
end
 
def test_endless_redirect
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert_equal "redirect limit exceeded", resolution.failed_reason
assert_equal 5, resolution.num_redirects
assert_equal 'http://disney.com', resolution.final_url.to_s
end
 
def test_normal_redirect
@http.expect_request do |http, request|
assert_equal 't.co', http.address
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
 
def test_redirect_to_dead_url
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
mock_response 404
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert resolution.dead?, "expected to be dead"
assert_equal "server returned 404 Not Found", resolution.failed_reason
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
 
def test_multiple_redirects
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal '/pluto', request.path
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
assert_equal '/', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com', resolution.final_url.to_s
assert_equal 2, resolution.num_redirects
end
 
def test_ssl
@http.expect_request do |http, request|
mock_response 301, 'location' => 'https://disney.com/pluto'
end
@http.expect_request do |http, request|
assert http.use_ssl?, "expected to use SSL"
assert_equal 443, http.port
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'https://disney.com/pluto', resolution.final_url.to_s
end
 
def test_referer
@http.expect_request do |http, request|
assert_nil request['referer'], "expected blank referer"
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'http://t.co/short', request['referer']
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
end
 
def test_response
@http.expect_request do |http, request|
mock_response 200, 'ETag' => 'hi!'
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '200', resolution.response.code
assert_equal 200, resolution.response_code
assert_equal({'ETag' => 'hi!'}, resolution.response_headers)
end
 
def test_failed_response
@http.expect_request do |http, request|
mock_response 503
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '503', resolution.response.code
assert_equal 503, resolution.response_code
end
 
def test_exception_response
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com/pluto'
assert_equal 500, resolution.response_code
assert_equal({}, resolution.response_headers)
assert_nil resolution.response
end
end
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.