Skip to content

Instantly share code, notes, and snippets.

@bfirsh
Created March 8, 2012 21:24
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bfirsh/2003517 to your computer and use it in GitHub Desktop.
Save bfirsh/2003517 to your computer and use it in GitHub Desktop.
Efficiently expand short URLs
import requests
from urlparse import urlparse, urljoin
URL_SHORTENERS = [
'bit.ly',
'ow.ly',
'dlvr.it',
'fb.me',
'4sq.com',
'is.gd',
'j.mp',
'tmblr.co',
'ibm.co',
'on.fb.me',
'vsb.li',
'shar.es',
'ht.ly',
'wp.me',
'dld.bz',
'n.pr',
'tiny.cc',
'ar.gy',
'adf.ly',
'lnkd.in',
'ow.ly',
'slidesha.re',
'bbc.in',
'flic.kr',
'bitly.com',
'wapo.st',
'aol.it',
'monk.ly',
'intel.ly',
'go.ign.com',
'twb.io',
'short.to',
's.co',
'youtu.be',
'tiny.cc',
'huff.to',
'on.mash.to',
'goo.gl',
'tinyurl.com',
'git.io',
'su.pr',
'rww.to',
'appd.it',
'on.cnn.com',
'trap.it',
'dlvr.it',
'zite.to',
'itsh.bo',
]
def resolve_short_url(url, force=False, depth=1):
"""
Given a short URL, returns its real URL. By default, it will only resolve a
whitelist of known URL shorteners, but this can be overridden with the `force`
argument.
"""
domain = urlparse(url).netloc
# If the domain is long and not a known url shortener, don't bother resolving
if not force and len(domain) >= 7 and domain not in URL_SHORTENERS:
return url
count = 0
while count < depth:
count += 1
r = requests.head(url, allow_redirects=False)
if 'location' not in r.headers:
break
url = r.headers['location']
# (Pinched from requests)
# Handle redirection without scheme (see: RFC 1808 Section 4)
if url.startswith('//'):
parsed_rurl = urlparse(r.url)
url = '%s:%s' % (parsed_rurl.scheme, url)
# Facilitate non-RFC2616-compliant 'location' headers
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
if not urlparse(url).netloc:
url = urljoin(r.url, url)
return url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment