Created
March 8, 2012 21:24
-
-
Save bfirsh/2003517 to your computer and use it in GitHub Desktop.
Efficiently expand short URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urlparse import urlparse, urljoin | |
URL_SHORTENERS = [ | |
'bit.ly', | |
'ow.ly', | |
'dlvr.it', | |
'fb.me', | |
'4sq.com', | |
'is.gd', | |
'j.mp', | |
'tmblr.co', | |
'ibm.co', | |
'on.fb.me', | |
'vsb.li', | |
'shar.es', | |
'ht.ly', | |
'wp.me', | |
'dld.bz', | |
'n.pr', | |
'tiny.cc', | |
'ar.gy', | |
'adf.ly', | |
'lnkd.in', | |
'ow.ly', | |
'slidesha.re', | |
'bbc.in', | |
'flic.kr', | |
'bitly.com', | |
'wapo.st', | |
'aol.it', | |
'monk.ly', | |
'intel.ly', | |
'go.ign.com', | |
'twb.io', | |
'short.to', | |
's.co', | |
'youtu.be', | |
'tiny.cc', | |
'huff.to', | |
'on.mash.to', | |
'goo.gl', | |
'tinyurl.com', | |
'git.io', | |
'su.pr', | |
'rww.to', | |
'appd.it', | |
'on.cnn.com', | |
'trap.it', | |
'dlvr.it', | |
'zite.to', | |
'itsh.bo', | |
] | |
def resolve_short_url(url, force=False, depth=1): | |
""" | |
Given a short URL, returns its real URL. By default, it will only resolve a | |
whitelist of known URL shorteners, but this can be overridden with the `force` | |
argument. | |
""" | |
domain = urlparse(url).netloc | |
# If the domain is long and not a known url shortener, don't bother resolving | |
if not force and len(domain) >= 7 and domain not in URL_SHORTENERS: | |
return url | |
count = 0 | |
while count < depth: | |
count += 1 | |
r = requests.head(url, allow_redirects=False) | |
if 'location' not in r.headers: | |
break | |
url = r.headers['location'] | |
# (Pinched from requests) | |
# Handle redirection without scheme (see: RFC 1808 Section 4) | |
if url.startswith('//'): | |
parsed_rurl = urlparse(r.url) | |
url = '%s:%s' % (parsed_rurl.scheme, url) | |
# Facilitate non-RFC2616-compliant 'location' headers | |
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource') | |
if not urlparse(url).netloc: | |
url = urljoin(r.url, url) | |
return url | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment