bfirsh/short_urls.py

## short_urls.py
import requests
from urlparse import urlparse, urljoin

URL_SHORTENERS = [
    'bit.ly',
    'ow.ly',
    'dlvr.it',
    'fb.me',
    '4sq.com',
    'is.gd',
    'j.mp',
    'tmblr.co',
    'ibm.co',
    'on.fb.me',
    'vsb.li',
    'shar.es',
    'ht.ly',
    'wp.me',
    'dld.bz',
    'n.pr',
    'tiny.cc',
    'ar.gy',
    'adf.ly',
    'lnkd.in',
    'ow.ly',
    'slidesha.re',
    'bbc.in',
    'flic.kr',
    'bitly.com',
    'wapo.st',
    'aol.it',
    'monk.ly',
    'intel.ly',
    'go.ign.com',
    'twb.io',
    'short.to',
    's.co',
    'youtu.be',
    'tiny.cc',
    'huff.to',
    'on.mash.to',
    'goo.gl',
    'tinyurl.com',
    'git.io',
    'su.pr',
    'rww.to',
    'appd.it',
    'on.cnn.com',
    'trap.it',
    'dlvr.it',
    'zite.to',
    'itsh.bo',
]

def resolve_short_url(url, force=False, depth=1):
    """
    Given a short URL, returns its real URL. By default, it will only resolve a
    whitelist of known URL shorteners, but this can be overridden with the `force`
    argument.
    """
    domain = urlparse(url).netloc

    # If the domain is long and not a known url shortener, don't bother resolving
    if not force and len(domain) >= 7 and domain not in URL_SHORTENERS:
        return url

    count = 0

    while count < depth:
        count += 1

        r = requests.head(url, allow_redirects=False)

        if 'location' not in r.headers:
            break

        url = r.headers['location']

        # (Pinched from requests)
        # Handle redirection without scheme (see: RFC 1808 Section 4)
        if url.startswith('//'):
            parsed_rurl = urlparse(r.url)
            url = '%s:%s' % (parsed_rurl.scheme, url)

        # Facilitate non-RFC2616-compliant 'location' headers
        # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
        if not urlparse(url).netloc:
            url = urljoin(r.url, url)

    return url
	import requests
	from urlparse import urlparse, urljoin

	URL_SHORTENERS = [
	'bit.ly',
	'ow.ly',
	'dlvr.it',
	'fb.me',
	'4sq.com',
	'is.gd',
	'j.mp',
	'tmblr.co',
	'ibm.co',
	'on.fb.me',
	'vsb.li',
	'shar.es',
	'ht.ly',
	'wp.me',
	'dld.bz',
	'n.pr',
	'tiny.cc',
	'ar.gy',
	'adf.ly',
	'lnkd.in',
	'ow.ly',
	'slidesha.re',
	'bbc.in',
	'flic.kr',
	'bitly.com',
	'wapo.st',
	'aol.it',
	'monk.ly',
	'intel.ly',
	'go.ign.com',
	'twb.io',
	'short.to',
	's.co',
	'youtu.be',
	'tiny.cc',
	'huff.to',
	'on.mash.to',
	'goo.gl',
	'tinyurl.com',
	'git.io',
	'su.pr',
	'rww.to',
	'appd.it',
	'on.cnn.com',
	'trap.it',
	'dlvr.it',
	'zite.to',
	'itsh.bo',
	]

	def resolve_short_url(url, force=False, depth=1):
	"""
	Given a short URL, returns its real URL. By default, it will only resolve a
	whitelist of known URL shorteners, but this can be overridden with the `force`
	argument.
	"""
	domain = urlparse(url).netloc

	# If the domain is long and not a known url shortener, don't bother resolving
	if not force and len(domain) >= 7 and domain not in URL_SHORTENERS:
	return url

	count = 0

	while count < depth:
	count += 1

	r = requests.head(url, allow_redirects=False)

	if 'location' not in r.headers:
	break

	url = r.headers['location']

	# (Pinched from requests)
	# Handle redirection without scheme (see: RFC 1808 Section 4)
	if url.startswith('//'):
	parsed_rurl = urlparse(r.url)
	url = '%s:%s' % (parsed_rurl.scheme, url)

	# Facilitate non-RFC2616-compliant 'location' headers
	# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
	if not urlparse(url).netloc:
	url = urljoin(r.url, url)

	return url