Skip to content

Instantly share code, notes, and snippets.

@graingerkid
Last active August 29, 2015 14:20
Show Gist options
  • Save graingerkid/c43e63798c2e3a6f74ce to your computer and use it in GitHub Desktop.
Save graingerkid/c43e63798c2e3a6f74ce to your computer and use it in GitHub Desktop.
Simple helper function bringing together two different libraries for extracting a URLs components. In-putted URLs are somewhat validated, this is simply against a users adding example.com instead of http://example.com, the former will result in both parsers being incorrect. The results are then simply returned into a named tuple for easy extract…
# -*- coding: utf-8 -*-
# pip install tldextract
import tldextract
# standard lib
from urlparse import urlparse
from collections import namedtuple
def url_split(url):
'''
Simple helper function bringing together two different libraries for extracting
a URLs components.
In-putted URLs are somewhat validated, this is simply against a users adding
example.com instead of http://example.com, the former will result in both
parsers being incorrect.
The results are then simply returned into a named tuple for easy extraction.
'''
# validate the url to see if user has added the http prefix
if url.startswith('http'):
url = url
else:
url = 'http://' + url
# create objects for url parsers
tld = tldextract.extract(url) # parser from github that correctly extracts the sub / domain / tld
o = urlparse(url) # python standard lib parser
protocol = o.scheme + '://'
subdomain = tld.subdomain + '.' if tld.subdomain else ''
domain = tld.domain + '.'
tld = tld.suffix
if o.query:
path = o.path + '?' + o.query # domain path
else:
path = o.path # domain path
full = '{}{}{}{}{}'.format(protocol, subdomain, domain, tld, path)
domain_and_tld = '{}{}'.format(domain, tld)
Result = namedtuple(
'urlsplit',
['protocol', 'subdomain', 'domain', 'tld', 'path', 'full', 'domain_and_tld']
) # Defining the namedtuple
return Result(
protocol=protocol,
subdomain=subdomain,
domain=domain,
tld=tld,
path=path,
full=full,
domain_and_tld=domain_and_tld
)
the_urls = url_split('http://www.example.com/widget?colour=blue&size=10')
print the_urls
# output
urlsplit(protocol='http://', subdomain='www.', domain='example.', tld='com', path='/widget?colour=blue&size=10', full='http://www.example.com/widge
t?colour=blue&size=10', domain_and_tld='example.com')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment