Last active
August 29, 2015 14:20
-
-
Save graingerkid/c43e63798c2e3a6f74ce to your computer and use it in GitHub Desktop.
Simple helper function bringing together two different libraries for extracting a URLs components. In-putted URLs are somewhat validated, this is simply against a users adding example.com instead of http://example.com, the former will result in both parsers being incorrect. The results are then simply returned into a named tuple for easy extract…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pip install tldextract | |
import tldextract | |
# standard lib | |
from urlparse import urlparse | |
from collections import namedtuple | |
def url_split(url): | |
''' | |
Simple helper function bringing together two different libraries for extracting | |
a URLs components. | |
In-putted URLs are somewhat validated, this is simply against a users adding | |
example.com instead of http://example.com, the former will result in both | |
parsers being incorrect. | |
The results are then simply returned into a named tuple for easy extraction. | |
''' | |
# validate the url to see if user has added the http prefix | |
if url.startswith('http'): | |
url = url | |
else: | |
url = 'http://' + url | |
# create objects for url parsers | |
tld = tldextract.extract(url) # parser from github that correctly extracts the sub / domain / tld | |
o = urlparse(url) # python standard lib parser | |
protocol = o.scheme + '://' | |
subdomain = tld.subdomain + '.' if tld.subdomain else '' | |
domain = tld.domain + '.' | |
tld = tld.suffix | |
if o.query: | |
path = o.path + '?' + o.query # domain path | |
else: | |
path = o.path # domain path | |
full = '{}{}{}{}{}'.format(protocol, subdomain, domain, tld, path) | |
domain_and_tld = '{}{}'.format(domain, tld) | |
Result = namedtuple( | |
'urlsplit', | |
['protocol', 'subdomain', 'domain', 'tld', 'path', 'full', 'domain_and_tld'] | |
) # Defining the namedtuple | |
return Result( | |
protocol=protocol, | |
subdomain=subdomain, | |
domain=domain, | |
tld=tld, | |
path=path, | |
full=full, | |
domain_and_tld=domain_and_tld | |
) | |
the_urls = url_split('http://www.example.com/widget?colour=blue&size=10') | |
print the_urls | |
# output | |
urlsplit(protocol='http://', subdomain='www.', domain='example.', tld='com', path='/widget?colour=blue&size=10', full='http://www.example.com/widge | |
t?colour=blue&size=10', domain_and_tld='example.com') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment