graingerkid/url_splitter.py

## url_splitter.py
# -*- coding: utf-8 -*-

# pip install tldextract
import tldextract

# standard lib
from urlparse import urlparse
from collections import namedtuple


def url_split(url):
    '''
    Simple helper function bringing together two different libraries for extracting
    a URLs components.

    In-putted URLs are somewhat validated, this is simply against a users adding
    example.com instead of http://example.com, the former will result in both
    parsers being incorrect.

    The results are then simply returned into a named tuple for easy extraction.
    '''
    # validate the url to see if user has added the http prefix
    if url.startswith('http'):
        url = url
    else:
        url = 'http://' + url

    # create objects for url parsers
    tld = tldextract.extract(url)  # parser from github that correctly extracts the sub / domain / tld
    o = urlparse(url)  # python standard lib parser

    protocol = o.scheme + '://'
    subdomain = tld.subdomain + '.' if tld.subdomain else ''
    domain = tld.domain + '.'
    tld = tld.suffix

    if o.query:
        path = o.path + '?' + o.query # domain path
    else:
        path = o.path  # domain path

    full = '{}{}{}{}{}'.format(protocol, subdomain, domain, tld, path)
    domain_and_tld = '{}{}'.format(domain, tld)

    Result = namedtuple(
        'urlsplit',
        ['protocol', 'subdomain', 'domain', 'tld', 'path', 'full', 'domain_and_tld']
        )  # Defining the namedtuple

    return Result(
        protocol=protocol,
        subdomain=subdomain,
        domain=domain,
        tld=tld,
        path=path,
        full=full,
        domain_and_tld=domain_and_tld
        )


the_urls = url_split('http://www.example.com/widget?colour=blue&size=10')

print the_urls

# output
urlsplit(protocol='http://', subdomain='www.', domain='example.', tld='com', path='/widget?colour=blue&size=10', full='http://www.example.com/widge
t?colour=blue&size=10', domain_and_tld='example.com')
	# -- coding: utf-8 --

	# pip install tldextract
	import tldextract

	# standard lib
	from urlparse import urlparse
	from collections import namedtuple


	def url_split(url):
	'''
	Simple helper function bringing together two different libraries for extracting
	a URLs components.

	In-putted URLs are somewhat validated, this is simply against a users adding
	example.com instead of http://example.com, the former will result in both
	parsers being incorrect.

	The results are then simply returned into a named tuple for easy extraction.
	'''
	# validate the url to see if user has added the http prefix
	if url.startswith('http'):
	url = url
	else:
	url = 'http://' + url

	# create objects for url parsers
	tld = tldextract.extract(url) # parser from github that correctly extracts the sub / domain / tld
	o = urlparse(url) # python standard lib parser

	protocol = o.scheme + '://'
	subdomain = tld.subdomain + '.' if tld.subdomain else ''
	domain = tld.domain + '.'
	tld = tld.suffix

	if o.query:
	path = o.path + '?' + o.query # domain path
	else:
	path = o.path # domain path

	full = '{}{}{}{}{}'.format(protocol, subdomain, domain, tld, path)
	domain_and_tld = '{}{}'.format(domain, tld)

	Result = namedtuple(
	'urlsplit',
	['protocol', 'subdomain', 'domain', 'tld', 'path', 'full', 'domain_and_tld']
	) # Defining the namedtuple

	return Result(
	protocol=protocol,
	subdomain=subdomain,
	domain=domain,
	tld=tld,
	path=path,
	full=full,
	domain_and_tld=domain_and_tld
	)


	the_urls = url_split('http://www.example.com/widget?colour=blue&size=10')

	print the_urls

	# output
	urlsplit(protocol='http://', subdomain='www.', domain='example.', tld='com', path='/widget?colour=blue&size=10', full='http://www.example.com/widge
	t?colour=blue&size=10', domain_and_tld='example.com')