Skip to content

Instantly share code, notes, and snippets.

@xtream1101
Last active May 6, 2016 14:08
Show Gist options
  • Save xtream1101/764b70b6241997ee01d692ab961d246e to your computer and use it in GitHub Desktop.
Save xtream1101/764b70b6241997ee01d692ab961d246e to your computer and use it in GitHub Desktop.
URL Regex
import re
from pprint import pprint
"""
TODO:
Support PORT numbers in the url
Support IP's as domains. If IP, it cannot have a subdomain or a tld
"""
"""
What it does:
Splits a valid url into its base parts (with a few exceptions e.g. edge cases in test)
- protocol
- subdomain(s)
- domain
- tld
- endpoint
What it does NOT:
- Check if a url is valid or not
"""
p = re.compile("""
# Parse the protocol
(?:(?P<protocol>\w+):\/\/)?
# Parse all subdomains together, the extra ?'s makes the group ungreedy
(?P<subdomain>(?:\w+\.??)+?)??\.?
# Parse domain
(?P<domain>[-\w]+)\.
# Parse TLD
(?P<tld>(?:[-\w]{2,}|[-\w]{1,2}\.[-\w]{2}))
# Parse endpoint
(?P<endpoint>\/.*)?$
""", re.VERBOSE)
tests = [
{
'url': "hilton.com",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'hilton',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "hil3ton.com",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'hil3ton',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "fa4l.hil3ton.com",
'check': {
'protocol': None,
'subdomain': 'fa4l',
'domain': 'hil3ton',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "x-ex.io",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'x-ex',
'tld': 'io',
'endpoint': None,
}
},
{
'url': "www.hilton.com",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'hilton',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "www3.hilton.com",
'check': {
'protocol': None,
'subdomain': 'www3',
'domain': 'hilton',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "ftp://test.example.com/",
'check': {
'protocol': 'ftp',
'subdomain': 'test',
'domain': 'example',
'tld': 'com',
'endpoint': '/',
}
},
{
'url': "http://example.com/index.html",
'check': {
'protocol': 'http',
'subdomain': None,
'domain': 'example',
'tld': 'com',
'endpoint': '/index.html',
}
},
{
'url': "jw.marriott.com/",
'check': {
'protocol': None,
'subdomain': 'jw',
'domain': 'marriott',
'tld': 'com',
'endpoint': '/',
}
},
{
'url': "www.jw.marriott.co/index.html",
'check': {
'protocol': None,
'subdomain': 'www.jw',
'domain': 'marriott',
'tld': 'co',
'endpoint': '/index.html',
}
},
{
'url': "whotels.com/foo/bar.html",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'whotels',
'tld': 'com',
'endpoint': '/foo/bar.html',
}
},
{
'url': "store.nike.io",
'check': {
'protocol': None,
'subdomain': 'store',
'domain': 'nike',
'tld': 'io',
'endpoint': None,
}
},
{
'url': "nike.melbourne",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'nike',
'tld': 'melbourne',
'endpoint': None,
}
},
{
'url': "store.nike.melbourne",
'check': {
'protocol': None,
'subdomain': 'store',
'domain': 'nike',
'tld': 'melbourne',
'endpoint': None,
}
},
{
'url': "store.nike.co.uk",
'check': {
'protocol': None,
'subdomain': 'store',
'domain': 'nike',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "www.o2.co.uk",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'o2',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "www.o2.co.uk",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'o2',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "https://www.o2.co.uk",
'check': {
'protocol': 'https',
'subdomain': 'www',
'domain': 'o2',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "www.cnn.co",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'cnn',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "cnn.com",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'cnn',
'tld': 'com',
'endpoint': None,
}
},
{
'url': "nike.co.uk",
'check': {
'protocol': None,
'subdomain': None,
'domain': 'nike',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "http://nike.co.uk",
'check': {
'protocol': 'http',
'subdomain': None,
'domain': 'nike',
'tld': 'co.uk',
'endpoint': None,
}
},
{
'url': "http://t.co",
'check': {
'protocol': 'http',
'subdomain': None,
'domain': 't',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "t.co",
'check': {
'protocol': None,
'subdomain': None,
'domain': 't',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "http://xn--74h.com/",
'check': {
'protocol': 'http',
'subdomain': None,
'domain': 'xn--74h',
'tld': 'com',
'endpoint': '/',
}
},
{
'url': "www.o2.com",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'o2',
'tld': 'com',
'endpoint': None,
}
},
##
# Edge cases
##
# If:
# root domain is <= 2 chars
# AND has a subdomain
# AND tld is <= 2 chars
{
'url': "http://foo.t.co",
'check': {
'protocol': 'http',
'subdomain': 'foo',
'domain': 't',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "www.o2.io",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'o2',
'tld': 'io',
'endpoint': None,
}
},
{
'url': "www.t.co",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 't',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "http://www.o2.co",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 'o2',
'tld': 'co',
'endpoint': None,
}
},
{
'url': "https://www.t.co",
'check': {
'protocol': None,
'subdomain': 'www',
'domain': 't',
'tld': 'co',
'endpoint': None,
}
},
]
num_passed = 0
num_failed = 0
for test in tests:
url = test['url']
check = test['check']
results = re.match(p, url)
if results:
matched = results.groupdict()
if check == matched:
print("Passed: {}".format(url))
num_passed += 1
else:
print("Failed: {}".format(url))
pprint(matched)
num_failed += 1
else:
print("No Match {}".format(url))
num_failed += 1
print("\n")
print("Passed: {}".format(num_passed))
print("Failed: {}".format(num_failed))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment