Last active
May 6, 2016 14:08
-
-
Save xtream1101/764b70b6241997ee01d692ab961d246e to your computer and use it in GitHub Desktop.
URL Regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pprint import pprint | |
""" | |
TODO: | |
Support PORT numbers in the url | |
Support IP's as domains. If IP, it cannot have a subdomain or a tld | |
""" | |
""" | |
What it does: | |
Splits a valid url into its base parts (with a few exceptions e.g. edge cases in test) | |
- protocol | |
- subdomain(s) | |
- domain | |
- tld | |
- endpoint | |
What it does NOT: | |
- Check if a url is valid or not | |
""" | |
p = re.compile(""" | |
# Parse the protocol | |
(?:(?P<protocol>\w+):\/\/)? | |
# Parse all subdomains together, the extra ?'s makes the group ungreedy | |
(?P<subdomain>(?:\w+\.??)+?)??\.? | |
# Parse domain | |
(?P<domain>[-\w]+)\. | |
# Parse TLD | |
(?P<tld>(?:[-\w]{2,}|[-\w]{1,2}\.[-\w]{2})) | |
# Parse endpoint | |
(?P<endpoint>\/.*)?$ | |
""", re.VERBOSE) | |
tests = [ | |
{ | |
'url': "hilton.com", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'hilton', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "hil3ton.com", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'hil3ton', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "fa4l.hil3ton.com", | |
'check': { | |
'protocol': None, | |
'subdomain': 'fa4l', | |
'domain': 'hil3ton', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "x-ex.io", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'x-ex', | |
'tld': 'io', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.hilton.com", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'hilton', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www3.hilton.com", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www3', | |
'domain': 'hilton', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "ftp://test.example.com/", | |
'check': { | |
'protocol': 'ftp', | |
'subdomain': 'test', | |
'domain': 'example', | |
'tld': 'com', | |
'endpoint': '/', | |
} | |
}, | |
{ | |
'url': "http://example.com/index.html", | |
'check': { | |
'protocol': 'http', | |
'subdomain': None, | |
'domain': 'example', | |
'tld': 'com', | |
'endpoint': '/index.html', | |
} | |
}, | |
{ | |
'url': "jw.marriott.com/", | |
'check': { | |
'protocol': None, | |
'subdomain': 'jw', | |
'domain': 'marriott', | |
'tld': 'com', | |
'endpoint': '/', | |
} | |
}, | |
{ | |
'url': "www.jw.marriott.co/index.html", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www.jw', | |
'domain': 'marriott', | |
'tld': 'co', | |
'endpoint': '/index.html', | |
} | |
}, | |
{ | |
'url': "whotels.com/foo/bar.html", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'whotels', | |
'tld': 'com', | |
'endpoint': '/foo/bar.html', | |
} | |
}, | |
{ | |
'url': "store.nike.io", | |
'check': { | |
'protocol': None, | |
'subdomain': 'store', | |
'domain': 'nike', | |
'tld': 'io', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "nike.melbourne", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'nike', | |
'tld': 'melbourne', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "store.nike.melbourne", | |
'check': { | |
'protocol': None, | |
'subdomain': 'store', | |
'domain': 'nike', | |
'tld': 'melbourne', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "store.nike.co.uk", | |
'check': { | |
'protocol': None, | |
'subdomain': 'store', | |
'domain': 'nike', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.o2.co.uk", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.o2.co.uk", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "https://www.o2.co.uk", | |
'check': { | |
'protocol': 'https', | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.cnn.co", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'cnn', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "cnn.com", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'cnn', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "nike.co.uk", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 'nike', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "http://nike.co.uk", | |
'check': { | |
'protocol': 'http', | |
'subdomain': None, | |
'domain': 'nike', | |
'tld': 'co.uk', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "http://t.co", | |
'check': { | |
'protocol': 'http', | |
'subdomain': None, | |
'domain': 't', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "t.co", | |
'check': { | |
'protocol': None, | |
'subdomain': None, | |
'domain': 't', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "http://xn--74h.com/", | |
'check': { | |
'protocol': 'http', | |
'subdomain': None, | |
'domain': 'xn--74h', | |
'tld': 'com', | |
'endpoint': '/', | |
} | |
}, | |
{ | |
'url': "www.o2.com", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'com', | |
'endpoint': None, | |
} | |
}, | |
## | |
# Edge cases | |
## | |
# If: | |
# root domain is <= 2 chars | |
# AND has a subdomain | |
# AND tld is <= 2 chars | |
{ | |
'url': "http://foo.t.co", | |
'check': { | |
'protocol': 'http', | |
'subdomain': 'foo', | |
'domain': 't', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.o2.io", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'io', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "www.t.co", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 't', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "http://www.o2.co", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 'o2', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
{ | |
'url': "https://www.t.co", | |
'check': { | |
'protocol': None, | |
'subdomain': 'www', | |
'domain': 't', | |
'tld': 'co', | |
'endpoint': None, | |
} | |
}, | |
] | |
num_passed = 0 | |
num_failed = 0 | |
for test in tests: | |
url = test['url'] | |
check = test['check'] | |
results = re.match(p, url) | |
if results: | |
matched = results.groupdict() | |
if check == matched: | |
print("Passed: {}".format(url)) | |
num_passed += 1 | |
else: | |
print("Failed: {}".format(url)) | |
pprint(matched) | |
num_failed += 1 | |
else: | |
print("No Match {}".format(url)) | |
num_failed += 1 | |
print("\n") | |
print("Passed: {}".format(num_passed)) | |
print("Failed: {}".format(num_failed)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment