Skip to content

Instantly share code, notes, and snippets.

@heath
Last active August 29, 2015 14:02
Show Gist options
  • Save heath/b3633a2b67731abce756 to your computer and use it in GitHub Desktop.
Save heath/b3633a2b67731abce756 to your computer and use it in GitHub Desktop.
"""
Provide a module which parses URLs into useful features (protocol, hostname, path, etc).
"""
import re
class url:
def parse(self, feature, url):
"""
a feature may be any of "protocol", "hostname",
or "path", and looks like this:
{"protocol": "ftp"}
{"hostname": "google"}
{"path": "news/google"}
The hostname is fully qualified domain name,
including both domain and tld, e.g.: google.com
An example call to this function:
url_parse({"protocol": "http"}, "http://www.google.com")
"""
# check for valid feature
valid_feature = [f for f in feature if f in ["protocol", "hostname", "path"]]
if valid_feature == []:
try:
raise NameError("Please use 'protocol', 'hostname', or 'path' when listing the feature you want to parse")
except NameError as e:
return e
protocol = re.findall('^(\w+://)', url)
if protocol != []:
self.is_fqdn(True, url)
hostname = re.findall('^\w+\:\/\/(\w+)', url)
else:
hostname = re.findall('^(\w+\.\w+)', url)
path = re.findall('^\w+\:\/\/\w+.*(/*)', url)
if "protocol" in feature:
if protocol != [] and \
(protocol[0] == feature["protocol"] or \
protocol[0] == feature["protocol"] + str("://")):
return (True, protocol, url)
else:
return (False, feature, protocol)
if "hostname" in feature and \
hostname != [] and \
hostname == feature["hostname"]:
return (True, hostname)
if "path" in feature and path != []:
return (True, path)
def is_fqdn(self, has_protocol, url):
if has_protocol:
# check for lack of tld
if re.findall('^\w+\:\/\/(\w+\.\w+)', url) == []:
try:
raise NameError("url doesn't include a tld.")
except NameError as e:
return e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment