Skip to content

Instantly share code, notes, and snippets.

@emmettbutler
Last active August 29, 2015 13:57
Show Gist options
  • Save emmettbutler/9394992 to your computer and use it in GitHub Desktop.
Save emmettbutler/9394992 to your computer and use it in GitHub Desktop.
Example solution to introductory python exercise
"""urltools.py - parse and format web URLs.
HINT:
>>> "http://google.com".split("://")
["http", "google"]
>>> "google.com/hangout/parsely.com/am".split("/")
["google.com", "hangout", "parsely.com", "am"]
>>> ["google.com/hangout/parsely.com/am".split("/", 1)
['google.com', 'hangout/parsely.com/am']
This is basically all you need to implement the parser.
For formatting / rejoining:
>>> "{host}/{path}".format(host="google.com", path="plus")
"google.com/plus"
You can do the whole thing without a single import!
"""
class URLParseError(Exception):
def __init__(self, message):
self.message = message
def __str__(self):
return "URL parse error: {}".format(self.message)
def url_parse(*args):
"""Takes a string URL and returns a dictionary of its various parts."""
ret = {"scheme": None, "host": None, "path": None, "port": None, "fragment": None, "query": None, "userinfo": None}
if len(args) == 0:
return ret
url = args[0]
if "://" not in url:
raise URLParseError("Missing scheme")
scheme_rest = url.split("://")
# scheme, *rest = url.split("://") # py3 only
ret["scheme"], rest = scheme_rest[0].lower(), scheme_rest[1]
ret["port"] = 80 if ret["scheme"] == "http" else 443 if ret["scheme"] == "https" else None
if "/" not in rest:
raise URLParseError("Missing authority")
authority_rest = rest.split("/", 1)
authority, path_query_fragment = authority_rest[0], "/" + authority_rest[1]
userinfo = authority.split("@")[0] if "@" in authority else None
# avoid "if @ in authority" by using replace
host_port = authority.replace("{}@".format(userinfo), "")
port = host_port.split(":")[1] if ":" in host_port else None
if port is not None:
if not port.isdigit():
raise URLParseError("Invalid port: {}".format(port))
ret["port"] = int(port)
ret["query"] = path_query_fragment.split("?")[1].split("#")[0] if "?" in path_query_fragment else None
ret["fragment"] = path_query_fragment.split("#")[1] if "#" in path_query_fragment else None
ret["host"] = host_port.split(":")[0]
ret["userinfo"] = userinfo
ret["path"] = path_query_fragment.split("?")[0].split("#")[0]
return ret
def url_join(*args):
"""Takes a dictionary of URL parts and returns a valid URL."""
in_dict = args[0] if len(args) >= 1 else None
if not in_dict:
return ""
scheme = in_dict["scheme"]
userinfo = port = query = fragment = ""
_userinfo = in_dict.get("userinfo", None)
userinfo = "{}@".format(_userinfo) if _userinfo else ""
_query = in_dict.get("query", None)
query = "?{}".format(_query) if _query else ""
_fragment = in_dict.get("fragment", "")
fragment = "#{}".format(_fragment) if _fragment else ""
_port = in_dict.get("port", "")
if _port:
if (scheme == "https" and _port != 443) or (scheme == "http" and _port != 80):
port = ":{}".format(_port)
return "{scheme}://{userinfo}{host}{port}{path}{query}{fragment}".format(
scheme=scheme, userinfo=userinfo,
host=in_dict["host"], port=port, path=in_dict["path"],
query=query, fragment=fragment
)
def test_basic_url():
url = "http://www.linkedin.com/in/andrewmontalenti"
parsed_url = url_parse(url)
assert parsed_url["scheme"] == "http"
assert parsed_url["host"] == "www.linkedin.com"
assert parsed_url["path"] == "/in/andrewmontalenti"
assert parsed_url["port"] == 80
assert parsed_url["fragment"] is None
assert parsed_url["query"] is None
def test_advanced_url():
url = "http://cogtree@www.linkedin.com:1234/profile/view?id=13836198&trk=ppro_viewmore#more-123"
parsed_url = url_parse(url)
assert parsed_url["fragment"] == "more-123"
assert parsed_url["query"] == "id=13836198&trk=ppro_viewmore"
assert parsed_url["userinfo"] == "cogtree"
assert parsed_url["port"] == 1234
def test_joining_url():
url_parts = {
"scheme": "http",
"host": "www.linkedin.com",
"path": "/profile/view",
"fragment": "more-123",
"query": "id=13836198&trk=ppro_viewmore",
"port": 80
}
url = "http://www.linkedin.com/profile/view?id=13836198&trk=ppro_viewmore#more-123"
assert url_join(url_parts) == url
url_parts["port"] = 8080
url = "http://www.linkedin.com:8080/profile/view?id=13836198&trk=ppro_viewmore#more-123"
assert url_join(url_parts) == url
url_parts["scheme"] = "https"
url_parts["port"] = 443
url = "https://www.linkedin.com/profile/view?id=13836198&trk=ppro_viewmore#more-123"
assert url_join(url_parts) == url
url = "http://emmett@www.linkedin.com:1234/profile/view#more-123"
assert url_join(url_parse(url)) == url
url = "ftp://emmett:butler@linkedin.com/profile/view?haha=what#more-123"
assert url_join(url_parse(url)) == url
url = "ftp//linkedin.com/profile/view?haha=what#more-123"
assert url_join(url_parse(url)) == url
def main():
test_basic_url()
test_advanced_url()
test_joining_url()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment