Skip to content

Instantly share code, notes, and snippets.

@FelisDiligens
Created July 11, 2019 15:15
Show Gist options
  • Save FelisDiligens/17acbb2d2a193412a7a768dc08e35cf5 to your computer and use it in GitHub Desktop.
Save FelisDiligens/17acbb2d2a193412a7a768dc08e35cf5 to your computer and use it in GitHub Desktop.
URL class for Python 3 - Parse and manipulate every piece of the url individually.
import re
import urllib.parse
"""
TODO:
* Documentation
* Adding comments
* Write own concatenation method, instead of relying on urllib
"""
class URL:
"""
Holds the parts of an parsed URL for analyzing, manipulating and resolving.
Instantiate it, by using the class-level function URL.from_url(u)
Supported operators:
+ or / - concat to URL-object
== - converts both URL-objects to str and compares them.
Methods:
parse(u) - Parses and applies changes directly.
resolve() - Resolves URL to str.
resolve_to_path() - Resolves to path. ("https://www.google.com" -> "./cache/com/google/www/")
concat(s) - Concats to url object directly. Use + operator, if you prefer not to change the object itself.
get_parent()
get_root()
get_filename()
copy() - Returns a copy of the object.
"""
def __init__(self):
# scheme:[//authority]path[?query][#fragment]
# authority = [userinfo@]host[:port]
self.original_url = ""
self.scheme = ""
#self.userinfo = "" # Not implemented.
self.host = ""
self.port = ""
self.path = []
self.arguments = {}
self.fragment = ""
self.endswith_separator = False # Important for some web servers and concatenation.
def from_url(u):
"""
Class-level function. Returns an URL object.
Will raise an ValueError, if URL is invalid.
"""
return _parse_url(URL(), u)
def parse(self, u):
"""
Object-level method. Changes will be applied directly.
Will raise an ValueError, if URL is invalid.
"""
_parse_url(self, u)
def __str__(self):
return self.resolve()
def resolve(self):
return _resolve_to_url(self)
def resolve_to_path(self):
"""
Resolves the url to an path for caching.
For instance
"http://www.example.com/search?q=foo+bar"
will be resolved to
"./cache/com/example/www/search/query171143778"
"""
return _resolve_to_path(self)
def _concat(self, s):
"""In development..."""
# If the url is absolute...
if _is_valid_url(s):
return URL.from_url(s) # parse it instead.
# /starts/with/slash
if s.lstrip().startswith("/"):
pass
def concat(self, s):
"""
Concatenate an path to the url.
The URL object itself will be changed.
If you want to create a new object instead, then use the '+'-operator.
"""
# If the url is absolute...
if _is_valid_url(s):
return self.parse(s) # parse it instead.
else:
self.parse(urllib.parse.urljoin(self.resolve(), s))
return self
def __add__(self, other):
return URL.from_url(urllib.parse.urljoin(self.resolve(), other))
__truediv__ = __add__
def __eq__(self, other):
return str(self) == str(other)
def get_parent(self):
"""
Get the parent of the url.
e.g. URL.from_str("http://www.example.com/sub/page.php").get_parent().resolve() # => "http://www.example.com/sub/"
"""
if len(self.path) == 0:
return self.copy()
u = self.copy()
u.path.pop()
u.endswith_separator = True
return u
def get_root(self):
"""
Get the root of the url.
e.g. URL.from_str("http://www.example.com/sub/page.php").get_root().resolve() # => "http://www.example.com"
"""
if len(self.path) == 0:
return self.copy()
u = self.copy()
u.path = []
u.endswith_separator = False
return u
def get_filename(self):
if len(self.path) == 0:
return "index.html"
return self.path[-1]
def copy(self):
u = URL()
u.original_url = self.original_url
u.scheme = self.scheme
u.userinfo = self.userinfo
u.host = self.host
u.port = self.port
u.path = self.path.copy()
u.arguments = self.arguments.copy()
u.fragment = self.fragment
u.endswith_separator = self.endswith_separator
return u
# https://mathiasbynens.be/demo/url-regex
#URL_REGEX = re.compile(r"^(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$")
URL_REGEX = re.compile(r"(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])")
def _is_valid_url(s):
return bool(URL_REGEX.match(str(s)))
def _parse_url_arguments(s):
args = {}
for arg in s.strip().lstrip("?").split("&"):
key, value = arg.split("=")
args[key] = urllib.parse.unquote_plus(value)
return args
def _parse_url(url, s):
# Don't parse invalid URLs:
if not _is_valid_url(s):
raise ValueError("The given url \"%s\" is invalid." % (s))
url.original_url = str(s)
temp = str(s)
# Get the scheme (e.g. "http")
i = temp.find("://")
if i >= 0:
url.scheme = temp[:i]
temp = temp[i+3:]
# Get the arguments (e.g. "?q=foo+bar")
i = temp.find("?")
if i >= 0:
url.arguments = _parse_url_arguments(temp[i+1:])
temp = temp[:i]
# Get the fragment (e.g. "#anRandomFragment")
i = temp.find("#")
if i >= 0:
url.fragment = temp[i+1:]
temp = temp[:i]
# Get the host (e.g. "www.example.com")
url.endswith_separator = temp.rstrip().endswith("/")
i = temp.find("/")
if i < 0:
url.host = temp
url.path = []
else:
url.host = temp[:i]
temp = temp[i+1:]
# Get the path (e.g. "/subpage/index.html")
url.path = temp.strip("/ ").split("/")
# Get the port (e.g. ":8080")
i = url.host.find(":")
if i >= 0:
url.port = url.host[i+1:]
url.host = url.host[:i]
url.resolved_url = _resolve_to_url(url)
return url
def _resolve_to_url(url):
resolved_url = ""
# Add scheme, if available (e.g. "http://")
if url.scheme:
resolved_url += url.scheme + "://"
# Add host (e.g. "www.example.com")
resolved_url += url.host
# Add port (e.g. ":8080")
if url.port:
resolved_url += ":" + url.port
# Add path, if available (e.g. "/subpage/index.html")
if len(url.path) > 0:
resolved_url += "/" + "/".join(url.path)
# Ends with an separator?
if url.endswith_separator:
resolved_url += "/"
# Add fragment, if available (e.g. "#anRandomFragment")
if url.fragment:
resolved_url += "#" + url.fragment
# Add arguments, if available (e.g. "?q=foo+bar")
if url.arguments:
resolved_url += "?" + urllib.parse.urlencode(url.arguments)
return resolved_url
def _resolve_to_path(url):
"""
Resolves the url to an path for caching.
For instance
"http://www.example.com/search?q=foo+bar"
will be resolved to
"./cache/com/example/www/search/query171143778"
"""
result = _remove_unsafe_characters("./cache/" + "/".join(reversed(url.host.split("."))) + "/" + "/".join(url.path))
# Has arguments? Then calculate and append an hash to path.
if url.arguments:
query_hash = _java_string_hashcode(urllib.parse.urlencode(url.arguments))
result += "/query%.0f" % (query_hash)
return result
# https://gist.github.com/hanleybrand/5224673
def _java_string_hashcode(s):
h = 0
for c in s:
h = (31 * h + ord(c)) & 0xFFFFFFFF
return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000
def _remove_unsafe_characters(path):
result = []
for chunk in path.split("/"):
result.append(
re.sub(r'[^a-zA-Z0-9_\-\.]', "", chunk)
)
return "/".join(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment