Created
July 11, 2019 15:15
-
-
Save FelisDiligens/17acbb2d2a193412a7a768dc08e35cf5 to your computer and use it in GitHub Desktop.
URL class for Python 3 - Parse and manipulate every piece of the url individually.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib.parse | |
""" | |
TODO: | |
* Documentation | |
* Adding comments | |
* Write own concatenation method, instead of relying on urllib | |
""" | |
class URL: | |
""" | |
Holds the parts of an parsed URL for analyzing, manipulating and resolving. | |
Instantiate it, by using the class-level function URL.from_url(u) | |
Supported operators: | |
+ or / - concat to URL-object | |
== - converts both URL-objects to str and compares them. | |
Methods: | |
parse(u) - Parses and applies changes directly. | |
resolve() - Resolves URL to str. | |
resolve_to_path() - Resolves to path. ("https://www.google.com" -> "./cache/com/google/www/") | |
concat(s) - Concats to url object directly. Use + operator, if you prefer not to change the object itself. | |
get_parent() | |
get_root() | |
get_filename() | |
copy() - Returns a copy of the object. | |
""" | |
def __init__(self): | |
# scheme:[//authority]path[?query][#fragment] | |
# authority = [userinfo@]host[:port] | |
self.original_url = "" | |
self.scheme = "" | |
#self.userinfo = "" # Not implemented. | |
self.host = "" | |
self.port = "" | |
self.path = [] | |
self.arguments = {} | |
self.fragment = "" | |
self.endswith_separator = False # Important for some web servers and concatenation. | |
def from_url(u): | |
""" | |
Class-level function. Returns an URL object. | |
Will raise an ValueError, if URL is invalid. | |
""" | |
return _parse_url(URL(), u) | |
def parse(self, u): | |
""" | |
Object-level method. Changes will be applied directly. | |
Will raise an ValueError, if URL is invalid. | |
""" | |
_parse_url(self, u) | |
def __str__(self): | |
return self.resolve() | |
def resolve(self): | |
return _resolve_to_url(self) | |
def resolve_to_path(self): | |
""" | |
Resolves the url to an path for caching. | |
For instance | |
"http://www.example.com/search?q=foo+bar" | |
will be resolved to | |
"./cache/com/example/www/search/query171143778" | |
""" | |
return _resolve_to_path(self) | |
def _concat(self, s): | |
"""In development...""" | |
# If the url is absolute... | |
if _is_valid_url(s): | |
return URL.from_url(s) # parse it instead. | |
# /starts/with/slash | |
if s.lstrip().startswith("/"): | |
pass | |
def concat(self, s): | |
""" | |
Concatenate an path to the url. | |
The URL object itself will be changed. | |
If you want to create a new object instead, then use the '+'-operator. | |
""" | |
# If the url is absolute... | |
if _is_valid_url(s): | |
return self.parse(s) # parse it instead. | |
else: | |
self.parse(urllib.parse.urljoin(self.resolve(), s)) | |
return self | |
def __add__(self, other): | |
return URL.from_url(urllib.parse.urljoin(self.resolve(), other)) | |
__truediv__ = __add__ | |
def __eq__(self, other): | |
return str(self) == str(other) | |
def get_parent(self): | |
""" | |
Get the parent of the url. | |
e.g. URL.from_str("http://www.example.com/sub/page.php").get_parent().resolve() # => "http://www.example.com/sub/" | |
""" | |
if len(self.path) == 0: | |
return self.copy() | |
u = self.copy() | |
u.path.pop() | |
u.endswith_separator = True | |
return u | |
def get_root(self): | |
""" | |
Get the root of the url. | |
e.g. URL.from_str("http://www.example.com/sub/page.php").get_root().resolve() # => "http://www.example.com" | |
""" | |
if len(self.path) == 0: | |
return self.copy() | |
u = self.copy() | |
u.path = [] | |
u.endswith_separator = False | |
return u | |
def get_filename(self): | |
if len(self.path) == 0: | |
return "index.html" | |
return self.path[-1] | |
def copy(self): | |
u = URL() | |
u.original_url = self.original_url | |
u.scheme = self.scheme | |
u.userinfo = self.userinfo | |
u.host = self.host | |
u.port = self.port | |
u.path = self.path.copy() | |
u.arguments = self.arguments.copy() | |
u.fragment = self.fragment | |
u.endswith_separator = self.endswith_separator | |
return u | |
# https://mathiasbynens.be/demo/url-regex | |
#URL_REGEX = re.compile(r"^(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$") | |
URL_REGEX = re.compile(r"(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])") | |
def _is_valid_url(s): | |
return bool(URL_REGEX.match(str(s))) | |
def _parse_url_arguments(s): | |
args = {} | |
for arg in s.strip().lstrip("?").split("&"): | |
key, value = arg.split("=") | |
args[key] = urllib.parse.unquote_plus(value) | |
return args | |
def _parse_url(url, s): | |
# Don't parse invalid URLs: | |
if not _is_valid_url(s): | |
raise ValueError("The given url \"%s\" is invalid." % (s)) | |
url.original_url = str(s) | |
temp = str(s) | |
# Get the scheme (e.g. "http") | |
i = temp.find("://") | |
if i >= 0: | |
url.scheme = temp[:i] | |
temp = temp[i+3:] | |
# Get the arguments (e.g. "?q=foo+bar") | |
i = temp.find("?") | |
if i >= 0: | |
url.arguments = _parse_url_arguments(temp[i+1:]) | |
temp = temp[:i] | |
# Get the fragment (e.g. "#anRandomFragment") | |
i = temp.find("#") | |
if i >= 0: | |
url.fragment = temp[i+1:] | |
temp = temp[:i] | |
# Get the host (e.g. "www.example.com") | |
url.endswith_separator = temp.rstrip().endswith("/") | |
i = temp.find("/") | |
if i < 0: | |
url.host = temp | |
url.path = [] | |
else: | |
url.host = temp[:i] | |
temp = temp[i+1:] | |
# Get the path (e.g. "/subpage/index.html") | |
url.path = temp.strip("/ ").split("/") | |
# Get the port (e.g. ":8080") | |
i = url.host.find(":") | |
if i >= 0: | |
url.port = url.host[i+1:] | |
url.host = url.host[:i] | |
url.resolved_url = _resolve_to_url(url) | |
return url | |
def _resolve_to_url(url): | |
resolved_url = "" | |
# Add scheme, if available (e.g. "http://") | |
if url.scheme: | |
resolved_url += url.scheme + "://" | |
# Add host (e.g. "www.example.com") | |
resolved_url += url.host | |
# Add port (e.g. ":8080") | |
if url.port: | |
resolved_url += ":" + url.port | |
# Add path, if available (e.g. "/subpage/index.html") | |
if len(url.path) > 0: | |
resolved_url += "/" + "/".join(url.path) | |
# Ends with an separator? | |
if url.endswith_separator: | |
resolved_url += "/" | |
# Add fragment, if available (e.g. "#anRandomFragment") | |
if url.fragment: | |
resolved_url += "#" + url.fragment | |
# Add arguments, if available (e.g. "?q=foo+bar") | |
if url.arguments: | |
resolved_url += "?" + urllib.parse.urlencode(url.arguments) | |
return resolved_url | |
def _resolve_to_path(url): | |
""" | |
Resolves the url to an path for caching. | |
For instance | |
"http://www.example.com/search?q=foo+bar" | |
will be resolved to | |
"./cache/com/example/www/search/query171143778" | |
""" | |
result = _remove_unsafe_characters("./cache/" + "/".join(reversed(url.host.split("."))) + "/" + "/".join(url.path)) | |
# Has arguments? Then calculate and append an hash to path. | |
if url.arguments: | |
query_hash = _java_string_hashcode(urllib.parse.urlencode(url.arguments)) | |
result += "/query%.0f" % (query_hash) | |
return result | |
# https://gist.github.com/hanleybrand/5224673 | |
def _java_string_hashcode(s): | |
h = 0 | |
for c in s: | |
h = (31 * h + ord(c)) & 0xFFFFFFFF | |
return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000 | |
def _remove_unsafe_characters(path): | |
result = [] | |
for chunk in path.split("/"): | |
result.append( | |
re.sub(r'[^a-zA-Z0-9_\-\.]', "", chunk) | |
) | |
return "/".join(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment