FelisDiligens/url.py

## url.py
import re
import urllib.parse

"""
    TODO:
    * Documentation
    * Adding comments
    * Write own concatenation method, instead of relying on urllib
"""

class URL:
    """
    Holds the parts of an parsed URL for analyzing, manipulating and resolving.

    Instantiate it, by using the class-level function URL.from_url(u)

    Supported operators:
        + or /    -    concat to URL-object
        ==        -    converts both URL-objects to str and compares them.

    Methods:
        parse(u)            - Parses and applies changes directly.
        resolve()           - Resolves URL to str.
        resolve_to_path()   - Resolves to path. ("https://www.google.com" -> "./cache/com/google/www/")
        concat(s)           - Concats to url object directly. Use + operator, if you prefer not to change the object itself.
        get_parent()
        get_root()
        get_filename()
        copy()              - Returns a copy of the object.
    """

    def __init__(self):
        # scheme:[//authority]path[?query][#fragment]
        # authority = [userinfo@]host[:port]
        self.original_url = ""
        self.scheme = ""
        #self.userinfo = "" # Not implemented.
        self.host = ""
        self.port = ""
        self.path = []
        self.arguments = {}
        self.fragment = ""
        self.endswith_separator = False # Important for some web servers and concatenation.

    def from_url(u):
        """
        Class-level function. Returns an URL object.

        Will raise an ValueError, if URL is invalid.
        """
        return _parse_url(URL(), u)

    def parse(self, u):
        """
        Object-level method. Changes will be applied directly.

        Will raise an ValueError, if URL is invalid.
        """
        _parse_url(self, u)

    def __str__(self):
        return self.resolve()

    def resolve(self):
        return _resolve_to_url(self)

    def resolve_to_path(self):
        """
        Resolves the url to an path for caching.

        For instance
        "http://www.example.com/search?q=foo+bar"
        will be resolved to
        "./cache/com/example/www/search/query171143778"
        """
        return _resolve_to_path(self)

    def _concat(self, s):
        """In development..."""
        # If the url is absolute...
        if _is_valid_url(s):
            return URL.from_url(s) # parse it instead.

        # /starts/with/slash
        if s.lstrip().startswith("/"):
            pass

    def concat(self, s):
        """
        Concatenate an path to the url.

        The URL object itself will be changed.
        If you want to create a new object instead, then use the '+'-operator.
        """

        # If the url is absolute...
        if _is_valid_url(s):
            return self.parse(s) # parse it instead.
        else:
            self.parse(urllib.parse.urljoin(self.resolve(), s))
        return self

    def __add__(self, other):
        return URL.from_url(urllib.parse.urljoin(self.resolve(), other))

    __truediv__ = __add__

    def __eq__(self, other):
        return str(self) == str(other)

    def get_parent(self):
        """
        Get the parent of the url.

        e.g. URL.from_str("http://www.example.com/sub/page.php").get_parent().resolve() # => "http://www.example.com/sub/"
        """
        if len(self.path) == 0:
            return self.copy()
        u = self.copy()
        u.path.pop()
        u.endswith_separator = True
        return u

    def get_root(self):
        """
        Get the root of the url.

        e.g. URL.from_str("http://www.example.com/sub/page.php").get_root().resolve() # => "http://www.example.com"
        """
        if len(self.path) == 0:
            return self.copy()
        u = self.copy()
        u.path = []
        u.endswith_separator = False
        return u

    def get_filename(self):
        if len(self.path) == 0:
            return "index.html"
        return self.path[-1]

    def copy(self):
        u = URL()
        u.original_url = self.original_url
        u.scheme = self.scheme
        u.userinfo = self.userinfo
        u.host = self.host
        u.port = self.port
        u.path = self.path.copy()
        u.arguments = self.arguments.copy()
        u.fragment = self.fragment
        u.endswith_separator = self.endswith_separator
        return u


# https://mathiasbynens.be/demo/url-regex
#URL_REGEX = re.compile(r"^(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$")
URL_REGEX = re.compile(r"(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])")

def _is_valid_url(s):
    return bool(URL_REGEX.match(str(s)))

def _parse_url_arguments(s):
    args = {}
    for arg in s.strip().lstrip("?").split("&"):
        key, value = arg.split("=")
        args[key] = urllib.parse.unquote_plus(value)
    return args

def _parse_url(url, s):
    # Don't parse invalid URLs:
    if not _is_valid_url(s):
        raise ValueError("The given url \"%s\" is invalid." % (s))

    url.original_url = str(s)
    temp = str(s)

    # Get the scheme (e.g. "http")
    i = temp.find("://")
    if i >= 0:
        url.scheme = temp[:i]
        temp = temp[i+3:]

    # Get the arguments (e.g. "?q=foo+bar")
    i = temp.find("?")
    if i >= 0:
        url.arguments = _parse_url_arguments(temp[i+1:])
        temp = temp[:i]

    # Get the fragment (e.g. "#anRandomFragment")
    i = temp.find("#")
    if i >= 0:
        url.fragment = temp[i+1:]
        temp = temp[:i]

    # Get the host (e.g. "www.example.com")
    url.endswith_separator = temp.rstrip().endswith("/")
    i = temp.find("/")
    if i < 0:
        url.host = temp
        url.path = []
    else:
        url.host = temp[:i]
        temp = temp[i+1:]

        # Get the path (e.g. "/subpage/index.html")
        url.path = temp.strip("/ ").split("/")

    # Get the port (e.g. ":8080")
    i = url.host.find(":")
    if i >= 0:
        url.port = url.host[i+1:]
        url.host = url.host[:i]

    url.resolved_url = _resolve_to_url(url)
    return url

def _resolve_to_url(url):
    resolved_url = ""

    # Add scheme, if available (e.g. "http://")
    if url.scheme:
        resolved_url += url.scheme + "://"

    # Add host (e.g. "www.example.com")
    resolved_url += url.host

    # Add port (e.g. ":8080")
    if url.port:
        resolved_url += ":" + url.port

    # Add path, if available (e.g. "/subpage/index.html")
    if len(url.path) > 0:
        resolved_url += "/" + "/".join(url.path)

    # Ends with an separator?
    if url.endswith_separator:
        resolved_url += "/"

    # Add fragment, if available (e.g. "#anRandomFragment")
    if url.fragment:
        resolved_url += "#" + url.fragment

    # Add arguments, if available (e.g. "?q=foo+bar")
    if url.arguments:
        resolved_url += "?" + urllib.parse.urlencode(url.arguments)

    return resolved_url

def _resolve_to_path(url):
    """
    Resolves the url to an path for caching.

    For instance
    "http://www.example.com/search?q=foo+bar"
    will be resolved to
    "./cache/com/example/www/search/query171143778"
    """
    result = _remove_unsafe_characters("./cache/" + "/".join(reversed(url.host.split("."))) + "/" + "/".join(url.path))

    # Has arguments? Then calculate and append an hash to path.
    if url.arguments:
        query_hash = _java_string_hashcode(urllib.parse.urlencode(url.arguments))
        result += "/query%.0f" % (query_hash)

    return result

# https://gist.github.com/hanleybrand/5224673
def _java_string_hashcode(s):
    h = 0
    for c in s:
        h = (31 * h + ord(c)) & 0xFFFFFFFF
    return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000

def _remove_unsafe_characters(path):
    result = []
    for chunk in path.split("/"):
        result.append(
            re.sub(r'[^a-zA-Z0-9_\-\.]', "", chunk)
        )
    return "/".join(result)
	import re
	import urllib.parse

	"""
	TODO:
	* Documentation
	* Adding comments
	* Write own concatenation method, instead of relying on urllib
	"""

	class URL:
	"""
	Holds the parts of an parsed URL for analyzing, manipulating and resolving.

	Instantiate it, by using the class-level function URL.from_url(u)

	Supported operators:
	+ or / - concat to URL-object
	== - converts both URL-objects to str and compares them.

	Methods:
	parse(u) - Parses and applies changes directly.
	resolve() - Resolves URL to str.
	resolve_to_path() - Resolves to path. ("https://www.google.com" -> "./cache/com/google/www/")
	concat(s) - Concats to url object directly. Use + operator, if you prefer not to change the object itself.
	get_parent()
	get_root()
	get_filename()
	copy() - Returns a copy of the object.
	"""

	def __init__(self):
	# scheme:[//authority]path[?query][#fragment]
	# authority = [userinfo@]host[:port]
	self.original_url = ""
	self.scheme = ""
	#self.userinfo = "" # Not implemented.
	self.host = ""
	self.port = ""
	self.path = []
	self.arguments = {}
	self.fragment = ""
	self.endswith_separator = False # Important for some web servers and concatenation.

	def from_url(u):
	"""
	Class-level function. Returns an URL object.

	Will raise an ValueError, if URL is invalid.
	"""
	return _parse_url(URL(), u)

	def parse(self, u):
	"""
	Object-level method. Changes will be applied directly.

	Will raise an ValueError, if URL is invalid.
	"""
	_parse_url(self, u)

	def __str__(self):
	return self.resolve()

	def resolve(self):
	return _resolve_to_url(self)

	def resolve_to_path(self):
	"""
	Resolves the url to an path for caching.

	For instance
	"http://www.example.com/search?q=foo+bar"
	will be resolved to
	"./cache/com/example/www/search/query171143778"
	"""
	return _resolve_to_path(self)

	def _concat(self, s):
	"""In development..."""
	# If the url is absolute...
	if _is_valid_url(s):
	return URL.from_url(s) # parse it instead.

	# /starts/with/slash
	if s.lstrip().startswith("/"):
	pass

	def concat(self, s):
	"""
	Concatenate an path to the url.

	The URL object itself will be changed.
	If you want to create a new object instead, then use the '+'-operator.
	"""

	# If the url is absolute...
	if _is_valid_url(s):
	return self.parse(s) # parse it instead.
	else:
	self.parse(urllib.parse.urljoin(self.resolve(), s))
	return self

	def __add__(self, other):
	return URL.from_url(urllib.parse.urljoin(self.resolve(), other))

	__truediv__ = __add__

	def __eq__(self, other):
	return str(self) == str(other)

	def get_parent(self):
	"""
	Get the parent of the url.

	e.g. URL.from_str("http://www.example.com/sub/page.php").get_parent().resolve() # => "http://www.example.com/sub/"
	"""
	if len(self.path) == 0:
	return self.copy()
	u = self.copy()
	u.path.pop()
	u.endswith_separator = True
	return u

	def get_root(self):
	"""
	Get the root of the url.

	e.g. URL.from_str("http://www.example.com/sub/page.php").get_root().resolve() # => "http://www.example.com"
	"""
	if len(self.path) == 0:
	return self.copy()
	u = self.copy()
	u.path = []
	u.endswith_separator = False
	return u

	def get_filename(self):
	if len(self.path) == 0:
	return "index.html"
	return self.path[-1]

	def copy(self):
	u = URL()
	u.original_url = self.original_url
	u.scheme = self.scheme
	u.userinfo = self.userinfo
	u.host = self.host
	u.port = self.port
	u.path = self.path.copy()
	u.arguments = self.arguments.copy()
	u.fragment = self.fragment
	u.endswith_separator = self.endswith_separator
	return u




	# https://mathiasbynens.be/demo/url-regex
	#URL_REGEX = re.compile(r"^(https?\|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$")
	URL_REGEX = re.compile(r"(^\|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\\|\_.!~*\\|'()\[\]%#,☺]+[\w/#](\(\))?)(?=$\|[\s',\\|\(\).:;?\-\[\]>\)])")

	def _is_valid_url(s):
	return bool(URL_REGEX.match(str(s)))

	def _parse_url_arguments(s):
	args = {}
	for arg in s.strip().lstrip("?").split("&"):
	key, value = arg.split("=")
	args[key] = urllib.parse.unquote_plus(value)
	return args

	def _parse_url(url, s):
	# Don't parse invalid URLs:
	if not _is_valid_url(s):
	raise ValueError("The given url \"%s\" is invalid." % (s))

	url.original_url = str(s)
	temp = str(s)

	# Get the scheme (e.g. "http")
	i = temp.find("://")
	if i >= 0:
	url.scheme = temp[:i]
	temp = temp[i+3:]

	# Get the arguments (e.g. "?q=foo+bar")
	i = temp.find("?")
	if i >= 0:
	url.arguments = _parse_url_arguments(temp[i+1:])
	temp = temp[:i]

	# Get the fragment (e.g. "#anRandomFragment")
	i = temp.find("#")
	if i >= 0:
	url.fragment = temp[i+1:]
	temp = temp[:i]

	# Get the host (e.g. "www.example.com")
	url.endswith_separator = temp.rstrip().endswith("/")
	i = temp.find("/")
	if i < 0:
	url.host = temp
	url.path = []
	else:
	url.host = temp[:i]
	temp = temp[i+1:]

	# Get the path (e.g. "/subpage/index.html")
	url.path = temp.strip("/ ").split("/")

	# Get the port (e.g. ":8080")
	i = url.host.find(":")
	if i >= 0:
	url.port = url.host[i+1:]
	url.host = url.host[:i]

	url.resolved_url = _resolve_to_url(url)
	return url

	def _resolve_to_url(url):
	resolved_url = ""

	# Add scheme, if available (e.g. "http://")
	if url.scheme:
	resolved_url += url.scheme + "://"

	# Add host (e.g. "www.example.com")
	resolved_url += url.host

	# Add port (e.g. ":8080")
	if url.port:
	resolved_url += ":" + url.port

	# Add path, if available (e.g. "/subpage/index.html")
	if len(url.path) > 0:
	resolved_url += "/" + "/".join(url.path)

	# Ends with an separator?
	if url.endswith_separator:
	resolved_url += "/"

	# Add fragment, if available (e.g. "#anRandomFragment")
	if url.fragment:
	resolved_url += "#" + url.fragment

	# Add arguments, if available (e.g. "?q=foo+bar")
	if url.arguments:
	resolved_url += "?" + urllib.parse.urlencode(url.arguments)

	return resolved_url

	def _resolve_to_path(url):
	"""
	Resolves the url to an path for caching.

	For instance
	"http://www.example.com/search?q=foo+bar"
	will be resolved to
	"./cache/com/example/www/search/query171143778"
	"""
	result = _remove_unsafe_characters("./cache/" + "/".join(reversed(url.host.split("."))) + "/" + "/".join(url.path))

	# Has arguments? Then calculate and append an hash to path.
	if url.arguments:
	query_hash = _java_string_hashcode(urllib.parse.urlencode(url.arguments))
	result += "/query%.0f" % (query_hash)

	return result

	# https://gist.github.com/hanleybrand/5224673
	def _java_string_hashcode(s):
	h = 0
	for c in s:
	h = (31 * h + ord(c)) & 0xFFFFFFFF
	return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000

	def _remove_unsafe_characters(path):
	result = []
	for chunk in path.split("/"):
	result.append(
	re.sub(r'[^a-zA-Z0-9_\-\.]', "", chunk)
	)
	return "/".join(result)