Skip to content

Instantly share code, notes, and snippets.

@openandclose
Forked from geryogam/uri_normalizer.py
Last active April 12, 2020 14:55
Show Gist options
  • Save openandclose/4ca42b9366c2b1eda33ac7d69b4f85d4 to your computer and use it in GitHub Desktop.
Save openandclose/4ca42b9366c2b1eda33ac7d69b4f85d4 to your computer and use it in GitHub Desktop.
URI Normalizer
"""
==============
URI Normalizer
==============
An URI normalization library providing syntax-based normalization (case
normalization, percent-encoding normalization, path segment normalization) and
scheme-based normalization to URIs according to RFC 3986
(https://tools.ietf.org/html/rfc3986).
Functions
=========
- normalizes: normalize an URI
- normalize: normalize URI components
- remove_dot_segments: remove the dot-segments in a URI path component
Classes
=======
- TestSuite: test suite for RFC 3986
Changes
=======
1.0.0 @ Géry Ogam
- update the code to Python 3
- update the code to comply with RFC 3986 (which obsoletes RFC 1808)
- update the module docstring
- add comments
- add type hints
- update the ``test`` function to a ``TestSuite`` class implementing the
unittest framework
- rename the ``norms`` and ``norm`` functions to ``normalizes`` and
``normalize`` respectively
- remove the trailing dot truncation in the host component
- add quoting of the path component
- add a ``remove_dot_segments`` function replacing the ``_collapse``
regex that incorrectly collapsed consecutive "/" delimiters in the
path component
- update the ``_authority`` regex to allow an empty port component with
its ":" delimiter
- update tests for syntax-based normalization (case normalization,
percent-encoding normalization, path segment normalization) and
scheme-based normalization
0.92 @ Mark Nottingham
- unknown schemes now pass the port through silently
0.91 @ Mark Nottingham
- general cleanup
- changed dictionaries to lists where appropriate
- more fine-grained authority parsing and normalization
"""
__license__ = """Copyright (c) 1999-2002 Mark Nottingham <mnot@pobox.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
__version__ = "1.0.0"
import re
import typing
import unittest
import urllib.parse
_relative_schemes = [
"http",
"https",
"news",
"snews",
"nntp",
"snntp",
"ftp",
"file",
""
]
_default_port = {
"http": "80",
"https": "443",
"gopher": "70",
"news": "119",
"snews": "563",
"nntp": "119",
"snntp": "563",
"ftp": "21",
"telnet": "23",
"prospero": "191"
}
_authority = re.compile(r"^(?:([^\@]+)\@)?([^\:]+)(?:\:(.*))?$")
def normalizes(uri: str) -> str:
"""Normalize an URI."""
if not isinstance(uri, str):
raise TypeError(
f"normalizes() argument must be a str, not a {type(uri).__name__}"
)
uri_components = urllib.parse.urlsplit(uri)
uri_components = normalize(uri_components)
return urllib.parse.urlunsplit(uri_components)
def normalize(
uri_components: typing.Tuple[str, str, str, str, str]
) -> typing.Tuple[str, str, str, str, str]:
"""Normalize URI components."""
if not isinstance(uri_components, tuple):
raise TypeError(
f"normalize() argument must be a tuple, not a "
f"{type(uri_components).__name__}"
)
if len(uri_components) != 5:
raise ValueError(
f"normalize() argument must be of length 6, not "
f"{len(uri_components).__name__}"
)
(scheme, authority, path, query, fragment) = uri_components
scheme = scheme.lower()
if authority:
(userinfo, host, port) = _authority.match(authority).groups()
authority = host.lower()
if userinfo:
authority = "%s@%s" % (userinfo, authority)
if port and port != _default_port.get(scheme):
authority = "%s:%s" % (authority, port)
if scheme in _relative_schemes:
path = remove_dot_segments(path)
if authority and not path:
path = "/"
path = urllib.parse.unquote(path)
path = urllib.parse.quote(path)
return (scheme, authority, path, query, fragment)
def remove_dot_segments(path: str) -> str:
"""Remove the dot-segments in a URI path component."""
if not isinstance(path, str):
raise TypeError(
f"remove_dot_segments() argument must be a str, not a "
f"{type(remove_dot_segments).__name__}"
)
in_buffer = path
out_buffer = ""
while in_buffer:
# A. If the input buffer begins with a prefix of "../" or "./", then
# remove that prefix from the input buffer; otherwise,
if in_buffer.startswith("../"):
in_buffer = in_buffer[3:]
elif in_buffer.startswith("./"):
in_buffer = in_buffer[2:]
# B. if the input buffer begins with a prefix of "/./" or "/.", where
# "." is a complete path segment, then replace that prefix with "/" in
# the input buffer; otherwise,
elif in_buffer.startswith("/./"):
in_buffer = "/" + in_buffer[3:]
elif in_buffer == "/.":
in_buffer = "/" + in_buffer[2:]
# C. if the input buffer begins with a prefix of "/../" or "/..", where
# ".." is a complete path segment, then replace that prefix with "/" in
# the input buffer and remove the last segment and its preceding "/"
# (if any) from the output buffer; otherwise,
elif in_buffer.startswith("/../"):
in_buffer = "/" + in_buffer[4:]
index = out_buffer.rfind("/")
if index == -1:
out_buffer = ""
else:
out_buffer = out_buffer[:index]
elif in_buffer == "/..":
in_buffer = "/" + in_buffer[3:]
index = out_buffer.rfind("/")
if index == -1:
out_buffer = ""
else:
out_buffer = out_buffer[:index]
# D. if the input buffer consists only of "." or "..", then remove that
# from the input buffer; otherwise,
elif in_buffer in [".", ".."]:
in_buffer = ""
# E. move the first path segment in the input buffer to the end of the
# output buffer, including the initial "/" character (if any) and any
# subsequent characters up to, but not including, the next "/"
# character or the end of the input buffer.
else:
index = in_buffer.find("/")
if index == -1:
out_buffer = out_buffer + in_buffer
in_buffer = ""
elif index == 0:
index = in_buffer.find("/", index + 1)
if index == -1:
out_buffer = out_buffer + in_buffer
in_buffer = ""
else:
out_buffer = out_buffer + in_buffer[:index]
in_buffer = in_buffer[index:]
else:
out_buffer = out_buffer + in_buffer[:index]
in_buffer = in_buffer[index:]
return out_buffer
class TestSuite(unittest.TestCase):
"""Test suite for RFC 3986."""
# Case Normalization
# (https://tools.ietf.org/html/rfc3986?#section-6.2.2.1).
def test_case_normalization(self) -> None:
tests = {
"http://example.com/foo%2a": "http://example.com/foo%2A",
"HTTP://example.com/Foo": "http://example.com/Foo",
"http://User@Example.COM/Foo": "http://User@example.com/Foo"
}
for (input, expected) in tests.items():
output = normalizes(input)
with self.subTest(input=input):
self.assertEqual(output, expected)
# Percent-Encoding Normalization
# (https://tools.ietf.org/html/rfc3986?#section-6.2.2.2).
def test_percent_encoding_normalization(self) -> None:
tests = {
"http://example.com/fo%6F": "http://example.com/foo"
}
for (input, expected) in tests.items():
output = normalizes(input)
with self.subTest(input=input):
self.assertEqual(output, expected)
# Path Segment Normalization
# (https://tools.ietf.org/html/rfc3986?#section-6.2.2.3).
def test_path_segment_normalization(self) -> None:
tests = {
".": "",
"..": "",
"./foo": "foo",
"../foo": "foo",
"/./foo": "/foo",
"/../foo": "/foo",
"/foo/.": "/foo/",
"/foo/..": "/",
"/foo/./": "/foo/",
"/foo/../": "/",
"/./foo/.": "/foo/",
"/../foo/..": "/",
"/foo/./bar": "/foo/bar",
"/foo/../bar": "/bar",
"/.foo": "/.foo",
"/..foo": "/..foo",
"/foo.": "/foo.",
"/foo..": "/foo..",
"/.foo.": "/.foo.",
"/..foo..": "/..foo..",
"/foo/./.": "/foo/",
"/foo/../..": "/",
"/foo/./..": "/",
"/foo/../.": "/"
}
for (input, expected) in tests.items():
output = normalizes(input)
with self.subTest(input=input):
self.assertEqual(output, expected)
# Scheme-Based Normalization
# (https://tools.ietf.org/html/rfc3986?#section-6.2.3).
def test_scheme_based_normalization(self) -> None:
tests = {
"http://@example.com/": "http://@example.com/",
"http://example.com": "http://example.com/",
"http://example.com:/": "http://example.com/",
"http://example.com:80/": "http://example.com/",
"http://example.com:8000/": "http://example.com:8000/",
# The following two tests are commented for the moment as the
# urlsplit and urlunsplit functions of the urllib.parse module
# incorrectly drop the '?' delimiter of an empty query component
# and the '#' delimiter of an empty fragment component in a URI
# (cf. https://github.com/python/cpython/pull/15642).
# "http://example.com/?": "http://example.com/?",
# "http://example.com/#": "http://example.com/#"
}
for (input, expected) in tests.items():
output = normalizes(input)
with self.subTest(input=input):
self.assertEqual(output, expected)
if __name__ == "__main__":
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment