Julian-Nash/clean_urls.py

## clean_urls.py
import re
from urllib.parse import urlsplit

links = [
    # eligible links
    "google.com/home",
    "http://google.com/home",
    "https://google.com/home",

    "www.google.com/home",
    "http://www.google.com/home",
    "https://www.google.com/home",

    "cdn.google.com/home",
    "http://cdn.google.com/home",
    "https://cdn.google.com/home",

    "www.cdn.google.com/home",
    "http://www.cdn.google.com/home",
    "https://www.cdn.google.com/home",

    "/",
    "/home",
    "//cdn.google.com/home",  # url with unspecified protocol

    # inegible links
    "#",
    "#home",
    "mailto:hi@hi.com",
    "tel:12345678900",
    "javascript:alert('hello')",
    "next",
    "file://dosomething",
    "chrome://dosomething",
    "microsoft-edge://dosomething",
    "ghttp://dosomething",
    "firefox://dosomething",
    "iexplore://dosomething",
    "opera://dosomething",
]

root_domain = "https://www.google.com"
root_scheme = urlsplit(root_domain).scheme
root_netloc = urlsplit(root_domain).netloc

regex = r"""( *# *\S* *| *mailto *: *| *tel *: *| *javascript *: *| *next *| *file *:\/\/ *| *chrome *:\/\/ *| *microsoft-edge *:\/\/ *| *ghttp *:\/\/ *| *firefox *:\/\/ *| *iexplore *:\/\/ *| *opera *:\/\/ *)"""

for n, link in enumerate(links, start=1):
    if not re.search(regex, link):

        if link.startswith("http://"):
            print(f"{n}: {link} <- {link}")
        elif link.startswith("https://"):
            print(f"{n}: {link} <- {link}")
        elif link.startswith("www."):
            print(f"{n}: {root_scheme}://{link} <- {link}")
        elif link == "/":
            print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
        elif re.search(r"^\/{1}\w+", link):
            print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
        elif link.startswith("//"):
            print(f"{n}: {root_scheme}:{link} <- {link}")
        else:
            print(f"{n}: {root_scheme}://{link} <- {link}")

print(f"of {len([i for i in links if not re.search(regex, i)])}")
	import re
	from urllib.parse import urlsplit

	links = [
	# eligible links
	"google.com/home",
	"http://google.com/home",
	"https://google.com/home",

	"www.google.com/home",
	"http://www.google.com/home",
	"https://www.google.com/home",

	"cdn.google.com/home",
	"http://cdn.google.com/home",
	"https://cdn.google.com/home",

	"www.cdn.google.com/home",
	"http://www.cdn.google.com/home",
	"https://www.cdn.google.com/home",

	"/",
	"/home",
	"//cdn.google.com/home", # url with unspecified protocol

	# inegible links
	"#",
	"#home",
	"mailto:hi@hi.com",
	"tel:12345678900",
	"javascript:alert('hello')",
	"next",
	"file://dosomething",
	"chrome://dosomething",
	"microsoft-edge://dosomething",
	"ghttp://dosomething",
	"firefox://dosomething",
	"iexplore://dosomething",
	"opera://dosomething",
	]

	root_domain = "https://www.google.com"
	root_scheme = urlsplit(root_domain).scheme
	root_netloc = urlsplit(root_domain).netloc

	regex = r"""( # \S* \| mailto : \| tel : \| javascript : \| next \| file :\/\/ \| chrome :\/\/ \| microsoft-edge :\/\/ \| ghttp :\/\/ \| firefox :\/\/ \| iexplore :\/\/ \| opera :\/\/ *)"""

	for n, link in enumerate(links, start=1):
	if not re.search(regex, link):

	if link.startswith("http://"):
	print(f"{n}: {link} <- {link}")
	elif link.startswith("https://"):
	print(f"{n}: {link} <- {link}")
	elif link.startswith("www."):
	print(f"{n}: {root_scheme}://{link} <- {link}")
	elif link == "/":
	print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
	elif re.search(r"^\/{1}\w+", link):
	print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
	elif link.startswith("//"):
	print(f"{n}: {root_scheme}:{link} <- {link}")
	else:
	print(f"{n}: {root_scheme}://{link} <- {link}")

	print(f"of {len([i for i in links if not re.search(regex, i)])}")