Skip to content

Instantly share code, notes, and snippets.

@Julian-Nash
Created October 12, 2018 00:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Julian-Nash/716eac6446977ed470f56df2e2247d72 to your computer and use it in GitHub Desktop.
Save Julian-Nash/716eac6446977ed470f56df2e2247d72 to your computer and use it in GitHub Desktop.
clean & assemble urls. Quick and dirty solution - need to refactor
import re
from urllib.parse import urlsplit
links = [
# eligible links
"google.com/home",
"http://google.com/home",
"https://google.com/home",
"www.google.com/home",
"http://www.google.com/home",
"https://www.google.com/home",
"cdn.google.com/home",
"http://cdn.google.com/home",
"https://cdn.google.com/home",
"www.cdn.google.com/home",
"http://www.cdn.google.com/home",
"https://www.cdn.google.com/home",
"/",
"/home",
"//cdn.google.com/home", # url with unspecified protocol
# inegible links
"#",
"#home",
"mailto:hi@hi.com",
"tel:12345678900",
"javascript:alert('hello')",
"next",
"file://dosomething",
"chrome://dosomething",
"microsoft-edge://dosomething",
"ghttp://dosomething",
"firefox://dosomething",
"iexplore://dosomething",
"opera://dosomething",
]
root_domain = "https://www.google.com"
root_scheme = urlsplit(root_domain).scheme
root_netloc = urlsplit(root_domain).netloc
regex = r"""( *# *\S* *| *mailto *: *| *tel *: *| *javascript *: *| *next *| *file *:\/\/ *| *chrome *:\/\/ *| *microsoft-edge *:\/\/ *| *ghttp *:\/\/ *| *firefox *:\/\/ *| *iexplore *:\/\/ *| *opera *:\/\/ *)"""
for n, link in enumerate(links, start=1):
if not re.search(regex, link):
if link.startswith("http://"):
print(f"{n}: {link} <- {link}")
elif link.startswith("https://"):
print(f"{n}: {link} <- {link}")
elif link.startswith("www."):
print(f"{n}: {root_scheme}://{link} <- {link}")
elif link == "/":
print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
elif re.search(r"^\/{1}\w+", link):
print(f"{n}: {root_scheme}://{root_netloc}{link} <- {link}")
elif link.startswith("//"):
print(f"{n}: {root_scheme}:{link} <- {link}")
else:
print(f"{n}: {root_scheme}://{link} <- {link}")
print(f"of {len([i for i in links if not re.search(regex, i)])}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment