Skip to content

Instantly share code, notes, and snippets.

@cisene
Created March 16, 2021 23:28
Show Gist options
  • Save cisene/7a9c6f56481b1d9794f2066d57db6c6e to your computer and use it in GitHub Desktop.
Save cisene/7a9c6f56481b1d9794f2066d57db6c6e to your computer and use it in GitHub Desktop.
def extractDomain(data):
# Remove protocol part of url
data = re.sub(r"^http(s)?\x3a\x2f\x2f", "", str(data), flags=re.IGNORECASE)
# Remove part after FQDN
data = re.sub(r"\x2f(.*)$", "", str(data), flags=re.IGNORECASE)
# Remove port
data = re.sub(r"\x3a\d{1,}$", "", str(data), flags=re.IGNORECASE)
# remove everything not a-z, 0-9, dash and dot
data = re.sub(r"[^a-z0-9\x2d\x2e]", "", str(data), flags=re.IGNORECASE)
# If IP, bail out early
if re.search(r"^(\d{1,3})\x2e(\d{1,3})\x2e(\d{1,3})\x2e(\d{1,3})$", str(data), flags=re.IGNORECASE):
return data
# Force lowercase
data = data.lower()
# split into chunks with dot/period separator
elements = data.split("\x2e")
# reverse order of chunks
elements.reverse()
# Check if third level domains, by TLDcc + second level
if (
re.search(r"^(ae|ar|at|au|bo|br|co|cn|cr|do|ec|es|gg|gh|gr|gt|hk|id|il|in|ir|it|jp|ke|kr|kw|ky|lk|my|mx|na|ng|np|nz|pe|pk|ph|pl|pt|py|ro|ru|sg|sv|th|tr|tt|tw|ua|uk|uy|ve|za|zw)$", str(elements[0]), flags=re.IGNORECASE) and
re.search(r"^(ac|asn|biz|co(m)?|csiro|edu|go(v|b)?|gv|id|int|leg|mi(l)?|ne(t)?|or(g)?|pri(v)?)$", str(elements[1]), flags=re.IGNORECASE)
):
if len(elements) >= 3:
data = '.'.join(elements[:3])
else:
data = '.'.join(elements)
# Nope, regular second level
else:
if len(elements) >= 2:
data = '.'.join(elements[:2])
else:
data = '.'.join(elements)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment