Created
March 16, 2021 23:28
-
-
Save cisene/7a9c6f56481b1d9794f2066d57db6c6e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extractDomain(data): | |
# Remove protocol part of url | |
data = re.sub(r"^http(s)?\x3a\x2f\x2f", "", str(data), flags=re.IGNORECASE) | |
# Remove part after FQDN | |
data = re.sub(r"\x2f(.*)$", "", str(data), flags=re.IGNORECASE) | |
# Remove port | |
data = re.sub(r"\x3a\d{1,}$", "", str(data), flags=re.IGNORECASE) | |
# remove everything not a-z, 0-9, dash and dot | |
data = re.sub(r"[^a-z0-9\x2d\x2e]", "", str(data), flags=re.IGNORECASE) | |
# If IP, bail out early | |
if re.search(r"^(\d{1,3})\x2e(\d{1,3})\x2e(\d{1,3})\x2e(\d{1,3})$", str(data), flags=re.IGNORECASE): | |
return data | |
# Force lowercase | |
data = data.lower() | |
# split into chunks with dot/period separator | |
elements = data.split("\x2e") | |
# reverse order of chunks | |
elements.reverse() | |
# Check if third level domains, by TLDcc + second level | |
if ( | |
re.search(r"^(ae|ar|at|au|bo|br|co|cn|cr|do|ec|es|gg|gh|gr|gt|hk|id|il|in|ir|it|jp|ke|kr|kw|ky|lk|my|mx|na|ng|np|nz|pe|pk|ph|pl|pt|py|ro|ru|sg|sv|th|tr|tt|tw|ua|uk|uy|ve|za|zw)$", str(elements[0]), flags=re.IGNORECASE) and | |
re.search(r"^(ac|asn|biz|co(m)?|csiro|edu|go(v|b)?|gv|id|int|leg|mi(l)?|ne(t)?|or(g)?|pri(v)?)$", str(elements[1]), flags=re.IGNORECASE) | |
): | |
if len(elements) >= 3: | |
data = '.'.join(elements[:3]) | |
else: | |
data = '.'.join(elements) | |
# Nope, regular second level | |
else: | |
if len(elements) >= 2: | |
data = '.'.join(elements[:2]) | |
else: | |
data = '.'.join(elements) | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment