Last active
June 28, 2018 16:17
-
-
Save allancaffee/9d8f21495931d96e7b7f18603a4aa3ab to your computer and use it in GitHub Desktop.
TLD grouping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
domains = """ | |
am.example.com | |
ap.example.com | |
eu.example.com | |
itd.sel.example.com | |
jp.example.com | |
ncope.account.example.com | |
bizbat.example.com | |
scesh.example.com | |
smss.example.com | |
soe.example.com | |
example.com | |
spe.example.com | |
""".splitlines() | |
# Split the domains, dropping any empty lines. | |
split_domains = [tuple(d.split('.')) for d in domains if d] | |
# Sort them all by length so that we find the shortest ones first. | |
split_domains.sort(key=len) | |
def _is_suffix(potential_suffix, full_tuple): | |
return full_tuple[-len(potential_suffix):] == potential_suffix | |
# Instead of hard-coding the Top Level Domain to be 2-tuple lets aggregate with | |
# the shortest domain we have for each domain. Effectively finding the shortest | |
# common suffix. This is O(N^2). It would be more efficient to build out a | |
# suffix tree but this is simpler to implement and understand. | |
domains_to_subdomains = {} | |
for domain_parts in split_domains: | |
# Check for a previous suffix match (that is a parent domain match). | |
for seen_domain in domains_to_subdomains: | |
if _is_suffix(seen_domain, domain_parts): | |
domains_to_subdomains[seen_domain].append(domain_parts) | |
break | |
else: | |
# If the loop exited normally then we didn't find one and we should | |
# register this domain as the TLD. | |
domains_to_subdomains[domain_parts] = [] | |
print domains_to_subdomains |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment