allancaffee/tld-grouping.py

## tld-grouping.py
domains = """
am.example.com
ap.example.com
eu.example.com
itd.sel.example.com
jp.example.com
ncope.account.example.com
bizbat.example.com
scesh.example.com
smss.example.com
soe.example.com
example.com
spe.example.com
""".splitlines()

# Split the domains, dropping any empty lines.
split_domains = [tuple(d.split('.')) for d in domains if d]

# Sort them all by length so that we find the shortest ones first.
split_domains.sort(key=len)


def _is_suffix(potential_suffix, full_tuple):
    return full_tuple[-len(potential_suffix):] == potential_suffix

# Instead of hard-coding the Top Level Domain to be 2-tuple lets aggregate with
# the shortest domain we have for each domain. Effectively finding the shortest
# common suffix. This is O(N^2). It would be more efficient to build out a
# suffix tree but this is simpler to implement and understand.
domains_to_subdomains = {}
for domain_parts in split_domains:
    # Check for a previous suffix match (that is a parent domain match).
    for seen_domain in domains_to_subdomains:
        if _is_suffix(seen_domain, domain_parts):
            domains_to_subdomains[seen_domain].append(domain_parts)
            break

    else:
        # If the loop exited normally then we didn't find one and we should
        # register this domain as the TLD.
        domains_to_subdomains[domain_parts] = []

print domains_to_subdomains
	domains = """
	am.example.com
	ap.example.com
	eu.example.com
	itd.sel.example.com
	jp.example.com
	ncope.account.example.com
	bizbat.example.com
	scesh.example.com
	smss.example.com
	soe.example.com
	example.com
	spe.example.com
	""".splitlines()

	# Split the domains, dropping any empty lines.
	split_domains = [tuple(d.split('.')) for d in domains if d]

	# Sort them all by length so that we find the shortest ones first.
	split_domains.sort(key=len)


	def _is_suffix(potential_suffix, full_tuple):
	return full_tuple[-len(potential_suffix):] == potential_suffix

	# Instead of hard-coding the Top Level Domain to be 2-tuple lets aggregate with
	# the shortest domain we have for each domain. Effectively finding the shortest
	# common suffix. This is O(N^2). It would be more efficient to build out a
	# suffix tree but this is simpler to implement and understand.
	domains_to_subdomains = {}
	for domain_parts in split_domains:
	# Check for a previous suffix match (that is a parent domain match).
	for seen_domain in domains_to_subdomains:
	if _is_suffix(seen_domain, domain_parts):
	domains_to_subdomains[seen_domain].append(domain_parts)
	break

	else:
	# If the loop exited normally then we didn't find one and we should
	# register this domain as the TLD.
	domains_to_subdomains[domain_parts] = []

	print domains_to_subdomains