axiak/tlds.py

## tlds.py
#!/usr/bin/env python
import re
from collections import defaultdict
from urllib2 import urlopen


def main():
    domains = set()
    #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
    domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')

    tree = build_tree(domains)

    print r'.+\\.' + build_regex(tree) + '$'


START = ';'
STOP = ':'


def build_regex(tree, start=START):
    items = tree[start]
    if len(items) == 1:
        next = iter(items).next()
        if next == STOP:
            return ''
        return escape(next) + build_regex(tree, start + next)
    else:
        pattern = []
        for item in items:
            if item == STOP:
                pattern.append('')
            else:
                pattern.append(escape(item) + build_regex(tree, start + item))
        return '(?:' + '|'.join(pattern) + ')'


def build_tree(domains):
    tree = defaultdict(set)
    for domain in domains:
        for prefix, next in prefixes(START + domain + STOP):
            tree[prefix].add(next)

    return tree


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def prefixes(input):
    for i in range(1, len(input), 1):
        yield input[:i], input[i]


def escape(item):
    return re.escape(item)


def get_domains_from_url(url):
    f = urlopen(url)
    domains = f.readlines()
    f.close()
    return {
        domain.strip().lower()
        .decode('utf8').encode('idna')
        .lstrip('*.')
        for domain in domains
        if not domain.strip().startswith('//') and
        not domain.strip().startswith('#') and
        not domain.strip().startswith('!') and
        domain.strip()
    }


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	import re
	from collections import defaultdict
	from urllib2 import urlopen


	def main():
	domains = set()
	#domains \|= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
	domains \|= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')

	tree = build_tree(domains)

	print r'.+\\.' + build_regex(tree) + '$'


	START = ';'
	STOP = ':'


	def build_regex(tree, start=START):
	items = tree[start]
	if len(items) == 1:
	next = iter(items).next()
	if next == STOP:
	return ''
	return escape(next) + build_regex(tree, start + next)
	else:
	pattern = []
	for item in items:
	if item == STOP:
	pattern.append('')
	else:
	pattern.append(escape(item) + build_regex(tree, start + item))
	return '(?:' + '\|'.join(pattern) + ')'


	def build_tree(domains):
	tree = defaultdict(set)
	for domain in domains:
	for prefix, next in prefixes(START + domain + STOP):
	tree[prefix].add(next)

	return tree


	def is_ascii(s):
	return all(ord(c) < 128 for c in s)


	def prefixes(input):
	for i in range(1, len(input), 1):
	yield input[:i], input[i]


	def escape(item):
	return re.escape(item)


	def get_domains_from_url(url):
	f = urlopen(url)
	domains = f.readlines()
	f.close()
	return {
	domain.strip().lower()
	.decode('utf8').encode('idna')
	.lstrip('*.')
	for domain in domains
	if not domain.strip().startswith('//') and
	not domain.strip().startswith('#') and
	not domain.strip().startswith('!') and
	domain.strip()
	}


	if __name__ == '__main__':
	main()