Skip to content

Instantly share code, notes, and snippets.

@axiak
Created November 5, 2014 02:25
Show Gist options
  • Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.
Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
from collections import defaultdict
from urllib2 import urlopen
def main():
domains = set()
#domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
tree = build_tree(domains)
print r'.+\\.' + build_regex(tree) + '$'
START = ';'
STOP = ':'
def build_regex(tree, start=START):
items = tree[start]
if len(items) == 1:
next = iter(items).next()
if next == STOP:
return ''
return escape(next) + build_regex(tree, start + next)
else:
pattern = []
for item in items:
if item == STOP:
pattern.append('')
else:
pattern.append(escape(item) + build_regex(tree, start + item))
return '(?:' + '|'.join(pattern) + ')'
def build_tree(domains):
tree = defaultdict(set)
for domain in domains:
for prefix, next in prefixes(START + domain + STOP):
tree[prefix].add(next)
return tree
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def prefixes(input):
for i in range(1, len(input), 1):
yield input[:i], input[i]
def escape(item):
return re.escape(item)
def get_domains_from_url(url):
f = urlopen(url)
domains = f.readlines()
f.close()
return {
domain.strip().lower()
.decode('utf8').encode('idna')
.lstrip('*.')
for domain in domains
if not domain.strip().startswith('//') and
not domain.strip().startswith('#') and
not domain.strip().startswith('!') and
domain.strip()
}
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment