Skip to content

Instantly share code, notes, and snippets.

@ninowalker
Created January 6, 2014 23:45
Show Gist options
  • Save ninowalker/8292135 to your computer and use it in GitHub Desktop.
Save ninowalker/8292135 to your computer and use it in GitHub Desktop.
Load/invert the DB from http://urlblacklist.com
import glob
import collections
import re
def load(root="./"):
h = {}
for f in glob.glob(root + "*/domains"):
d = f.replace(root, "").replace("/domains", "")
h[d] = set()
with open(f) as _f:
h[d] = set(map(str.strip, _f.readlines()))
return h
def invert(db):
h = collections.defaultdict(list)
for tag, entries in db.iteritems():
for entry in entries:
h[entry].append(tag)
return dict(h)
def lookup(domain, db):
for d in [domain, re.sub(r"^(?:[A-Za-z0-9-]+\.)([A-Za-z0-9-]+\.[A-Za-z]{2,})$", r"\1", domain)]:
if d in db:
return d, db[d]
return d, None
DB = load()
INVERSE = invert(DB)
for u in user_urls:
print u, lookup(u, INVERSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment