Skip to content

Instantly share code, notes, and snippets.

@whiler
Created December 4, 2020 20:59
Show Gist options
  • Save whiler/0499baab5db78eb70286fcc4200f4b1a to your computer and use it in GitHub Desktop.
Save whiler/0499baab5db78eb70286fcc4200f4b1a to your computer and use it in GitHub Desktop.
get domains from adblock rules in Python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
import ipaddress
import re
HEAD = re.compile('^(\|\|?)?(https?://)?')
TAIL = re.compile('(/.*$)|(%2f.*$)|(%2F.*$)')
DOMAIN = re.compile('(\w[-\w]*(\.\w[-\w]*)+)')
WILDCARD = re.compile('^((\w*\*[-\w]*)?(\.))?(\w[-\w]*(\.\w[-\w]*)+)(\*)?')
def isip(s):
try:
ipaddress.ip_address(s)
except ValueError:
return False
return True
def adblockfilter(line):
line = line.strip()
if not line or line[0] in '![@':
return False
elif '/' == line[0] and '/' == line[-1]:
return False
return True
def adblock2domains(content):
it = filter(lambda domain: not isip(domain),
map(lambda line: WILDCARD.search(line)[4],
filter(DOMAIN.search,
map(lambda line: TAIL.sub('', line),
map(lambda line: HEAD.sub('', line),
filter(adblockfilter,
content.splitlines()))))))
return set(it)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment