Skip to content

Instantly share code, notes, and snippets.

@englehardt
Created April 18, 2019 22:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save englehardt/53b7b585ef38067540b4a547363b81df to your computer and use it in GitHub Desktop.
Save englehardt/53b7b585ef38067540b4a547363b81df to your computer and use it in GitHub Desktop.
Generate a list of safebrowsing hashes from the raw Disconnect list
import base64
import hashlib
import json
import re
import urllib2
from trackingprotection_tools import DisconnectParser
TRACKER_CATEGORIES = [
'Advertising', 'Analytics', 'Social', 'Content', 'Disconnect'
]
BLOCKLIST = 'https://raw.githubusercontent.com/mozilla-services/shavar-prod-lists/master/disconnect-blacklist.json' # noqa
MAPPING = 'https://raw.githubusercontent.com/mozilla-services/shavar-list-creation/master/disconnect_mapping.json' # noqa
TEST_SET = {
'google.ee': 'BIjZakI3ChrLy6S7OTUampyXO+WUJFp4dYEfvXDQTtc=',
'adservice.google.com': 'ORbzeOvsHcz/DEuwyZXenmorqa79AoIdfjEXXKIxKIY=',
'doubleclick.net': 'uXNT1PzjAVau8b402OMAIGDejKbiXfQX5iXvPASfO/s=',
'youtube.com': 'LvOZqM9U3cK9V1r05/4lr38ecDvgztKSGdyzL4bvE8c='
}
# From https://github.com/mozilla-services/shavar-list-creation/blob/93924188fee1c2b708217154524c90ed40d568f3/lists2safebrowsing.py#L154 # noqa
def canonicalize(d):
if (not d or d == ""):
return d
# remove tab (0x09), CR (0x0d), LF (0x0a)
# TODO?: d, _subs_made = re.subn("\t|\r|\n", "", d)
d = re.subn("\t|\r|\n", "", d)[0]
# remove any URL fragment
fragment_index = d.find("#")
if (fragment_index != -1):
d = d[0:fragment_index]
# repeatedly unescape until no more hex encodings
while (1):
_d = d
d = urllib2.unquote(_d)
# if decoding had no effect, stop
if (d == _d):
break
# extract hostname (scheme://)(username(:password)@)hostname(:port)(/...)
# extract path
# TODO?: use urlparse ?
url_components = re.match(
re.compile(
"^(?:[a-z]+\:\/\/)?(?:[a-z]+(?:\:[a-z0-9]+)?@)?([^\/^\?^\:]+)(?:\:[0-9]+)?(\/(.*)|$)" # noqa
), d)
host = url_components.group(1)
path = url_components.group(2) or ""
path = re.subn(r"^(\/)+", "", path)[0]
# remove leading and trailing dots
# TODO?: host, _subs_made = re.subn("^\.+|\.+$", "", host)
host = re.subn(r"^\.+|\.+$", "", host)[0]
# replace consequtive dots with a single dot
# TODO?: host, _subs_made = re.subn("\.+", ".", host)
host = re.subn(r"\.+", ".", host)[0]
# lowercase the whole thing
host = host.lower()
# percent-escape any characters <= ASCII 32, >= 127, or '#' or '%'
_path = ""
for i in path:
if (ord(i) <= 32 or ord(i) >= 127 or i == '#' or i == '%'):
_path += urllib2.quote(i)
else:
_path += i
# Note: we do NOT append the scheme
# because safebrowsing lookups ignore it
return host + "/" + _path
def get_safebrowsing_hash(domain):
return base64.b64encode(hashlib.sha256(canonicalize(domain)).digest())
def main():
dc = DisconnectParser(
blocklist_url=BLOCKLIST,
disconnect_mapping_url=MAPPING
)
output = dict()
domains = dc.get_domains_with_category(TRACKER_CATEGORIES)
for domain in domains:
output[domain] = get_safebrowsing_hash(domain)
if domain in TEST_SET:
assert(output[domain] == TEST_SET[domain])
with open('output.json', 'w') as f:
json.dump(output, f)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment