englehardt/generate_hash_list.py

## generate_hash_list.py
import base64
import hashlib
import json
import re
import urllib2

from trackingprotection_tools import DisconnectParser

TRACKER_CATEGORIES = [
    'Advertising', 'Analytics', 'Social', 'Content', 'Disconnect'
]
BLOCKLIST = 'https://raw.githubusercontent.com/mozilla-services/shavar-prod-lists/master/disconnect-blacklist.json'  # noqa
MAPPING = 'https://raw.githubusercontent.com/mozilla-services/shavar-list-creation/master/disconnect_mapping.json'  # noqa

TEST_SET = {
    'google.ee': 'BIjZakI3ChrLy6S7OTUampyXO+WUJFp4dYEfvXDQTtc=',
    'adservice.google.com': 'ORbzeOvsHcz/DEuwyZXenmorqa79AoIdfjEXXKIxKIY=',
    'doubleclick.net': 'uXNT1PzjAVau8b402OMAIGDejKbiXfQX5iXvPASfO/s=',
    'youtube.com': 'LvOZqM9U3cK9V1r05/4lr38ecDvgztKSGdyzL4bvE8c='
}


# From https://github.com/mozilla-services/shavar-list-creation/blob/93924188fee1c2b708217154524c90ed40d568f3/lists2safebrowsing.py#L154  # noqa
def canonicalize(d):
    if (not d or d == ""):
        return d

    # remove tab (0x09), CR (0x0d), LF (0x0a)
    # TODO?: d, _subs_made = re.subn("\t|\r|\n", "", d)
    d = re.subn("\t|\r|\n", "", d)[0]

    # remove any URL fragment
    fragment_index = d.find("#")
    if (fragment_index != -1):
        d = d[0:fragment_index]

    # repeatedly unescape until no more hex encodings
    while (1):
        _d = d
        d = urllib2.unquote(_d)
        # if decoding had no effect, stop
        if (d == _d):
            break

    # extract hostname (scheme://)(username(:password)@)hostname(:port)(/...)
    # extract path
    # TODO?: use urlparse ?
    url_components = re.match(
        re.compile(
            "^(?:[a-z]+\:\/\/)?(?:[a-z]+(?:\:[a-z0-9]+)?@)?([^\/^\?^\:]+)(?:\:[0-9]+)?(\/(.*)|$)"  # noqa
        ), d)
    host = url_components.group(1)
    path = url_components.group(2) or ""
    path = re.subn(r"^(\/)+", "", path)[0]

    # remove leading and trailing dots
    # TODO?: host, _subs_made = re.subn("^\.+|\.+$", "", host)
    host = re.subn(r"^\.+|\.+$", "", host)[0]
    # replace consequtive dots with a single dot
    # TODO?: host, _subs_made = re.subn("\.+", ".", host)
    host = re.subn(r"\.+", ".", host)[0]
    # lowercase the whole thing
    host = host.lower()

    # percent-escape any characters <= ASCII 32, >= 127, or '#' or '%'
    _path = ""
    for i in path:
        if (ord(i) <= 32 or ord(i) >= 127 or i == '#' or i == '%'):
            _path += urllib2.quote(i)
        else:
            _path += i

    # Note: we do NOT append the scheme
    # because safebrowsing lookups ignore it
    return host + "/" + _path


def get_safebrowsing_hash(domain):
    return base64.b64encode(hashlib.sha256(canonicalize(domain)).digest())


def main():
    dc = DisconnectParser(
        blocklist_url=BLOCKLIST,
        disconnect_mapping_url=MAPPING
    )

    output = dict()
    domains = dc.get_domains_with_category(TRACKER_CATEGORIES)
    for domain in domains:
        output[domain] = get_safebrowsing_hash(domain)
        if domain in TEST_SET:
            assert(output[domain] == TEST_SET[domain])

    with open('output.json', 'w') as f:
        json.dump(output, f)


if __name__ == '__main__':
    main()
	import base64
	import hashlib
	import json
	import re
	import urllib2

	from trackingprotection_tools import DisconnectParser

	TRACKER_CATEGORIES = [
	'Advertising', 'Analytics', 'Social', 'Content', 'Disconnect'
	]
	BLOCKLIST = 'https://raw.githubusercontent.com/mozilla-services/shavar-prod-lists/master/disconnect-blacklist.json' # noqa
	MAPPING = 'https://raw.githubusercontent.com/mozilla-services/shavar-list-creation/master/disconnect_mapping.json' # noqa

	TEST_SET = {
	'google.ee': 'BIjZakI3ChrLy6S7OTUampyXO+WUJFp4dYEfvXDQTtc=',
	'adservice.google.com': 'ORbzeOvsHcz/DEuwyZXenmorqa79AoIdfjEXXKIxKIY=',
	'doubleclick.net': 'uXNT1PzjAVau8b402OMAIGDejKbiXfQX5iXvPASfO/s=',
	'youtube.com': 'LvOZqM9U3cK9V1r05/4lr38ecDvgztKSGdyzL4bvE8c='
	}


	# From https://github.com/mozilla-services/shavar-list-creation/blob/93924188fee1c2b708217154524c90ed40d568f3/lists2safebrowsing.py#L154 # noqa
	def canonicalize(d):
	if (not d or d == ""):
	return d

	# remove tab (0x09), CR (0x0d), LF (0x0a)
	# TODO?: d, _subs_made = re.subn("\t\|\r\|\n", "", d)
	d = re.subn("\t\|\r\|\n", "", d)[0]

	# remove any URL fragment
	fragment_index = d.find("#")
	if (fragment_index != -1):
	d = d[0:fragment_index]

	# repeatedly unescape until no more hex encodings
	while (1):
	_d = d
	d = urllib2.unquote(_d)
	# if decoding had no effect, stop
	if (d == _d):
	break

	# extract hostname (scheme://)(username(:password)@)hostname(:port)(/...)
	# extract path
	# TODO?: use urlparse ?
	url_components = re.match(
	re.compile(
	"^(?:[a-z]+\:\/\/)?(?:[a-z]+(?:\:[a-z0-9]+)?@)?([^\/^\?^\:]+)(?:\:[0-9]+)?(\/(.*)\|$)" # noqa
	), d)
	host = url_components.group(1)
	path = url_components.group(2) or ""
	path = re.subn(r"^(\/)+", "", path)[0]

	# remove leading and trailing dots
	# TODO?: host, _subs_made = re.subn("^\.+\|\.+$", "", host)
	host = re.subn(r"^\.+\|\.+$", "", host)[0]
	# replace consequtive dots with a single dot
	# TODO?: host, _subs_made = re.subn("\.+", ".", host)
	host = re.subn(r"\.+", ".", host)[0]
	# lowercase the whole thing
	host = host.lower()

	# percent-escape any characters <= ASCII 32, >= 127, or '#' or '%'
	_path = ""
	for i in path:
	if (ord(i) <= 32 or ord(i) >= 127 or i == '#' or i == '%'):
	_path += urllib2.quote(i)
	else:
	_path += i

	# Note: we do NOT append the scheme
	# because safebrowsing lookups ignore it
	return host + "/" + _path


	def get_safebrowsing_hash(domain):
	return base64.b64encode(hashlib.sha256(canonicalize(domain)).digest())


	def main():
	dc = DisconnectParser(
	blocklist_url=BLOCKLIST,
	disconnect_mapping_url=MAPPING
	)

	output = dict()
	domains = dc.get_domains_with_category(TRACKER_CATEGORIES)
	for domain in domains:
	output[domain] = get_safebrowsing_hash(domain)
	if domain in TEST_SET:
	assert(output[domain] == TEST_SET[domain])

	with open('output.json', 'w') as f:
	json.dump(output, f)


	if __name__ == '__main__':
	main()