ablanathtanalba/pixel_substring_entropy_results.py

## pixel_substring_entropy_results.py

import json
import math


# python version of estimate max entropy function
# manually tested, produces same results as the js version
def estimate_max_entropy(str):
	MAX_LS_LEN_FOR_ENTROPY_EST = 256

	SEPS = "._-"
	BIN = "01"
	DEC = "0123456789"

	HEX = "abcdef" + DEC
	ALPHA = "abcdefghijklmnopqrstuvwxyz"
	ALPHANUM = ALPHA + DEC

	B64 = ALPHANUM + ALPHA.upper() + "/+"
	URL = ALPHANUM + ALPHA.upper() + "~%"

	if len(str) > MAX_LS_LEN_FOR_ENTROPY_EST:
		return len(str)

	max_symbols = None

	same_case = str.lower() == str or str.upper() == str
	if same_case:
		str = str.lower()

	chr_classes = [BIN, DEC, HEX, ALPHA, ALPHANUM, B64, URL]

    # this is the code block to comment out on the second run
	for chr_class in chr_classes:
		group = chr_class + SEPS

		each_char_in_group = None

		for char in str:
			if char in str:
				if not each_char_in_group:
					each_char_in_group = True
				else:
					each_char_in_group = False

		if each_char_in_group:
			max_symbols = len(chr_class)
			break

	# clunkier than the JS version
	if not max_symbols:
		char_codes = []
		char_list = list(str)
		for char in char_list:
			char_codes.append(ord(char))

		# arbitrary values, meh
		min_char_code = char_codes[0]
		max_char_code = char_codes[1]

		for char in char_codes:
			if char < min_char_code:
				min_char_code = char
			elif char > max_char_code:
				max_char_code = char

		max_symbols = max_char_code - min_char_code + 1


	max_bits = (math.log(max_symbols)/math.log(2)) * len(str)

	return max_bits


#### now the tricky stuff, import the test results
#### filter them by type, find the pixel ones, save the substrings
#### run those substrings through the estimate max entorpy function
#### save those results,
#### then rinse and repeat but with the block of code in question commented out
#### compare the results, see what's good


# big fat json blob of test results
with open('./results-10k-cookieshare.json') as data_file:
    data = json.load(data_file)

# gonna be a dictionary of all the substrings in pixel tracking matches
filtered_results = {}

# find those matches
for item in data["snitch_map"][""]:
    if item["type"] == "pixel":
        match = item["details"]["substring"]
        filtered_results[match] = estimate_max_entropy(match)

# will probs rewrite this to write to a new file and save it there
# if so, will append the same results when code block in question is commented out
print(filtered_results)

	import json
	import math


	# python version of estimate max entropy function
	# manually tested, produces same results as the js version
	def estimate_max_entropy(str):
	MAX_LS_LEN_FOR_ENTROPY_EST = 256

	SEPS = "._-"
	BIN = "01"
	DEC = "0123456789"

	HEX = "abcdef" + DEC
	ALPHA = "abcdefghijklmnopqrstuvwxyz"
	ALPHANUM = ALPHA + DEC

	B64 = ALPHANUM + ALPHA.upper() + "/+"
	URL = ALPHANUM + ALPHA.upper() + "~%"

	if len(str) > MAX_LS_LEN_FOR_ENTROPY_EST:
	return len(str)

	max_symbols = None

	same_case = str.lower() == str or str.upper() == str
	if same_case:
	str = str.lower()

	chr_classes = [BIN, DEC, HEX, ALPHA, ALPHANUM, B64, URL]

	# this is the code block to comment out on the second run
	for chr_class in chr_classes:
	group = chr_class + SEPS

	each_char_in_group = None

	for char in str:
	if char in str:
	if not each_char_in_group:
	each_char_in_group = True
	else:
	each_char_in_group = False

	if each_char_in_group:
	max_symbols = len(chr_class)
	break

	# clunkier than the JS version
	if not max_symbols:
	char_codes = []
	char_list = list(str)
	for char in char_list:
	char_codes.append(ord(char))

	# arbitrary values, meh
	min_char_code = char_codes[0]
	max_char_code = char_codes[1]

	for char in char_codes:
	if char < min_char_code:
	min_char_code = char
	elif char > max_char_code:
	max_char_code = char

	max_symbols = max_char_code - min_char_code + 1


	max_bits = (math.log(max_symbols)/math.log(2)) * len(str)

	return max_bits



	#### now the tricky stuff, import the test results
	#### filter them by type, find the pixel ones, save the substrings
	#### run those substrings through the estimate max entorpy function
	#### save those results,
	#### then rinse and repeat but with the block of code in question commented out
	#### compare the results, see what's good



	# big fat json blob of test results
	with open('./results-10k-cookieshare.json') as data_file:
	data = json.load(data_file)

	# gonna be a dictionary of all the substrings in pixel tracking matches
	filtered_results = {}

	# find those matches
	for item in data["snitch_map"][""]:
	if item["type"] == "pixel":
	match = item["details"]["substring"]
	filtered_results[match] = estimate_max_entropy(match)

	# will probs rewrite this to write to a new file and save it there
	# if so, will append the same results when code block in question is commented out
	print(filtered_results)