Last active
September 11, 2019 07:29
-
-
Save ablanathtanalba/bada7a50943e28022b55a79ace40344f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import math | |
# python version of estimate max entropy function | |
# manually tested, produces same results as the js version | |
def estimate_max_entropy(str): | |
MAX_LS_LEN_FOR_ENTROPY_EST = 256 | |
SEPS = "._-" | |
BIN = "01" | |
DEC = "0123456789" | |
HEX = "abcdef" + DEC | |
ALPHA = "abcdefghijklmnopqrstuvwxyz" | |
ALPHANUM = ALPHA + DEC | |
B64 = ALPHANUM + ALPHA.upper() + "/+" | |
URL = ALPHANUM + ALPHA.upper() + "~%" | |
if len(str) > MAX_LS_LEN_FOR_ENTROPY_EST: | |
return len(str) | |
max_symbols = None | |
same_case = str.lower() == str or str.upper() == str | |
if same_case: | |
str = str.lower() | |
chr_classes = [BIN, DEC, HEX, ALPHA, ALPHANUM, B64, URL] | |
# this is the code block to comment out on the second run | |
for chr_class in chr_classes: | |
group = chr_class + SEPS | |
each_char_in_group = None | |
for char in str: | |
if char in str: | |
if not each_char_in_group: | |
each_char_in_group = True | |
else: | |
each_char_in_group = False | |
if each_char_in_group: | |
max_symbols = len(chr_class) | |
break | |
# clunkier than the JS version | |
if not max_symbols: | |
char_codes = [] | |
char_list = list(str) | |
for char in char_list: | |
char_codes.append(ord(char)) | |
# arbitrary values, meh | |
min_char_code = char_codes[0] | |
max_char_code = char_codes[1] | |
for char in char_codes: | |
if char < min_char_code: | |
min_char_code = char | |
elif char > max_char_code: | |
max_char_code = char | |
max_symbols = max_char_code - min_char_code + 1 | |
max_bits = (math.log(max_symbols)/math.log(2)) * len(str) | |
return max_bits | |
#### now the tricky stuff, import the test results | |
#### filter them by type, find the pixel ones, save the substrings | |
#### run those substrings through the estimate max entorpy function | |
#### save those results, | |
#### then rinse and repeat but with the block of code in question commented out | |
#### compare the results, see what's good | |
# big fat json blob of test results | |
with open('./results-10k-cookieshare.json') as data_file: | |
data = json.load(data_file) | |
# gonna be a dictionary of all the substrings in pixel tracking matches | |
filtered_results = {} | |
# find those matches | |
for item in data["snitch_map"][""]: | |
if item["type"] == "pixel": | |
match = item["details"]["substring"] | |
filtered_results[match] = estimate_max_entropy(match) | |
# will probs rewrite this to write to a new file and save it there | |
# if so, will append the same results when code block in question is commented out | |
print(filtered_results) |
buggy, but it's late, coming back to this
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
this is run on my local machine, so directory paths are assuming as much. i downloaded the raw json results from bennett's jupyter notebook