Created
November 8, 2019 00:23
-
-
Save aviat/12f678e383a581285a4735fcc0d722e2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python 3 script to anonymize a JSON file, keys and data. Conserve structure. Conserve the | |
shape of URLs found in strings, anonymizing host, path, params, query and | |
fragment. | |
Run without argument to test: | |
$ python3 anonymize_json.py | |
Usage: anonymize_json.py path/to/json | |
running tests... | |
TestResults(failed=0, attempted=18) | |
Run with a JSON file as argument to display the anonymized JSON to stdout: | |
$ python3 anonymize_json.py b.json | |
{'jccjjxfn': [{'mvszkvun': [{'pdajmknzgid': [{'fljcffiqfviuwjowkp': | |
[{'flrwyvxlc': [{'zhsdkaaauramvg': ... | |
""" | |
import json | |
import random | |
import re | |
import string | |
import sys | |
import urllib.parse | |
def anonymize_str(s): | |
""" | |
>>> anonymize_str("q") | |
'p' | |
>>> anonymize_str("qweqwe") | |
'yopumz' | |
""" | |
letters = string.ascii_lowercase | |
return ''.join(random.choice(letters) for i in range(len(s))) | |
def anonymze_url(url, anonymizer=anonymize_str): | |
""" | |
>>> def f(s): return "abc" | |
>>> f("zzz") | |
'abc' | |
>>> anonymze_url("zzz", anonymizer=f) | |
'abc' | |
>>> anonymze_url("https://www.google.com", anonymizer=f) | |
'https://abc' | |
""" | |
try: | |
p = urllib.parse.urlparse(url) | |
except ValueError: | |
ano = anonymizer(url) | |
else: | |
if p.netloc: p = p._replace(netloc = anonymizer(p.netloc)) | |
if p.path: p = p._replace(path = anonymizer(p.path)) | |
if p.params: p = p._replace(params = anonymizer(p.params)) | |
if p.query: p = p._replace(query = anonymizer(p.query)) | |
if p.fragment: p = p._replace(fragment = anonymizer(p.fragment)) | |
ano = urllib.parse.urlunparse(p) | |
return ano | |
def anonymze_str_with_urls(s, anonymizer=anonymize_str): | |
""" | |
>>> def f(s): return "abc" | |
>>> f("zzz") | |
'abc' | |
>>> anonymze_str_with_urls("qweqwe", anonymizer=f) | |
'abc' | |
>>> anonymze_str_with_urls("http://www.google.com/?q=123#zzz", anonymizer=f) | |
' https://abc?abc#abc' | |
>>> anonymze_str_with_urls("zzzz http://www.google.com/?q=123#rrr", anonymizer=f) | |
'abc https://abc?abc#abc' | |
>>> anonymze_str_with_urls("zzzz http://www.google.com/?q=123#abc qqqqqw https://www.google.fr/?z=eee&aaaa z;z;z;z", anonymizer=f) | |
'abc https://abc?abc#abc https://abc?abc' | |
""" | |
urls = re.compile("[a-z]+://").split(s) | |
return " https://".join([anonymze_url(url, anonymizer=anonymizer) for url in urls]) | |
def anonymize(obj): | |
""" | |
>>> anonymize({}) | |
{} | |
>>> anonymize("") | |
'' | |
>>> anonymize(123) | |
17 | |
>>> anonymize([]) | |
[] | |
>>> anonymize(True) | |
True | |
>>> anonymize({'z': {42: [3, 'z']}}) | |
{'d': {32: [97, 'c']}} | |
""" | |
if type(obj) == dict: | |
return { anonymize(k): anonymize(v) for (k, v) in obj.items() } | |
elif type(obj) == str: | |
if "://" in obj: | |
return anonymze_str_with_urls(obj) | |
else: | |
return anonymize_str(obj) | |
elif type(obj) == int: | |
return random.randint(0, 100) | |
elif type(obj) == list: | |
return [anonymize(item) for item in obj] | |
elif type(obj) == bool: | |
return bool(random.randint(0, 2)) | |
else: | |
raise Exception("Cannot anonymize type %s" % type(obj)) | |
random.seed(1) | |
if len(sys.argv) != 2: | |
print("Usage: %s path/to/json" % sys.argv[0]) | |
print("running tests...") | |
import doctest | |
print(doctest.testmod()) | |
sys.exit(1) | |
with open(sys.argv[1], "r") as f: | |
res = json.load(f) | |
print(anonymize(res)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment