Skip to content

Instantly share code, notes, and snippets.

@aviat
Created November 8, 2019 00:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aviat/12f678e383a581285a4735fcc0d722e2 to your computer and use it in GitHub Desktop.
Save aviat/12f678e383a581285a4735fcc0d722e2 to your computer and use it in GitHub Desktop.
"""
Python 3 script to anonymize a JSON file, keys and data. Conserve structure. Conserve the
shape of URLs found in strings, anonymizing host, path, params, query and
fragment.
Run without argument to test:
$ python3 anonymize_json.py
Usage: anonymize_json.py path/to/json
running tests...
TestResults(failed=0, attempted=18)
Run with a JSON file as argument to display the anonymized JSON to stdout:
$ python3 anonymize_json.py b.json
{'jccjjxfn': [{'mvszkvun': [{'pdajmknzgid': [{'fljcffiqfviuwjowkp':
[{'flrwyvxlc': [{'zhsdkaaauramvg': ...
"""
import json
import random
import re
import string
import sys
import urllib.parse
def anonymize_str(s):
"""
>>> anonymize_str("q")
'p'
>>> anonymize_str("qweqwe")
'yopumz'
"""
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(len(s)))
def anonymze_url(url, anonymizer=anonymize_str):
"""
>>> def f(s): return "abc"
>>> f("zzz")
'abc'
>>> anonymze_url("zzz", anonymizer=f)
'abc'
>>> anonymze_url("https://www.google.com", anonymizer=f)
'https://abc'
"""
try:
p = urllib.parse.urlparse(url)
except ValueError:
ano = anonymizer(url)
else:
if p.netloc: p = p._replace(netloc = anonymizer(p.netloc))
if p.path: p = p._replace(path = anonymizer(p.path))
if p.params: p = p._replace(params = anonymizer(p.params))
if p.query: p = p._replace(query = anonymizer(p.query))
if p.fragment: p = p._replace(fragment = anonymizer(p.fragment))
ano = urllib.parse.urlunparse(p)
return ano
def anonymze_str_with_urls(s, anonymizer=anonymize_str):
"""
>>> def f(s): return "abc"
>>> f("zzz")
'abc'
>>> anonymze_str_with_urls("qweqwe", anonymizer=f)
'abc'
>>> anonymze_str_with_urls("http://www.google.com/?q=123#zzz", anonymizer=f)
' https://abc?abc#abc'
>>> anonymze_str_with_urls("zzzz http://www.google.com/?q=123#rrr", anonymizer=f)
'abc https://abc?abc#abc'
>>> anonymze_str_with_urls("zzzz http://www.google.com/?q=123#abc qqqqqw https://www.google.fr/?z=eee&aaaa z;z;z;z", anonymizer=f)
'abc https://abc?abc#abc https://abc?abc'
"""
urls = re.compile("[a-z]+://").split(s)
return " https://".join([anonymze_url(url, anonymizer=anonymizer) for url in urls])
def anonymize(obj):
"""
>>> anonymize({})
{}
>>> anonymize("")
''
>>> anonymize(123)
17
>>> anonymize([])
[]
>>> anonymize(True)
True
>>> anonymize({'z': {42: [3, 'z']}})
{'d': {32: [97, 'c']}}
"""
if type(obj) == dict:
return { anonymize(k): anonymize(v) for (k, v) in obj.items() }
elif type(obj) == str:
if "://" in obj:
return anonymze_str_with_urls(obj)
else:
return anonymize_str(obj)
elif type(obj) == int:
return random.randint(0, 100)
elif type(obj) == list:
return [anonymize(item) for item in obj]
elif type(obj) == bool:
return bool(random.randint(0, 2))
else:
raise Exception("Cannot anonymize type %s" % type(obj))
random.seed(1)
if len(sys.argv) != 2:
print("Usage: %s path/to/json" % sys.argv[0])
print("running tests...")
import doctest
print(doctest.testmod())
sys.exit(1)
with open(sys.argv[1], "r") as f:
res = json.load(f)
print(anonymize(res))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment