Skip to content

Instantly share code, notes, and snippets.

@tkaemming
Last active April 21, 2016 01:25
Show Gist options
  • Save tkaemming/303f0761723801f350b92345a53a46e2 to your computer and use it in GitHub Desktop.
Save tkaemming/303f0761723801f350b92345a53a46e2 to your computer and use it in GitHub Desktop.
import itertools
import pprint
import json
import sys
import zlib
from collections import (
defaultdict,
Mapping,
Sequence,
Set,
)
unset = object()
TYPE_REFERENCE = 0
TYPE_VALUE = 1
TYPE_MAPPING = 2
TYPE_SEQUENCE = 3
class Encoder(object):
def __init__(self):
sequence = itertools.count()
def make_value():
identifier = str(next(sequence))
return identifier, 0
self.__identifiers = defaultdict(make_value)
def bindings(self):
def rewrite_value((flag, type, name, annotations)):
return (flag, type, name, tuple(annotations))
return {identifier: rewrite_value(value) for value, (identifier, _) in self.__identifiers.iteritems()}
def statistics(self):
return {identifier: count for (identifier, count) in self.__identifiers.values()}
def encode(self, value, preprocess=None, postprocess=None):
if preprocess is None:
preprocess = lambda value, key=None: (value, set())
if postprocess is None:
postprocess = lambda value, key=None: (value, set())
def process(value, key=None):
t = type(value).__name__
annotations = frozenset()
value, updates = preprocess(value, key)
annotations = annotations | updates
if isinstance(value, Mapping):
flag = TYPE_MAPPING
value = tuple((process(k), process(v, k)) for k, v in value.iteritems())
elif isinstance(value, (Sequence, Set)) and not isinstance(value, basestring):
flag = TYPE_SEQUENCE
value = tuple(process(v) for v in value)
else:
flag = TYPE_VALUE
value = str(value)
value, updates = postprocess(value, key)
annotations = annotations | updates
signature = (flag, t, value, annotations)
identifier, count = self.__identifiers[signature]
self.__identifiers[signature] = (identifier, count + 1)
return (TYPE_REFERENCE, identifier)
return process(value)
def decode((bindings, value)):
def process(value):
value = list(value)
flag = value.pop(0)
if flag == TYPE_REFERENCE:
identifier, = value
return process(bindings[identifier])
elif flag == TYPE_VALUE:
type, value, annotations = value
return value
elif flag == TYPE_MAPPING:
type, value, annotations = value
return {process(k): process(v) for (k, v) in value}
elif flag == TYPE_SEQUENCE:
type, value, annotations = value
return [process(i) for i in value]
else:
raise AssertionError('unexpected flag')
return process(value)
def rewrite(encoder, payload):
def rewrite_frame(frame):
frame['vars'] = encoder.encode(frame.get('vars', {}))
return frame
def rewrite_stacktrace(stacktrace):
stacktrace['frames'] = map(rewrite_frame, stacktrace['frames'])
return stacktrace
def rewrite_exception(exception):
exception['stacktrace'] = rewrite_stacktrace(exception['stacktrace'])
return exception
payload['extra'] = encoder.encode(payload['extra'])
payload['sentry.interfaces.Exception']['values'] = map(
rewrite_exception,
payload['sentry.interfaces.Exception']['values'],
)
return inline(encoder, {
'data': payload,
'bindings': bindings,
})
def inline(encoder, data):
statistics = encoder.statistics()
remove = set()
def inline(value):
value = list(value)
flag = value.pop(0)
if flag == TYPE_REFERENCE:
identifier, = value
if statistics[identifier] == 1:
remove.add(identifier)
return bindings[identifier]
elif flag == TYPE_SEQUENCE:
type, value, annotations = value
return flag, type, tuple(map(inline, value)), annotations
elif flag == TYPE_MAPPING:
type, value, annotations = value
value = tuple((key, inline(value)) for key, value in value)
return flag, type, value, annotations
return (flag,) + tuple(value)
bindings = data['bindings']
for key, value in bindings.iteritems():
bindings[key] = inline(value)
for identifier in remove:
bindings.pop(identifier)
return data
def dump(name):
encoder = Encoder()
file = open(name)
sys.stdout.write(
json.dumps(
rewrite(
encoder,
json.load(file),
),
indent=2
)
)
sys.stdout.write('\n')
def stats(*names):
for name in names:
file = open(name)
sys.stdout.write('{}\t'.format(name))
original = json.dumps(json.load(file))
file.seek(0)
encoder = Encoder()
processed = json.dumps(rewrite(encoder, json.load(file)))
sys.stdout.write('{}\t{}\t{:.2%}\t{}\t{}\t{:.2%}\n'.format(
len(original),
len(processed),
float(len(processed)) / len(original),
len(zlib.compress(original)),
len(zlib.compress(processed)),
float(len(zlib.compress(processed))) / len(zlib.compress(original)),
))
if __name__ == '__main__':
command = {
'dump': dump,
'stats': stats,
}[sys.argv[1]]
command(*sys.argv[2:])
import pprint
import json
import sys
import zlib
from bindings import Encoder, inline
unset = object()
class MappingSplitter(object):
def __init__(self, splitters):
self.splitters = splitters
def split(self, value):
left = value.copy()
right = {}
for key, splitter in self.splitters.iteritems():
item = left.pop(key, unset)
if item is not unset:
result = splitter.split(item)
if result[0] is not unset:
left[key] = result[0]
right[key] = result[1]
return left, right
def combine(self, left, right):
result = left.copy()
for key, value in right.iteritems():
result[key] = self.splitters[key].combine(left.get(key), value)
return result
class SequenceSplitter(object):
def __init__(self, splitter):
self.splitter = splitter
def split(self, value):
left = []
right = []
for i, item in enumerate(value):
a, b = self.splitter.split(item)
left.append(a)
right.append(b)
return left, right
def combine(self, left, right):
assert len(left) == len(right)
result = []
for l, r in zip(left, right):
result.append(self.splitter.combine(l, r))
return result
class ValueExtractor(object):
def __init__(self, encoder=lambda i: i):
self.encoder = encoder
def split(self, value):
return unset, self.encoder(value)
def combine(self, left, right):
return right
def rewrite(data):
encoder = Encoder()
encode = encoder.encode
# encode = lambda value: value
splitter = MappingSplitter({
'id': ValueExtractor(),
'project': ValueExtractor(),
'release': ValueExtractor(),
'message': ValueExtractor(),
'datetime': ValueExtractor(),
'tags': ValueExtractor(encode),
'extra': ValueExtractor(encode),
'received': ValueExtractor(),
'modules': ValueExtractor(),
'sentry.interfaces.Message': MappingSplitter({
'params': ValueExtractor(encode),
}),
'sentry.interfaces.Exception': MappingSplitter({
'values': SequenceSplitter(
MappingSplitter({
'stacktrace': MappingSplitter({
'frames': SequenceSplitter(
MappingSplitter({
'vars': ValueExtractor(encode),
}),
),
}),
})
),
}),
'sentry.interfaces.http.Http': ValueExtractor(encode),
'sentry.interfaces.user.User': ValueExtractor(encode),
})
shared, unique = splitter.split(data)
return shared, inline(encoder, {
'bindings': encoder.bindings(),
'data': unique,
})
def dump(name='/dev/stdin'):
file = open(name)
shared, unique = rewrite(json.load(file))
for data in (shared, unique):
sys.stdout.write(json.dumps(data, indent=2))
sys.stdout.write('\n')
def stats(*names):
for name in names:
file = open(name)
sys.stdout.write('{}\t'.format(name))
data = json.load(file)
original = json.dumps(data)
shared, unique = map(json.dumps, rewrite(data))
def zlen(value):
return len(zlib.compress(value))
sys.stdout.write(
(' '.join(['{}\t{}\t{:.2%}\t{}\t{:.2%}'] * 2) + '\n').format(
len(original),
len(shared),
float(len(shared)) / len(original),
len(unique),
float(len(unique)) / len(original),
zlen(original),
zlen(shared),
float(zlen(shared)) / zlen(original),
zlen(unique),
float(zlen(unique)) / zlen(original),
),
)
if __name__ == '__main__':
command = {
'dump': dump,
'stats': stats,
}[sys.argv[1]]
command(*sys.argv[2:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment