Skip to content

Instantly share code, notes, and snippets.

@dvdbng
Created July 1, 2016 09:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvdbng/16bd04f66fbf6593276aec2ee3dd2b96 to your computer and use it in GitHub Desktop.
Save dvdbng/16bd04f66fbf6593276aec2ee3dd2b96 to your computer and use it in GitHub Desktop.
from collections import Counter
import heapq
class SummaryDict(Counter):
"""
A counting dict that aggregates uncommon values in a "Others" key.
The full set of keys is kept in a separate counter, in case they need to
be added back to the main counter.
>>> a = SummaryDict(threshold=0.1)
>>> for key, amount in dict(a=5, b=15, c=30, d=5, e=3, f=2, g=40).items():
... a.increment(key, amount)
>>> import json
>>> json.dumps(a, sort_keys=True)
'{"Other values": 15, "b": 15, "c": 30, "g": 40}'
"""
def __init__(self, threshold=0.001, others_key="Other values"):
self.threshold = threshold
self.others_key = others_key
self.total_count = 0
self.full_counter = Counter()
self.heap = []
super(SummaryDict, self).__init__()
def increment(self, key, amount=1):
"""
Increment given key by the set amount (has to be positive)
"""
assert amount >= 0
heap = self.heap
others_key = self.others_key
full_counter = self.full_counter
new_value = full_counter[key] + amount
self.total_count += amount
threshold = self.threshold * self.total_count
# Remove small keys from main dict
while len(heap) and heap[0][0] < threshold:
oldkey = heapq.heappop(heap)[1]
oldval = full_counter[oldkey]
if oldval > threshold:
heapq.heappush(heap, (oldval, oldkey))
else:
self[others_key] += self.pop(oldkey, 0)
if new_value > threshold:
if key not in self:
self[others_key] -= new_value - amount
self[key] = new_value
heapq.heappush(heap, (new_value, key))
else:
self[others_key] += amount
full_counter[key] = new_value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment