Created
September 25, 2023 20:42
-
-
Save alexforster/3cd1c51f3d1988814e365dab9d92cb57 to your computer and use it in GitHub Desktop.
Streaming Heavy Hitters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import random | |
from typing import * | |
from dataclasses import dataclass | |
from ipaddress import IPv4Address | |
@dataclass | |
class HeavyHittersTableEntry: | |
item: Any | |
count: int | |
class HeavyHittersTable: | |
def __init__(self, capacity: int): | |
self.capacity = capacity # type: int | |
self.entries = list() # type: List[HeavyHittersTableEntry] | |
def sample(self, item: Any): | |
# try to find the item in an existing table entry | |
for entry in self.entries: | |
if entry.item == item: | |
# we found the item, so increment its count and stop searching | |
entry.count += 1 | |
break | |
else: | |
# we didn't find the item | |
if len(self.entries) < self.capacity: | |
# we still have unused table capacity, so append a new table entry to the end of the table | |
self.entries.append(HeavyHittersTableEntry(item, 1)) | |
else: | |
# the table is full, so replace the last entry in the table with this entry | |
self.entries.pop() | |
self.entries.append(HeavyHittersTableEntry(item, 1)) | |
# re-sort the table | |
self.entries.sort(key=lambda entry: entry.count, reverse=True) | |
def __iter__(self): | |
return iter(self.entries) | |
def main() -> int: | |
# create a heavy hitters table and simulate three top IPs | |
hh = HeavyHittersTable(capacity=25) | |
top_ips = [IPv4Address(random.randbytes(4)), IPv4Address(random.randbytes(4)), IPv4Address(random.randbytes(4))] | |
for i in range(10000): | |
if random.random() < 0.2: | |
# 20% chance this is one of the three "top IPs" | |
random_ip = random.choice(top_ips) | |
else: | |
# 80% chance this is a random IP | |
random_ip = IPv4Address(random.randbytes(4)) | |
# sample the IP in the heavy hitters table | |
hh.sample(random_ip) | |
# print the heavy hitters table | |
for entry in hh: | |
print(entry.item, entry.count) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) | |
# Example output: | |
# | |
# ~# ./hh.py | |
# 103.124.248.231 679 | |
# 7.206.8.42 678 | |
# 122.92.108.109 657 | |
# 134.197.126.208 1 | |
# 26.90.72.67 1 | |
# 221.143.101.0 1 | |
# 190.234.230.152 1 | |
# 69.62.101.189 1 | |
# 230.112.137.45 1 | |
# 73.175.51.45 1 | |
# 189.13.248.134 1 | |
# 87.94.143.18 1 | |
# 145.13.55.254 1 | |
# 116.87.65.104 1 | |
# 202.22.116.138 1 | |
# 48.252.17.176 1 | |
# 167.92.52.213 1 | |
# 203.67.81.165 1 | |
# 65.121.29.182 1 | |
# 241.143.141.107 1 | |
# 2.12.49.64 1 | |
# 195.198.98.127 1 | |
# 182.239.36.119 1 | |
# 248.226.103.26 1 | |
# 136.9.121.18 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment