Skip to content

Instantly share code, notes, and snippets.

@btimby
Created August 2, 2017 14:21
Show Gist options
  • Save btimby/03a2537c7f1d2db0c147de592927d6f6 to your computer and use it in GitHub Desktop.
Save btimby/03a2537c7f1d2db0c147de592927d6f6 to your computer and use it in GitHub Desktop.
Simulates various cache configurations over an nginx log.
import sys
import bisect
from UserDict import UserDict
from datetime import timedelta
from collections import deque, Counter
from peewee import *
DB = MySQLDatabase('static', user='static', password='http')
class BaseModel(Model):
class Meta:
database = DB
class Access(BaseModel):
"""
Access log for nginx.
CREATE TABLE `access` (
`remote_addr` varchar(30) DEFAULT NULL,
`dash` char(1) DEFAULT NULL,
`remote_user` varchar(30) DEFAULT NULL,
`url` varchar(1024) DEFAULT NULL,
`status` int(11) DEFAULT NULL,
`sent` bigint(20) DEFAULT NULL,
`referrer` text,
`user_agent` text,
`client_addr` varchar(30) DEFAULT NULL,
`ts` datetime DEFAULT NULL,
`method` varchar(8) DEFAULT NULL
)"""
class Meta:
db_table = 'access'
database = DB
primary_key = False
remote_addr = CharField(max_length=30)
dash = CharField(max_length=1)
remote_user = CharField(max_length=30)
url = CharField(max_length=1024)
status = IntegerField()
sent = BigIntegerField()
referrer = TextField()
user_agent = TextField()
client_addr = CharField(max_length=30)
ts = DateTimeField()
method = CharField(max_length=8)
class CacheStat(BaseModel):
"""
Hypothetical caching stats.
"""
class Meta:
db_table = 'cache_'
database = DB
primary_key = False
url = CharField(max_length=1024)
hits = IntegerField()
misses = IntegerField()
class CacheItem(object):
def __init__(self, url, expires, hits=0, misses=1):
self.url = url
self.expires = expires
self.hits = hits
self.misses = misses
def __iter__(self):
return iter(['url', 'hits', 'misses'])
def __getitem__(self, key):
try:
return getattr(self, key)
except AttributeError:
raise KeyError(key)
class Cache(object):
# TODO: this class can also compute the stats that I am currently using
# SQL for, saving several manual steps.
def __init__(self, ttl, cache_after=None):
self.ttl = ttl
self.cache_after = cache_after
# This is a dictionary that holds the expiration, hits, misses for an
# item. The key is the item URL.
self.stats = {}
# This is a sorted list that contains (expiration, size) tuples sorted
# by expiration. The list remains sorted because we progress through
# our log in sequential order. As an item enters the cache, we can add
# it's size to the cache total size, then pop out any items that have
# expired, subtracting their sizes. Then size will always be correct.
self.items = deque()
# We store the number of accesses, so that we can do things like cache
# an item on it's 3rd access.
self.hits = Counter()
# Here we will store our high-water mark, because that is all we are
# really interested in.
self._size = 0
self.size = 0
self.count = 0
def __len__(self):
return len(self.stats)
def add(self, url, timestamp, size):
self.hits[url] += 1
cache_item = self.stats.get(url, None)
add_item = False
if cache_item:
if cache_item.expires > timestamp:
cache_item.hits += 1
else:
cache_item.misses += 1
elif self.cache_after:
if self.hits[url] == self.cache_after:
add_item = True
else:
add_item = False
if add_item:
expires = timestamp + timedelta(seconds=self.ttl)
self.stats[url] = CacheItem(url, expires)
self.items.append((expires, size))
self._size += size
# Purge old items.
while self.items and self.items[0][0] <= timestamp:
i = self.items.popleft()
self._size -= i[1]
self.size = max((self.size, self._size))
self.count = max((self.count, len(self.items)))
def values(self):
return self.stats.values()
def main(*ttls):
records = Access.select() \
.where(SQL(r"url REGEXP '^/[a-zA-Z0-9_\-]\{11\}/'")) \
.order_by('ts')
caches = [Cache(ttl, cache_after=2) for ttl in ttls]
print('Simulating %s caching scheme(s)' % len(caches))
for i, r in enumerate(records):
for cache in caches:
cache.add(r.url, r.ts, r.sent)
for cache in caches:
# Our cache simulation is complete, store the data set. The table name is
# based upon the cache ttl.
table_name = 'cache_%ss' % cache.ttl
CacheStat._meta.db_table = table_name
DB.create_tables([CacheStat], True)
CacheStat.raw(SQL('TRUNCATE TABLE `%s`', table_name))
print('Maximum cache size with ttl %s: %s/%s' % (
cache.ttl, cache.count, cache.size))
# Save all the Cache instances we created.
objs = list(cache.values())
with DB.atomic():
for i in range(0, len(objs), 10000):
CacheStat.insert_many(objs[i:i+1000]).execute()
if __name__ == '__main__':
ttls = map(int, sys.argv[1:])
if not ttls:
sys.exit('Please provide at least one TTL (in seconds)')
main(*ttls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment