Skip to content

Instantly share code, notes, and snippets.

@devdazed
Created November 24, 2015 22:04
Show Gist options
  • Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop.
Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop.
Count tombstones in a Cassandra Table
#!/usr/bin/env python
"""
Counts the number of tombstones in a keyspace.table and reports the top N highest counts
tombstone_count.py
[-h] This help screen
[--data-dir DATA_DIR] The C* data directory (/var/lib/cassandra/data)
[--top-k TOP_K] The top number of keys with highest tombstone counts to display.
keyspace The keyspace that contains the table
table The table to count tombstones
"""
from collections import Counter
import argparse
import glob
import json
import operator
import subprocess
def sizeof_fmt(num, suffix='B'):
for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
class SSTableReader(object):
def __init__(self, sstable):
self._sstable = sstable
self._proc = None
self._opened = False
self.num_bytes = 0
def __iter__(self):
return self
def _read_n(self, n=1):
self._proc.stdout.read(n)
self.num_bytes += n
def _next_object(self):
buf = []
for char in self.read():
if char == '\n':
continue
if len(buf) == 0 and char != '{':
raise ValueError('Invalid JSON Object Start Char: {0} ({1})'.format(char, ord(char)))
buf.append(char)
# the object ends with a `}`, so each one we see we try to marshal
# if the marshal works, the object is complete
if char == '}':
try:
row = json.loads(''.join(buf))
self._read_n(2) # skip past the next two chars `,\n`
return row
except ValueError:
# if we can't marshal the object, then continue reading
continue
def open(self):
self._proc = subprocess.Popen(['sstable2json', self._sstable], stdout=subprocess.PIPE, bufsize=1048576)
def read(self):
for c in iter(lambda: self._proc.stdout.read(1), ''):
yield c
self.num_bytes += 1
def next(self):
if not self._opened:
self.open()
self._opened = True
self._read_n(2) # skip past the first two chars `[\n`
next_object = self._next_object()
if next_object is None:
raise StopIteration()
return next_object
class TombstoneCounter(object):
def __init__(self, keyspace, table, data_dir):
self._data_dir = data_dir
self._keyspace = keyspace
self._table = table
self._sstable_count = 0
self._total_bytes = 0
self._tombstones = Counter()
@staticmethod
def read_sstable_json(sstable):
print 'Reading {0}'.format(sstable)
reader = SSTableReader(sstable)
return reader
def sstable_files(self):
tables = glob.glob('{0}/{1}/{2}/*-Data.db'.format(self._data_dir, self._keyspace, self._table))
self._sstable_count = len(tables)
print 'Found {0} sstables'.format(self._sstable_count)
return tables
def count_tombstones(self):
for sstable in self.sstable_files():
self.count_tombstones_in_sstable(sstable)
def count_tombstones_in_row(self, row):
for cell in row['cells']:
if len(cell) > 3 and cell[3] == 't':
self._tombstones[row['key']] += 1
def count_tombstones_in_sstable(self, sstable):
reader = self.read_sstable_json(sstable)
for row in reader:
self.count_tombstones_in_row(row)
self._total_bytes += reader.num_bytes
def report(self, top):
sorted_tombstones = sorted(self._tombstones.items(), key=operator.itemgetter(1))
sorted_tombstones.reverse()
print 'Read {0} keys and {1} of data'.format(len(sorted_tombstones), sizeof_fmt(self._total_bytes))
print 'Top {0} keys with highest number tombstones'.format(top)
n = 0
for pair in sorted_tombstones[0:top]:
n += 1
print "{0:3} {1} => {2}".format(str(n) + '.', pair[0], pair[1])
def main():
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument('keyspace')
parser.add_argument('table')
parser.add_argument('--data-dir', default='/var/lib/cassandra/data')
parser.add_argument('--top-k', default=25)
args = parser.parse_args()
t = TombstoneCounter(args.keyspace, args.table, args.data_dir)
t.count_tombstones()
t.report(args.top_k)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment