Skip to content

Instantly share code, notes, and snippets.

@meyarivan
Last active August 29, 2015 14:04
Show Gist options
  • Save meyarivan/05b93150fc3e97589147 to your computer and use it in GitHub Desktop.
Save meyarivan/05b93150fc3e97589147 to your computer and use it in GitHub Desktop.
Compute sizes of raw_data:* from a silly sample of live data
#!/usr/bin/env python
import os, sys
import math
import happybase
import time
import simplejson as json
import struct
from datetime import datetime, timedelta
def show_usage_and_quit():
print >> sys.stderr, "Usage: %s hostname/VIP" % (sys.argv[0])
sys.exit(2)
def get_table(thrift_server, tbl_name):
connection = happybase.Connection(thrift_server)
table = connection.table(tbl_name)
return table
if __name__ == '__main__':
if len(sys.argv) != 2:
show_usage_and_quit()
table = get_table(sys.argv[1], 'crash_reports')
rows = []
end = datetime.today()
start = end - timedelta(days=190)
curr = start
pcnt = 0.99
while curr <= end:
sizes = []
query_key = 'a' + curr.strftime("%y%m%d")
for key, val in table.scan(row_start = query_key, row_stop = query_key + 'a01', columns = ('raw_data',), batch_size = 256, filter=b'KeyOnlyFilter(true)', limit = 4096):
sizes.append(sum(map(lambda x : struct.unpack(">i", x)[0], [val[y] for y in val.keys()])))
if sizes:
sizes.sort()
print query_key[1:], sum(sizes)/len(sizes), len(sizes), sizes[0], sizes[-1], sizes[int(math.ceil(pcnt * len(sizes))) - 1]
curr += timedelta(days = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment