Skip to content

Instantly share code, notes, and snippets.

@phrawzty
Forked from rhelmer/gist:7d3507bd0921fd9939f4
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phrawzty/02b9d680e76c6e6f371d to your computer and use it in GitHub Desktop.
Save phrawzty/02b9d680e76c6e6f371d to your computer and use it in GitHub Desktop.
crash reports hbase / s3 comparison test
#!/usr/bin/env python
import hashlib
import logging
import boto
import config
import happybase
logger = logging.getLogger(__name__)
class HBase(object):
def __init__(self):
logger.info('connected to %s' % config.THRIFT_ADDR)
conn = happybase.Connection(config.THRIFT_ADDR)
self.table = conn.table(config.HBASE_TABLE)
def get_crashes(self):
"""This is rudimentary but functional. Optimally our sampling
technique would be more rigorous.
"""
logger.info('start table scan')
result = self.table.scan(
columns=['raw_data'],
batch_size=10,
limit=10
)
for key, data in result:
yield (key, data)
def main():
conn = boto.connect_s3(config.AWS_KEY, config.AWS_SECRET_KEY)
bucket = conn.get_bucket(config.BUCKET_NAME)
logger.info('connected to bucket %s' % config.BUCKET_NAME)
hbase = HBase()
for counter, crash in enumerate(hbase.get_crashes()):
(key, data) = crash
crash_id = key[7:]
path = '%s/%s' % (config.PREFIX, crash_id)
logger.debug('fetching crash from S3: %s' % path)
# NOTE - Amazon sets ETag to an md5sum, on objects under 5GB
# See http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html
# Also note that bucket.lookup() is HEAD only, so it costs less than a GET.
s3_checksum = bucket.lookup('%s' % (path)).etag.strip('"')
s3_key = bucket.get_key('%s' % path)
logger.debug('S3 checksum for %s: %s', crash_id, s3_checksum)
md5 = hashlib.md5()
md5.update(data['raw_data:dump'])
hb_checksum = md5.hexdigest()
logger.debug('HB checksum for %s: %s', crash_id, hb_checksum)
if (s3_checksum != hb_checksum):
logger.warn('MD5 checksum mismatch: %s' % key)
else:
logger.debug('MD5 checksums match: %s' % key)
logger.info('Compared %s crashes' % (counter+1))
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s %(name)s:%(levelname)s: %(message)s',
level=logging.INFO
)
main()
THRIFT_ADDR = ''
HBASE_TABLE = ''
AWS_KEY = ''
AWS_SECRET_KEY = ''
BUCKET_NAME = ''
PREFIX = ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment