Skip to content

Instantly share code, notes, and snippets.

@rhelmer
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rhelmer/7d3507bd0921fd9939f4 to your computer and use it in GitHub Desktop.
Save rhelmer/7d3507bd0921fd9939f4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import hashlib
import logging
import boto
import config
import happybase
logger = logging.getLogger(__name__)
class HBase(object):
def __init__(self):
logger.info('connected to %s' % config.THRIFT_ADDR)
conn = happybase.Connection(config.THRIFT_ADDR)
self.table = conn.table(config.HBASE_TABLE)
def get_crashes(self):
logger.info('start table scan')
result = self.table.scan(
columns=['raw_data'],
batch_size=10,
limit=10
)
for key, data in result:
yield (key, data)
def main():
conn = boto.connect_s3(config.AWS_KEY, config.AWS_SECRET_KEY)
bucket = conn.get_bucket(config.BUCKET_NAME)
logger.info('connected to bucket %s' % config.BUCKET_NAME)
hbase = HBase()
for counter, crash in enumerate(hbase.get_crashes()):
(key, data) = crash
crash_id = key[7:]
path = '%s/%s' % (config.PREFIX, crash_id)
logger.debug('fetching crash from S3: %s' % path)
# NOTE - Amazon sets ETag to an md5sum, on objects under 5GB
# See http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html
s3_checksum = bucket.lookup('%s' % (path)).etag.strip('"')
s3_key = bucket.get_key('%s' % path)
logger.debug('S3 checksum for %s: %s', crash_id, s3_checksum)
md5 = hashlib.md5()
md5.update(data['raw_data:dump'])
hb_checksum = md5.hexdigest()
logger.debug('HB checksum for %s: %s', crash_id, hb_checksum)
if (s3_checksum != hb_checksum):
logger.warn('MD5 checksum mismatch: %s' % key)
else:
logger.debug('MD5 checksums match: %s' % key)
logger.info('Compared %s crashes' % (counter+1))
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s %(name)s:%(levelname)s: %(message)s',
level=logging.INFO
)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment