Skip to content

Instantly share code, notes, and snippets.

@DailyDreaming
Last active February 20, 2019 18:32
Show Gist options
  • Save DailyDreaming/f7ada6f1a68309ba98ba060a3a0b1130 to your computer and use it in GitHub Desktop.
Save DailyDreaming/f7ada6f1a68309ba98ba060a3a0b1130 to your computer and use it in GitHub Desktop.
Script to test speed of AWS checkout.
#!/usr/bin/env python3
import time
import os
import boto3
from hca import HCAConfig
from hca.dss import DSSClient
"""
This script tests https://github.com/HumanCellAtlas/data-store and its checkout.
The data-store repo relies on 2 buckets per cloud, a main bucket and a checkout bucket.
When an outside user requests data from one of these buckets, it is transferred
from the main bucket to the checkout bucket if it does not already exist there,
before being downloaded a second time to the user's computer.
This tests the speed of fetching the file directly from a bucket vs. the
indirect transfer of data from main to the checkout bucket to the user.
The purpose of this is to demonstrate that files kept/cached in the checkout bucket
will be able to be accessed much more quickly.
"""
hca_config = HCAConfig()
hca_config['DSSClient'].swagger_url = f'https://dss.integration.data.humancellatlas.org/v1/swagger.json'
dss = DSSClient(config=hca_config)
# user_path = '/home/quokka/Desktop/delete'
user_path = '/home/ubuntu'
def get_metadata_multihash(m):
return 'blobs/' + '.'.join([m['sha256'], m['sha1'], m['s3_etag'], m['crc32c']])
def delete_from_checkout(metadata_multihashes):
s3 = boto3.client('s3')
bucket = 'org-humancellatlas-dss-checkout-integration'
for mhash in metadata_multihashes:
if mhash:
r = s3.delete_object(Bucket=bucket, Key=mhash)
print(f'Successfully deleted: s3://{bucket}/{mhash}\n'
f'==============================================\n'
f'{str(r)}\n\n')
def get_bundles(num_bundles=100):
search_results = dss.post_search(replica='aws', es_query={})
bundle_ids = []
for r in search_results['results']:
bundle_ids.append(r['bundle_fqid'].split('.')[0])
good_bundles = []
bad_bundles = [] # bad bundles have been removed but are still present in the index (& need a reindex)
good_bundle_stats = [] # list of file sizes
metadata_hashes = [] # list of bucket keys
i = 0
for uuid in bundle_ids:
print(f'[{str(i)}/{str(len(bundle_ids))}] Getting data for : {uuid}')
if len(good_bundles) == num_bundles:
break
i += 1
try:
bundle_results = dss.get_bundle(uuid=uuid, replica='aws')
for b in bundle_results['bundle']['files']:
metadata_hashes.append(get_metadata_multihash(b))
good_bundle_stats.append(b['size'])
good_bundles.append(uuid)
except:
bad_bundles.append(uuid)
return good_bundles, good_bundle_stats, metadata_hashes
def download_metadata(bundles, bundle_stats):
start = time.time()
i = 0
for bundle in bundles:
new_dir = f'new_dir_{str(i)}'
os.makedirs(new_dir, exist_ok=True)
i += 1
print(f'[{str(i)}/{str(len(bundles))}] Now downloading {bundle}.')
dl_results = dss.download(bundle_uuid=bundle,
dest_name=f'{user_path}/new_dir',
replica='aws',
data_files='',
metadata_files='*')
end = time.time()
print(f'Download complete!\n'
f'Time taken: {str(end - start)} seconds.\n'
f'# of Files: {str(len(bundle_stats))}.\n'
f'Total size: {str(sum(bundle_stats))}.')
return str(end - start)
def del_local_folder_contents(folder=f'{user_path}'):
for the_file in os.listdir(folder):
file_path = os.path.join(folder, the_file)
if os.path.isfile(file_path):
os.unlink(file_path)
def main():
# query & return all good bundle names, file size stats, and names of file keys (which are named as hashes)
bundles, bundle_stats, metadata_hashes = get_bundles()
# first empty the checkout bucket of file keys (which are named as hashes)
delete_from_checkout(metadata_hashes)
print('Phase 1: Download w/o files in the checkout buckets (uncached).')
time_taken = download_metadata(bundles, bundle_stats)
with open('stats.log', 'a+') as f:
f.write(f'Phase 1: Download w/o files in the checkout buckets (uncached): {time_taken} seconds.\n')
f.write(f'# of Files: {str(len(bundle_stats))}.\n')
f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')
print('Phase 2: Download w/ files in the checkout buckets (cached).')
time_taken = download_metadata(bundles, bundle_stats)
with open('stats.log', 'a+') as f:
f.write(f'Phase 2: Download w/ files in the checkout buckets (cached): {time_taken} seconds.\n')
f.write(f'# of Files: {str(len(bundle_stats))}.\n')
f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment