Last active
February 20, 2019 18:32
-
-
Save DailyDreaming/f7ada6f1a68309ba98ba060a3a0b1130 to your computer and use it in GitHub Desktop.
Script to test speed of AWS checkout.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import time | |
import os | |
import boto3 | |
from hca import HCAConfig | |
from hca.dss import DSSClient | |
""" | |
This script tests https://github.com/HumanCellAtlas/data-store and its checkout. | |
The data-store repo relies on 2 buckets per cloud, a main bucket and a checkout bucket. | |
When an outside user requests data from one of these buckets, it is transferred | |
from the main bucket to the checkout bucket if it does not already exist there, | |
before being downloaded a second time to the user's computer. | |
This tests the speed of fetching the file directly from a bucket vs. the | |
indirect transfer of data from main to the checkout bucket to the user. | |
The purpose of this is to demonstrate that files kept/cached in the checkout bucket | |
will be able to be accessed much more quickly. | |
""" | |
hca_config = HCAConfig() | |
hca_config['DSSClient'].swagger_url = f'https://dss.integration.data.humancellatlas.org/v1/swagger.json' | |
dss = DSSClient(config=hca_config) | |
# user_path = '/home/quokka/Desktop/delete' | |
user_path = '/home/ubuntu' | |
def get_metadata_multihash(m): | |
return 'blobs/' + '.'.join([m['sha256'], m['sha1'], m['s3_etag'], m['crc32c']]) | |
def delete_from_checkout(metadata_multihashes): | |
s3 = boto3.client('s3') | |
bucket = 'org-humancellatlas-dss-checkout-integration' | |
for mhash in metadata_multihashes: | |
if mhash: | |
r = s3.delete_object(Bucket=bucket, Key=mhash) | |
print(f'Successfully deleted: s3://{bucket}/{mhash}\n' | |
f'==============================================\n' | |
f'{str(r)}\n\n') | |
def get_bundles(num_bundles=100): | |
search_results = dss.post_search(replica='aws', es_query={}) | |
bundle_ids = [] | |
for r in search_results['results']: | |
bundle_ids.append(r['bundle_fqid'].split('.')[0]) | |
good_bundles = [] | |
bad_bundles = [] # bad bundles have been removed but are still present in the index (& need a reindex) | |
good_bundle_stats = [] # list of file sizes | |
metadata_hashes = [] # list of bucket keys | |
i = 0 | |
for uuid in bundle_ids: | |
print(f'[{str(i)}/{str(len(bundle_ids))}] Getting data for : {uuid}') | |
if len(good_bundles) == num_bundles: | |
break | |
i += 1 | |
try: | |
bundle_results = dss.get_bundle(uuid=uuid, replica='aws') | |
for b in bundle_results['bundle']['files']: | |
metadata_hashes.append(get_metadata_multihash(b)) | |
good_bundle_stats.append(b['size']) | |
good_bundles.append(uuid) | |
except: | |
bad_bundles.append(uuid) | |
return good_bundles, good_bundle_stats, metadata_hashes | |
def download_metadata(bundles, bundle_stats): | |
start = time.time() | |
i = 0 | |
for bundle in bundles: | |
new_dir = f'new_dir_{str(i)}' | |
os.makedirs(new_dir, exist_ok=True) | |
i += 1 | |
print(f'[{str(i)}/{str(len(bundles))}] Now downloading {bundle}.') | |
dl_results = dss.download(bundle_uuid=bundle, | |
dest_name=f'{user_path}/new_dir', | |
replica='aws', | |
data_files='', | |
metadata_files='*') | |
end = time.time() | |
print(f'Download complete!\n' | |
f'Time taken: {str(end - start)} seconds.\n' | |
f'# of Files: {str(len(bundle_stats))}.\n' | |
f'Total size: {str(sum(bundle_stats))}.') | |
return str(end - start) | |
def del_local_folder_contents(folder=f'{user_path}'): | |
for the_file in os.listdir(folder): | |
file_path = os.path.join(folder, the_file) | |
if os.path.isfile(file_path): | |
os.unlink(file_path) | |
def main(): | |
# query & return all good bundle names, file size stats, and names of file keys (which are named as hashes) | |
bundles, bundle_stats, metadata_hashes = get_bundles() | |
# first empty the checkout bucket of file keys (which are named as hashes) | |
delete_from_checkout(metadata_hashes) | |
print('Phase 1: Download w/o files in the checkout buckets (uncached).') | |
time_taken = download_metadata(bundles, bundle_stats) | |
with open('stats.log', 'a+') as f: | |
f.write(f'Phase 1: Download w/o files in the checkout buckets (uncached): {time_taken} seconds.\n') | |
f.write(f'# of Files: {str(len(bundle_stats))}.\n') | |
f.write(f'Total size: {str(sum(bundle_stats))}.\n\n') | |
print('Phase 2: Download w/ files in the checkout buckets (cached).') | |
time_taken = download_metadata(bundles, bundle_stats) | |
with open('stats.log', 'a+') as f: | |
f.write(f'Phase 2: Download w/ files in the checkout buckets (cached): {time_taken} seconds.\n') | |
f.write(f'# of Files: {str(len(bundle_stats))}.\n') | |
f.write(f'Total size: {str(sum(bundle_stats))}.\n\n') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment