DailyDreaming/gist:f7ada6f1a68309ba98ba060a3a0b1130

## gistfile1.txt
#!/usr/bin/env python3
import time
import os
import boto3

from hca import HCAConfig
from hca.dss import DSSClient


"""
This script tests https://github.com/HumanCellAtlas/data-store and its checkout.

The data-store repo relies on 2 buckets per cloud, a main bucket and a checkout bucket.
When an outside user requests data from one of these buckets, it is transferred
from the main bucket to the checkout bucket if it does not already exist there,
before being downloaded a second time to the user's computer.

This tests the speed of fetching the file directly from a bucket vs. the
indirect transfer of data from main to the checkout bucket to the user.

The purpose of this is to demonstrate that files kept/cached in the checkout bucket
will be able to be accessed much more quickly.
"""

hca_config = HCAConfig()
hca_config['DSSClient'].swagger_url = f'https://dss.integration.data.humancellatlas.org/v1/swagger.json'
dss = DSSClient(config=hca_config)
# user_path = '/home/quokka/Desktop/delete'
user_path = '/home/ubuntu'


def get_metadata_multihash(m):
    return 'blobs/' + '.'.join([m['sha256'], m['sha1'], m['s3_etag'], m['crc32c']])

def delete_from_checkout(metadata_multihashes):
    s3 = boto3.client('s3')
    bucket = 'org-humancellatlas-dss-checkout-integration'
    for mhash in metadata_multihashes:
        if mhash:
            r = s3.delete_object(Bucket=bucket, Key=mhash)
            print(f'Successfully deleted: s3://{bucket}/{mhash}\n'
                  f'==============================================\n'
                  f'{str(r)}\n\n')

def get_bundles(num_bundles=100):
    search_results = dss.post_search(replica='aws', es_query={})
    bundle_ids = []
    for r in search_results['results']:
        bundle_ids.append(r['bundle_fqid'].split('.')[0])
    good_bundles = []
    bad_bundles = []  # bad bundles have been removed but are still present in the index (& need a reindex)
    good_bundle_stats = []  # list of file sizes
    metadata_hashes = []  # list of bucket keys
    i = 0
    for uuid in bundle_ids:
        print(f'[{str(i)}/{str(len(bundle_ids))}] Getting data for : {uuid}')
        if len(good_bundles) == num_bundles:
            break
        i += 1
        try:
            bundle_results = dss.get_bundle(uuid=uuid, replica='aws')
            for b in bundle_results['bundle']['files']:
                metadata_hashes.append(get_metadata_multihash(b))
                good_bundle_stats.append(b['size'])
            good_bundles.append(uuid)
        except:
            bad_bundles.append(uuid)
    return good_bundles, good_bundle_stats, metadata_hashes


def download_metadata(bundles, bundle_stats):
    start = time.time()
    i = 0
    for bundle in bundles:
        new_dir = f'new_dir_{str(i)}'
        os.makedirs(new_dir, exist_ok=True)
        i += 1
        print(f'[{str(i)}/{str(len(bundles))}] Now downloading {bundle}.')
        dl_results = dss.download(bundle_uuid=bundle,
                                  dest_name=f'{user_path}/new_dir',
                                  replica='aws',
                                  data_files='',
                                  metadata_files='*')
    end = time.time()
    print(f'Download complete!\n'
          f'Time taken: {str(end - start)} seconds.\n'
          f'# of Files: {str(len(bundle_stats))}.\n'
          f'Total size: {str(sum(bundle_stats))}.')
    return str(end - start)


def del_local_folder_contents(folder=f'{user_path}'):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        if os.path.isfile(file_path):
            os.unlink(file_path)

def main():
    # query & return all good bundle names, file size stats, and names of file keys (which are named as hashes)
    bundles, bundle_stats, metadata_hashes = get_bundles()

    # first empty the checkout bucket of file keys (which are named as hashes)
    delete_from_checkout(metadata_hashes)

    print('Phase 1: Download w/o files in the checkout buckets (uncached).')
    time_taken = download_metadata(bundles, bundle_stats)
    with open('stats.log', 'a+') as f:
        f.write(f'Phase 1: Download w/o files in the checkout buckets (uncached): {time_taken} seconds.\n')
        f.write(f'# of Files: {str(len(bundle_stats))}.\n')
        f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')

    print('Phase 2: Download w/ files in the checkout buckets (cached).')
    time_taken = download_metadata(bundles, bundle_stats)
    with open('stats.log', 'a+') as f:
        f.write(f'Phase 2: Download w/ files in the checkout buckets (cached): {time_taken} seconds.\n')
        f.write(f'# of Files: {str(len(bundle_stats))}.\n')
        f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	import time
	import os
	import boto3

	from hca import HCAConfig
	from hca.dss import DSSClient


	"""
	This script tests https://github.com/HumanCellAtlas/data-store and its checkout.

	The data-store repo relies on 2 buckets per cloud, a main bucket and a checkout bucket.
	When an outside user requests data from one of these buckets, it is transferred
	from the main bucket to the checkout bucket if it does not already exist there,
	before being downloaded a second time to the user's computer.

	This tests the speed of fetching the file directly from a bucket vs. the
	indirect transfer of data from main to the checkout bucket to the user.

	The purpose of this is to demonstrate that files kept/cached in the checkout bucket
	will be able to be accessed much more quickly.
	"""

	hca_config = HCAConfig()
	hca_config['DSSClient'].swagger_url = f'https://dss.integration.data.humancellatlas.org/v1/swagger.json'
	dss = DSSClient(config=hca_config)
	# user_path = '/home/quokka/Desktop/delete'
	user_path = '/home/ubuntu'


	def get_metadata_multihash(m):
	return 'blobs/' + '.'.join([m['sha256'], m['sha1'], m['s3_etag'], m['crc32c']])

	def delete_from_checkout(metadata_multihashes):
	s3 = boto3.client('s3')
	bucket = 'org-humancellatlas-dss-checkout-integration'
	for mhash in metadata_multihashes:
	if mhash:
	r = s3.delete_object(Bucket=bucket, Key=mhash)
	print(f'Successfully deleted: s3://{bucket}/{mhash}\n'
	f'==============================================\n'
	f'{str(r)}\n\n')

	def get_bundles(num_bundles=100):
	search_results = dss.post_search(replica='aws', es_query={})
	bundle_ids = []
	for r in search_results['results']:
	bundle_ids.append(r['bundle_fqid'].split('.')[0])
	good_bundles = []
	bad_bundles = [] # bad bundles have been removed but are still present in the index (& need a reindex)
	good_bundle_stats = [] # list of file sizes
	metadata_hashes = [] # list of bucket keys
	i = 0
	for uuid in bundle_ids:
	print(f'[{str(i)}/{str(len(bundle_ids))}] Getting data for : {uuid}')
	if len(good_bundles) == num_bundles:
	break
	i += 1
	try:
	bundle_results = dss.get_bundle(uuid=uuid, replica='aws')
	for b in bundle_results['bundle']['files']:
	metadata_hashes.append(get_metadata_multihash(b))
	good_bundle_stats.append(b['size'])
	good_bundles.append(uuid)
	except:
	bad_bundles.append(uuid)
	return good_bundles, good_bundle_stats, metadata_hashes


	def download_metadata(bundles, bundle_stats):
	start = time.time()
	i = 0
	for bundle in bundles:
	new_dir = f'new_dir_{str(i)}'
	os.makedirs(new_dir, exist_ok=True)
	i += 1
	print(f'[{str(i)}/{str(len(bundles))}] Now downloading {bundle}.')
	dl_results = dss.download(bundle_uuid=bundle,
	dest_name=f'{user_path}/new_dir',
	replica='aws',
	data_files='',
	metadata_files='*')
	end = time.time()
	print(f'Download complete!\n'
	f'Time taken: {str(end - start)} seconds.\n'
	f'# of Files: {str(len(bundle_stats))}.\n'
	f'Total size: {str(sum(bundle_stats))}.')
	return str(end - start)


	def del_local_folder_contents(folder=f'{user_path}'):
	for the_file in os.listdir(folder):
	file_path = os.path.join(folder, the_file)
	if os.path.isfile(file_path):
	os.unlink(file_path)

	def main():
	# query & return all good bundle names, file size stats, and names of file keys (which are named as hashes)
	bundles, bundle_stats, metadata_hashes = get_bundles()

	# first empty the checkout bucket of file keys (which are named as hashes)
	delete_from_checkout(metadata_hashes)

	print('Phase 1: Download w/o files in the checkout buckets (uncached).')
	time_taken = download_metadata(bundles, bundle_stats)
	with open('stats.log', 'a+') as f:
	f.write(f'Phase 1: Download w/o files in the checkout buckets (uncached): {time_taken} seconds.\n')
	f.write(f'# of Files: {str(len(bundle_stats))}.\n')
	f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')

	print('Phase 2: Download w/ files in the checkout buckets (cached).')
	time_taken = download_metadata(bundles, bundle_stats)
	with open('stats.log', 'a+') as f:
	f.write(f'Phase 2: Download w/ files in the checkout buckets (cached): {time_taken} seconds.\n')
	f.write(f'# of Files: {str(len(bundle_stats))}.\n')
	f.write(f'Total size: {str(sum(bundle_stats))}.\n\n')


	if __name__ == '__main__':
	main()