Skip to content

Instantly share code, notes, and snippets.

@larssono
Last active April 5, 2018 18:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save larssono/232f587b9233a5f5e60e to your computer and use it in GitHub Desktop.
Save larssono/232f587b9233a5f5e60e to your computer and use it in GitHub Desktop.
import argparse
import synapseclient, synapseutils
import pandas as pd
from multiprocessing.dummy import Pool
from synapseclient import File, Schema
from synapseclient.utils import humanizeBytes
from synapseutils.monitor import with_progress_bar
syn=synapseclient.Synapse()
syn.login()
def getOlderEntityVersions(fileEntity):
"""Goes through all entities in and returns the older versions (if they exist)"""
synId, versionNumber = fileEntity.id, fileEntity.versionNumber
if versionNumber >1: #Skip one rest call for single version files.
versions = syn._GET_paginated('/entity/%s/version' %synId, offset=1)
return [syn.get(synId, version = v['versionNumber'], downloadFile=False) for v in versions]
else:
return [fileEntity]
mp = Pool(10)
parser = argparse.ArgumentParser(description='Get diskusge of a container in Synapse.')
parser.add_argument('synId', help='synapse Id of folder or project to determine diskusage')
args = parser.parse_args()
fileEnts=[]
tableEnts=[]
for path, dirs, files in synapseutils.walk(syn, args.synId):
getFunc = with_progress_bar(lambda i: syn.get(i[1], downloadFile=False), len(files),
prefix='Getting files in: %32s (%s)' %path)
newEnts = mp.map(getFunc, files)
fileEnts.extend([e for e in newEnts if isinstance(e, File)])
tableEnts.extend([e for e in newEnts if isinstance(e, Schema)])
getOld = with_progress_bar(getOlderEntityVersions, len(fileEnts), prefix='getting older versions ')
allEnts = sum(mp.map(getOld, fileEnts), [])
print 'Naive fileSize is:', synapseclient.utils.humanizeBytes(sum([e.fileSize for e in allEnts if e.fileSize is not None]))
df = pd.DataFrame([e._file_handle for e in allEnts])
df = df.drop_duplicates('id') #Drop duplicate filehandles
df = df[(~df.duplicated('key')) | (df['key'].isnull())] #Drop duplicate references to same object
summary = df.pivot_table('contentSize', 'concreteType', aggfunc=sum, fill_value=0)
for i, value in summary.iterrows():
print '%10s\t%s' %(humanizeBytes(value['contentSize']), i.replace('org.sagebionetworks.repo.model.file.', ''))
@larssono
Copy link
Author

larssono commented Aug 26, 2016

Currently does not track data usage in tables but has support for versions in files.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment