Skip to content

Instantly share code, notes, and snippets.

@kdaily
Forked from larssono/diskusage.py
Created April 5, 2018 18:46
Show Gist options
  • Save kdaily/f96c20777d8c0607e08800234f319837 to your computer and use it in GitHub Desktop.
Save kdaily/f96c20777d8c0607e08800234f319837 to your computer and use it in GitHub Desktop.
import argparse
import synapseclient, synapseutils
import pandas as pd
from multiprocessing.dummy import Pool
from synapseclient import File, Schema
from synapseclient.utils import humanizeBytes
from synapseutils.monitor import with_progress_bar
syn=synapseclient.Synapse()
syn.login()
def getOlderEntityVersions(fileEntity):
"""Goes through all entities in and returns the older versions (if they exist)"""
synId, versionNumber = fileEntity.id, fileEntity.versionNumber
if versionNumber >1: #Skip one rest call for single version files.
versions = syn._GET_paginated('/entity/%s/version' %synId, offset=1)
return [syn.get(synId, version = v['versionNumber'], downloadFile=False) for v in versions]
else:
return [fileEntity]
mp = Pool(10)
parser = argparse.ArgumentParser(description='Get diskusge of a container in Synapse.')
parser.add_argument('synId', help='synapse Id of folder or project to determine diskusage')
args = parser.parse_args()
fileEnts=[]
tableEnts=[]
for path, dirs, files in synapseutils.walk(syn, args.synId):
getFunc = with_progress_bar(lambda i: syn.get(i[1], downloadFile=False), len(files),
prefix='Getting files in: %32s (%s)' %path)
newEnts = mp.map(getFunc, files)
fileEnts.extend([e for e in newEnts if isinstance(e, File)])
tableEnts.extend([e for e in newEnts if isinstance(e, Schema)])
getOld = with_progress_bar(getOlderEntityVersions, len(fileEnts), prefix='getting older versions ')
allEnts = sum(mp.map(getOld, fileEnts), [])
print 'Naive fileSize is:', synapseclient.utils.humanizeBytes(sum([e.fileSize for e in allEnts if e.fileSize is not None]))
df = pd.DataFrame([e._file_handle for e in allEnts])
df = df.drop_duplicates('id') #Drop duplicate filehandles
df = df[(~df.duplicated('key')) | (df['key'].isnull())] #Drop duplicate references to same object
summary = df.pivot_table('contentSize', 'concreteType', aggfunc=sum, fill_value=0)
for i, value in summary.iterrows():
print '%10s\t%s' %(humanizeBytes(value['contentSize']), i.replace('org.sagebionetworks.repo.model.file.', ''))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment