Skip to content

Instantly share code, notes, and snippets.

@larssono
Created March 7, 2015 10:21
Show Gist options
  • Save larssono/93780dae4d15c67421af to your computer and use it in GitHub Desktop.
Save larssono/93780dae4d15c67421af to your computer and use it in GitHub Desktop.
Create a summary of the TCGA data in Synapse
import synapseclient
syn = synapseclient.login()
import pandas as pd
import synapseHelpers
from multiprocessing.dummy import Pool
QUERY = ("select * from file where benefactorId=='syn2812961' "
"and fileType!='clinicalMatrix'"
"and fileType!='maf'")
def countContent(input):
i, fileMeta = input
print i, fileMeta.id, fileMeta['name']
if fileMeta.fileType =='bed5':
data = pd.read_csv(syn.get(fileMeta.id).path, sep='\t', header=None)
nFeatures = 0
samples = list(set(data[3]))
else: #All other fileTypes
data = pd.read_csv(syn.get(fileMeta.id).path, sep='\t', index_col=0)
nFeatures, nSamples = data.shape
samples = data.columns
metadata = pd.DataFrame([fileMeta]*len(samples))
metadata['nFeatures'] = nFeatures
metadata['samples'] = samples
return metadata
syn=synapseclient.login()
p = Pool(5)
files = synapseHelpers.query2df(syn.chunkedQuery(QUERY))
dfs = p.map(countContent, files.iterrows())
metadata = pd.concat(dfs)
metadata['patient_barcode'] = [x[:12] for x in metadata.samples]
metadata.drop(['projectId'], axis=1, inplace=True)
metadata.nFeatures = metadata.nFeatures.astype('int')
metadata.to_csv('all_sample_info.tsv', sep='\t')
#Create table
cols = synapseclient.as_table_columns(metadata)
for col in cols:
if col['name']=='patient_barcode': col['maximumSize']=13
if col['name']=='id': col['columnType']='ENTITYID'
if col['name']=='acronym': col['maximumSize']=10
schema = synapseclient.Schema(name='All Sample Info', columns=cols, parent='syn300013')
table = syn.store(synapseclient.Table(schema, metadata))
################
#Using the table
#################
##Summarize the patient per sample
# table = syn.tableQuery('SELECT * FROM syn3281840')
# df = table.asDataFrame()
# sample_counts = df.pivot_table('basename',
# rows=['patient_barcode', 'acronym'],
# cols=['platform'], aggfunc=len,
# fill_value='')
# filename= 'sample_level_data.tsv'
# sample_counts.to_csv(filename, sep='\t', float_format='%g', na_rep='')
# syn.store(File(filename, parentId='syn3242745'),
# used=['syn3281840'],
# executed=[synapseHelpers.thisCodeInSynapse(parentId='syn1774100')])
##Create a summary of the number of samples for each platform and disease
# x= df.pivot_table('patient_barcode', cols=['acronym'], rows=['platform'], aggfunc=lambda x:len(set(x)))
# x.to_csv('skit.csv', sep='\t', float_format='%g', na_rep=' ')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment