Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save larssono/ad6c400a9946ac3c3733 to your computer and use it in GitHub Desktop.
Save larssono/ad6c400a9946ac3c3733 to your computer and use it in GitHub Desktop.
import tarfile
from StringIO import StringIO
import requests
import synapseclient
import re
import pandas as pd
syn=synapseclient.Synapse(skip_checks=False)
syn.login(silent=True)
ids = [x['file.id'] for x in syn.chunkedQuery("select name from file where platform=='IlluminaHiSeq_DNASeqC' and benefactorId=='syn2812961'")]
used = sum([syn.getProvenance(id)['used'] for id in ids], [])
urls = [x['url'] for x in used]
#Filter out the mage tab and non-data urls
urls = [url for url in urls if 'mage' not in url and 'tcga-data' in url]
#Download all tar files and extract content filenames
file_names = [tarfile.open(mode= "r:gz", fileobj = StringIO(requests.get(url).content)).getnames() for url in urls]
file_names = sum(file_names, [])
file_names = [f for f in file_names if f.endswith('tsv')]
tumour_to_normal = [re.findall(r"TCGA-..-....-...-...-....-.." , s.split('/')[-1]) for s in file_names]
normal_to_tumour = {x[1]: x[0] for x in tumour_to_normal}
##Add the tumour pair to the existing file
df = pd.read_csv('Sample_List_Freeze_v1_1.0_Annotations_Merged_5-18-15.csv', sep='\t')
df = df[df.platform=='IlluminaHiSeq_DNASeqC']
df['tumor_aliquot_barcode'] = [normal_to_tumour.get(barcode, '') for barcode in df.aliquot_barcode]
df.to_csv('IlluminaHiSeq_DNASeqC_List_Freeze_v1_1.1_Annotations_Merged_5-18-15.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment