Created
May 21, 2015 23:31
-
-
Save larssono/ad6c400a9946ac3c3733 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tarfile | |
from StringIO import StringIO | |
import requests | |
import synapseclient | |
import re | |
import pandas as pd | |
syn=synapseclient.Synapse(skip_checks=False) | |
syn.login(silent=True) | |
ids = [x['file.id'] for x in syn.chunkedQuery("select name from file where platform=='IlluminaHiSeq_DNASeqC' and benefactorId=='syn2812961'")] | |
used = sum([syn.getProvenance(id)['used'] for id in ids], []) | |
urls = [x['url'] for x in used] | |
#Filter out the mage tab and non-data urls | |
urls = [url for url in urls if 'mage' not in url and 'tcga-data' in url] | |
#Download all tar files and extract content filenames | |
file_names = [tarfile.open(mode= "r:gz", fileobj = StringIO(requests.get(url).content)).getnames() for url in urls] | |
file_names = sum(file_names, []) | |
file_names = [f for f in file_names if f.endswith('tsv')] | |
tumour_to_normal = [re.findall(r"TCGA-..-....-...-...-....-.." , s.split('/')[-1]) for s in file_names] | |
normal_to_tumour = {x[1]: x[0] for x in tumour_to_normal} | |
##Add the tumour pair to the existing file | |
df = pd.read_csv('Sample_List_Freeze_v1_1.0_Annotations_Merged_5-18-15.csv', sep='\t') | |
df = df[df.platform=='IlluminaHiSeq_DNASeqC'] | |
df['tumor_aliquot_barcode'] = [normal_to_tumour.get(barcode, '') for barcode in df.aliquot_barcode] | |
df.to_csv('IlluminaHiSeq_DNASeqC_List_Freeze_v1_1.1_Annotations_Merged_5-18-15.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment