Skip to content

Instantly share code, notes, and snippets.

@jvwong
Last active September 1, 2016 20:51
Show Gist options
  • Save jvwong/1a46e9f6c967834c68f5ed99dd2fb77d to your computer and use it in GitHub Desktop.
Save jvwong/1a46e9f6c967834c68f5ed99dd2fb77d to your computer and use it in GitHub Desktop.
Merge TCGA data in separate files sourced from Genomic Data Commons
import os
import fnmatch
import json
import pandas as pd
def writeout(output_path, df):
"""
Write dataframe (df) to output path (output_path)
"""
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
os.makedirs(directory)
df.to_csv(output_path, sep="\t", index=True, header=True)
### Extract and combined the RNA sequencing counts for each case
## Note 1
BASE_DIR = os.path.abspath('/Users/jeffreywong/Sync/bader_jvwong/Guide/datasets/get-data/data/GDC_TCGAOv_Counts/')
for file in os.listdir(BASE_DIR):
if fnmatch.fnmatch(file, 'metadata.cart*.json'):
metadata_file = os.path.join(BASE_DIR, file)
if fnmatch.fnmatch(file, 'gdc_manifest_*.txt'):
manifest_file = os.path.join(BASE_DIR, file)
downloads_dir = os.path.join(BASE_DIR, 'gdc_downloads')
output_dir = os.path.join(BASE_DIR, 'output')
output_file_data = os.path.join(output_dir, 'TCGAOv_counts.txt')
subtypes_file = os.path.join(output_dir, 'TCGAOv_subtypes.txt')
count_file_extension = '.htseq.counts.gz'
df_subtypes = pd.DataFrame()
df_combined = pd.DataFrame()
## Note 2
with open(subtypes_file, 'r') as sf:
df_subtypes = pd.read_table(sf, header=0, index_col=0)
with open(manifest_file, 'r') as mf:
df_manifest = pd.read_table(mf, header=0, index_col=1)
with open(metadata_file, 'r') as f:
metadatas = json.load(f)
for idx, metadata in enumerate(metadatas):
## Note 3
if "analysis" not in metadata or not metadata["analysis"]["submitter_id"]:
continue
submitter_id = metadata["analysis"]["submitter_id"]
count_file_id = submitter_id.split('_')[0]
## Note 4
if "associated_entities" not in metadata or not metadata["associated_entities"]:
continue
case_id = metadata["associated_entities"][0]["case_id"]
## Note 5
if case_id in df_combined.columns:
continue
if case_id not in df_subtypes['case_id'].values:
continue
## Note 6
count_file_name = count_file_id + count_file_extension
count_file_directory = df_manifest.ix[count_file_name]['id']
countfile_path = os.path.join(downloads_dir,
count_file_directory,
count_file_name)
if not os.path.isfile(countfile_path):
continue
df_case = pd.read_table(countfile_path,
compression='gzip',
header = None)
df_case.set_index(0, inplace=True)
df_case.columns=[case_id]
if idx == 0:
df_combined = df_case.copy(deep=True)
df_combined.index.name = 'gene_id'
df_combined.columns.name = 'case_id'
continue
## Note 7
df_combined = pd.merge(df_case,
df_combined,
how='outer',
left_index=True,
right_index=True)
## Note 8
df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])
df_combined = df_combined[df_combined.index.map(lambda x: 'ENSG' in x)]
df_combined = df_combined.reindex_axis(sorted(df_combined.columns), axis=1)
writeout(output_file_data, df_combined)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment