Skip to content

Instantly share code, notes, and snippets.

@jvwong
Created August 9, 2016 19:19
Show Gist options
  • Save jvwong/3982dad461eb508d11fe60c7b9e59311 to your computer and use it in GitHub Desktop.
Save jvwong/3982dad461eb508d11fe60c7b9e59311 to your computer and use it in GitHub Desktop.
Data munging for the TCGAOv data set

GDC Data Munging Scripts

This is for use in the Pathway Commons Guide TCGA Ovarian Cancer data set description.

Setup

  1. Install conda

  2. Create a conda environment named tcgaov. You can use the supplied environments.yml to quickly get started.

$ conda env create -f environment.yml
  1. Activate the environment
$ source activate tcgaov
discarding /Users/username/anaconda/bin from PATH
prepending /Users/username/anaconda/envs/tcgaov/bin to PATH  
(tcgaov)$
  1. Modify python file main functions

You will need to modify the main function of format_gdc.py to match the location of your metadata and downloads.

  • format_gdc
    • BASE_DIR: This is where your downloads, metadata folders reside
    • downloads_dir: the name of the folder where you GDC downloads reside
    • metadata_file: GDC metadata file
    • subtypes_file: Table of TCGA participant barcode and subtypes
    • input_file_filtered_data: The filtered data output file name
  1. Run
(tcgaov)$ python format_gdc.py
name: tcgaov
dependencies:
- appnope=0.1.0=py27_0
- backports=1.0=py27_0
- decorator=4.0.10=py27_0
- ecdsa=0.13=py27_0
- fabric=1.11.1=py27_0
- get_terminal_size=1.0.0=py27_0
- ipython=5.0.0=py27_0
- ipython_genutils=0.1.0=py27_0
- mkl=11.3.3=0
- numpy=1.11.1=py27_0
- openssl=1.0.2h=1
- pandas=0.18.1=np111py27_0
- paramiko=1.16.0=py27_0
- path.py=8.2.1=py27_0
- pathlib2=2.1.0=py27_0
- pexpect=4.0.1=py27_0
- pickleshare=0.7.3=py27_0
- pip=8.1.2=py27_0
- prompt_toolkit=1.0.3=py27_0
- ptyprocess=0.5.1=py27_0
- pycrypto=2.6.1=py27_4
- pygments=2.1.3=py27_0
- python=2.7.12=1
- python-dateutil=2.5.3=py27_0
- python.app=1.2=py27_4
- pytz=2016.6.1=py27_0
- readline=6.2=2
- setuptools=23.0.0=py27_0
- simplegeneric=0.8.1=py27_1
- six=1.10.0=py27_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- traitlets=4.2.2=py27_0
- wcwidth=0.1.7=py27_0
- wheel=0.29.0=py27_0
- zlib=1.2.8=3
- pip:
- backports.shutil-get-terminal-size==1.0.0
- gprofiler-official==0.2.3
- ipython-genutils==0.1.0
- prompt-toolkit==1.0.3
- requests==2.11.0
from __future__ import with_statement
import os
import csv
import json
import gzip
import pandas as pd
from numpy import nan as NA
from gprofiler import GProfiler
gp = GProfiler("GDC/0.1")
def merge_data(metadata_file, downloads_dir):
"""
Merge the data files declared in the metadata
Cleans out the ENSG index gene ids for suffixes
Return a single dataframe
"""
with open(metadata_file, 'r') as f:
metadatas = json.load(f)
df_combined = pd.DataFrame()
# Loop over each file metadata record
for idx, metadata in enumerate(metadatas):
# Retrieve case ID so that we can name the output column
if not metadata["associated_entities"]:
continue
case_id = metadata["associated_entities"][0]["case_id"]
# Construct the path to the FPKM gzipped (*FKPM-UQ.gz) file
fkpmzip_path = os.path.join(downloads_dir, metadata["file_id"], metadata["file_name"])
# unzip and read in as dataframe
df_case = pd.read_table(fkpmzip_path, compression='gzip', header = None)
# set ENSG ID column (0) as index
df_case.set_index(0, inplace=True)
# set the column name
df_case.columns=[case_id]
# merge on ENSG as common; perform a union
## initialize the df_combined if not already
if idx == 0:
df_combined = df_case.copy(deep=True)
df_combined.index.name = 'gene_id'
df_combined.columns.name = 'case_id'
continue
df_combined = pd.merge(df_case, df_combined, how='outer', left_index=True, right_index=True)
## Remove any trailing decimal digits in the ENSG gene id index
df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])
return df_combined
def assign_subtype_caseid(metadata_file, subtype_filename):
"""
Assign to each TCGA patient barcode and subtype contained in the subtype_filename
a GDC case_id. Return the intersection.
"""
## input file with TCGA patient barcodes and subtypes
df_subtypes = pd.read_table(subtype_filename, index_col=0, header=0)
## DataFrame which will have a case id assigned for each
## TCGA barcode aka participant aka entity_submitter_id
df_gdc = pd.DataFrame(columns=['case_id'])
with open(metadata_file, 'r') as f:
metadatas = json.load(f)
# Loop over each file metadata record
for idx, metadata in enumerate(metadatas):
# Retrieve case id so that we can name the output column
if not metadata["associated_entities"]:
continue
case_id = metadata["associated_entities"][0]["case_id"]
entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"]
# Reconstruct the TCGA barcode 'TCGA-<TSS>-<participant>'
barcode = '-'.join(entity_submitter_id.split('-')[0:3])
df_gdc.ix[barcode] = case_id
df_gdc.index.name = 'patient'
# merge the two (union)
return pd.merge(df_subtypes, df_gdc, how='inner', left_index=True, right_index=True)
def writeout(output_path, df):
"""
Write the dataframe (df) to output path (output_path) as a
tab-separated file
"""
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
os.makedirs(directory)
df.to_csv(output_path, sep="\t", index=True, header=True)
def format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype):
"""
Munge the TCGA data downloaded from GDC.
Extracts and combines the individual .gz data files
Finds a case_id UUID for each corresponding TCGA barcode/subtype (subtypes_file)
Filters data if there is an associated subtype
Dumps tab-separated file to the output_dir
"""
### merge FPKM-UQ data
df_data = merge_data(metadata_file, downloads_dir)
### merge and filter (intersect) subtype information having associated case ids
## This is the intersection
df_subtypes = assign_subtype_caseid(metadata_file, subtypes_file)
writeout(output_file_subtype, df_subtypes)
## access df_data columns (reindex) by presence of case_id in df_subtypes
df_data_subtyped = df_data.reindex(columns=df_subtypes['case_id'])
writeout(output_file_data, df_data_subtyped)
def conversion_map(query, target='HGNC', keys='query'):
"""
Convert query id list (possibly mixed) to target id format (human).
See GProfiler.gconvert (http://biit.cs.ut.ee/gprofiler/gconvert.cgi)
for targets. Filters out results where mapping is 'None'.
"""
converted = gp.gconvert(query, organism='hsapiens', target=target)
df_mapped = pd.DataFrame(converted, columns=['g#', 'initial_alias', 'c#', 'converted_alias', 'name', 'description', 'namespace'])
## replace None values with np.nan as NA
df_mapped.fillna(value=NA)
## Drop rows where the target 'converted_alias' is NA
df_id_map_filtered = df_mapped.dropna(subset = ['converted_alias'])
return df_id_map_filtered
def filter_gdc(input_file_data, output_path):
"""
Takes an input data file (gene ids, samples) and filters it for those
gene id rows having a valid HGNC symbol as determined by g:Profiler
Dumps tab-separated file (output_path)
"""
## Create a map of data ids to HGNC ids
df_data = pd.read_table(input_file_data, header=0, index_col=0)
df_id_map = conversion_map(list(df_data.index.values), target='HGNC')
## merge (intersection) df_data for rows with valid HGNC symbols
## that is, a matching df_id_map['initial_alias'] value
df_combined = pd.merge(df_data, df_id_map, how='inner', left_index=True, right_on='initial_alias')
## Clean up the result
df_combined.index = df_combined['converted_alias']
df_combined = df_combined.drop(['g#', 'c#', 'converted_alias', 'initial_alias', 'name', 'description', 'namespace'], axis=1)
df_combined.index.name = 'HGNC'
df_combined.columns.name = 'case_id'
writeout(output_path, df_combined)
if __name__ == "__main__":
BASE_DIR = os.path.abspath('/Users/username/Downloads/TCGAOV_data/')
downloads_dir = os.path.join(BASE_DIR, 'gdc_download_20160803')
metadata_file = os.path.join(BASE_DIR, 'metadata.cart.2016-08-03T16-04-06.289487.json')
subtypes_file = os.path.join(BASE_DIR, 'SupplementaryDataTable4.txt')
## Create output file paths
output_dir = os.path.join(BASE_DIR, 'output')
output_file_data = os.path.join(output_dir, 'TCGAOv_data.txt')
output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt')
output_file_filtered_data = os.path.join(output_dir, 'TCGAOv_data_filtered.txt')
# Do the munge
format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype)
filter_gdc(output_file_data, output_file_filtered_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment