jvwong/README.md

## README.md

      
    Raw
  

              README.md
            
          
    GDC Data Munging Scripts

This is for use in the Pathway Commons Guide TCGA Ovarian Cancer data set description.
Setup


Install conda


Create a conda environment named tcgaov. You can use the supplied environments.yml to quickly get started.


$ conda env create -f environment.yml

Activate the environment

$ source activate tcgaov
discarding /Users/username/anaconda/bin from PATH
prepending /Users/username/anaconda/envs/tcgaov/bin to PATH  
(tcgaov)$

Modify python file main functions

You will need to modify the main function of format_gdc.py to match the location of your metadata and downloads.

format_gdc

BASE_DIR: This is where your downloads, metadata folders reside
downloads_dir: the name of the folder where you GDC downloads reside
metadata_file: GDC metadata file
subtypes_file: Table of TCGA participant barcode and subtypes
input_file_filtered_data: The filtered data output file name


Run

(tcgaov)$ python format_gdc.py

  
## environment.yml
name: tcgaov
dependencies:
- appnope=0.1.0=py27_0
- backports=1.0=py27_0
- decorator=4.0.10=py27_0
- ecdsa=0.13=py27_0
- fabric=1.11.1=py27_0
- get_terminal_size=1.0.0=py27_0
- ipython=5.0.0=py27_0
- ipython_genutils=0.1.0=py27_0
- mkl=11.3.3=0
- numpy=1.11.1=py27_0
- openssl=1.0.2h=1
- pandas=0.18.1=np111py27_0
- paramiko=1.16.0=py27_0
- path.py=8.2.1=py27_0
- pathlib2=2.1.0=py27_0
- pexpect=4.0.1=py27_0
- pickleshare=0.7.3=py27_0
- pip=8.1.2=py27_0
- prompt_toolkit=1.0.3=py27_0
- ptyprocess=0.5.1=py27_0
- pycrypto=2.6.1=py27_4
- pygments=2.1.3=py27_0
- python=2.7.12=1
- python-dateutil=2.5.3=py27_0
- python.app=1.2=py27_4
- pytz=2016.6.1=py27_0
- readline=6.2=2
- setuptools=23.0.0=py27_0
- simplegeneric=0.8.1=py27_1
- six=1.10.0=py27_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- traitlets=4.2.2=py27_0
- wcwidth=0.1.7=py27_0
- wheel=0.29.0=py27_0
- zlib=1.2.8=3
- pip:
  - backports.shutil-get-terminal-size==1.0.0
  - gprofiler-official==0.2.3
  - ipython-genutils==0.1.0
  - prompt-toolkit==1.0.3
  - requests==2.11.0

## format_gdc.py
from __future__ import with_statement
import os
import csv
import json
import gzip
import pandas as pd
from numpy import nan as NA
from gprofiler import GProfiler
gp = GProfiler("GDC/0.1")

def merge_data(metadata_file, downloads_dir):
    """
        Merge the data files declared in the metadata
        Cleans out the ENSG index gene ids for suffixes
        Return a single dataframe
    """
    with open(metadata_file, 'r') as f:
        metadatas = json.load(f)
        df_combined = pd.DataFrame()

        # Loop over each file metadata record
        for idx, metadata in enumerate(metadatas):
            # Retrieve case ID so that we can name the output column
            if not metadata["associated_entities"]:
                continue
            case_id = metadata["associated_entities"][0]["case_id"]
            # Construct the path to the FPKM gzipped (*FKPM-UQ.gz) file
            fkpmzip_path = os.path.join(downloads_dir, metadata["file_id"], metadata["file_name"])
            # unzip and read in as dataframe
            df_case = pd.read_table(fkpmzip_path, compression='gzip', header = None)
            # set ENSG ID column (0) as index
            df_case.set_index(0, inplace=True)
            # set the column name
            df_case.columns=[case_id]
            # merge on ENSG as common; perform a union
            ## initialize the df_combined if not already
            if idx == 0:
                df_combined = df_case.copy(deep=True)
                df_combined.index.name = 'gene_id'
                df_combined.columns.name = 'case_id'
                continue

            df_combined = pd.merge(df_case, df_combined, how='outer', left_index=True, right_index=True)

    ## Remove any trailing decimal digits in the ENSG gene id index
    df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])

    return df_combined


def assign_subtype_caseid(metadata_file, subtype_filename):
    """
        Assign to each TCGA patient barcode and subtype contained in the subtype_filename
        a GDC case_id. Return the intersection.
    """
    ## input file with TCGA patient barcodes and subtypes
    df_subtypes = pd.read_table(subtype_filename, index_col=0, header=0)
    ## DataFrame which will have a case id assigned for each
    ## TCGA barcode aka participant aka entity_submitter_id
    df_gdc = pd.DataFrame(columns=['case_id'])

    with open(metadata_file, 'r') as f:
        metadatas = json.load(f)
        # Loop over each file metadata record
        for idx, metadata in enumerate(metadatas):
            # Retrieve case id so that we can name the output column
            if not metadata["associated_entities"]:
                continue
            case_id = metadata["associated_entities"][0]["case_id"]
            entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"]
            # Reconstruct the TCGA barcode 'TCGA-<TSS>-<participant>'
            barcode = '-'.join(entity_submitter_id.split('-')[0:3])

            df_gdc.ix[barcode] = case_id
    df_gdc.index.name = 'patient'

    # merge the two (union)
    return pd.merge(df_subtypes, df_gdc, how='inner', left_index=True, right_index=True)


def writeout(output_path, df):
    """
        Write the dataframe (df) to output path (output_path) as a
        tab-separated file
    """
    directory = os.path.dirname(output_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(output_path, sep="\t", index=True, header=True)


def format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype):
    """
        Munge the TCGA data downloaded from GDC.
        Extracts and combines the individual .gz data files
        Finds a case_id UUID for each corresponding TCGA barcode/subtype (subtypes_file)
        Filters data if there is an associated subtype
        Dumps tab-separated file to the output_dir
    """


    ### merge FPKM-UQ data
    df_data = merge_data(metadata_file, downloads_dir)

    ### merge and filter (intersect) subtype information having associated case ids
    ## This is the intersection
    df_subtypes = assign_subtype_caseid(metadata_file, subtypes_file)
    writeout(output_file_subtype, df_subtypes)

    ## access df_data columns (reindex) by presence of case_id in df_subtypes
    df_data_subtyped = df_data.reindex(columns=df_subtypes['case_id'])
    writeout(output_file_data, df_data_subtyped)

def conversion_map(query, target='HGNC', keys='query'):
    """
        Convert query id list (possibly mixed) to target id format (human).
        See GProfiler.gconvert (http://biit.cs.ut.ee/gprofiler/gconvert.cgi)
        for targets. Filters out results where mapping is 'None'.
    """
    converted = gp.gconvert(query, organism='hsapiens', target=target)
    df_mapped = pd.DataFrame(converted, columns=['g#', 'initial_alias', 'c#', 'converted_alias', 'name', 'description', 'namespace'])

    ## replace None values with np.nan as NA
    df_mapped.fillna(value=NA)

    ## Drop rows where the target 'converted_alias' is NA
    df_id_map_filtered = df_mapped.dropna(subset = ['converted_alias'])

    return df_id_map_filtered


def filter_gdc(input_file_data, output_path):
    """
        Takes an input data file (gene ids, samples) and filters it for those
        gene id rows having a valid HGNC symbol as determined by g:Profiler
        Dumps tab-separated file (output_path)
    """
    ## Create a map of data ids to HGNC ids
    df_data = pd.read_table(input_file_data, header=0, index_col=0)
    df_id_map = conversion_map(list(df_data.index.values), target='HGNC')

    ## merge (intersection) df_data for rows with valid HGNC symbols
    ## that is, a matching df_id_map['initial_alias'] value
    df_combined = pd.merge(df_data, df_id_map, how='inner', left_index=True, right_on='initial_alias')

    ## Clean up the result
    df_combined.index = df_combined['converted_alias']
    df_combined = df_combined.drop(['g#', 'c#', 'converted_alias', 'initial_alias', 'name', 'description', 'namespace'], axis=1)
    df_combined.index.name = 'HGNC'
    df_combined.columns.name = 'case_id'

    writeout(output_path, df_combined)

if __name__ == "__main__":
    BASE_DIR = os.path.abspath('/Users/username/Downloads/TCGAOV_data/')
    downloads_dir = os.path.join(BASE_DIR, 'gdc_download_20160803')
    metadata_file = os.path.join(BASE_DIR, 'metadata.cart.2016-08-03T16-04-06.289487.json')
    subtypes_file = os.path.join(BASE_DIR, 'SupplementaryDataTable4.txt')

    ## Create output file paths
    output_dir = os.path.join(BASE_DIR, 'output')
    output_file_data = os.path.join(output_dir, 'TCGAOv_data.txt')
    output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt')
    output_file_filtered_data = os.path.join(output_dir, 'TCGAOv_data_filtered.txt')

    # Do the munge
    format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype)
    filter_gdc(output_file_data, output_file_filtered_data)
	name: tcgaov
	dependencies:
	- appnope=0.1.0=py27_0
	- backports=1.0=py27_0
	- decorator=4.0.10=py27_0
	- ecdsa=0.13=py27_0
	- fabric=1.11.1=py27_0
	- get_terminal_size=1.0.0=py27_0
	- ipython=5.0.0=py27_0
	- ipython_genutils=0.1.0=py27_0
	- mkl=11.3.3=0
	- numpy=1.11.1=py27_0
	- openssl=1.0.2h=1
	- pandas=0.18.1=np111py27_0
	- paramiko=1.16.0=py27_0
	- path.py=8.2.1=py27_0
	- pathlib2=2.1.0=py27_0
	- pexpect=4.0.1=py27_0
	- pickleshare=0.7.3=py27_0
	- pip=8.1.2=py27_0
	- prompt_toolkit=1.0.3=py27_0
	- ptyprocess=0.5.1=py27_0
	- pycrypto=2.6.1=py27_4
	- pygments=2.1.3=py27_0
	- python=2.7.12=1
	- python-dateutil=2.5.3=py27_0
	- python.app=1.2=py27_4
	- pytz=2016.6.1=py27_0
	- readline=6.2=2
	- setuptools=23.0.0=py27_0
	- simplegeneric=0.8.1=py27_1
	- six=1.10.0=py27_0
	- sqlite=3.13.0=0
	- tk=8.5.18=0
	- traitlets=4.2.2=py27_0
	- wcwidth=0.1.7=py27_0
	- wheel=0.29.0=py27_0
	- zlib=1.2.8=3
	- pip:
	- backports.shutil-get-terminal-size==1.0.0
	- gprofiler-official==0.2.3
	- ipython-genutils==0.1.0
	- prompt-toolkit==1.0.3
	- requests==2.11.0
	from __future__ import with_statement
	import os
	import csv
	import json
	import gzip
	import pandas as pd
	from numpy import nan as NA
	from gprofiler import GProfiler
	gp = GProfiler("GDC/0.1")

	def merge_data(metadata_file, downloads_dir):
	"""
	Merge the data files declared in the metadata
	Cleans out the ENSG index gene ids for suffixes
	Return a single dataframe
	"""
	with open(metadata_file, 'r') as f:
	metadatas = json.load(f)
	df_combined = pd.DataFrame()

	# Loop over each file metadata record
	for idx, metadata in enumerate(metadatas):
	# Retrieve case ID so that we can name the output column
	if not metadata["associated_entities"]:
	continue
	case_id = metadata["associated_entities"][0]["case_id"]
	# Construct the path to the FPKM gzipped (*FKPM-UQ.gz) file
	fkpmzip_path = os.path.join(downloads_dir, metadata["file_id"], metadata["file_name"])
	# unzip and read in as dataframe
	df_case = pd.read_table(fkpmzip_path, compression='gzip', header = None)
	# set ENSG ID column (0) as index
	df_case.set_index(0, inplace=True)
	# set the column name
	df_case.columns=[case_id]
	# merge on ENSG as common; perform a union
	## initialize the df_combined if not already
	if idx == 0:
	df_combined = df_case.copy(deep=True)
	df_combined.index.name = 'gene_id'
	df_combined.columns.name = 'case_id'
	continue

	df_combined = pd.merge(df_case, df_combined, how='outer', left_index=True, right_index=True)

	## Remove any trailing decimal digits in the ENSG gene id index
	df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])

	return df_combined


	def assign_subtype_caseid(metadata_file, subtype_filename):
	"""
	Assign to each TCGA patient barcode and subtype contained in the subtype_filename
	a GDC case_id. Return the intersection.
	"""
	## input file with TCGA patient barcodes and subtypes
	df_subtypes = pd.read_table(subtype_filename, index_col=0, header=0)
	## DataFrame which will have a case id assigned for each
	## TCGA barcode aka participant aka entity_submitter_id
	df_gdc = pd.DataFrame(columns=['case_id'])

	with open(metadata_file, 'r') as f:
	metadatas = json.load(f)
	# Loop over each file metadata record
	for idx, metadata in enumerate(metadatas):
	# Retrieve case id so that we can name the output column
	if not metadata["associated_entities"]:
	continue
	case_id = metadata["associated_entities"][0]["case_id"]
	entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"]
	# Reconstruct the TCGA barcode 'TCGA-<TSS>-<participant>'
	barcode = '-'.join(entity_submitter_id.split('-')[0:3])

	df_gdc.ix[barcode] = case_id
	df_gdc.index.name = 'patient'

	# merge the two (union)
	return pd.merge(df_subtypes, df_gdc, how='inner', left_index=True, right_index=True)


	def writeout(output_path, df):
	"""
	Write the dataframe (df) to output path (output_path) as a
	tab-separated file
	"""
	directory = os.path.dirname(output_path)
	if not os.path.exists(directory):
	os.makedirs(directory)
	df.to_csv(output_path, sep="\t", index=True, header=True)


	def format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype):
	"""
	Munge the TCGA data downloaded from GDC.
	Extracts and combines the individual .gz data files
	Finds a case_id UUID for each corresponding TCGA barcode/subtype (subtypes_file)
	Filters data if there is an associated subtype
	Dumps tab-separated file to the output_dir
	"""


	### merge FPKM-UQ data
	df_data = merge_data(metadata_file, downloads_dir)

	### merge and filter (intersect) subtype information having associated case ids
	## This is the intersection
	df_subtypes = assign_subtype_caseid(metadata_file, subtypes_file)
	writeout(output_file_subtype, df_subtypes)

	## access df_data columns (reindex) by presence of case_id in df_subtypes
	df_data_subtyped = df_data.reindex(columns=df_subtypes['case_id'])
	writeout(output_file_data, df_data_subtyped)

	def conversion_map(query, target='HGNC', keys='query'):
	"""
	Convert query id list (possibly mixed) to target id format (human).
	See GProfiler.gconvert (http://biit.cs.ut.ee/gprofiler/gconvert.cgi)
	for targets. Filters out results where mapping is 'None'.
	"""
	converted = gp.gconvert(query, organism='hsapiens', target=target)
	df_mapped = pd.DataFrame(converted, columns=['g#', 'initial_alias', 'c#', 'converted_alias', 'name', 'description', 'namespace'])

	## replace None values with np.nan as NA
	df_mapped.fillna(value=NA)

	## Drop rows where the target 'converted_alias' is NA
	df_id_map_filtered = df_mapped.dropna(subset = ['converted_alias'])

	return df_id_map_filtered


	def filter_gdc(input_file_data, output_path):
	"""
	Takes an input data file (gene ids, samples) and filters it for those
	gene id rows having a valid HGNC symbol as determined by g:Profiler
	Dumps tab-separated file (output_path)
	"""
	## Create a map of data ids to HGNC ids
	df_data = pd.read_table(input_file_data, header=0, index_col=0)
	df_id_map = conversion_map(list(df_data.index.values), target='HGNC')

	## merge (intersection) df_data for rows with valid HGNC symbols
	## that is, a matching df_id_map['initial_alias'] value
	df_combined = pd.merge(df_data, df_id_map, how='inner', left_index=True, right_on='initial_alias')

	## Clean up the result
	df_combined.index = df_combined['converted_alias']
	df_combined = df_combined.drop(['g#', 'c#', 'converted_alias', 'initial_alias', 'name', 'description', 'namespace'], axis=1)
	df_combined.index.name = 'HGNC'
	df_combined.columns.name = 'case_id'

	writeout(output_path, df_combined)

	if __name__ == "__main__":
	BASE_DIR = os.path.abspath('/Users/username/Downloads/TCGAOV_data/')
	downloads_dir = os.path.join(BASE_DIR, 'gdc_download_20160803')
	metadata_file = os.path.join(BASE_DIR, 'metadata.cart.2016-08-03T16-04-06.289487.json')
	subtypes_file = os.path.join(BASE_DIR, 'SupplementaryDataTable4.txt')

	## Create output file paths
	output_dir = os.path.join(BASE_DIR, 'output')
	output_file_data = os.path.join(output_dir, 'TCGAOv_data.txt')
	output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt')
	output_file_filtered_data = os.path.join(output_dir, 'TCGAOv_data_filtered.txt')

	# Do the munge
	format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype)
	filter_gdc(output_file_data, output_file_filtered_data)