|
from __future__ import with_statement |
|
import os |
|
import csv |
|
import json |
|
import gzip |
|
import pandas as pd |
|
from numpy import nan as NA |
|
from gprofiler import GProfiler |
|
gp = GProfiler("GDC/0.1") |
|
|
|
def merge_data(metadata_file, downloads_dir): |
|
""" |
|
Merge the data files declared in the metadata |
|
Cleans out the ENSG index gene ids for suffixes |
|
Return a single dataframe |
|
""" |
|
with open(metadata_file, 'r') as f: |
|
metadatas = json.load(f) |
|
df_combined = pd.DataFrame() |
|
|
|
# Loop over each file metadata record |
|
for idx, metadata in enumerate(metadatas): |
|
# Retrieve case ID so that we can name the output column |
|
if not metadata["associated_entities"]: |
|
continue |
|
case_id = metadata["associated_entities"][0]["case_id"] |
|
# Construct the path to the FPKM gzipped (*FKPM-UQ.gz) file |
|
fkpmzip_path = os.path.join(downloads_dir, metadata["file_id"], metadata["file_name"]) |
|
# unzip and read in as dataframe |
|
df_case = pd.read_table(fkpmzip_path, compression='gzip', header = None) |
|
# set ENSG ID column (0) as index |
|
df_case.set_index(0, inplace=True) |
|
# set the column name |
|
df_case.columns=[case_id] |
|
# merge on ENSG as common; perform a union |
|
## initialize the df_combined if not already |
|
if idx == 0: |
|
df_combined = df_case.copy(deep=True) |
|
df_combined.index.name = 'gene_id' |
|
df_combined.columns.name = 'case_id' |
|
continue |
|
|
|
df_combined = pd.merge(df_case, df_combined, how='outer', left_index=True, right_index=True) |
|
|
|
## Remove any trailing decimal digits in the ENSG gene id index |
|
df_combined.index = df_combined.index.map(lambda x: x.split('.')[0]) |
|
|
|
return df_combined |
|
|
|
|
|
def assign_subtype_caseid(metadata_file, subtype_filename): |
|
""" |
|
Assign to each TCGA patient barcode and subtype contained in the subtype_filename |
|
a GDC case_id. Return the intersection. |
|
""" |
|
## input file with TCGA patient barcodes and subtypes |
|
df_subtypes = pd.read_table(subtype_filename, index_col=0, header=0) |
|
## DataFrame which will have a case id assigned for each |
|
## TCGA barcode aka participant aka entity_submitter_id |
|
df_gdc = pd.DataFrame(columns=['case_id']) |
|
|
|
with open(metadata_file, 'r') as f: |
|
metadatas = json.load(f) |
|
# Loop over each file metadata record |
|
for idx, metadata in enumerate(metadatas): |
|
# Retrieve case id so that we can name the output column |
|
if not metadata["associated_entities"]: |
|
continue |
|
case_id = metadata["associated_entities"][0]["case_id"] |
|
entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"] |
|
# Reconstruct the TCGA barcode 'TCGA-<TSS>-<participant>' |
|
barcode = '-'.join(entity_submitter_id.split('-')[0:3]) |
|
|
|
df_gdc.ix[barcode] = case_id |
|
df_gdc.index.name = 'patient' |
|
|
|
# merge the two (union) |
|
return pd.merge(df_subtypes, df_gdc, how='inner', left_index=True, right_index=True) |
|
|
|
|
|
def writeout(output_path, df): |
|
""" |
|
Write the dataframe (df) to output path (output_path) as a |
|
tab-separated file |
|
""" |
|
directory = os.path.dirname(output_path) |
|
if not os.path.exists(directory): |
|
os.makedirs(directory) |
|
df.to_csv(output_path, sep="\t", index=True, header=True) |
|
|
|
|
|
def format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype): |
|
""" |
|
Munge the TCGA data downloaded from GDC. |
|
Extracts and combines the individual .gz data files |
|
Finds a case_id UUID for each corresponding TCGA barcode/subtype (subtypes_file) |
|
Filters data if there is an associated subtype |
|
Dumps tab-separated file to the output_dir |
|
""" |
|
|
|
|
|
### merge FPKM-UQ data |
|
df_data = merge_data(metadata_file, downloads_dir) |
|
|
|
### merge and filter (intersect) subtype information having associated case ids |
|
## This is the intersection |
|
df_subtypes = assign_subtype_caseid(metadata_file, subtypes_file) |
|
writeout(output_file_subtype, df_subtypes) |
|
|
|
## access df_data columns (reindex) by presence of case_id in df_subtypes |
|
df_data_subtyped = df_data.reindex(columns=df_subtypes['case_id']) |
|
writeout(output_file_data, df_data_subtyped) |
|
|
|
def conversion_map(query, target='HGNC', keys='query'): |
|
""" |
|
Convert query id list (possibly mixed) to target id format (human). |
|
See GProfiler.gconvert (http://biit.cs.ut.ee/gprofiler/gconvert.cgi) |
|
for targets. Filters out results where mapping is 'None'. |
|
""" |
|
converted = gp.gconvert(query, organism='hsapiens', target=target) |
|
df_mapped = pd.DataFrame(converted, columns=['g#', 'initial_alias', 'c#', 'converted_alias', 'name', 'description', 'namespace']) |
|
|
|
## replace None values with np.nan as NA |
|
df_mapped.fillna(value=NA) |
|
|
|
## Drop rows where the target 'converted_alias' is NA |
|
df_id_map_filtered = df_mapped.dropna(subset = ['converted_alias']) |
|
|
|
return df_id_map_filtered |
|
|
|
|
|
def filter_gdc(input_file_data, output_path): |
|
""" |
|
Takes an input data file (gene ids, samples) and filters it for those |
|
gene id rows having a valid HGNC symbol as determined by g:Profiler |
|
Dumps tab-separated file (output_path) |
|
""" |
|
## Create a map of data ids to HGNC ids |
|
df_data = pd.read_table(input_file_data, header=0, index_col=0) |
|
df_id_map = conversion_map(list(df_data.index.values), target='HGNC') |
|
|
|
## merge (intersection) df_data for rows with valid HGNC symbols |
|
## that is, a matching df_id_map['initial_alias'] value |
|
df_combined = pd.merge(df_data, df_id_map, how='inner', left_index=True, right_on='initial_alias') |
|
|
|
## Clean up the result |
|
df_combined.index = df_combined['converted_alias'] |
|
df_combined = df_combined.drop(['g#', 'c#', 'converted_alias', 'initial_alias', 'name', 'description', 'namespace'], axis=1) |
|
df_combined.index.name = 'HGNC' |
|
df_combined.columns.name = 'case_id' |
|
|
|
writeout(output_path, df_combined) |
|
|
|
if __name__ == "__main__": |
|
BASE_DIR = os.path.abspath('/Users/username/Downloads/TCGAOV_data/') |
|
downloads_dir = os.path.join(BASE_DIR, 'gdc_download_20160803') |
|
metadata_file = os.path.join(BASE_DIR, 'metadata.cart.2016-08-03T16-04-06.289487.json') |
|
subtypes_file = os.path.join(BASE_DIR, 'SupplementaryDataTable4.txt') |
|
|
|
## Create output file paths |
|
output_dir = os.path.join(BASE_DIR, 'output') |
|
output_file_data = os.path.join(output_dir, 'TCGAOv_data.txt') |
|
output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt') |
|
output_file_filtered_data = os.path.join(output_dir, 'TCGAOv_data_filtered.txt') |
|
|
|
# Do the munge |
|
format_gdc(metadata_file, downloads_dir, subtypes_file, output_dir, output_file_data, output_file_subtype) |
|
filter_gdc(output_file_data, output_file_filtered_data) |