Skip to content

Instantly share code, notes, and snippets.

@jvwong
Last active September 1, 2016 20:51
Show Gist options
  • Save jvwong/48f9195db3d73009e8b93ed1de94a52d to your computer and use it in GitHub Desktop.
Save jvwong/48f9195db3d73009e8b93ed1de94a52d to your computer and use it in GitHub Desktop.
Obtain subtypes
import os
import fnmatch
import json
import pandas as pd
def writeout(output_path, df):
"""
Write dataframe (df) to output path (output_path)
"""
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
os.makedirs(directory)
df.to_csv(output_path, sep="\t", index=True, header=True)
### Assign a case UUID to each subtype
BASE_DIR = os.path.abspath('/Users/jeffreywong/Sync/bader_jvwong/Guide/datasets/get-data/data/GDC_TCGAOv_Counts')
for file in os.listdir(BASE_DIR):
if fnmatch.fnmatch(file, 'metadata.cart*.json'):
metadata_file = os.path.join(BASE_DIR, file)
subtypes_file = os.path.join(BASE_DIR, 'Verhaak_JCI_2013_tableS1.txt')
output_dir = os.path.join(BASE_DIR, 'output')
output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt')
df_subtypes = pd.read_table(subtypes_file, index_col=0, header=0)
df_gdc = pd.DataFrame(columns=['case_id'])
subtypes = ['Mesenchymal', 'Immunoreactive']
with open(metadata_file, 'r') as f:
metadatas = json.load(f)
for idx, metadata in enumerate(metadatas):
if not "associated_entities" in metadata or not metadata["associated_entities"]:
continue
## debug notes - is this mapping right???
case_id = metadata["associated_entities"][0]["case_id"]
entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"]
## Note 1
barcode = '-'.join(entity_submitter_id.split('-')[0:3])
# Note 2
if barcode in df_subtypes.index and df_subtypes.ix[barcode]['SUBTYPE'] not in subtypes:
continue
df_gdc.ix[barcode] = case_id
## Note 3
df_assigned = pd.merge(df_subtypes, df_gdc,
how='inner',
left_index=True,
right_index=True)
df_assigned.index.name = 'barcode'
## Note 4
df_assigned.sort_values('case_id', inplace=True)
writeout(output_file_subtype, df_assigned)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment