Last active
September 1, 2016 20:51
-
-
Save jvwong/48f9195db3d73009e8b93ed1de94a52d to your computer and use it in GitHub Desktop.
Obtain subtypes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import fnmatch | |
import json | |
import pandas as pd | |
def writeout(output_path, df): | |
""" | |
Write dataframe (df) to output path (output_path) | |
""" | |
directory = os.path.dirname(output_path) | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
df.to_csv(output_path, sep="\t", index=True, header=True) | |
### Assign a case UUID to each subtype | |
BASE_DIR = os.path.abspath('/Users/jeffreywong/Sync/bader_jvwong/Guide/datasets/get-data/data/GDC_TCGAOv_Counts') | |
for file in os.listdir(BASE_DIR): | |
if fnmatch.fnmatch(file, 'metadata.cart*.json'): | |
metadata_file = os.path.join(BASE_DIR, file) | |
subtypes_file = os.path.join(BASE_DIR, 'Verhaak_JCI_2013_tableS1.txt') | |
output_dir = os.path.join(BASE_DIR, 'output') | |
output_file_subtype = os.path.join(output_dir, 'TCGAOv_subtypes.txt') | |
df_subtypes = pd.read_table(subtypes_file, index_col=0, header=0) | |
df_gdc = pd.DataFrame(columns=['case_id']) | |
subtypes = ['Mesenchymal', 'Immunoreactive'] | |
with open(metadata_file, 'r') as f: | |
metadatas = json.load(f) | |
for idx, metadata in enumerate(metadatas): | |
if not "associated_entities" in metadata or not metadata["associated_entities"]: | |
continue | |
## debug notes - is this mapping right??? | |
case_id = metadata["associated_entities"][0]["case_id"] | |
entity_submitter_id = metadata["associated_entities"][0]["entity_submitter_id"] | |
## Note 1 | |
barcode = '-'.join(entity_submitter_id.split('-')[0:3]) | |
# Note 2 | |
if barcode in df_subtypes.index and df_subtypes.ix[barcode]['SUBTYPE'] not in subtypes: | |
continue | |
df_gdc.ix[barcode] = case_id | |
## Note 3 | |
df_assigned = pd.merge(df_subtypes, df_gdc, | |
how='inner', | |
left_index=True, | |
right_index=True) | |
df_assigned.index.name = 'barcode' | |
## Note 4 | |
df_assigned.sort_values('case_id', inplace=True) | |
writeout(output_file_subtype, df_assigned) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment