jvwong/get_counts.py

## get_counts.py
import os
import fnmatch
import json
import pandas as pd

def writeout(output_path, df):
    """
        Write dataframe (df) to output path (output_path)
    """
    directory = os.path.dirname(output_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(output_path, sep="\t", index=True, header=True)

### Extract and combined the RNA sequencing counts for each case
## Note 1
BASE_DIR = os.path.abspath('/Users/jeffreywong/Sync/bader_jvwong/Guide/datasets/get-data/data/GDC_TCGAOv_Counts/')
for file in os.listdir(BASE_DIR):
    if fnmatch.fnmatch(file, 'metadata.cart*.json'):
        metadata_file = os.path.join(BASE_DIR, file)
    if fnmatch.fnmatch(file, 'gdc_manifest_*.txt'):
        manifest_file = os.path.join(BASE_DIR, file)

downloads_dir = os.path.join(BASE_DIR, 'gdc_downloads')
output_dir = os.path.join(BASE_DIR, 'output')
output_file_data = os.path.join(output_dir, 'TCGAOv_counts.txt')
subtypes_file = os.path.join(output_dir, 'TCGAOv_subtypes.txt')

count_file_extension = '.htseq.counts.gz'
df_subtypes = pd.DataFrame()
df_combined = pd.DataFrame()

## Note 2
with open(subtypes_file, 'r') as sf:
    df_subtypes = pd.read_table(sf, header=0, index_col=0)
with open(manifest_file, 'r') as mf:
    df_manifest = pd.read_table(mf, header=0, index_col=1)
with open(metadata_file, 'r') as f:
    metadatas = json.load(f)

for idx, metadata in enumerate(metadatas):
    ## Note 3
    if "analysis" not in metadata or not metadata["analysis"]["submitter_id"]:
        continue
    submitter_id = metadata["analysis"]["submitter_id"]
    count_file_id = submitter_id.split('_')[0]

    ## Note 4
    if "associated_entities" not in metadata or not metadata["associated_entities"]:
        continue
    case_id = metadata["associated_entities"][0]["case_id"]

    ## Note 5
    if case_id in df_combined.columns:
        continue
    if case_id not in df_subtypes['case_id'].values:
        continue

    ## Note 6
    count_file_name = count_file_id + count_file_extension
    count_file_directory = df_manifest.ix[count_file_name]['id']
    countfile_path = os.path.join(downloads_dir,
                                  count_file_directory,
                                  count_file_name)

    if not os.path.isfile(countfile_path):
        continue
    df_case = pd.read_table(countfile_path,
                            compression='gzip',
                            header = None)
    df_case.set_index(0, inplace=True)
    df_case.columns=[case_id]

    if idx == 0:
        df_combined = df_case.copy(deep=True)
        df_combined.index.name = 'gene_id'
        df_combined.columns.name = 'case_id'
        continue
    ## Note 7
    df_combined = pd.merge(df_case,
                           df_combined,
                           how='outer',
                           left_index=True,
                           right_index=True)
## Note 8
df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])
df_combined = df_combined[df_combined.index.map(lambda x: 'ENSG' in x)]
df_combined = df_combined.reindex_axis(sorted(df_combined.columns), axis=1)

writeout(output_file_data, df_combined)
	import os
	import fnmatch
	import json
	import pandas as pd

	def writeout(output_path, df):
	"""
	Write dataframe (df) to output path (output_path)
	"""
	directory = os.path.dirname(output_path)
	if not os.path.exists(directory):
	os.makedirs(directory)
	df.to_csv(output_path, sep="\t", index=True, header=True)

	### Extract and combined the RNA sequencing counts for each case
	## Note 1
	BASE_DIR = os.path.abspath('/Users/jeffreywong/Sync/bader_jvwong/Guide/datasets/get-data/data/GDC_TCGAOv_Counts/')
	for file in os.listdir(BASE_DIR):
	if fnmatch.fnmatch(file, 'metadata.cart*.json'):
	metadata_file = os.path.join(BASE_DIR, file)
	if fnmatch.fnmatch(file, 'gdc_manifest_*.txt'):
	manifest_file = os.path.join(BASE_DIR, file)

	downloads_dir = os.path.join(BASE_DIR, 'gdc_downloads')
	output_dir = os.path.join(BASE_DIR, 'output')
	output_file_data = os.path.join(output_dir, 'TCGAOv_counts.txt')
	subtypes_file = os.path.join(output_dir, 'TCGAOv_subtypes.txt')

	count_file_extension = '.htseq.counts.gz'
	df_subtypes = pd.DataFrame()
	df_combined = pd.DataFrame()

	## Note 2
	with open(subtypes_file, 'r') as sf:
	df_subtypes = pd.read_table(sf, header=0, index_col=0)
	with open(manifest_file, 'r') as mf:
	df_manifest = pd.read_table(mf, header=0, index_col=1)
	with open(metadata_file, 'r') as f:
	metadatas = json.load(f)

	for idx, metadata in enumerate(metadatas):
	## Note 3
	if "analysis" not in metadata or not metadata["analysis"]["submitter_id"]:
	continue
	submitter_id = metadata["analysis"]["submitter_id"]
	count_file_id = submitter_id.split('_')[0]

	## Note 4
	if "associated_entities" not in metadata or not metadata["associated_entities"]:
	continue
	case_id = metadata["associated_entities"][0]["case_id"]

	## Note 5
	if case_id in df_combined.columns:
	continue
	if case_id not in df_subtypes['case_id'].values:
	continue

	## Note 6
	count_file_name = count_file_id + count_file_extension
	count_file_directory = df_manifest.ix[count_file_name]['id']
	countfile_path = os.path.join(downloads_dir,
	count_file_directory,
	count_file_name)

	if not os.path.isfile(countfile_path):
	continue
	df_case = pd.read_table(countfile_path,
	compression='gzip',
	header = None)
	df_case.set_index(0, inplace=True)
	df_case.columns=[case_id]

	if idx == 0:
	df_combined = df_case.copy(deep=True)
	df_combined.index.name = 'gene_id'
	df_combined.columns.name = 'case_id'
	continue
	## Note 7
	df_combined = pd.merge(df_case,
	df_combined,
	how='outer',
	left_index=True,
	right_index=True)
	## Note 8
	df_combined.index = df_combined.index.map(lambda x: x.split('.')[0])
	df_combined = df_combined[df_combined.index.map(lambda x: 'ENSG' in x)]
	df_combined = df_combined.reindex_axis(sorted(df_combined.columns), axis=1)

	writeout(output_file_data, df_combined)