Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active April 20, 2023 14:14
Show Gist options
  • Save dinovski/d6b2f446bc88f2973e1ce6916a0f3a24 to your computer and use it in GitHub Desktop.
Save dinovski/d6b2f446bc88f2973e1ce6916a0f3a24 to your computer and use it in GitHub Desktop.
Calculate median transcript-level TPM by GTEx tissue
#!/usr/bin/env python
import pandas as pd
import numpy as np
import csv
import statistics
# mapping of sample ID to tissue:
#curl https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt -o GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
# TPM per sample per transcript:
#curl https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz -o GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
#gunzip GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
att_file = './GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
tpm_file = './GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct'
out_file = './GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_median_tpm.gct'
# get sample IDs by tissue
att_tab = pd.read_csv(att_file, sep='\t')
att_tab = att_tab[['SAMPID', 'SMTS']]
all_tissues = att_tab.SMTS.unique()
#all_tissues = all_tissues[0:3] #testing
#print(all_tissues)
tpm_lines = sum(1 for line in open(tpm_file))
#print(tpm_lines)
# make empty data frame
col_names = np.insert(all_tissues, 0, 'transcript_id')
#df_tpm = pd.DataFrame(columns=col_names, index=range(1,tpm_lines))
df_tpm = pd.DataFrame(columns=col_names)
#print(df_tpm.head())
# write header to csv file
df_tpm.to_csv(out_file, encoding='utf-8', sep='\t', index=False, header=True, mode='w')
# store column names (ie. sample IDs)
tpm_tab_header = pd.read_csv(tpm_file, sep='\t', skiprows=2, nrows=0)
header_list = list(tpm_tab_header)
# for each line (transcript) map SAMPID to tissue and calculate median TPM
for line in range(1, tpm_lines):
# import single transcript
num_to_skip = line + 2
line += 1
tpm_tab = pd.read_csv(tpm_file, sep='\t', skiprows=num_to_skip, nrows=1, names=header_list)
#tpm_file = open(tpm_file, 'r')
#tpm_tab = tpm_file.readlines()
#print(tpm_tab.head())
transcript_id = tpm_tab.iloc[0]['transcript_id']
row_to_append = pd.DataFrame([{'transcript_id': transcript_id}])
print(transcript_id)
for i in range(len(all_tissues)):
tx_sel = all_tissues[i]
tx_samps = att_tab.loc[att_tab['SMTS'] == tx_sel,'SAMPID']
tx_tpm = tpm_tab[tpm_tab.columns[tpm_tab.columns.isin(tx_samps)]]
tx_tpm_med = float(tx_tpm.median(axis=1))
new_tx = pd.DataFrame([{tx_sel: tx_tpm_med}])
new_tx = new_tx.round(4)
row_to_append = pd.concat([row_to_append, new_tx], axis=1)
row_to_append.to_csv(out_file, encoding='utf-8', sep='\t', header=False, index=False, mode='a')
#df_tpm = pd.concat([df_tpm, row_to_append])
#df_tpm.head(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment