Last active
October 5, 2017 19:02
-
-
Save adiamb/4e2efbbb2fed405398c4287ca5278906 to your computer and use it in GitHub Desktop.
This will count num of CDR3s from Hi-SEQ TCR CDR3 Beta calls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import subprocess | |
from subprocess import PIPE | |
import pandas as pd | |
import numpy as np | |
import scipy.stats as stats | |
import glob | |
import scipy | |
############################ read in the DBID and TCR identifiers from ling's csv file | |
BETAS_ID ={} | |
#ALPHAS_ID ={} | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_DBID_HSIDS_SEP6.csv') as in_file: | |
line_number =0 | |
for line in in_file: | |
line_number += 1 | |
if line_number > 1: | |
line_parse = line.strip('\r\n').split(',') | |
DBID = str(line_parse[0]) | |
cell_type = str(line_parse[12]) | |
DX = str(line_parse[2]) | |
sampleID = str(line_parse[0]) + ','+str(line_parse[2])+','+str(line_parse[12]) ## sampleID | |
if line_parse[11] and line_parse[22]: | |
make_BETA_BC = str(line_parse[11])+','+str(line_parse[22]) | |
beta_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_BETA_BC) | |
#print beta_ID, sampleID+','+beta_ID | |
if beta_ID not in BETAS_ID: | |
BETAS_ID[beta_ID] = sampleID+','+beta_ID | |
else: | |
print 'fuck', beta_ID, line_number | |
#if line_parse[10] and line_parse[21]: | |
#make_ALPHA_BC = str(line_parse[10])+','+str(line_parse[21]) | |
#alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC) | |
#print make_ALPHA_BC, make_BETA_BC | |
## clone count > 2 | |
############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items | |
BETA_CDR3={} | |
filelist=glob.glob('*.rcl') | |
processed_rcl = 0 | |
for rcl in filelist: | |
processed_rcl += 1 | |
if rcl: | |
with open(rcl) as in_file_rcl: | |
processed_buf = 0 | |
processed_line = 0 | |
for line in in_file_rcl: | |
processed_line += 1 | |
if line: | |
if processed_line == 100000: | |
processed_buf += 100000 | |
processed_line = 0 | |
print ' PROCESSED ', processed_buf, ' lines from ', rcl | |
line_parse = line.strip('\r\n').split(',') | |
if line_parse[1] == 'b-b' and int(line_parse[9]) >= 2:# clone count > 2 | |
assert line_parse[1] == 'b-b' | |
get_BC_sub = str(line_parse[0])+','+str(line_parse[2]) | |
makefileBC = re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub) | |
if line_parse[3] and line_parse[5] and line_parse[8]: | |
makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8] | |
if makeCDR3 in BETA_CDR3: | |
get_exist = BETA_CDR3.get(makeCDR3) | |
if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same run | |
BETA_CDR3[makeCDR3] = get_exist+'%'+makefileBC | |
else: | |
BETA_CDR3[makeCDR3] = makefileBC | |
## take only clones that are shared in atleast 10 runs | |
BETA_CDR3={key: value for key, value in BETA_CDR3.items() if len(value.split('%')) >= 5} | |
### confirm if the dictionary comphrension worked or not | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt', 'w') as out: | |
for key, value in BETA_CDR3.iteritems(): | |
value_parse = value.split('%') | |
out.write(key+'$') | |
for item in value_parse: | |
if item: | |
sampleID=BETAS_ID.get(item) | |
if sampleID: | |
out.write(str(sampleID)+'$') | |
out.write('\n') | |
#print key, value | |
## yes n = 241026 | |
### read in the DQ list | |
DQ_DBID =[] | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f: | |
for line in dq_f: | |
if line.strip('\r\n'): | |
DQ_DBID.append(line.strip('\r\n')) | |
cells = ['CD4+', 'CD4+CD45RA-'] | |
P_DBID=[] ## total length of patients | |
C_DBID =[] ## total length of controls | |
calls={} | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt') as cdrfile: | |
processed_calls = 0 | |
processed_buf = 0 | |
for line in cdrfile: | |
processed_calls += 1 | |
if line: | |
if processed_calls == 1000: | |
processed_buf += 1000 | |
processed_calls = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS' | |
line_parse = line.strip('\r\n').split('$') | |
if len(line_parse) >= 6: | |
P_ = 0 | |
C_ = 0 | |
list_c = 0 | |
DBID = [] | |
for item in line_parse: | |
list_c += 1 | |
if list_c > 1: | |
if item: ## this will update per clone counts for patients and controls | |
dbid=str(item.split(',')[0]) | |
dx = str(item.split(',')[1]) | |
cell = str(item.split(',')[2]) | |
if dbid in DQ_DBID: | |
if cell in cells: | |
if dbid not in DBID: | |
DBID.append(dbid) | |
if dx == "P": | |
P_ += 1 | |
else: | |
C_ += 1 | |
else: | |
pass | |
if dx == "P" and dbid not in P_DBID: | |
P_DBID.append(dbid) | |
elif dx == "C" and dbid not in C_DBID: | |
C_DBID.append(dbid) | |
else: | |
pass | |
calls[line_parse[0]] = str(P_)+','+str(C_) ## this is dictionary update keyed with CDR3 itself | |
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True]) | |
### calculate chisquare## defin chisq | |
def stat_rt(i, j): | |
cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float) | |
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab) | |
#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab) | |
odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1]) | |
SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64))) | |
L_CI = np.exp(np.log(odds_ratio) - 1.96*SE) | |
U_CI = np.exp(np.log(odds_ratio) + 1.96*SE) | |
return [p, chi2, odds_ratio, SE, L_CI, U_CI] | |
#float_formatter = lambda x: "%.2f" % x | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/STAT_BETA_CDR3_OCT5.csv', 'w') as out_f: | |
out_f.write('TRBV,TRBJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n') | |
processed_buf = 0 | |
processed_line = 0 | |
for key, value in calls.iteritems(): | |
value_parse = value.split(',') | |
if int(value_parse[0]) + int(value_parse[1]) >= 5: | |
processed_line += 1 | |
if processed_line == 1000: | |
processed_buf += 1000 | |
processed_line = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS' | |
to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1])) | |
to_out_2 = ','.join(map(str,to_out_1)) | |
out_f.write(key+','+value+','+to_out_2+'\n') | |
calls_F=pd.read_csv('STAT_BETA_CDR3_OCT5.csv') | |
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv') | |
calls_F.sort_values(['PVAL'], inplace=True) | |
calls_F.reset_index(drop=True, inplace=True) | |
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL']) | |
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL']) | |
calls_F.ix[:20].to_csv('BETA_RESULTS/TOP_PVALUES_CDR3_BETA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/NARCOLEPSY_CDR3_BETA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/CONTROLS_CDR3_BETA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'], | |
ascending=False)[1:20].to_csv('BETA_RESULTS/ENRICHED_CDR3_BETA_OCT5.csv', index=False) | |
## conisder only the CD4+ total and CD4 memory | |
######## debugging | |
processed_buf = 0 | |
processed_line = 0 | |
calls_F.loc[:, 'PVAL'] = 1 | |
calls_F.loc[:, 'CHI_TSTAT'] ='0' | |
for i in xrange(len(calls_F)): | |
if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0: | |
processed_line += 1 | |
if processed_line == 1000: | |
processed_buf += 1000 | |
processed_line = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS' | |
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i]) | |
calls_F.loc[i, 'PVAL'] = chi_out[0] | |
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1] | |
else: | |
pass | |
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False) | |
calls_DF = pd.Series(calls, name='P_C') | |
calls_DF_=calls_DF.reset_index() | |
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1) | |
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True) | |
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int) | |
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int) | |
'TRAV16,TRAJ42*01,CALSGGSQGNLIF' | |
#### debugging scripts ############# | |
testlist=['TRBV24-1*01,TRBJ2-1*01,CATSDFGGYNEQFF', | |
'15980,C,CD8+,HS76,8', | |
'14450,C,CD8+,HS43,4', | |
'13121,P,CD4+,HS29,6', | |
'13284,P,CD8+,HS51,7', | |
'14272,C,CD4+CD45RA-,HS32,7', | |
'14272,C,CD4+CD45RA-,HS78,17', | |
'14272,C,CD8+,HS40,8', | |
'21-001-01-13,C,CD8+,HS104,15',] | |
P_ = 0 | |
C_ = 0 | |
list_c =0 | |
DBID = [] | |
for item in testlist: | |
list_c += 1 | |
if list_c > 1: | |
if item: | |
dbid=str(item.split(',')[0]) | |
dx=item.split(',')[1] | |
if dbid not in DBID: | |
print 'yey', dx, dbid | |
DBID.append(dbid) | |
if dx == "P": | |
P_ += 1 | |
else: | |
C_ += 1 | |
else: | |
print 'already present counts not performed', dx, dbid | |
print P_, C_ | |
house = [ [43, 48-43 ], [71, 89-71] ] | |
chi2, p, ddof, expected = scipy.stats.chi2_contingency( house ) | |
msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n" | |
print( msg.format( chi2, p, ddof ) ) | |
print( expected ) | |
calls_DF = pd.Series(calls, name='P_C') | |
calls_DF_=calls_DF.reset_index() | |
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1) | |
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True) | |
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int) | |
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int) | |
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True]) | |
### calculate chisquare## defin chisq | |
def chisq_rt(i, j): | |
cont_tab = [[i, len(P_DBID)-i], [j, len(C_DBID)-j]] | |
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab) | |
return [p, chi2] | |
processed_buf = 0 | |
processed_line = 0 | |
calls_F.loc[:, 'PVAL'] = 1 | |
calls_F.loc[:, 'CHI_TSTAT'] ='0' | |
for i in xrange(len(calls_F)): | |
if calls_F.P_Counts[i] > 0 and calls_F.C_Counts[i] > 0: | |
processed_line += 1 | |
if processed_line == 1000: | |
processed_buf += 1000 | |
processed_line = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS' | |
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i]) | |
calls_F.loc[i, 'PVAL'] = chi_out[0] | |
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1] | |
else: | |
pass | |
### write out the file | |
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment