Skip to content

Instantly share code, notes, and snippets.

@adiamb
Last active October 5, 2017 19:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/4e2efbbb2fed405398c4287ca5278906 to your computer and use it in GitHub Desktop.
Save adiamb/4e2efbbb2fed405398c4287ca5278906 to your computer and use it in GitHub Desktop.
This will count num of CDR3s from Hi-SEQ TCR CDR3 Beta calls
import re
import subprocess
from subprocess import PIPE
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import scipy
############################ read in the DBID and TCR identifiers from ling's csv file
BETAS_ID ={}
#ALPHAS_ID ={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_DBID_HSIDS_SEP6.csv') as in_file:
line_number =0
for line in in_file:
line_number += 1
if line_number > 1:
line_parse = line.strip('\r\n').split(',')
DBID = str(line_parse[0])
cell_type = str(line_parse[12])
DX = str(line_parse[2])
sampleID = str(line_parse[0]) + ','+str(line_parse[2])+','+str(line_parse[12]) ## sampleID
if line_parse[11] and line_parse[22]:
make_BETA_BC = str(line_parse[11])+','+str(line_parse[22])
beta_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_BETA_BC)
#print beta_ID, sampleID+','+beta_ID
if beta_ID not in BETAS_ID:
BETAS_ID[beta_ID] = sampleID+','+beta_ID
else:
print 'fuck', beta_ID, line_number
#if line_parse[10] and line_parse[21]:
#make_ALPHA_BC = str(line_parse[10])+','+str(line_parse[21])
#alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC)
#print make_ALPHA_BC, make_BETA_BC
## clone count > 2
############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
BETA_CDR3={}
filelist=glob.glob('*.rcl')
processed_rcl = 0
for rcl in filelist:
processed_rcl += 1
if rcl:
with open(rcl) as in_file_rcl:
processed_buf = 0
processed_line = 0
for line in in_file_rcl:
processed_line += 1
if line:
if processed_line == 100000:
processed_buf += 100000
processed_line = 0
print ' PROCESSED ', processed_buf, ' lines from ', rcl
line_parse = line.strip('\r\n').split(',')
if line_parse[1] == 'b-b' and int(line_parse[9]) >= 2:# clone count > 2
assert line_parse[1] == 'b-b'
get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
makefileBC = re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub)
if line_parse[3] and line_parse[5] and line_parse[8]:
makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8]
if makeCDR3 in BETA_CDR3:
get_exist = BETA_CDR3.get(makeCDR3)
if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same run
BETA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
else:
BETA_CDR3[makeCDR3] = makefileBC
## take only clones that are shared in atleast 10 runs
BETA_CDR3={key: value for key, value in BETA_CDR3.items() if len(value.split('%')) >= 5}
### confirm if the dictionary comphrension worked or not
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt', 'w') as out:
for key, value in BETA_CDR3.iteritems():
value_parse = value.split('%')
out.write(key+'$')
for item in value_parse:
if item:
sampleID=BETAS_ID.get(item)
if sampleID:
out.write(str(sampleID)+'$')
out.write('\n')
#print key, value
## yes n = 241026
### read in the DQ list
DQ_DBID =[]
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
for line in dq_f:
if line.strip('\r\n'):
DQ_DBID.append(line.strip('\r\n'))
cells = ['CD4+', 'CD4+CD45RA-']
P_DBID=[] ## total length of patients
C_DBID =[] ## total length of controls
calls={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt') as cdrfile:
processed_calls = 0
processed_buf = 0
for line in cdrfile:
processed_calls += 1
if line:
if processed_calls == 1000:
processed_buf += 1000
processed_calls = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
line_parse = line.strip('\r\n').split('$')
if len(line_parse) >= 6:
P_ = 0
C_ = 0
list_c = 0
DBID = []
for item in line_parse:
list_c += 1
if list_c > 1:
if item: ## this will update per clone counts for patients and controls
dbid=str(item.split(',')[0])
dx = str(item.split(',')[1])
cell = str(item.split(',')[2])
if dbid in DQ_DBID:
if cell in cells:
if dbid not in DBID:
DBID.append(dbid)
if dx == "P":
P_ += 1
else:
C_ += 1
else:
pass
if dx == "P" and dbid not in P_DBID:
P_DBID.append(dbid)
elif dx == "C" and dbid not in C_DBID:
C_DBID.append(dbid)
else:
pass
calls[line_parse[0]] = str(P_)+','+str(C_) ## this is dictionary update keyed with CDR3 itself
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def stat_rt(i, j):
cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
return [p, chi2, odds_ratio, SE, L_CI, U_CI]
#float_formatter = lambda x: "%.2f" % x
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/STAT_BETA_CDR3_OCT5.csv', 'w') as out_f:
out_f.write('TRBV,TRBJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
processed_buf = 0
processed_line = 0
for key, value in calls.iteritems():
value_parse = value.split(',')
if int(value_parse[0]) + int(value_parse[1]) >= 5:
processed_line += 1
if processed_line == 1000:
processed_buf += 1000
processed_line = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
to_out_2 = ','.join(map(str,to_out_1))
out_f.write(key+','+value+','+to_out_2+'\n')
calls_F=pd.read_csv('STAT_BETA_CDR3_OCT5.csv')
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
calls_F.sort_values(['PVAL'], inplace=True)
calls_F.reset_index(drop=True, inplace=True)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
calls_F.ix[:20].to_csv('BETA_RESULTS/TOP_PVALUES_CDR3_BETA_OCT5.csv', index=False)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/NARCOLEPSY_CDR3_BETA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/CONTROLS_CDR3_BETA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'],
ascending=False)[1:20].to_csv('BETA_RESULTS/ENRICHED_CDR3_BETA_OCT5.csv', index=False)
## conisder only the CD4+ total and CD4 memory
######## debugging
processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
processed_line += 1
if processed_line == 1000:
processed_buf += 1000
processed_line = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
calls_F.loc[i, 'PVAL'] = chi_out[0]
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
else:
pass
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)
'TRAV16,TRAJ42*01,CALSGGSQGNLIF'
#### debugging scripts #############
testlist=['TRBV24-1*01,TRBJ2-1*01,CATSDFGGYNEQFF',
'15980,C,CD8+,HS76,8',
'14450,C,CD8+,HS43,4',
'13121,P,CD4+,HS29,6',
'13284,P,CD8+,HS51,7',
'14272,C,CD4+CD45RA-,HS32,7',
'14272,C,CD4+CD45RA-,HS78,17',
'14272,C,CD8+,HS40,8',
'21-001-01-13,C,CD8+,HS104,15',]
P_ = 0
C_ = 0
list_c =0
DBID = []
for item in testlist:
list_c += 1
if list_c > 1:
if item:
dbid=str(item.split(',')[0])
dx=item.split(',')[1]
if dbid not in DBID:
print 'yey', dx, dbid
DBID.append(dbid)
if dx == "P":
P_ += 1
else:
C_ += 1
else:
print 'already present counts not performed', dx, dbid
print P_, C_
house = [ [43, 48-43 ], [71, 89-71] ]
chi2, p, ddof, expected = scipy.stats.chi2_contingency( house )
msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"
print( msg.format( chi2, p, ddof ) )
print( expected )
calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def chisq_rt(i, j):
cont_tab = [[i, len(P_DBID)-i], [j, len(C_DBID)-j]]
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
return [p, chi2]
processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
if calls_F.P_Counts[i] > 0 and calls_F.C_Counts[i] > 0:
processed_line += 1
if processed_line == 1000:
processed_buf += 1000
processed_line = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
calls_F.loc[i, 'PVAL'] = chi_out[0]
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
else:
pass
### write out the file
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment