Skip to content

Instantly share code, notes, and snippets.

@adiamb
Last active October 5, 2017 18:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/f758621a81b675f5a9b02ee0a64b1e9e to your computer and use it in GitHub Desktop.
Save adiamb/f758621a81b675f5a9b02ee0a64b1e9e to your computer and use it in GitHub Desktop.
This gets the sharing for alpha CDR3
import re
import subprocess
from subprocess import PIPE
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import scipy
############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
ALPHA_CDR3={}
filelist=glob.glob('*RATIO_FT.rcl')
processed_rcl = 0
for rcl in filelist:
processed_rcl += 1
if rcl:
with open(rcl) as in_file_rcl:
processed_buf = 0
processed_line = 0
for line in in_file_rcl:
processed_line += 1
if line:
if processed_line == 100000:
processed_buf += 100000
processed_line = 0
print ' PROCESSED ', processed_buf, ' lines from ', rcl
line_parse = line.strip('\r\n').split(',')
if line_parse[1] == 'a-a' and int(line_parse[9]) >= 2:
assert line_parse[1] == 'a-a'
get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
makefileBC = re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub)
if line_parse[3] and line_parse[5] and line_parse[8]:
makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8] ## change this if you want only CDR3 as keys
if makeCDR3 in ALPHA_CDR3:
get_exist = ALPHA_CDR3.get(makeCDR3)
if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same runBC comb
ALPHA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
else:
ALPHA_CDR3[makeCDR3] = makefileBC
## take only clones that are shared in atleast 5 runs
ALPHA_CDR3={key: value for key, value in ALPHA_CDR3.items() if len(value.split('%')) >= 5}
ALPHAS_ID ={}
DBID_dic ={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/TRAV_CDR3_BC_IDS.txt') as in_file:
line_number =0
for line in in_file:
line_number += 1
if line_number > 1:
line_parse = line.strip('\r\n').split(',')
print line_parse
if line_parse[0] and line_parse[1]:
make_ALPHA_BC = str(line_parse[0])+','+str(line_parse[1])
alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC)
print alpha_ID, line.strip('\n')
if alpha_ID not in ALPHAS_ID:
ALPHAS_ID[alpha_ID] = str(line.strip('\n'))
else:
print 'fuck'
if line_parse[3] in DBID_dic:
get_exist = DBID_dic.get(str(line_parse[3]))
DBID_dic[str(line_parse[3])] = int(get_exist)+1
else:
DBID_dic[str(line_parse[3])] = 1
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CDR3_ALPHAS_SHARING_OCT2.txt', 'w') as out_f:
for key, value in ALPHA_CDR3.iteritems():
out_f.write(key+'$')
value_parse = value.split('%')
for item in value_parse:
if item:
out_dic = ALPHAS_ID.get(item)
if out_dic:
out_f.write(out_dic+'$')
out_f.write('\n')
### read in the DQ list
DQ_DBID =[]
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
for line in dq_f:
if line.strip('\r\n'):
DQ_DBID.append(line.strip('\r\n'))
cells = ['CD4+','CD4+CD45RA-']
P_DBID=[] ## total length of patients
C_DBID =[] ## total length of controls
calls={}
with open('CDR3_ALPHAS_SHARING_OCT2.txt') as cdrfile:
processed_calls = 0
processed_buf = 0
for line in cdrfile:
processed_calls += 1
if line:
if processed_calls == 1000:
processed_buf += 1000
processed_calls = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
line_parse = line.strip('\r\n').split('$')
if len(line_parse) > 6:
P_ = 0
C_ = 0
list_c = 0
DBID = []
for item in line_parse:
list_c += 1
if list_c > 1:
if item:
dbid=str(item.split(',')[-2])
dx=str(item.split(',')[-1])
cell = str(item.split(',')[-3])
if dbid in DQ_DBID:
if cell in cells:
if dbid not in DBID:
DBID.append(dbid)
if dx == 'P':
P_ += 1
else:
C_ += 1
else:
pass
if dx == "P" and dbid not in P_DBID:
P_DBID.append(dbid)
elif dx == "C" and dbid not in C_DBID:
C_DBID.append(dbid)
else:
pass
calls[line_parse[0]] = str(P_)+','+str(C_)
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def stat_rt(i, j):
cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
return [p, chi2, odds_ratio, SE, L_CI, U_CI]
### stats will be calculated by this call and results written out
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/STAT_ALPHA_CDR3_OCT5.csv', 'w') as out_f:
out_f.write('TRAV,TRAJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
processed_buf = 0
processed_line = 0
for key, value in calls.iteritems():
value_parse = value.split(',')
if int(value_parse[0]) + int(value_parse[1]) >= 5:
processed_line += 1
if processed_line == 1000:
processed_buf += 1000
processed_line = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
to_out_2 = ','.join(map(str,to_out_1))
out_f.write(key+','+value+','+to_out_2+'\n')
### re read the results using pandas and see if stats look normal
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
calls_F.sort_values(['PVAL'], inplace=True)
calls_F.reset_index(drop=True, inplace=True)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
calls_F.ix[:20].to_csv('ALPHA_RESULTS/TOP_PVALUES_CDR3_ALPHA_OCT5.csv', index=False)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/NARCOLEPSY_CDR3_ALPHA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/CONTROLS_CDR3_ALPHA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'], ascending=False)[1:20].to_csv('ALPHA_RESULTS/ENRICHED_CDR3_ALPHA_OCT5.csv', index=False)
## conisder only the CD4+ total and CD4 memory
######## debugging
processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
processed_line += 1
if processed_line == 1000:
processed_buf += 1000
processed_line = 0
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
calls_F.loc[i, 'PVAL'] = chi_out[0]
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
else:
pass
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)
'TRAV16,TRAJ42*01,CALSGGSQGNLIF'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment