Last active
October 5, 2017 18:02
-
-
Save adiamb/f758621a81b675f5a9b02ee0a64b1e9e to your computer and use it in GitHub Desktop.
This gets the sharing for alpha CDR3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import subprocess | |
from subprocess import PIPE | |
import pandas as pd | |
import numpy as np | |
import scipy.stats as stats | |
import glob | |
import scipy | |
############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items | |
ALPHA_CDR3={} | |
filelist=glob.glob('*RATIO_FT.rcl') | |
processed_rcl = 0 | |
for rcl in filelist: | |
processed_rcl += 1 | |
if rcl: | |
with open(rcl) as in_file_rcl: | |
processed_buf = 0 | |
processed_line = 0 | |
for line in in_file_rcl: | |
processed_line += 1 | |
if line: | |
if processed_line == 100000: | |
processed_buf += 100000 | |
processed_line = 0 | |
print ' PROCESSED ', processed_buf, ' lines from ', rcl | |
line_parse = line.strip('\r\n').split(',') | |
if line_parse[1] == 'a-a' and int(line_parse[9]) >= 2: | |
assert line_parse[1] == 'a-a' | |
get_BC_sub = str(line_parse[0])+','+str(line_parse[2]) | |
makefileBC = re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub) | |
if line_parse[3] and line_parse[5] and line_parse[8]: | |
makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8] ## change this if you want only CDR3 as keys | |
if makeCDR3 in ALPHA_CDR3: | |
get_exist = ALPHA_CDR3.get(makeCDR3) | |
if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same runBC comb | |
ALPHA_CDR3[makeCDR3] = get_exist+'%'+makefileBC | |
else: | |
ALPHA_CDR3[makeCDR3] = makefileBC | |
## take only clones that are shared in atleast 5 runs | |
ALPHA_CDR3={key: value for key, value in ALPHA_CDR3.items() if len(value.split('%')) >= 5} | |
ALPHAS_ID ={} | |
DBID_dic ={} | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/TRAV_CDR3_BC_IDS.txt') as in_file: | |
line_number =0 | |
for line in in_file: | |
line_number += 1 | |
if line_number > 1: | |
line_parse = line.strip('\r\n').split(',') | |
print line_parse | |
if line_parse[0] and line_parse[1]: | |
make_ALPHA_BC = str(line_parse[0])+','+str(line_parse[1]) | |
alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC) | |
print alpha_ID, line.strip('\n') | |
if alpha_ID not in ALPHAS_ID: | |
ALPHAS_ID[alpha_ID] = str(line.strip('\n')) | |
else: | |
print 'fuck' | |
if line_parse[3] in DBID_dic: | |
get_exist = DBID_dic.get(str(line_parse[3])) | |
DBID_dic[str(line_parse[3])] = int(get_exist)+1 | |
else: | |
DBID_dic[str(line_parse[3])] = 1 | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CDR3_ALPHAS_SHARING_OCT2.txt', 'w') as out_f: | |
for key, value in ALPHA_CDR3.iteritems(): | |
out_f.write(key+'$') | |
value_parse = value.split('%') | |
for item in value_parse: | |
if item: | |
out_dic = ALPHAS_ID.get(item) | |
if out_dic: | |
out_f.write(out_dic+'$') | |
out_f.write('\n') | |
### read in the DQ list | |
DQ_DBID =[] | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f: | |
for line in dq_f: | |
if line.strip('\r\n'): | |
DQ_DBID.append(line.strip('\r\n')) | |
cells = ['CD4+','CD4+CD45RA-'] | |
P_DBID=[] ## total length of patients | |
C_DBID =[] ## total length of controls | |
calls={} | |
with open('CDR3_ALPHAS_SHARING_OCT2.txt') as cdrfile: | |
processed_calls = 0 | |
processed_buf = 0 | |
for line in cdrfile: | |
processed_calls += 1 | |
if line: | |
if processed_calls == 1000: | |
processed_buf += 1000 | |
processed_calls = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS' | |
line_parse = line.strip('\r\n').split('$') | |
if len(line_parse) > 6: | |
P_ = 0 | |
C_ = 0 | |
list_c = 0 | |
DBID = [] | |
for item in line_parse: | |
list_c += 1 | |
if list_c > 1: | |
if item: | |
dbid=str(item.split(',')[-2]) | |
dx=str(item.split(',')[-1]) | |
cell = str(item.split(',')[-3]) | |
if dbid in DQ_DBID: | |
if cell in cells: | |
if dbid not in DBID: | |
DBID.append(dbid) | |
if dx == 'P': | |
P_ += 1 | |
else: | |
C_ += 1 | |
else: | |
pass | |
if dx == "P" and dbid not in P_DBID: | |
P_DBID.append(dbid) | |
elif dx == "C" and dbid not in C_DBID: | |
C_DBID.append(dbid) | |
else: | |
pass | |
calls[line_parse[0]] = str(P_)+','+str(C_) | |
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True]) | |
### calculate chisquare## defin chisq | |
def stat_rt(i, j): | |
cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float) | |
chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab) | |
#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab) | |
odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1]) | |
SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64))) | |
L_CI = np.exp(np.log(odds_ratio) - 1.96*SE) | |
U_CI = np.exp(np.log(odds_ratio) + 1.96*SE) | |
return [p, chi2, odds_ratio, SE, L_CI, U_CI] | |
### stats will be calculated by this call and results written out | |
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/STAT_ALPHA_CDR3_OCT5.csv', 'w') as out_f: | |
out_f.write('TRAV,TRAJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n') | |
processed_buf = 0 | |
processed_line = 0 | |
for key, value in calls.iteritems(): | |
value_parse = value.split(',') | |
if int(value_parse[0]) + int(value_parse[1]) >= 5: | |
processed_line += 1 | |
if processed_line == 1000: | |
processed_buf += 1000 | |
processed_line = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS' | |
to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1])) | |
to_out_2 = ','.join(map(str,to_out_1)) | |
out_f.write(key+','+value+','+to_out_2+'\n') | |
### re read the results using pandas and see if stats look normal | |
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv') | |
calls_F.sort_values(['PVAL'], inplace=True) | |
calls_F.reset_index(drop=True, inplace=True) | |
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL']) | |
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL']) | |
calls_F.ix[:20].to_csv('ALPHA_RESULTS/TOP_PVALUES_CDR3_ALPHA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/NARCOLEPSY_CDR3_ALPHA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/CONTROLS_CDR3_ALPHA_OCT5.csv', index=False) | |
calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'], ascending=False)[1:20].to_csv('ALPHA_RESULTS/ENRICHED_CDR3_ALPHA_OCT5.csv', index=False) | |
## conisder only the CD4+ total and CD4 memory | |
######## debugging | |
processed_buf = 0 | |
processed_line = 0 | |
calls_F.loc[:, 'PVAL'] = 1 | |
calls_F.loc[:, 'CHI_TSTAT'] ='0' | |
for i in xrange(len(calls_F)): | |
if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0: | |
processed_line += 1 | |
if processed_line == 1000: | |
processed_buf += 1000 | |
processed_line = 0 | |
print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS' | |
chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i]) | |
calls_F.loc[i, 'PVAL'] = chi_out[0] | |
calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1] | |
else: | |
pass | |
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False) | |
calls_DF = pd.Series(calls, name='P_C') | |
calls_DF_=calls_DF.reset_index() | |
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1) | |
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True) | |
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int) | |
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int) | |
'TRAV16,TRAJ42*01,CALSGGSQGNLIF' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment