adiamb/CDR3_COUNTS_ALPHAS_OCT2.py

## CDR3_COUNTS_ALPHAS_OCT2.py
import re
import subprocess
from subprocess import PIPE
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import scipy


############# this will read in the CDR3 and  make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
ALPHA_CDR3={}
filelist=glob.glob('*RATIO_FT.rcl')
processed_rcl = 0
for rcl in filelist:
	processed_rcl += 1
	if rcl:
		with open(rcl) as in_file_rcl:
			processed_buf = 0
			processed_line = 0
			for line in in_file_rcl:
				processed_line += 1
				if line:
					if processed_line == 100000:
						processed_buf += 100000
						processed_line = 0
						print ' PROCESSED ', processed_buf, ' lines from ', rcl
					line_parse = line.strip('\r\n').split(',')
					if line_parse[1] == 'a-a' and int(line_parse[9]) >= 2:
						assert line_parse[1] == 'a-a'
						get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
						makefileBC =  re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub)
						if line_parse[3] and line_parse[5] and line_parse[8]:
							makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8] ## change this if you want only CDR3 as keys
						if makeCDR3 in ALPHA_CDR3:
							get_exist = ALPHA_CDR3.get(makeCDR3)
							if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same runBC comb
								ALPHA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
						else:
							ALPHA_CDR3[makeCDR3] = makefileBC
## take only clones that are shared in atleast 5 runs
ALPHA_CDR3={key: value for key, value in ALPHA_CDR3.items() if len(value.split('%')) >= 5}

ALPHAS_ID ={}
DBID_dic ={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/TRAV_CDR3_BC_IDS.txt') as in_file:
	line_number =0
	for line in in_file:
		line_number += 1
		if line_number > 1:
			line_parse = line.strip('\r\n').split(',')
			print line_parse
			if line_parse[0] and line_parse[1]:
				make_ALPHA_BC = str(line_parse[0])+','+str(line_parse[1])
				alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC)
				print alpha_ID, line.strip('\n')
				if alpha_ID not in ALPHAS_ID:
					ALPHAS_ID[alpha_ID] = str(line.strip('\n'))
				else:
					print 'fuck'
				if line_parse[3] in DBID_dic:
					get_exist = DBID_dic.get(str(line_parse[3]))
					DBID_dic[str(line_parse[3])] = int(get_exist)+1
				else:
					DBID_dic[str(line_parse[3])] = 1


with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CDR3_ALPHAS_SHARING_OCT2.txt', 'w') as out_f:
	for key, value in ALPHA_CDR3.iteritems():
		out_f.write(key+'$')
		value_parse = value.split('%')
		for item in value_parse:
			if item:
				out_dic = ALPHAS_ID.get(item)
				if out_dic:
					out_f.write(out_dic+'$')
		out_f.write('\n')


### read in the DQ list
DQ_DBID =[]
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
	for line in dq_f:
		if line.strip('\r\n'):
			DQ_DBID.append(line.strip('\r\n'))


cells = ['CD4+','CD4+CD45RA-']
P_DBID=[] ## total length of patients
C_DBID =[] ## total length of controls
calls={}
with open('CDR3_ALPHAS_SHARING_OCT2.txt') as cdrfile:
	processed_calls = 0
	processed_buf = 0
	for line in cdrfile:
		processed_calls += 1
		if line:
			if processed_calls == 1000:
				processed_buf += 1000
				processed_calls = 0
				print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
			line_parse = line.strip('\r\n').split('$')
			if len(line_parse) > 6:
				P_ = 0
				C_ = 0
				list_c = 0
				DBID = []
				for item in line_parse:
					list_c += 1
					if list_c > 1:
						if item:
							dbid=str(item.split(',')[-2])
							dx=str(item.split(',')[-1])
							cell = str(item.split(',')[-3])
							if dbid in DQ_DBID:
								if cell in cells:
									if dbid not in DBID:
										DBID.append(dbid)
										if dx == 'P':
											P_ += 1
										else:
											C_ += 1
									else:
										pass

									if dx == "P" and dbid not in P_DBID:
										P_DBID.append(dbid)
									elif dx == "C" and dbid not in C_DBID:
										C_DBID.append(dbid)
									else:
										pass
				calls[line_parse[0]] = str(P_)+','+str(C_)


#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def stat_rt(i, j):
	cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
	odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
	SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
	L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
	U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
	return [p, chi2, odds_ratio, SE, L_CI, U_CI]


###  stats will be calculated by this call and results written out
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/STAT_ALPHA_CDR3_OCT5.csv', 'w') as out_f:
	out_f.write('TRAV,TRAJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
	processed_buf = 0
	processed_line = 0
	for key, value in calls.iteritems():
		value_parse = value.split(',')
		if int(value_parse[0]) + int(value_parse[1]) >= 5:
			processed_line += 1
			if processed_line == 1000:
				processed_buf += 1000
				processed_line = 0
				print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
			to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
			to_out_2 = ','.join(map(str,to_out_1))
			out_f.write(key+','+value+','+to_out_2+'\n')


### re read the results using pandas and see if stats look normal
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
calls_F.sort_values(['PVAL'], inplace=True)
calls_F.reset_index(drop=True, inplace=True)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
calls_F.ix[:20].to_csv('ALPHA_RESULTS/TOP_PVALUES_CDR3_ALPHA_OCT5.csv', index=False)

calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/NARCOLEPSY_CDR3_ALPHA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/CONTROLS_CDR3_ALPHA_OCT5.csv', index=False)


calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'], ascending=False)[1:20].to_csv('ALPHA_RESULTS/ENRICHED_CDR3_ALPHA_OCT5.csv', index=False)


## conisder only the CD4+ total and CD4 memory
######## debugging
processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
		processed_line += 1
		if processed_line == 1000:
			processed_buf += 1000
			processed_line = 0
			print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
		chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
		calls_F.loc[i, 'PVAL'] = chi_out[0]
		calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
		pass

calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)

'TRAV16,TRAJ42*01,CALSGGSQGNLIF'
	import re
	import subprocess
	from subprocess import PIPE
	import pandas as pd
	import numpy as np
	import scipy.stats as stats
	import glob
	import scipy


	############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
	ALPHA_CDR3={}
	filelist=glob.glob('*RATIO_FT.rcl')
	processed_rcl = 0
	for rcl in filelist:
	processed_rcl += 1
	if rcl:
	with open(rcl) as in_file_rcl:
	processed_buf = 0
	processed_line = 0
	for line in in_file_rcl:
	processed_line += 1
	if line:
	if processed_line == 100000:
	processed_buf += 100000
	processed_line = 0
	print ' PROCESSED ', processed_buf, ' lines from ', rcl
	line_parse = line.strip('\r\n').split(',')
	if line_parse[1] == 'a-a' and int(line_parse[9]) >= 2:
	assert line_parse[1] == 'a-a'
	get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
	makefileBC = re.sub(r'(_v5B\|_v5\|_Fonly\|_SEP1\|_V5)', '', get_BC_sub)
	if line_parse[3] and line_parse[5] and line_parse[8]:
	makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8] ## change this if you want only CDR3 as keys
	if makeCDR3 in ALPHA_CDR3:
	get_exist = ALPHA_CDR3.get(makeCDR3)
	if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same runBC comb
	ALPHA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
	else:
	ALPHA_CDR3[makeCDR3] = makefileBC
	## take only clones that are shared in atleast 5 runs
	ALPHA_CDR3={key: value for key, value in ALPHA_CDR3.items() if len(value.split('%')) >= 5}

	ALPHAS_ID ={}
	DBID_dic ={}
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/TRAV_CDR3_BC_IDS.txt') as in_file:
	line_number =0
	for line in in_file:
	line_number += 1
	if line_number > 1:
	line_parse = line.strip('\r\n').split(',')
	print line_parse
	if line_parse[0] and line_parse[1]:
	make_ALPHA_BC = str(line_parse[0])+','+str(line_parse[1])
	alpha_ID = re.sub(r'(TCR-\|_v5B\|_v5\|_Fonly\|_SEP1\|_V5)', '', make_ALPHA_BC)
	print alpha_ID, line.strip('\n')
	if alpha_ID not in ALPHAS_ID:
	ALPHAS_ID[alpha_ID] = str(line.strip('\n'))
	else:
	print 'fuck'
	if line_parse[3] in DBID_dic:
	get_exist = DBID_dic.get(str(line_parse[3]))
	DBID_dic[str(line_parse[3])] = int(get_exist)+1
	else:
	DBID_dic[str(line_parse[3])] = 1


	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CDR3_ALPHAS_SHARING_OCT2.txt', 'w') as out_f:
	for key, value in ALPHA_CDR3.iteritems():
	out_f.write(key+'$')
	value_parse = value.split('%')
	for item in value_parse:
	if item:
	out_dic = ALPHAS_ID.get(item)
	if out_dic:
	out_f.write(out_dic+'$')
	out_f.write('\n')


	### read in the DQ list
	DQ_DBID =[]
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
	for line in dq_f:
	if line.strip('\r\n'):
	DQ_DBID.append(line.strip('\r\n'))


	cells = ['CD4+','CD4+CD45RA-']
	P_DBID=[] ## total length of patients
	C_DBID =[] ## total length of controls
	calls={}
	with open('CDR3_ALPHAS_SHARING_OCT2.txt') as cdrfile:
	processed_calls = 0
	processed_buf = 0
	for line in cdrfile:
	processed_calls += 1
	if line:
	if processed_calls == 1000:
	processed_buf += 1000
	processed_calls = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
	line_parse = line.strip('\r\n').split('$')
	if len(line_parse) > 6:
	P_ = 0
	C_ = 0
	list_c = 0
	DBID = []
	for item in line_parse:
	list_c += 1
	if list_c > 1:
	if item:
	dbid=str(item.split(',')[-2])
	dx=str(item.split(',')[-1])
	cell = str(item.split(',')[-3])
	if dbid in DQ_DBID:
	if cell in cells:
	if dbid not in DBID:
	DBID.append(dbid)
	if dx == 'P':
	P_ += 1
	else:
	C_ += 1
	else:
	pass

	if dx == "P" and dbid not in P_DBID:
	P_DBID.append(dbid)
	elif dx == "C" and dbid not in C_DBID:
	C_DBID.append(dbid)
	else:
	pass
	calls[line_parse[0]] = str(P_)+','+str(C_)





	#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
	### calculate chisquare## defin chisq
	def stat_rt(i, j):
	cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
	odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
	SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
	L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
	U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
	return [p, chi2, odds_ratio, SE, L_CI, U_CI]


	### stats will be calculated by this call and results written out
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/STAT_ALPHA_CDR3_OCT5.csv', 'w') as out_f:
	out_f.write('TRAV,TRAJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
	processed_buf = 0
	processed_line = 0
	for key, value in calls.iteritems():
	value_parse = value.split(',')
	if int(value_parse[0]) + int(value_parse[1]) >= 5:
	processed_line += 1
	if processed_line == 1000:
	processed_buf += 1000
	processed_line = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
	to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
	to_out_2 = ','.join(map(str,to_out_1))
	out_f.write(key+','+value+','+to_out_2+'\n')


	### re read the results using pandas and see if stats look normal
	calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
	calls_F.sort_values(['PVAL'], inplace=True)
	calls_F.reset_index(drop=True, inplace=True)
	calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
	calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
	calls_F.ix[:20].to_csv('ALPHA_RESULTS/TOP_PVALUES_CDR3_ALPHA_OCT5.csv', index=False)

	calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/NARCOLEPSY_CDR3_ALPHA_OCT5.csv', index=False)
	calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('ALPHA_RESULTS/CONTROLS_CDR3_ALPHA_OCT5.csv', index=False)


	calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'], ascending=False)[1:20].to_csv('ALPHA_RESULTS/ENRICHED_CDR3_ALPHA_OCT5.csv', index=False)










	## conisder only the CD4+ total and CD4 memory
	######## debugging
	processed_buf = 0
	processed_line = 0
	calls_F.loc[:, 'PVAL'] = 1
	calls_F.loc[:, 'CHI_TSTAT'] ='0'
	for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
	processed_line += 1
	if processed_line == 1000:
	processed_buf += 1000
	processed_line = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
	chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
	calls_F.loc[i, 'PVAL'] = chi_out[0]
	calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
	pass

	calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
	calls_DF = pd.Series(calls, name='P_C')
	calls_DF_=calls_DF.reset_index()
	calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
	#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
	calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
	calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)

	'TRAV16,TRAJ42*01,CALSGGSQGNLIF'