adiamb/CDR3_COUNTS_BETA_SEP29.py

## CDR3_COUNTS_BETA_SEP29.py
import re
import subprocess
from subprocess import PIPE
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import scipy
############################ read in the DBID and TCR identifiers from ling's csv file

BETAS_ID ={}
#ALPHAS_ID ={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_DBID_HSIDS_SEP6.csv') as in_file:
	line_number =0
	for line in in_file:
		line_number += 1
		if line_number > 1:
			line_parse = line.strip('\r\n').split(',')
			DBID = str(line_parse[0])
			cell_type = str(line_parse[12])
			DX = str(line_parse[2])
			sampleID = str(line_parse[0]) + ','+str(line_parse[2])+','+str(line_parse[12]) ## sampleID
			if line_parse[11] and line_parse[22]:
				make_BETA_BC = str(line_parse[11])+','+str(line_parse[22])
				beta_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_BETA_BC)
				#print beta_ID, sampleID+','+beta_ID
				if beta_ID not in BETAS_ID:
					BETAS_ID[beta_ID] = sampleID+','+beta_ID
				else:
					print 'fuck', beta_ID, line_number
			#if line_parse[10] and line_parse[21]:
				#make_ALPHA_BC = str(line_parse[10])+','+str(line_parse[21])
				#alpha_ID = re.sub(r'(TCR-|_v5B|_v5|_Fonly|_SEP1|_V5)', '', make_ALPHA_BC)
				#print make_ALPHA_BC, make_BETA_BC


## clone count > 2
############# this will read in the CDR3 and  make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
BETA_CDR3={}
filelist=glob.glob('*.rcl')
processed_rcl = 0
for rcl in filelist:
	processed_rcl += 1
	if rcl:
		with open(rcl) as in_file_rcl:
			processed_buf = 0
			processed_line = 0
			for line in in_file_rcl:
				processed_line += 1
				if line:
					if processed_line == 100000:
						processed_buf += 100000
						processed_line = 0
						print ' PROCESSED ', processed_buf, ' lines from ', rcl
					line_parse = line.strip('\r\n').split(',')
					if line_parse[1] == 'b-b' and int(line_parse[9]) >= 2:# clone count > 2
						assert line_parse[1] == 'b-b'
						get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
						makefileBC =  re.sub(r'(_v5B|_v5|_Fonly|_SEP1|_V5)', '', get_BC_sub)
						if line_parse[3] and line_parse[5] and line_parse[8]:
							makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8]
						if makeCDR3 in BETA_CDR3:
							get_exist = BETA_CDR3.get(makeCDR3)
							if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same run
								BETA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
						else:
							BETA_CDR3[makeCDR3] = makefileBC
## take only clones that are shared in atleast 10 runs
BETA_CDR3={key: value for key, value in BETA_CDR3.items() if len(value.split('%')) >= 5}
### confirm if the dictionary comphrension worked or not
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt', 'w') as out:
	for key, value in BETA_CDR3.iteritems():
		value_parse = value.split('%')
		out.write(key+'$')
		for item in value_parse:
			if item:
				sampleID=BETAS_ID.get(item)
				if sampleID:
					out.write(str(sampleID)+'$')
		out.write('\n')
		#print key, value
## yes n =  241026

### read in the DQ list
DQ_DBID =[]
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
	for line in dq_f:
		if line.strip('\r\n'):
			DQ_DBID.append(line.strip('\r\n'))

cells = ['CD4+', 'CD4+CD45RA-']
P_DBID=[] ## total length of patients
C_DBID =[] ## total length of controls
calls={}
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt') as cdrfile:
	processed_calls = 0
	processed_buf = 0
	for line in cdrfile:
		processed_calls += 1
		if line:
			if processed_calls == 1000:
				processed_buf += 1000
				processed_calls = 0
				print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
			line_parse = line.strip('\r\n').split('$')
			if len(line_parse) >= 6:
				P_ = 0
				C_ = 0
				list_c = 0
				DBID = []
				for item in line_parse:
					list_c += 1
					if list_c > 1:
						if item: ## this will update per clone counts for patients and controls
							dbid=str(item.split(',')[0])
							dx = str(item.split(',')[1])
							cell = str(item.split(',')[2])
							if dbid in DQ_DBID:
								if cell in cells:
									if dbid not in DBID:
										DBID.append(dbid)
										if dx == "P":
											P_ += 1
										else:
											C_ += 1
									else:
										pass

									if dx == "P" and dbid not in P_DBID:
										P_DBID.append(dbid)
									elif dx == "C" and dbid not in C_DBID:
										C_DBID.append(dbid)
									else:
										pass
				calls[line_parse[0]] = str(P_)+','+str(C_) ## this is dictionary update keyed with CDR3 itself

#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def stat_rt(i, j):
	cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
	odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
	SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
	L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
	U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
	return [p, chi2, odds_ratio, SE, L_CI, U_CI]


#float_formatter = lambda x: "%.2f" % x
with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/STAT_BETA_CDR3_OCT5.csv', 'w') as out_f:
	out_f.write('TRBV,TRBJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
	processed_buf = 0
	processed_line = 0
	for key, value in calls.iteritems():
		value_parse = value.split(',')
		if int(value_parse[0]) + int(value_parse[1]) >= 5:
			processed_line += 1
			if processed_line == 1000:
				processed_buf += 1000
				processed_line = 0
				print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
			to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
			to_out_2 = ','.join(map(str,to_out_1))
			out_f.write(key+','+value+','+to_out_2+'\n')


calls_F=pd.read_csv('STAT_BETA_CDR3_OCT5.csv')
calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
calls_F.sort_values(['PVAL'], inplace=True)
calls_F.reset_index(drop=True, inplace=True)
calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
calls_F.ix[:20].to_csv('BETA_RESULTS/TOP_PVALUES_CDR3_BETA_OCT5.csv', index=False)

calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/NARCOLEPSY_CDR3_BETA_OCT5.csv', index=False)
calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/CONTROLS_CDR3_BETA_OCT5.csv', index=False)

calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'],
	ascending=False)[1:20].to_csv('BETA_RESULTS/ENRICHED_CDR3_BETA_OCT5.csv', index=False)


## conisder only the CD4+ total and CD4 memory
######## debugging
processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
		processed_line += 1
		if processed_line == 1000:
			processed_buf += 1000
			processed_line = 0
			print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
		chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
		calls_F.loc[i, 'PVAL'] = chi_out[0]
		calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
		pass

calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)

'TRAV16,TRAJ42*01,CALSGGSQGNLIF'


#### debugging scripts #############
testlist=['TRBV24-1*01,TRBJ2-1*01,CATSDFGGYNEQFF',
 '15980,C,CD8+,HS76,8',
 '14450,C,CD8+,HS43,4',
 '13121,P,CD4+,HS29,6',
 '13284,P,CD8+,HS51,7',
 '14272,C,CD4+CD45RA-,HS32,7',
 '14272,C,CD4+CD45RA-,HS78,17',
 '14272,C,CD8+,HS40,8',
 '21-001-01-13,C,CD8+,HS104,15',]
P_ = 0
C_ = 0
list_c =0
DBID = []
for item in testlist:
	list_c += 1
	if list_c > 1:
		if item:
			dbid=str(item.split(',')[0])
			dx=item.split(',')[1]
			if dbid not in DBID:
				print 'yey', dx, dbid
				DBID.append(dbid)
				if dx == "P":
					P_ += 1
				else:
					C_ += 1

			else:
				print 'already present counts not performed', dx, dbid
print P_, C_


house = [ [43, 48-43 ], [71, 89-71] ]
chi2, p, ddof, expected = scipy.stats.chi2_contingency( house )
msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"
print( msg.format( chi2, p, ddof ) )
print( expected )


calls_DF = pd.Series(calls, name='P_C')
calls_DF_=calls_DF.reset_index()
calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)
#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
### calculate chisquare## defin chisq
def chisq_rt(i, j):
	cont_tab = [[i, len(P_DBID)-i], [j, len(C_DBID)-j]]
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	return [p, chi2]

processed_buf = 0
processed_line = 0
calls_F.loc[:, 'PVAL'] = 1
calls_F.loc[:, 'CHI_TSTAT'] ='0'
for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 and calls_F.C_Counts[i] > 0:
		processed_line += 1
		if processed_line == 1000:
			processed_buf += 1000
			processed_line = 0
			print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
		chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
		calls_F.loc[i, 'PVAL'] = chi_out[0]
		calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
		pass
### write out the file
calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
	import re
	import subprocess
	from subprocess import PIPE
	import pandas as pd
	import numpy as np
	import scipy.stats as stats
	import glob
	import scipy
	############################ read in the DBID and TCR identifiers from ling's csv file

	BETAS_ID ={}
	#ALPHAS_ID ={}
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_DBID_HSIDS_SEP6.csv') as in_file:
	line_number =0
	for line in in_file:
	line_number += 1
	if line_number > 1:
	line_parse = line.strip('\r\n').split(',')
	DBID = str(line_parse[0])
	cell_type = str(line_parse[12])
	DX = str(line_parse[2])
	sampleID = str(line_parse[0]) + ','+str(line_parse[2])+','+str(line_parse[12]) ## sampleID
	if line_parse[11] and line_parse[22]:
	make_BETA_BC = str(line_parse[11])+','+str(line_parse[22])
	beta_ID = re.sub(r'(TCR-\|_v5B\|_v5\|_Fonly\|_SEP1\|_V5)', '', make_BETA_BC)
	#print beta_ID, sampleID+','+beta_ID
	if beta_ID not in BETAS_ID:
	BETAS_ID[beta_ID] = sampleID+','+beta_ID
	else:
	print 'fuck', beta_ID, line_number
	#if line_parse[10] and line_parse[21]:
	#make_ALPHA_BC = str(line_parse[10])+','+str(line_parse[21])
	#alpha_ID = re.sub(r'(TCR-\|_v5B\|_v5\|_Fonly\|_SEP1\|_V5)', '', make_ALPHA_BC)
	#print make_ALPHA_BC, make_BETA_BC


	## clone count > 2
	############# this will read in the CDR3 and make a dicitionary CDR3 BETAS will be keys and TCR_HS will be items
	BETA_CDR3={}
	filelist=glob.glob('*.rcl')
	processed_rcl = 0
	for rcl in filelist:
	processed_rcl += 1
	if rcl:
	with open(rcl) as in_file_rcl:
	processed_buf = 0
	processed_line = 0
	for line in in_file_rcl:
	processed_line += 1
	if line:
	if processed_line == 100000:
	processed_buf += 100000
	processed_line = 0
	print ' PROCESSED ', processed_buf, ' lines from ', rcl
	line_parse = line.strip('\r\n').split(',')
	if line_parse[1] == 'b-b' and int(line_parse[9]) >= 2:# clone count > 2
	assert line_parse[1] == 'b-b'
	get_BC_sub = str(line_parse[0])+','+str(line_parse[2])
	makefileBC = re.sub(r'(_v5B\|_v5\|_Fonly\|_SEP1\|_V5)', '', get_BC_sub)
	if line_parse[3] and line_parse[5] and line_parse[8]:
	makeCDR3 = line_parse[3]+','+line_parse[5]+','+line_parse[8]
	if makeCDR3 in BETA_CDR3:
	get_exist = BETA_CDR3.get(makeCDR3)
	if makefileBC not in get_exist.split('%'): ## take only one isntance of the CDR3 for same run
	BETA_CDR3[makeCDR3] = get_exist+'%'+makefileBC
	else:
	BETA_CDR3[makeCDR3] = makefileBC
	## take only clones that are shared in atleast 10 runs
	BETA_CDR3={key: value for key, value in BETA_CDR3.items() if len(value.split('%')) >= 5}
	### confirm if the dictionary comphrension worked or not
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt', 'w') as out:
	for key, value in BETA_CDR3.iteritems():
	value_parse = value.split('%')
	out.write(key+'$')
	for item in value_parse:
	if item:
	sampleID=BETAS_ID.get(item)
	if sampleID:
	out.write(str(sampleID)+'$')
	out.write('\n')
	#print key, value
	## yes n = 241026

	### read in the DQ list
	DQ_DBID =[]
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/DBID_DQ0602_only_OCT3.txt') as dq_f:
	for line in dq_f:
	if line.strip('\r\n'):
	DQ_DBID.append(line.strip('\r\n'))

	cells = ['CD4+', 'CD4+CD45RA-']
	P_DBID=[] ## total length of patients
	C_DBID =[] ## total length of controls
	calls={}
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/CDR3_BETAS_SHARING_OCT4.txt') as cdrfile:
	processed_calls = 0
	processed_buf = 0
	for line in cdrfile:
	processed_calls += 1
	if line:
	if processed_calls == 1000:
	processed_buf += 1000
	processed_calls = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
	line_parse = line.strip('\r\n').split('$')
	if len(line_parse) >= 6:
	P_ = 0
	C_ = 0
	list_c = 0
	DBID = []
	for item in line_parse:
	list_c += 1
	if list_c > 1:
	if item: ## this will update per clone counts for patients and controls
	dbid=str(item.split(',')[0])
	dx = str(item.split(',')[1])
	cell = str(item.split(',')[2])
	if dbid in DQ_DBID:
	if cell in cells:
	if dbid not in DBID:
	DBID.append(dbid)
	if dx == "P":
	P_ += 1
	else:
	C_ += 1
	else:
	pass

	if dx == "P" and dbid not in P_DBID:
	P_DBID.append(dbid)
	elif dx == "C" and dbid not in C_DBID:
	C_DBID.append(dbid)
	else:
	pass
	calls[line_parse[0]] = str(P_)+','+str(C_) ## this is dictionary update keyed with CDR3 itself

	#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
	### calculate chisquare## defin chisq
	def stat_rt(i, j):
	cont_tab = np.array([[i, len(P_DBID)-i], [j, len(C_DBID)-j]], dtype=float)
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	#oddsratio, pvalue = scipy.stats.fisher_exact(cont_tab)
	odds_ratio = (cont_tab[0, 0]/cont_tab[1,0])/(cont_tab[0, 1]/cont_tab[1, 1])
	SE = np.sqrt(np.sum(1/cont_tab.astype(np.float64)))
	L_CI = np.exp(np.log(odds_ratio) - 1.96*SE)
	U_CI = np.exp(np.log(odds_ratio) + 1.96*SE)
	return [p, chi2, odds_ratio, SE, L_CI, U_CI]



	#float_formatter = lambda x: "%.2f" % x
	with open('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_BETAS_SEP7/STAT_BETA_CDR3_OCT5.csv', 'w') as out_f:
	out_f.write('TRBV,TRBJ,CDR3,P_FREQ,C_FREQ,PVAL,T_CHI,ODDS_RATIO,SE,L_CI,U_CI'+'\n')
	processed_buf = 0
	processed_line = 0
	for key, value in calls.iteritems():
	value_parse = value.split(',')
	if int(value_parse[0]) + int(value_parse[1]) >= 5:
	processed_line += 1
	if processed_line == 1000:
	processed_buf += 1000
	processed_line = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
	to_out_1=stat_rt(i=float(value_parse[0]), j=float(value_parse[1]))
	to_out_2 = ','.join(map(str,to_out_1))
	out_f.write(key+','+value+','+to_out_2+'\n')


	calls_F=pd.read_csv('STAT_BETA_CDR3_OCT5.csv')
	calls_F=pd.read_csv('STAT_ALPHA_CDR3_OCT5.csv')
	calls_F.sort_values(['PVAL'], inplace=True)
	calls_F.reset_index(drop=True, inplace=True)
	calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])
	calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])
	calls_F.ix[:20].to_csv('BETA_RESULTS/TOP_PVALUES_CDR3_BETA_OCT5.csv', index=False)

	calls_F.loc[(calls_F.P_FREQ > 5) & (calls_F.C_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/NARCOLEPSY_CDR3_BETA_OCT5.csv', index=False)
	calls_F.loc[(calls_F.C_FREQ > 5) & (calls_F.P_FREQ == 0)].sort_values(['PVAL'])[1:20].to_csv('BETA_RESULTS/CONTROLS_CDR3_BETA_OCT5.csv', index=False)

	calls_F.loc[(calls_F.C_FREQ > 30) & (calls_F.P_FREQ > 30)].sort_values(['PVAL'],
	ascending=False)[1:20].to_csv('BETA_RESULTS/ENRICHED_CDR3_BETA_OCT5.csv', index=False)










	## conisder only the CD4+ total and CD4 memory
	######## debugging
	processed_buf = 0
	processed_line = 0
	calls_F.loc[:, 'PVAL'] = 1
	calls_F.loc[:, 'CHI_TSTAT'] ='0'
	for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 or calls_F.C_Counts[i] > 0:
	processed_line += 1
	if processed_line == 1000:
	processed_buf += 1000
	processed_line = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 ALPHAS'
	chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
	calls_F.loc[i, 'PVAL'] = chi_out[0]
	calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
	pass

	calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)
	calls_DF = pd.Series(calls, name='P_C')
	calls_DF_=calls_DF.reset_index()
	calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
	#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
	calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
	calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)

	'TRAV16,TRAJ42*01,CALSGGSQGNLIF'





	#### debugging scripts #############
	testlist=['TRBV24-101,TRBJ2-101,CATSDFGGYNEQFF',
	'15980,C,CD8+,HS76,8',
	'14450,C,CD8+,HS43,4',
	'13121,P,CD4+,HS29,6',
	'13284,P,CD8+,HS51,7',
	'14272,C,CD4+CD45RA-,HS32,7',
	'14272,C,CD4+CD45RA-,HS78,17',
	'14272,C,CD8+,HS40,8',
	'21-001-01-13,C,CD8+,HS104,15',]
	P_ = 0
	C_ = 0
	list_c =0
	DBID = []
	for item in testlist:
	list_c += 1
	if list_c > 1:
	if item:
	dbid=str(item.split(',')[0])
	dx=item.split(',')[1]
	if dbid not in DBID:
	print 'yey', dx, dbid
	DBID.append(dbid)
	if dx == "P":
	P_ += 1
	else:
	C_ += 1

	else:
	print 'already present counts not performed', dx, dbid
	print P_, C_


	house = [ [43, 48-43 ], [71, 89-71] ]
	chi2, p, ddof, expected = scipy.stats.chi2_contingency( house )
	msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"
	print( msg.format( chi2, p, ddof ) )
	print( expected )



	calls_DF = pd.Series(calls, name='P_C')
	calls_DF_=calls_DF.reset_index()
	calls_F=pd.concat([calls_DF_, calls_DF_.P_C.str.split(',', expand=True)], axis=1)
	#calls_DF_.loc[:, ('P', 'C')]= calls_DF_.P_C.str.split(',', expand=True)
	calls_F.loc[:, 'P_Counts']=calls_F[0].astype(int)
	calls_F.loc[:, 'C_Counts']=calls_F[1].astype(int)
	#calls_F.sort_values(['P_Counts', 'C_Counts'], ascending=[False, True])
	### calculate chisquare## defin chisq
	def chisq_rt(i, j):
	cont_tab = [[i, len(P_DBID)-i], [j, len(C_DBID)-j]]
	chi2, p, ddof, expected = scipy.stats.chi2_contingency(cont_tab)
	return [p, chi2]

	processed_buf = 0
	processed_line = 0
	calls_F.loc[:, 'PVAL'] = 1
	calls_F.loc[:, 'CHI_TSTAT'] ='0'
	for i in xrange(len(calls_F)):
	if calls_F.P_Counts[i] > 0 and calls_F.C_Counts[i] > 0:
	processed_line += 1
	if processed_line == 1000:
	processed_buf += 1000
	processed_line = 0
	print 'PROCESSED '+str(processed_buf)+' CDR3 BETAS'
	chi_out=chisq_rt(calls_F.P_Counts[i], calls_F.C_Counts[i])
	calls_F.loc[i, 'PVAL'] = chi_out[0]
	calls_F.loc[i, 'CHI_TSTAT'] = chi_out[1]
	else:
	pass
	### write out the file
	calls_F.to_csv('/media/labcomp/HDD2/RCL_FILES_AUG23_2017/BASELINE_CALLS_AUG31/CHI_SQ_ALPHA_CDR3.csv', index=False)