adiamb/retreive_memory_TCells_Seq.py

## retreive_memory_TCells_Seq.py
import pandas as pd
import numpy as np


total_TCR = pd.read_excel('total done list 7-18-17.xlsx', sheetname='total file')
TCR_5merBC = pd.read_excel('early deep SEQ with 5 barcode.xlsx', sheetname='total file')

TCR_5mer_sub=TCR_5merBC.loc[:, ('DbID','cell type', 'Dx', '# to SEQ1', 'alpha', '# to SEQ2', 'beta')]
TCR_5mer_sub.groupby(['cell type', 'Dx']).size()

total_TCR.groupby(['DbID','cell type', 'Dx', 'rs1483979']).size()

total_TCR.loc[total_TCR['cell type'] == 'CD4', 'cell type'] = 'CD4+'
total_TCR.loc[total_TCR['cell type'] == 'CD4 memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+Memory', 'cell type'] = 'CD4+CD45RA-'

total_TCR.loc[total_TCR['cell type'] == 'CD8', 'cell type'] = 'CD8+'

'CD4 memory'

CD4_mem=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA-') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_Nai=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_total=total_TCR.loc[(total_TCR['cell type'] == 'CD4+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD8_total = total_TCR.loc[(total_TCR['cell type'] == 'CD8+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_mem.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA-'
CD4_Nai.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA+'
CD4_total.loc[:, ('CELL_TYPE')] = 'CD4+'
CD8_total.loc[:, ('CELL_TYPE')] = 'CD8+'
all_mem=pd.concat([CD4_mem, CD4_Nai, CD4_total, CD8_total], axis=0)

all_mem.DbID=all_mem.DbID.astype(object)

all_mem.loc[:, ('SEQ1')]=[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ1']]
all_mem.loc[:, ('SEQ2')]=[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ2']]


[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]

all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)

SEQ1_5mer=pd.Series([re.sub(r'(TCRHS|TCR-HS-|^HS|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]).unique().tolist()
SEQ2_5mer=pd.Series([re.sub(r'(TCRHS|TCR-HS-|^HS|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ2']]).unique().tolist()
SEQ1_5mer.remove('TCR-HS')
SEQ2_5mer.remove('nan')
SEQ2_5mer.remove('TCR-HS')


all_mem.reset_index(drop=True, inplace=True)


all_mem.loc[:, ('SEQ1_5merBC?')] = ''
all_mem.loc[:, ('SEQ2_5merBC?')] = ''

all_mem.loc[all_mem['SEQ1'].isin(SEQ1_5mer), 'SEQ1_5merBC?'] = 'YES'
all_mem.loc[all_mem['SEQ2'].isin(SEQ2_5mer), 'SEQ2_5merBC?'] = 'YES'


all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)

SEQ1_FORMYUSE	SEQ2_FORMYUSE
################### LING returned file by email Augst14 at 230pm
ling_file = pd.read_excel('ALL_MEMORY_NAIVE_SEQ_AUGUST15.xlsx', sheetname='ALL_MEMORY_NAIVE_SEQ_AUGUST14')
for_re_alpha=ling_file.loc[ling_file['SEQ1_5merBC?'] == 'YES']
TCRHS_IDS_SLEEP = pd.read_csv('TCR_HS_IDENTIFIERS_August14_2017.csv')

## check how many runs have to be reanalysed
for_re_alpha.loc[for_re_alpha.SEQ1_UNIFORM.isin(TCRHS_IDS_SLEEP['Sample name'])]
pd.pivot_table(ling_file, index=['DbID', 'Dx', 'rs1483979'], columns=['CELL_TYPE'])

#pd.crosstab(index=ling_file['DbID'], columns=[ling_file['CELL_TYPE']], values =[ling_file['SEQ1_UNIFORM']])


aggregated_alpha=pd.DataFrame(ling_file.groupby(['DbID', 'Dx', 'rs1483979']).size().reset_index())

aggregated_alpha.loc[:, ('CD4+')] = ''
aggregated_alpha.loc[:, ('CD4+CD45RA-')] = ''
aggregated_alpha.loc[:, ('CD4+CD45RA+')] = ''
aggregated_alpha.loc[:, ('CD8+')] = ''
aggregated_alpha.loc[:, ('others')] = ''

for i in xrange(0, len(aggregated_alpha)):
	temp=ling_file.loc[ling_file.DbID == aggregated_alpha.DbID[i]]
	temp.reset_index(drop=True, inplace=True)
	for j in xrange(0, len(temp.CELL_TYPE)):
		if temp.CELL_TYPE[j]:
			if temp.CELL_TYPE[j] =='CD8+':
				if not aggregated_alpha.loc[i, 'CD8+']:
					aggregated_alpha.loc[i, 'CD8+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
				else:
					aggregated_alpha.loc[i, 'CD8+']=str(aggregated_alpha.loc[i, 'CD8+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
			elif temp.CELL_TYPE[j] == 'CD4+':
				if not aggregated_alpha.loc[i, 'CD4+']:
					aggregated_alpha.loc[i, 'CD4+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
				else:
					aggregated_alpha.loc[i, 'CD4+']=str(aggregated_alpha.loc[i, 'CD4+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
			elif temp.CELL_TYPE[j] == 'CD4+CD45RA-':
				if not aggregated_alpha.loc[i, 'CD4+CD45RA-']:
					aggregated_alpha.loc[i, 'CD4+CD45RA-']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
				else:
					aggregated_alpha.loc[i, 'CD4+CD45RA-']=str(aggregated_alpha.loc[i, 'CD4+CD45RA-']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
			elif temp.CELL_TYPE[j] == 'CD4+CD45RA+':
				if not aggregated_alpha.loc[i, 'CD4+CD45RA+']:
					aggregated_alpha.loc[i, 'CD4+CD45RA+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
				else:
					aggregated_alpha.loc[i, 'CD4+CD45RA+']=str(aggregated_alpha.loc[i, 'CD4+CD45RA+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
			else:
				if not aggregated_alpha.loc[i, 'others']:
					aggregated_alpha.loc[i, 'others']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
				else:
					aggregated_alpha.loc[i, 'others']=str(aggregated_alpha.loc[i, 'others']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	import pandas as pd
	import numpy as np


	total_TCR = pd.read_excel('total done list 7-18-17.xlsx', sheetname='total file')
	TCR_5merBC = pd.read_excel('early deep SEQ with 5 barcode.xlsx', sheetname='total file')

	TCR_5mer_sub=TCR_5merBC.loc[:, ('DbID','cell type', 'Dx', '# to SEQ1', 'alpha', '# to SEQ2', 'beta')]
	TCR_5mer_sub.groupby(['cell type', 'Dx']).size()

	total_TCR.groupby(['DbID','cell type', 'Dx', 'rs1483979']).size()

	total_TCR.loc[total_TCR['cell type'] == 'CD4', 'cell type'] = 'CD4+'
	total_TCR.loc[total_TCR['cell type'] == 'CD4 memory', 'cell type'] = 'CD4+Memory'
	total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
	total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
	total_TCR.loc[total_TCR['cell type'] == 'CD4+Memory', 'cell type'] = 'CD4+CD45RA-'

	total_TCR.loc[total_TCR['cell type'] == 'CD8', 'cell type'] = 'CD8+'

	'CD4 memory'

	CD4_mem=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA-') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
	CD4_Nai=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
	CD4_total=total_TCR.loc[(total_TCR['cell type'] == 'CD4+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
	CD8_total = total_TCR.loc[(total_TCR['cell type'] == 'CD8+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
	CD4_mem.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA-'
	CD4_Nai.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA+'
	CD4_total.loc[:, ('CELL_TYPE')] = 'CD4+'
	CD8_total.loc[:, ('CELL_TYPE')] = 'CD8+'
	all_mem=pd.concat([CD4_mem, CD4_Nai, CD4_total, CD8_total], axis=0)

	all_mem.DbID=all_mem.DbID.astype(object)

	all_mem.loc[:, ('SEQ1')]=[re.sub(r'(TCRHS\|TCR-HS-\|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ1']]
	all_mem.loc[:, ('SEQ2')]=[re.sub(r'(TCRHS\|TCR-HS-\|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ2']]


	[re.sub(r'(TCRHS\|TCR-HS-\|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]

	all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)

	SEQ1_5mer=pd.Series([re.sub(r'(TCRHS\|TCR-HS-\|^HS\|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]).unique().tolist()
	SEQ2_5mer=pd.Series([re.sub(r'(TCRHS\|TCR-HS-\|^HS\|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ2']]).unique().tolist()
	SEQ1_5mer.remove('TCR-HS')
	SEQ2_5mer.remove('nan')
	SEQ2_5mer.remove('TCR-HS')


	all_mem.reset_index(drop=True, inplace=True)


	all_mem.loc[:, ('SEQ1_5merBC?')] = ''
	all_mem.loc[:, ('SEQ2_5merBC?')] = ''

	all_mem.loc[all_mem['SEQ1'].isin(SEQ1_5mer), 'SEQ1_5merBC?'] = 'YES'
	all_mem.loc[all_mem['SEQ2'].isin(SEQ2_5mer), 'SEQ2_5merBC?'] = 'YES'


	all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)

	SEQ1_FORMYUSE SEQ2_FORMYUSE
	################### LING returned file by email Augst14 at 230pm
	ling_file = pd.read_excel('ALL_MEMORY_NAIVE_SEQ_AUGUST15.xlsx', sheetname='ALL_MEMORY_NAIVE_SEQ_AUGUST14')
	for_re_alpha=ling_file.loc[ling_file['SEQ1_5merBC?'] == 'YES']
	TCRHS_IDS_SLEEP = pd.read_csv('TCR_HS_IDENTIFIERS_August14_2017.csv')

	## check how many runs have to be reanalysed
	for_re_alpha.loc[for_re_alpha.SEQ1_UNIFORM.isin(TCRHS_IDS_SLEEP['Sample name'])]
	pd.pivot_table(ling_file, index=['DbID', 'Dx', 'rs1483979'], columns=['CELL_TYPE'])

	#pd.crosstab(index=ling_file['DbID'], columns=[ling_file['CELL_TYPE']], values =[ling_file['SEQ1_UNIFORM']])


	aggregated_alpha=pd.DataFrame(ling_file.groupby(['DbID', 'Dx', 'rs1483979']).size().reset_index())

	aggregated_alpha.loc[:, ('CD4+')] = ''
	aggregated_alpha.loc[:, ('CD4+CD45RA-')] = ''
	aggregated_alpha.loc[:, ('CD4+CD45RA+')] = ''
	aggregated_alpha.loc[:, ('CD8+')] = ''
	aggregated_alpha.loc[:, ('others')] = ''

	for i in xrange(0, len(aggregated_alpha)):
	temp=ling_file.loc[ling_file.DbID == aggregated_alpha.DbID[i]]
	temp.reset_index(drop=True, inplace=True)
	for j in xrange(0, len(temp.CELL_TYPE)):
	if temp.CELL_TYPE[j]:
	if temp.CELL_TYPE[j] =='CD8+':
	if not aggregated_alpha.loc[i, 'CD8+']:
	aggregated_alpha.loc[i, 'CD8+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	aggregated_alpha.loc[i, 'CD8+']=str(aggregated_alpha.loc[i, 'CD8+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	elif temp.CELL_TYPE[j] == 'CD4+':
	if not aggregated_alpha.loc[i, 'CD4+']:
	aggregated_alpha.loc[i, 'CD4+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	aggregated_alpha.loc[i, 'CD4+']=str(aggregated_alpha.loc[i, 'CD4+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	elif temp.CELL_TYPE[j] == 'CD4+CD45RA-':
	if not aggregated_alpha.loc[i, 'CD4+CD45RA-']:
	aggregated_alpha.loc[i, 'CD4+CD45RA-']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	aggregated_alpha.loc[i, 'CD4+CD45RA-']=str(aggregated_alpha.loc[i, 'CD4+CD45RA-']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	elif temp.CELL_TYPE[j] == 'CD4+CD45RA+':
	if not aggregated_alpha.loc[i, 'CD4+CD45RA+']:
	aggregated_alpha.loc[i, 'CD4+CD45RA+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	aggregated_alpha.loc[i, 'CD4+CD45RA+']=str(aggregated_alpha.loc[i, 'CD4+CD45RA+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	if not aggregated_alpha.loc[i, 'others']:
	aggregated_alpha.loc[i, 'others']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
	else:
	aggregated_alpha.loc[i, 'others']=str(aggregated_alpha.loc[i, 'others']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])