Skip to content

Instantly share code, notes, and snippets.

@adiamb
Created August 15, 2017 01:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/69923564291abfdc6629c36bebfbb0dd to your computer and use it in GitHub Desktop.
Save adiamb/69923564291abfdc6629c36bebfbb0dd to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
total_TCR = pd.read_excel('total done list 7-18-17.xlsx', sheetname='total file')
TCR_5merBC = pd.read_excel('early deep SEQ with 5 barcode.xlsx', sheetname='total file')
TCR_5mer_sub=TCR_5merBC.loc[:, ('DbID','cell type', 'Dx', '# to SEQ1', 'alpha', '# to SEQ2', 'beta')]
TCR_5mer_sub.groupby(['cell type', 'Dx']).size()
total_TCR.groupby(['DbID','cell type', 'Dx', 'rs1483979']).size()
total_TCR.loc[total_TCR['cell type'] == 'CD4', 'cell type'] = 'CD4+'
total_TCR.loc[total_TCR['cell type'] == 'CD4 memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+memory', 'cell type'] = 'CD4+Memory'
total_TCR.loc[total_TCR['cell type'] == 'CD4+Memory', 'cell type'] = 'CD4+CD45RA-'
total_TCR.loc[total_TCR['cell type'] == 'CD8', 'cell type'] = 'CD8+'
'CD4 memory'
CD4_mem=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA-') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_Nai=total_TCR.loc[(total_TCR['cell type'] == 'CD4+CD45RA+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_total=total_TCR.loc[(total_TCR['cell type'] == 'CD4+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD8_total = total_TCR.loc[(total_TCR['cell type'] == 'CD8+') & (total_TCR.rs1483979.str.contains('CG')), ['DbID','Dx','# to SEQ1', 'alpha', '# to SEQ2', 'beta', 'note', 'rs1483979']]
CD4_mem.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA-'
CD4_Nai.loc[:, ('CELL_TYPE')] = 'CD4+CD45RA+'
CD4_total.loc[:, ('CELL_TYPE')] = 'CD4+'
CD8_total.loc[:, ('CELL_TYPE')] = 'CD8+'
all_mem=pd.concat([CD4_mem, CD4_Nai, CD4_total, CD8_total], axis=0)
all_mem.DbID=all_mem.DbID.astype(object)
all_mem.loc[:, ('SEQ1')]=[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ1']]
all_mem.loc[:, ('SEQ2')]=[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in all_mem['# to SEQ2']]
[re.sub(r'(TCRHS|TCR-HS-|^HS)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]
all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)
SEQ1_5mer=pd.Series([re.sub(r'(TCRHS|TCR-HS-|^HS|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ1']]).unique().tolist()
SEQ2_5mer=pd.Series([re.sub(r'(TCRHS|TCR-HS-|^HS|\.)', 'TCR-HS', str(i)) if i else 'NA' for i in TCR_5merBC['# to SEQ2']]).unique().tolist()
SEQ1_5mer.remove('TCR-HS')
SEQ2_5mer.remove('nan')
SEQ2_5mer.remove('TCR-HS')
all_mem.reset_index(drop=True, inplace=True)
all_mem.loc[:, ('SEQ1_5merBC?')] = ''
all_mem.loc[:, ('SEQ2_5merBC?')] = ''
all_mem.loc[all_mem['SEQ1'].isin(SEQ1_5mer), 'SEQ1_5merBC?'] = 'YES'
all_mem.loc[all_mem['SEQ2'].isin(SEQ2_5mer), 'SEQ2_5merBC?'] = 'YES'
all_mem.to_csv('/home/labcomp/Desktop/ALL_MEMORY_NAIVE_SEQ_AUGUST14.csv', index=False)
SEQ1_FORMYUSE SEQ2_FORMYUSE
################### LING returned file by email Augst14 at 230pm
ling_file = pd.read_excel('ALL_MEMORY_NAIVE_SEQ_AUGUST15.xlsx', sheetname='ALL_MEMORY_NAIVE_SEQ_AUGUST14')
for_re_alpha=ling_file.loc[ling_file['SEQ1_5merBC?'] == 'YES']
TCRHS_IDS_SLEEP = pd.read_csv('TCR_HS_IDENTIFIERS_August14_2017.csv')
## check how many runs have to be reanalysed
for_re_alpha.loc[for_re_alpha.SEQ1_UNIFORM.isin(TCRHS_IDS_SLEEP['Sample name'])]
pd.pivot_table(ling_file, index=['DbID', 'Dx', 'rs1483979'], columns=['CELL_TYPE'])
#pd.crosstab(index=ling_file['DbID'], columns=[ling_file['CELL_TYPE']], values =[ling_file['SEQ1_UNIFORM']])
aggregated_alpha=pd.DataFrame(ling_file.groupby(['DbID', 'Dx', 'rs1483979']).size().reset_index())
aggregated_alpha.loc[:, ('CD4+')] = ''
aggregated_alpha.loc[:, ('CD4+CD45RA-')] = ''
aggregated_alpha.loc[:, ('CD4+CD45RA+')] = ''
aggregated_alpha.loc[:, ('CD8+')] = ''
aggregated_alpha.loc[:, ('others')] = ''
for i in xrange(0, len(aggregated_alpha)):
temp=ling_file.loc[ling_file.DbID == aggregated_alpha.DbID[i]]
temp.reset_index(drop=True, inplace=True)
for j in xrange(0, len(temp.CELL_TYPE)):
if temp.CELL_TYPE[j]:
if temp.CELL_TYPE[j] =='CD8+':
if not aggregated_alpha.loc[i, 'CD8+']:
aggregated_alpha.loc[i, 'CD8+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
aggregated_alpha.loc[i, 'CD8+']=str(aggregated_alpha.loc[i, 'CD8+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
elif temp.CELL_TYPE[j] == 'CD4+':
if not aggregated_alpha.loc[i, 'CD4+']:
aggregated_alpha.loc[i, 'CD4+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
aggregated_alpha.loc[i, 'CD4+']=str(aggregated_alpha.loc[i, 'CD4+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
elif temp.CELL_TYPE[j] == 'CD4+CD45RA-':
if not aggregated_alpha.loc[i, 'CD4+CD45RA-']:
aggregated_alpha.loc[i, 'CD4+CD45RA-']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
aggregated_alpha.loc[i, 'CD4+CD45RA-']=str(aggregated_alpha.loc[i, 'CD4+CD45RA-']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
elif temp.CELL_TYPE[j] == 'CD4+CD45RA+':
if not aggregated_alpha.loc[i, 'CD4+CD45RA+']:
aggregated_alpha.loc[i, 'CD4+CD45RA+']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
aggregated_alpha.loc[i, 'CD4+CD45RA+']=str(aggregated_alpha.loc[i, 'CD4+CD45RA+']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
if not aggregated_alpha.loc[i, 'others']:
aggregated_alpha.loc[i, 'others']= str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
else:
aggregated_alpha.loc[i, 'others']=str(aggregated_alpha.loc[i, 'others']) + '/' + str(temp.SEQ1_UNIFORM[j]) + '_' + str(temp['alpha'][j])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment