Skip to content

Instantly share code, notes, and snippets.

@esford3

esford3/vzv.py Secret

Created July 30, 2023 16:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esford3/0b49ea55adacb96ee29412573a178ab7 to your computer and use it in GitHub Desktop.
Save esford3/0b49ea55adacb96ee29412573a178ab7 to your computer and use it in GitHub Desktop.
adaptive file read+merge
import os
import pandas as pd
import platform
import re
import parmap
from tcrdist.swap_gene_name import adaptive_to_imgt
path= "" #path to key file
key = "" #list of adaptive files to include
files=pd.read_csv(os.path.join(path, key), sep="\t")
files=sorted(files['file'])
print(files)
count=1
vzv_list=list()
for file in files:
if not file.endswith(".tsv"):
continue
df = pd.read_csv(os.path.join(path, file),
sep="\t")
df['file']=file
df.dropna(subset = ["aminoAcid"], inplace=True)
df.dropna(subset = ['vMaxResolved'], inplace=True)
print(df.head(2))
vzv_list.append(df)
count=count+1
# if count>5: break
vzv_tcrs = pd.concat(vzv_list, axis=0)
vzv_tcrs = vzv_tcrs[vzv_tcrs['sequenceStatus']=='In'].reset_index(drop = True)
vzv_tcrs.columns
cols = {'nucleotide': 'cdr3_b_nucseq',
'aminoAcid': 'cdr3_b_aa',
'count (templates/reads)': 'templates',
'frequencyCount (%)': 'freq',
'vMaxResolved': 'vMaxResolved',
'jMaxResolved': 'jMaxResolved',
'file': 'file'}
vzv_tcrs = vzv_tcrs[cols].rename(columns = cols)
vzv_tcrs['file'] = vzv_tcrs['file'].str.replace("gE_CD4","gE")
vzv_tcrs[['person', 'cell', 'type', 'TCR']] = vzv_tcrs.file.str.split('_', n=3, expand =True)
vzv_tcrs['source'] = vzv_tcrs['cell'] + "_" + vzv_tcrs['type']
vzv_tcrs['v_b_gene'] = vzv_tcrs['vMaxResolved'].apply(lambda x: adaptive_to_imgt['human'].get(x.split("*")[0]) if isinstance(x,str) else None)
vzv_tcrs['j_b_gene'] = vzv_tcrs['jMaxResolved'].apply(lambda x: adaptive_to_imgt['human'].get(x.split("*")[0]) if isinstance(x,str) else None)
cols = {'cdr3_b_nucseq': 'cdr3_b_nucseq',
'cdr3_b_aa': 'cdr3_b_aa',
'templates': 'count',
'freq': 'freq',
'person': 'person',
'source': 'source',
'v_b_gene': 'v_b_gene',
'j_b_gene': 'j_b_gene'}
vzv_tcrs = vzv_tcrs[cols].rename(columns = cols)
## frequency wide version
dfp = vzv_tcrs.pivot(index=['cdr3_b_nucseq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'person'],
columns='source', values='freq')
dfp = dfp.reset_index(drop = False)
## counts wide version
dft = vzv_tcrs.pivot(index=['cdr3_b_nucseq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'person'],
columns='source', values='count')
dft = dft.reset_index(drop = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment