JamieHeather/exploring_vdjdb.sh

## exploring_vdjdb.sh
# Download repo, build database then navigate to it
brew install groovy # OSX groovy installation, replace depending on your setup
git clone https://github.com/antigenomics/vdjdb-db.git
cd vdjdb-db/src/
groovy -cp . BuildDatabase.groovy
cd ../database/

# See fields
head -1 vdjdb_full.txt

# fields 1 to 3 are alpha CDR3, V and J, fields 4 to 7 are beta CDR3, V, D and J, field 9 is MHC, 12 is peptide and 14 is antigen source

# Get just the alpha CDR3 sequences for any TCRs recognising HIV
# Awk checks that CDR3 is present, and that antigen source is HIV
# Then tidy it up/ get unique tab-delimited data using sort/uniq/sed
awk -F '\t' '(length($1) > 1) && ($14 == "HIV"){print $1}' vdjdb_full.txt | sort | uniq | sort | sed 's/ /\t/g' > all_alpha_hiv_cdr3s.tsv

# Get CDR3, V, J and recognised peptide of all beta chain recognising HIV on an HLA-A*02 background
# This time we're using grep to catch the HLA type, as it's used more variably (e.g. HLA-A*02/02:01/02:01:XX, depending on source)
awk -F '\t' '(length($4) > 1) && ($14 == "CMV"){print $4, $5, $7, $9, $12}' vdjdb_full.txt | grep HLA-A\\*02 | sort | uniq | sort | sed 's/ /\t/g' > all_a2_beta_hiv_cdr3s.tsv

# Note that I've only successfully tested this code in the terminal on a Mac - quick test on Linux proves that awk behaves differently on each
# Frankly in all but a few edge cases you're probably just OK grepping across the whole lines
	# Download repo, build database then navigate to it
	brew install groovy # OSX groovy installation, replace depending on your setup
	git clone https://github.com/antigenomics/vdjdb-db.git
	cd vdjdb-db/src/
	groovy -cp . BuildDatabase.groovy
	cd ../database/

	# See fields
	head -1 vdjdb_full.txt

	# fields 1 to 3 are alpha CDR3, V and J, fields 4 to 7 are beta CDR3, V, D and J, field 9 is MHC, 12 is peptide and 14 is antigen source

	# Get just the alpha CDR3 sequences for any TCRs recognising HIV
	# Awk checks that CDR3 is present, and that antigen source is HIV
	# Then tidy it up/ get unique tab-delimited data using sort/uniq/sed
	awk -F '\t' '(length($1) > 1) && ($14 == "HIV"){print $1}' vdjdb_full.txt \| sort \| uniq \| sort \| sed 's/ /\t/g' > all_alpha_hiv_cdr3s.tsv

	# Get CDR3, V, J and recognised peptide of all beta chain recognising HIV on an HLA-A*02 background
	# This time we're using grep to catch the HLA type, as it's used more variably (e.g. HLA-A*02/02:01/02:01:XX, depending on source)
	awk -F '\t' '(length($4) > 1) && ($14 == "CMV"){print $4, $5, $7, $9, $12}' vdjdb_full.txt \| grep HLA-A\\*02 \| sort \| uniq \| sort \| sed 's/ /\t/g' > all_a2_beta_hiv_cdr3s.tsv

	# Note that I've only successfully tested this code in the terminal on a Mac - quick test on Linux proves that awk behaves differently on each
	# Frankly in all but a few edge cases you're probably just OK grepping across the whole lines