sirselim/dbNSFP_pipeline_build.sh

## dbNSFP_pipeline_build.sh
#!/bin/bash
## small bash script to download and reformat dbNSFP for pipeline
## Miles Benton
## created: 2018-01-13
## modified: 2019-08-21

# Set to dbNSFP version to download and build
version="4.0a"
#TODO: add an option to 'scrape' this from the url to always return latest version
# define thread number for parallel processing where able
THREADS=6

# Download dbNSFP database
wget -O dbNSFP${version}.zip "ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip"

# Uncompress
unzip dbNSFP${version}.zip

# grab header
zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz

### this section will produce data for hg38 capable pipelines
## hg38 version

# Create a single file version
# NOTE: bgzip parameter -@ X represents number of threads
cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
#TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

# add header back into file
cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz

# Create tabix index
tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

# test annotation
# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
#TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
#TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.

# clean up
#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
#/END hg38
###

### this section will produce data for hg19 capable pipelines
## hg19 version
# for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
# this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
# NOTE: bgzip parameter -@ X represents number of threads

# create a file with ordered chrosome names
echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt

# set awk params as variable for parallel
awkBody1='$8 != "."'
awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

# parallel implementation of the hg19 format and sort
parallel -j 4 \
"zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
awk '${awkBody1}' | \
awk '${awkBody2}' | \
LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
#TODO: need to implement a defined approach to GNU parallel, can't use $THREADS

# cat all files back together (should retain sort order)
cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

# Create tabix index
tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

# test hg19 annotation
# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

# clean up
#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
#/END hg38
###
	#!/bin/bash
	## small bash script to download and reformat dbNSFP for pipeline
	## Miles Benton
	## created: 2018-01-13
	## modified: 2019-08-21

	# Set to dbNSFP version to download and build
	version="4.0a"
	#TODO: add an option to 'scrape' this from the url to always return latest version
	# define thread number for parallel processing where able
	THREADS=6

	# Download dbNSFP database
	wget -O dbNSFP${version}.zip "ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip"

	# Uncompress
	unzip dbNSFP${version}.zip

	# grab header
	zcat dbNSFP${version}_variant.chr1.gz \| head -n 1 \| bgzip > header.gz

	### this section will produce data for hg38 capable pipelines
	## hg38 version

	# Create a single file version
	# NOTE: bgzip parameter -@ X represents number of threads
	cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz \| zgrep -v '#chr' \| bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
	#TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

	# add header back into file
	cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz

	# Create tabix index
	tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

	# test annotation
	# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
	#TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
	#TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.

	# clean up
	#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
	#/END hg38
	###

	### this section will produce data for hg19 capable pipelines
	## hg19 version
	# for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
	# this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
	# NOTE: bgzip parameter -@ X represents number of threads

	# create a file with ordered chrosome names
	echo {1..22} X Y M \| tr ' ' '\n' > chromosomeList.txt

	# set awk params as variable for parallel
	awkBody1='$8 != "."'
	awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

	# parallel implementation of the hg19 format and sort
	parallel -j 4 \
	"zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz \| \
	awk '${awkBody1}' \| \
	awk '${awkBody2}' \| \
	LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip \| \
	bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
	#TODO: need to implement a defined approach to GNU parallel, can't use $THREADS

	# cat all files back together (should retain sort order)
	cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

	# Create tabix index
	tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

	# test hg19 annotation
	# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

	# clean up
	#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
	#/END hg38
	###