tkrahn/bwa_hg38_pipeline_release.sh

## bwa_hg38_pipeline_release.sh
#!/bin/bash

START=$(date +%s.%N)
clear
# setup parameters

YSEQID=${PWD##*/}
# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)

NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
echo "We can use ${NUM_THREADS} threads."

BWA='bwa'
if test -x "/usr/local/bin/bwa-mem2"; then
    echo "We can use bwa-mem2"
    BWA='bwa-mem2'
fi


UPS='upload.server.yseq.net'

# Create a random password
PASSWD=`date +%s | sha256sum | base64 | head -c 8`
echo "Random password created: ${PASSWD}"

# Generate summary file
cat /genomes/0/script-templates/x_result_summary.txt | sed "s/<kitnumber>/${YSEQID}/g" | sed "s/<passwd>/${PASSWD}/g" >${YSEQID}_result_summary.txt
echo "${YSEQID}_result_summary.txt created"

REF="/genomes/0/refseq/hg38/hg38.fa"

READS_1="fastq/${YSEQID}_R1.fastq.gz"
READS_2="fastq/${YSEQID}_R2.fastq.gz"
#SAMFILE="${YSEQID}_bwa-mem_hg38.sam" # no longer needed due to piping
BAMFILE="${YSEQID}_bwa-mem_hg38.bam"

BAMFILE_SORTED="${YSEQID}_bwa-mem_hg38_sorted.bam"
VCF_FILE="${YSEQID}_hg38.vcf"

cp /genomes/0/tree/trees/yfull/latest_YFull_YTree.json .
YFULLTREE="latest_YFull_YTree.json"

cp /genomes/0/script-templates/cladeFinder.py .
CLADEFINDER="cladeFinder.py"

#ANN_VCF_FILE="${YSEQID}_hg38_dbSNP150_ann.vcf"
#ANN_CLINVAR_VCF_FILE="${YSEQID}_hg38_clinvar_ann.vcf"

#DBSNP="/genomes/0/refseq/hg19/All_chr_20161121.vcf.gz"

# Read Group(s)
# READ_GROUPS="\"@RG\tID:1\tSM:${YSEQID}\tPL:ILLUMINA\tPU:UNIT1\tLB:${YSEQID}\""

if [[ $1 != "-resume" ]]
then


	# Pipeline commands:
	${BWA} mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 | \
	samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE -
	samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE
	samtools index $BAMFILE_SORTED
	samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv
fi

# Update summary file with BAM filesize:
BAM_FILESIZE=`du -kh "${BAMFILE_SORTED}" | cut -f1`
echo "Size of ${BAMFILE_SORTED} = ${BAM_FILESIZE}"
cat ${YSEQID}_result_summary.txt | sed "s/<bamsize>/${BAM_FILESIZE}byte/g" >${YSEQID}_result_summary.txt.tmp
rm -f ${YSEQID}_result_summary.txt
mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

if [[ $1 != "-resume" ]]
then

	# Delete no longer needed large files
	#rm -f $SAMFILE
	rm -f $BAMFILE # keep $BAMFILE_SORTED
fi

# Easy tview file
echo "#!/bin/bash" > tview_${YSEQID}.sh
echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh
chmod a+x tview_${YSEQID}.sh


if [[ $1 != "-resume" ]]
then

	# Separate chrY & mtDNA BAM files
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_hg38_chrY.bam" $BAMFILE_SORTED chrY &
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_rCRS_chrM.bam" $BAMFILE_SORTED chrM &
	wait
	samtools index "${YSEQID}_bwa-mem_hg38_chrY.bam" &
	samtools index "${YSEQID}_bwa-mem_rCRS_chrM.bam" &
	wait


	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam.bai ${UPS}:/genomes/${YSEQID}/

	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam.bai ${UPS}:/genomes/${YSEQID}/

	# mtDNA allele calling $ FASTA file generation
	bcftools mpileup -r chrM -Ou -C 50 -f ${REF} ${BAMFILE_SORTED} | bcftools call --threads $NUM_THREADS -O z -v -m -P 0  > chrM_${VCF_FILE}.gz
	tabix chrM_${VCF_FILE}.gz
	samtools faidx $REF chrM | bcftools consensus chrM_${VCF_FILE}.gz -o ${YSEQID}_mtDNA.fasta


	# Annotate all known Y chromosome SNPs from Ybrowse
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi

	bcftools mpileup -r chrY -C 50 --threads $NUM_THREADS -O z -f $REF $BAMFILE_SORTED > chrY_raw_${VCF_FILE}.gz
	tabix chrY_raw_${VCF_FILE}.gz

	bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz
	tabix chrY_merged_${VCF_FILE}.gz
	echo "merging completed"

	bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz
	tabix chrY_called_${VCF_FILE}.gz
	echo "calling completed"

	bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz
	tabix chrY_cleaned_${VCF_FILE}.gz
	echo "cleanup completed"

	bcftools filter -O z -i '(GT=="1/1" && AA==REF) || (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz &
	bcftools filter -O z -i '(GT=="0/0" && AA==REF) || (GT=="1/1" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_ancestral_${VCF_FILE}.gz &
	wait
	tabix chrY_derived_${VCF_FILE}.gz &
	tabix chrY_ancestral_${VCF_FILE}.gz &
	wait
	echo "ancestral & derived filters completed"

	# Find the Y haplogroup
	bcftools query -f '%ID,' chrY_derived_${VCF_FILE}.gz > ${YSEQID}_positives.txt &
	bcftools query -f '%ID,' chrY_ancestral_${VCF_FILE}.gz > ${YSEQID}_negatives.txt &
	wait
fi


python ${CLADEFINDER} ${YFULLTREE} ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${YSEQID}_cladeFinderOutput.csv
echo "Y haplogroup checked"
echo "The 5 best hits are:"
head -n 5 ${YSEQID}_cladeFinderOutput.csv


YFULLHG="unknown"
YFULLPATH="unknown"
OLDIFS=$IFS
IFS=$'\t'
LINECOUNTER=$((0))

[ ! -f ${YSEQID}_cladeFinderOutput.csv ] && { echo "${YSEQID}_cladeFinderOutput.csv file not found"; }
while read -r haplogroup path purity depth score conflicting_negatives
do
	LINECOUNTER=$((LINECOUNTER + 1))
	if [ $LINECOUNTER -eq $((2)) ]
	then
		YFULLHG=$haplogroup
		YFULLPATH=$path
	fi

done < ${YSEQID}_cladeFinderOutput.csv
IFS=$OLDIFS


echo "YFULLHG: ${YFULLHG}"
echo "YFULLPATH: ${YFULLPATH}"

# We're reading the first 100k sequences from the first fastq file to determine the average read length
READLENGTH=$(samtools bam2fq -@ ${NUM_THREADS} ${BAMFILE_SORTED} | head -n 100000 | awk '{if(NR%4==2) {count++; bases += length} } END{print bases/count}')
echo $READLENGTH

# Calculating the coverage from the idxstats output
COVERAGE_HG38=$(awk -v rl="$READLENGTH" '{x+=$2;m+=$3}END{print m*rl/x "x"}' < ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv)
echo $COVERAGE_HG38

cp /genomes/0/script-templates/getEquivalentAndDownstreamSNPs.py .
PHYLOEQ_SNPS_FILE=${YSEQID}_PHYLOEQ_SNPS.tsv
DOWNSTR_SNPS_FILE=${YSEQID}_DOWNSTR_SNPS.tsv
python3 getEquivalentAndDownstreamSNPs.py ${YFULLTREE} "${YFULLHG}" chrY_cleaned_${VCF_FILE}.gz ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${PHYLOEQ_SNPS_FILE} ${DOWNSTR_SNPS_FILE} ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv

cp /genomes/0/script-templates/getMTDNADifferences.py .
MTDNA_SNPS_FILE=${YSEQID}_MTDNA_SNPS.tsv
HAPLOGREP_JAR=haplogrep-2.1.25.jar
cp /genomes/0/haplogrep/${HAPLOGREP_JAR} .

MTHAPLOGROUP=$(python getMTDNADifferences.py -process chrM_${VCF_FILE}.gz ${HAPLOGREP_JAR} ${MTDNA_SNPS_FILE})


# Update summary file with Y haplogroup:

# Note: Since the YFull tree uses forward slashes for identical SNPs with different names, we replace the sed separator "/" with "|"
# This keeps us from escaping every forward slash. But don't confuse it with the pipe character in the same command!

cat ${YSEQID}_result_summary.txt | \
    sed "s|<MTHAPLOGROUP>|${MTHAPLOGROUP}|g" | \
    sed "s|<YHAPLOGROUP>|${YFULLHG}|g" | \
    sed "s|<YFULLPATH>|${YFULLPATH}|g" | \
    sed "s|<COVERAGE_HG38>|${COVERAGE_HG38}|g" | \
    sed "s|<READLENGTH>|${READLENGTH}|g" | \
    sed "s|<TERMINALSNP>|${YFULLHG}|g" >${YSEQID}_result_summary.txt.tmp


mv ${YSEQID}_result_summary.txt ${YSEQID}_result_summary.txt.backup

mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz
tabix chrY_novel_SNPs_${VCF_FILE}.gz
zcat chrY_novel_SNPs_${VCF_FILE}.gz | grep -v "##" >chrY_novel_SNPs_${VCF_FILE}.tsv
echo "novel SNP calling completed"

# Create Novel SNP template after checking for unreliable ranges and identity within Y Chromosome
cp /genomes/0/script-templates/identityResolutionTemplateCreator.py .
python identityResolutionTemplateCreator.py -batch chrY_novel_SNPs_${VCF_FILE}.tsv chrY_novel_SNPs_${YSEQID}_hg38.tsv ${REF}

bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz
tabix chrY_INDELs_${VCF_FILE}.gz
echo "INDEL calling completed"


# Parallel SNP calling by chromosome
bcftools mpileup -r chr1 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr1_${VCF_FILE}.gz &
bcftools mpileup -r chr2 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr2_${VCF_FILE}.gz &
bcftools mpileup -r chr3 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr3_${VCF_FILE}.gz &
bcftools mpileup -r chr4 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr4_${VCF_FILE}.gz &
bcftools mpileup -r chr5 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr5_${VCF_FILE}.gz &
bcftools mpileup -r chr6 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr6_${VCF_FILE}.gz &
bcftools mpileup -r chr7 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr7_${VCF_FILE}.gz &
bcftools mpileup -r chr8 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr8_${VCF_FILE}.gz &
bcftools mpileup -r chr9 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr9_${VCF_FILE}.gz &
bcftools mpileup -r chr10 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr10_${VCF_FILE}.gz &
bcftools mpileup -r chr11 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr11_${VCF_FILE}.gz &
bcftools mpileup -r chr12 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr12_${VCF_FILE}.gz &
bcftools mpileup -r chr13 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr13_${VCF_FILE}.gz &
bcftools mpileup -r chr14 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr14_${VCF_FILE}.gz &
bcftools mpileup -r chr15 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr15_${VCF_FILE}.gz &
bcftools mpileup -r chr16 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr16_${VCF_FILE}.gz &
bcftools mpileup -r chr17 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr17_${VCF_FILE}.gz &
bcftools mpileup -r chr18 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr18_${VCF_FILE}.gz &
bcftools mpileup -r chr19 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr19_${VCF_FILE}.gz &
bcftools mpileup -r chr20 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr20_${VCF_FILE}.gz &
bcftools mpileup -r chr21 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr21_${VCF_FILE}.gz &
bcftools mpileup -r chr22 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr22_${VCF_FILE}.gz &
bcftools mpileup -r chrX -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0  > chrX_${VCF_FILE}.gz &
bcftools mpileup -r chrY -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0  > chrY_${VCF_FILE}.gz &
wait

# Concatenate all chromosome VCFs to one big file
bcftools concat -O z chr[1-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chr[M,X-Y]_${VCF_FILE}.gz > ${VCF_FILE}.gz
tabix ${VCF_FILE}.gz

# Delete no longer needed VCFs
rm -f chr[0-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chrX_${VCF_FILE}.gz


END=$(date +%s.%N)

DIFF=$(echo "$END - $START" | bc)
echo ${DIFF}
	#!/bin/bash

	START=$(date +%s.%N)
	clear
	# setup parameters

	YSEQID=${PWD##*/}
	# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)

	NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
	echo "We can use ${NUM_THREADS} threads."

	BWA='bwa'
	if test -x "/usr/local/bin/bwa-mem2"; then
	echo "We can use bwa-mem2"
	BWA='bwa-mem2'
	fi


	UPS='upload.server.yseq.net'

	# Create a random password
	PASSWD=`date +%s \| sha256sum \| base64 \| head -c 8`
	echo "Random password created: ${PASSWD}"

	# Generate summary file
	cat /genomes/0/script-templates/x_result_summary.txt \| sed "s/<kitnumber>/${YSEQID}/g" \| sed "s/<passwd>/${PASSWD}/g" >${YSEQID}_result_summary.txt
	echo "${YSEQID}_result_summary.txt created"

	REF="/genomes/0/refseq/hg38/hg38.fa"

	READS_1="fastq/${YSEQID}_R1.fastq.gz"
	READS_2="fastq/${YSEQID}_R2.fastq.gz"
	#SAMFILE="${YSEQID}_bwa-mem_hg38.sam" # no longer needed due to piping
	BAMFILE="${YSEQID}_bwa-mem_hg38.bam"

	BAMFILE_SORTED="${YSEQID}_bwa-mem_hg38_sorted.bam"
	VCF_FILE="${YSEQID}_hg38.vcf"

	cp /genomes/0/tree/trees/yfull/latest_YFull_YTree.json .
	YFULLTREE="latest_YFull_YTree.json"

	cp /genomes/0/script-templates/cladeFinder.py .
	CLADEFINDER="cladeFinder.py"

	#ANN_VCF_FILE="${YSEQID}_hg38_dbSNP150_ann.vcf"
	#ANN_CLINVAR_VCF_FILE="${YSEQID}_hg38_clinvar_ann.vcf"

	#DBSNP="/genomes/0/refseq/hg19/All_chr_20161121.vcf.gz"

	# Read Group(s)
	# READ_GROUPS="\"@RG\tID:1\tSM:${YSEQID}\tPL:ILLUMINA\tPU:UNIT1\tLB:${YSEQID}\""

	if [[ $1 != "-resume" ]]
	then


	# Pipeline commands:
	${BWA} mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 \| \
	samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE -
	samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE
	samtools index $BAMFILE_SORTED
	samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv
	fi

	# Update summary file with BAM filesize:
	BAM_FILESIZE=`du -kh "${BAMFILE_SORTED}" \| cut -f1`
	echo "Size of ${BAMFILE_SORTED} = ${BAM_FILESIZE}"
	cat ${YSEQID}_result_summary.txt \| sed "s/<bamsize>/${BAM_FILESIZE}byte/g" >${YSEQID}_result_summary.txt.tmp
	rm -f ${YSEQID}_result_summary.txt
	mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

	if [[ $1 != "-resume" ]]
	then

	# Delete no longer needed large files
	#rm -f $SAMFILE
	rm -f $BAMFILE # keep $BAMFILE_SORTED
	fi

	# Easy tview file
	echo "#!/bin/bash" > tview_${YSEQID}.sh
	echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh
	chmod a+x tview_${YSEQID}.sh


	if [[ $1 != "-resume" ]]
	then

	# Separate chrY & mtDNA BAM files
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_hg38_chrY.bam" $BAMFILE_SORTED chrY &
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_rCRS_chrM.bam" $BAMFILE_SORTED chrM &
	wait
	samtools index "${YSEQID}_bwa-mem_hg38_chrY.bam" &
	samtools index "${YSEQID}_bwa-mem_rCRS_chrM.bam" &
	wait


	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam.bai ${UPS}:/genomes/${YSEQID}/

	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam.bai ${UPS}:/genomes/${YSEQID}/

	# mtDNA allele calling $ FASTA file generation
	bcftools mpileup -r chrM -Ou -C 50 -f ${REF} ${BAMFILE_SORTED} \| bcftools call --threads $NUM_THREADS -O z -v -m -P 0 > chrM_${VCF_FILE}.gz
	tabix chrM_${VCF_FILE}.gz
	samtools faidx $REF chrM \| bcftools consensus chrM_${VCF_FILE}.gz -o ${YSEQID}_mtDNA.fasta


	# Annotate all known Y chromosome SNPs from Ybrowse
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi

	bcftools mpileup -r chrY -C 50 --threads $NUM_THREADS -O z -f $REF $BAMFILE_SORTED > chrY_raw_${VCF_FILE}.gz
	tabix chrY_raw_${VCF_FILE}.gz

	bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz
	tabix chrY_merged_${VCF_FILE}.gz
	echo "merging completed"

	bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz
	tabix chrY_called_${VCF_FILE}.gz
	echo "calling completed"

	bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz
	tabix chrY_cleaned_${VCF_FILE}.gz
	echo "cleanup completed"

	bcftools filter -O z -i '(GT=="1/1" && AA==REF) \|\| (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz &
	bcftools filter -O z -i '(GT=="0/0" && AA==REF) \|\| (GT=="1/1" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_ancestral_${VCF_FILE}.gz &
	wait
	tabix chrY_derived_${VCF_FILE}.gz &
	tabix chrY_ancestral_${VCF_FILE}.gz &
	wait
	echo "ancestral & derived filters completed"

	# Find the Y haplogroup
	bcftools query -f '%ID,' chrY_derived_${VCF_FILE}.gz > ${YSEQID}_positives.txt &
	bcftools query -f '%ID,' chrY_ancestral_${VCF_FILE}.gz > ${YSEQID}_negatives.txt &
	wait
	fi


	python ${CLADEFINDER} ${YFULLTREE} ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${YSEQID}_cladeFinderOutput.csv
	echo "Y haplogroup checked"
	echo "The 5 best hits are:"
	head -n 5 ${YSEQID}_cladeFinderOutput.csv


	YFULLHG="unknown"
	YFULLPATH="unknown"
	OLDIFS=$IFS
	IFS=$'\t'
	LINECOUNTER=$((0))

	[ ! -f ${YSEQID}_cladeFinderOutput.csv ] && { echo "${YSEQID}_cladeFinderOutput.csv file not found"; }
	while read -r haplogroup path purity depth score conflicting_negatives
	do
	LINECOUNTER=$((LINECOUNTER + 1))
	if [ $LINECOUNTER -eq $((2)) ]
	then
	YFULLHG=$haplogroup
	YFULLPATH=$path
	fi

	done < ${YSEQID}_cladeFinderOutput.csv
	IFS=$OLDIFS


	echo "YFULLHG: ${YFULLHG}"
	echo "YFULLPATH: ${YFULLPATH}"

	# We're reading the first 100k sequences from the first fastq file to determine the average read length
	READLENGTH=$(samtools bam2fq -@ ${NUM_THREADS} ${BAMFILE_SORTED} \| head -n 100000 \| awk '{if(NR%4==2) {count++; bases += length} } END{print bases/count}')
	echo $READLENGTH

	# Calculating the coverage from the idxstats output
	COVERAGE_HG38=$(awk -v rl="$READLENGTH" '{x+=$2;m+=$3}END{print m*rl/x "x"}' < ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv)
	echo $COVERAGE_HG38

	cp /genomes/0/script-templates/getEquivalentAndDownstreamSNPs.py .
	PHYLOEQ_SNPS_FILE=${YSEQID}_PHYLOEQ_SNPS.tsv
	DOWNSTR_SNPS_FILE=${YSEQID}_DOWNSTR_SNPS.tsv
	python3 getEquivalentAndDownstreamSNPs.py ${YFULLTREE} "${YFULLHG}" chrY_cleaned_${VCF_FILE}.gz ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${PHYLOEQ_SNPS_FILE} ${DOWNSTR_SNPS_FILE} ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv

	cp /genomes/0/script-templates/getMTDNADifferences.py .
	MTDNA_SNPS_FILE=${YSEQID}_MTDNA_SNPS.tsv
	HAPLOGREP_JAR=haplogrep-2.1.25.jar
	cp /genomes/0/haplogrep/${HAPLOGREP_JAR} .

	MTHAPLOGROUP=$(python getMTDNADifferences.py -process chrM_${VCF_FILE}.gz ${HAPLOGREP_JAR} ${MTDNA_SNPS_FILE})


	# Update summary file with Y haplogroup:

	# Note: Since the YFull tree uses forward slashes for identical SNPs with different names, we replace the sed separator "/" with "\|"
	# This keeps us from escaping every forward slash. But don't confuse it with the pipe character in the same command!

	cat ${YSEQID}_result_summary.txt \| \
	sed "s\|<MTHAPLOGROUP>\|${MTHAPLOGROUP}\|g" \| \
	sed "s\|<YHAPLOGROUP>\|${YFULLHG}\|g" \| \
	sed "s\|<YFULLPATH>\|${YFULLPATH}\|g" \| \
	sed "s\|<COVERAGE_HG38>\|${COVERAGE_HG38}\|g" \| \
	sed "s\|<READLENGTH>\|${READLENGTH}\|g" \| \
	sed "s\|<TERMINALSNP>\|${YFULLHG}\|g" >${YSEQID}_result_summary.txt.tmp


	mv ${YSEQID}_result_summary.txt ${YSEQID}_result_summary.txt.backup

	mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

	bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz
	tabix chrY_novel_SNPs_${VCF_FILE}.gz
	zcat chrY_novel_SNPs_${VCF_FILE}.gz \| grep -v "##" >chrY_novel_SNPs_${VCF_FILE}.tsv
	echo "novel SNP calling completed"

	# Create Novel SNP template after checking for unreliable ranges and identity within Y Chromosome
	cp /genomes/0/script-templates/identityResolutionTemplateCreator.py .
	python identityResolutionTemplateCreator.py -batch chrY_novel_SNPs_${VCF_FILE}.tsv chrY_novel_SNPs_${YSEQID}_hg38.tsv ${REF}

	bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz
	tabix chrY_INDELs_${VCF_FILE}.gz
	echo "INDEL calling completed"



	# Parallel SNP calling by chromosome
	bcftools mpileup -r chr1 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr1_${VCF_FILE}.gz &
	bcftools mpileup -r chr2 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr2_${VCF_FILE}.gz &
	bcftools mpileup -r chr3 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr3_${VCF_FILE}.gz &
	bcftools mpileup -r chr4 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr4_${VCF_FILE}.gz &
	bcftools mpileup -r chr5 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr5_${VCF_FILE}.gz &
	bcftools mpileup -r chr6 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr6_${VCF_FILE}.gz &
	bcftools mpileup -r chr7 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr7_${VCF_FILE}.gz &
	bcftools mpileup -r chr8 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr8_${VCF_FILE}.gz &
	bcftools mpileup -r chr9 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr9_${VCF_FILE}.gz &
	bcftools mpileup -r chr10 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr10_${VCF_FILE}.gz &
	bcftools mpileup -r chr11 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr11_${VCF_FILE}.gz &
	bcftools mpileup -r chr12 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr12_${VCF_FILE}.gz &
	bcftools mpileup -r chr13 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr13_${VCF_FILE}.gz &
	bcftools mpileup -r chr14 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr14_${VCF_FILE}.gz &
	bcftools mpileup -r chr15 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr15_${VCF_FILE}.gz &
	bcftools mpileup -r chr16 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr16_${VCF_FILE}.gz &
	bcftools mpileup -r chr17 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr17_${VCF_FILE}.gz &
	bcftools mpileup -r chr18 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr18_${VCF_FILE}.gz &
	bcftools mpileup -r chr19 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr19_${VCF_FILE}.gz &
	bcftools mpileup -r chr20 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr20_${VCF_FILE}.gz &
	bcftools mpileup -r chr21 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr21_${VCF_FILE}.gz &
	bcftools mpileup -r chr22 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr22_${VCF_FILE}.gz &
	bcftools mpileup -r chrX -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrX_${VCF_FILE}.gz &
	bcftools mpileup -r chrY -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrY_${VCF_FILE}.gz &
	wait

	# Concatenate all chromosome VCFs to one big file
	bcftools concat -O z chr[1-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chr[M,X-Y]_${VCF_FILE}.gz > ${VCF_FILE}.gz
	tabix ${VCF_FILE}.gz

	# Delete no longer needed VCFs
	rm -f chr[0-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chrX_${VCF_FILE}.gz



	END=$(date +%s.%N)

	DIFF=$(echo "$END - $START" \| bc)
	echo ${DIFF}