Created
April 8, 2018 19:27
-
-
Save tkrahn/283462028c61cd213399ba7f6b773893 to your computer and use it in GitHub Desktop.
Script to annotate a BigY VCF file and identify the derived and novel SNPs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
START=$(date +%s.%N) | |
clear | |
# setup parameters | |
YSEQID=${PWD##*/} | |
# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory) | |
NUM_THREADS=80 | |
REF="/genomes/0/refseq/hg38/hg38.fa" | |
VCF_FILE="${YSEQID}_hg38.vcf" | |
REPLACEMENT_HEADER="#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t${YSEQID}" | |
cat variants.vcf | sed "s/^\#CHROM.*/${REPLACEMENT_HEADER}/" > unfiltered.vcf | |
# FTDNA still leaves the non-passed variants in the file. Need to remove them | |
awk '/^#/||$7=="PASS"' unfiltered.vcf > ${VCF_FILE} | |
cat $VCF_FILE | bgzip > chrY_raw_${VCF_FILE}.gz | |
tabix chrY_raw_${VCF_FILE}.gz | |
# Annotate all known Y chromosome SNPs from Ybrowse | |
wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz | |
wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi | |
bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz | |
tabix chrY_merged_${VCF_FILE}.gz | |
echo "merging completed" | |
#bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz | |
# not needed. FTDNA has already done allele calling | |
cp chrY_merged_${VCF_FILE}.gz chrY_called_${VCF_FILE}.gz | |
tabix chrY_called_${VCF_FILE}.gz | |
echo "calling completed" | |
bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz | |
tabix chrY_cleaned_${VCF_FILE}.gz | |
echo "cleanup completed" | |
bcftools filter -O z -i '(GT=="1/1" && AA==REF) || (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz | |
tabix chrY_derived_${VCF_FILE}.gz | |
echo "derived filter completed" | |
bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1"' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz | |
tabix chrY_novel_SNPs_${VCF_FILE}.gz | |
echo "novel SNP calling completed" | |
bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1"' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz | |
tabix chrY_INDELs_${VCF_FILE}.gz | |
echo "INDEL calling completed" | |
END=$(date +%s.%N) | |
DIFF=$(echo "$END - $START" | bc) | |
echo ${DIFF} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You need:
Linux or Unix based OS (or emulator)
htslib
samtools
bcftools
wget
sed
awk
bgzip
tabix
an internet connection to download from YBrowse
and obviously the VCF file from a BigY.