Skip to content

Instantly share code, notes, and snippets.

@david-a-parry
Last active June 19, 2018 12:51
Show Gist options
  • Save david-a-parry/e432bfae34ce74c46175c7ea32b31fe6 to your computer and use it in GitHub Desktop.
Save david-a-parry/e432bfae34ce74c46175c7ea32b31fe6 to your computer and use it in GitHub Desktop.
suggested edits
#!/bin/sh
#software
VCFHACKS=~/bin/vcfhacks/
# EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py
CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py
# dirs
WORK=~/projects/bridge_study/vcfs/work/
AITMAN=${WORK}/aitman/
CAMBRIDGE=${WORK}/cambridge/
# files
VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed
HELICAL_BED=~/projects/bridge_study/helical_domains.bed
######################################################
# sort_index()
#
# Sorts and Indexes the parsed vcf files.
#
######################################################
sort_index(){
for vcf in $@; do
vcf-sort $vcf | bgzip -c > ${vcf}.gz
tabix -p vcf ${vcf}.gz
done
}
####################################################
# AIMS:
# A. get all known pathogenic variants
# A1: Variants in genes of interest.
# A2: Filter on GT.Depth, GT.GQ and QUAL fields
# A3. All known causitive variants
# A4. LoF variants
# A5. GLY-X-Y variants
# B. get all VUS variants
####################################################
for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do
# A1: Filter for variants in genes of interest only
getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf
# A2: Convert samples with GT Depth < 8 and allele balance >= 0.25
filterGts.pl -d 8 -b 0.25 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.dp8.vcf
## I HAVE COMMENTED OUT THE BELOW - BECAUSE getFunctionalVariants.pl WILL ONLY KEEP ##
## VARIANTS WITH ITS DEFAULT SET OF 'FUNCTIONAL' CONSEQUENCES (MISSENSE, STOP, ##
## SPLICE ETC.). YOU DON'T WANT TO REMOVE VARIANTS WITH OTHER CONSEQUENCES BEFORE ##
## THE CLINVAR STEP AS SOME OF THE KNOWN PATHOGENIC VARIANTS MIGHT BE INTRONIC. IN ##
## FACT WE CAN DO THIS IN ONE STEP (see A3). ##
#getFunctionalVariants.pl \
# --input ${VCF}.eds_genes.gt20.vcf \
# --gq 20 \ #THIS HAS NO EFFECT UNLESS USING --samples OPTION
# --var_qual 20 \
# --clinvar disable \
# --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf
# A3: ALL KNOWN CAUSATIVE VARIANTS
# PLUS LOF VARIANTS
# get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields
getFunctionalVariants.pl \
--input ${VCF}.eds_genes.dp8.vcf \
--clinvar all \
--classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \
--samples all \
--gq 20 \
--var_qual 20 \
--output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf
# A5: GLY-X-Y VARIANTS
# filter for variants in helical domains
# NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'"
$CSQ_QUERY \
--vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
--helical_domains $HELICAL_BED \
--out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf
## THE ABOVE COULD BE DONE WITH ENSEMBL's filter_vep SCRIPT
# A6: merge all filtered VCF files together
# concatenate and sort all filtered VCF from A2-A4
DIRECTORY=`dirname $VCF`
vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"`
sort_index $vcfs
compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs`
bcftools concat -O z -a --rm-dups -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf.gz $compressed_vcfs
#vcf-concat $compressed_vcfs > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf
#sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf
## remove duplicate variants
#awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
# B: Identify all VUS variants
# NOT COMPLETE. Need to filter out variants present in the pathogenic vcfs using filterVcfOnVcf.pl
getFunctionalVariants.pl \
--input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
--clinvar disable \
--cadd_filter 10 \
--damaging 'sift=deleterious' \
--damaging 'polyphen=probably_damaging' \
--out ${VCF}.eds_genes.gt20.gq20.dp20.vus.vcf
done
# Merge the aitman and cambridge pathogenic vcfs
AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC
vcf-merge ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment