Last active
June 19, 2018 12:51
-
-
Save david-a-parry/e432bfae34ce74c46175c7ea32b31fe6 to your computer and use it in GitHub Desktop.
suggested edits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
#software | |
VCFHACKS=~/bin/vcfhacks/ | |
# EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py | |
CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py | |
# dirs | |
WORK=~/projects/bridge_study/vcfs/work/ | |
AITMAN=${WORK}/aitman/ | |
CAMBRIDGE=${WORK}/cambridge/ | |
# files | |
VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored | |
VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored | |
EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed | |
HELICAL_BED=~/projects/bridge_study/helical_domains.bed | |
###################################################### | |
# sort_index() | |
# | |
# Sorts and Indexes the parsed vcf files. | |
# | |
###################################################### | |
sort_index(){ | |
for vcf in $@; do | |
vcf-sort $vcf | bgzip -c > ${vcf}.gz | |
tabix -p vcf ${vcf}.gz | |
done | |
} | |
#################################################### | |
# AIMS: | |
# A. get all known pathogenic variants | |
# A1: Variants in genes of interest. | |
# A2: Filter on GT.Depth, GT.GQ and QUAL fields | |
# A3. All known causitive variants | |
# A4. LoF variants | |
# A5. GLY-X-Y variants | |
# B. get all VUS variants | |
#################################################### | |
for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do | |
# A1: Filter for variants in genes of interest only | |
getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf | |
# A2: Convert samples with GT Depth < 8 and allele balance >= 0.25 | |
filterGts.pl -d 8 -b 0.25 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.dp8.vcf | |
## I HAVE COMMENTED OUT THE BELOW - BECAUSE getFunctionalVariants.pl WILL ONLY KEEP ## | |
## VARIANTS WITH ITS DEFAULT SET OF 'FUNCTIONAL' CONSEQUENCES (MISSENSE, STOP, ## | |
## SPLICE ETC.). YOU DON'T WANT TO REMOVE VARIANTS WITH OTHER CONSEQUENCES BEFORE ## | |
## THE CLINVAR STEP AS SOME OF THE KNOWN PATHOGENIC VARIANTS MIGHT BE INTRONIC. IN ## | |
## FACT WE CAN DO THIS IN ONE STEP (see A3). ## | |
#getFunctionalVariants.pl \ | |
# --input ${VCF}.eds_genes.gt20.vcf \ | |
# --gq 20 \ #THIS HAS NO EFFECT UNLESS USING --samples OPTION | |
# --var_qual 20 \ | |
# --clinvar disable \ | |
# --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf | |
# A3: ALL KNOWN CAUSATIVE VARIANTS | |
# PLUS LOF VARIANTS | |
# get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields | |
getFunctionalVariants.pl \ | |
--input ${VCF}.eds_genes.dp8.vcf \ | |
--clinvar all \ | |
--classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \ | |
--samples all \ | |
--gq 20 \ | |
--var_qual 20 \ | |
--output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf | |
# A5: GLY-X-Y VARIANTS | |
# filter for variants in helical domains | |
# NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'" | |
$CSQ_QUERY \ | |
--vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \ | |
--helical_domains $HELICAL_BED \ | |
--out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf | |
## THE ABOVE COULD BE DONE WITH ENSEMBL's filter_vep SCRIPT | |
# A6: merge all filtered VCF files together | |
# concatenate and sort all filtered VCF from A2-A4 | |
DIRECTORY=`dirname $VCF` | |
vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"` | |
sort_index $vcfs | |
compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs` | |
bcftools concat -O z -a --rm-dups -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf.gz $compressed_vcfs | |
#vcf-concat $compressed_vcfs > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf | |
#sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf | |
## remove duplicate variants | |
#awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf | |
# B: Identify all VUS variants | |
# NOT COMPLETE. Need to filter out variants present in the pathogenic vcfs using filterVcfOnVcf.pl | |
getFunctionalVariants.pl \ | |
--input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \ | |
--clinvar disable \ | |
--cadd_filter 10 \ | |
--damaging 'sift=deleterious' \ | |
--damaging 'polyphen=probably_damaging' \ | |
--out ${VCF}.eds_genes.gt20.gq20.dp20.vus.vcf | |
done | |
# Merge the aitman and cambridge pathogenic vcfs | |
AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf | |
CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf | |
sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC | |
vcf-merge ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment