Created
May 2, 2016 16:03
-
-
Save ckandoth/355d3297ff6417f42528b8983f4b8a94 to your computer and use it in GitHub Desktop.
Fetch VCFs generated by TCGA PancanAtlas MC3 unified variant calling effort and curate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# GOAL: Fetch VCFs generated by TCGA PancanAtlas MC3 unified variant calling effort and curate | |
########## | |
## DATA ## | |
########## | |
# Data from DNANexus/UCSC came in as 3 batches. Batches 1 and 2 had GATK cocleaned bams available, | |
# while batch 3 needed realignment. Batch 1 had RNA (to run radia), batch 2 didn't. Kyle Elrott | |
# is the point person for these batches. Available on GChat. | |
# Batches were later merged by Singer Ma at DNANexus, into a single folder called full_v1. | |
# And some Pindel/BRCA-related patches and whitelisting was done, into a folder called full_v2. | |
https://tcga-data-secure.nci.nih.gov/tcgafiles/tcgajamboree/tcgajamboree/mc3/full_v2/ | |
# Data from Broad Institute, are VCFs from MuTect and Indelocator, which should be at the TCGA DCC | |
# in the usual per-tumor protected folders. Gordon Saksena at BI is the point person on this: | |
https://tcga-data-secure.nci.nih.gov/tcgafiles/tcgajamboree/tcgajamboree/broad_slides/ | |
# Download and store manifest files in subfolder "bin": | |
bin/MC3_DNAnexus_second_joblist.filter.update.manifest | |
bin/MC3_third_joblist_v2.filter.update.manifest | |
bin/mc3_joblist.filter.manifest | |
# Create an env variable with the username that has access to the protected TCGA Jamboree: | |
export DCCID=SusieQ | |
# Fetch the targeted region BED file that Broad limited all their calls to: | |
wget --user $DCCID --ask-password --directory-prefix bin https://tcga-data-secure.nci.nih.gov/tcgafiles/tcgajamboree/tcgajamboree/mc3/broad/gaf_20111020+broad_wex_1.1_hg19.bed | |
# Download VCF tarballs from the TCGA Jamboree: | |
wget --user $DCCID --ask-password --recursive --no-parent --no-host-directories --cut-dirs 4 --accept "*.tar.gz" --directory-prefix vcfs https://tcga-data-secure.nci.nih.gov/tcgafiles/tcgajamboree/tcgajamboree/mc3/full_v2/ | |
wget --user $DCCID --ask-password --directory-prefix vcfs/broad https://tcga-data-secure.nci.nih.gov/tcgafiles/tcgajamboree/tcgajamboree/broad_slides/{broad_mc3_vcfs_b2,broad_mc3_vcfs_b2a,mc3_b2c}.tar.gz | |
# Untar all the DNANexus tarballs with LSF, and delete their MuTect VCFs, cuz we'll be overriding them with VCFs from Broad: | |
ls vcfs/full_v2/*.tar.gz | perl -ne 'chomp; print "bsub -oo /dev/null -R rusage[iounits=2] -We 0:59 tar -C vcfs -zxf $_\n"' | bash | |
find vcfs -name *mutect.tcga_filtered.reheadered.vcf | xargs rm -f | |
# Untar the Broad tarballs: | |
ls vcfs/broad/*.tar.gz | perl -ne 'chomp; print "bsub -oo /dev/null -R rusage[iounits=2] -We 0:59 tar -C vcfs -zxf $_\n"' | bash | |
# Download a mapping of TCGA barcodes to live UUIDs on CGHub: | |
curl -L https://cghub.ucsc.edu/reports/SUMMARY_STATS/LATEST_MANIFEST.tsv | cut -f2,3,7-9,17-19,24-26 | grep -E "^barcode|^TCGA" | grep -wE "state$|Live$" > bin/tcga_barcodes_and_cghub_ids.tsv | |
# For each MuTect + Indelocator VCF from Broad, find the appropriate DNAnexus subfolder to move it into, and move it down there: | |
find vcfs/{mc3_b2,mc3_b2a,mc3_b2c} -type f -name *.vcf | perl -ne 'chomp; $if="bin/tcga_barcodes_and_cghub_ids.tsv"; ($p,$ti,$ni)=m/(TCGA-\w\w-\w{4})_(\w{8}-\w{4}-\w{4}-\w{4}-\w{12})_(\w{8}-\w{4}-\w{4}-\w{4}-\w{12})/; ($tb,$pi)=map{chomp; split("\t")}`grep -w $ti $if | cut -f1,8`; ($nb)=map{chomp; $_}`grep -w $ni $if | cut -f1`; $tb=~s/TCGA-\w\w-\w{4}-(\d\d).*/$1/; $nb=~s/TCGA-\w\w-\w{4}-(\d\d).*/$1/; ($dn)=glob("vcfs/*/$pi\_$tb\_$nb"); print "mv $_ $dn\n" if(-d $dn)' | bash | |
# ::TODO:: Duplicate UUIDs per barcode create a few duplicate MuTect/Indelocotor VCFs per TN-pair. Prolly best to just merge+deduplicate those calls to take advantage of the extra coverage. | |
# Rename each Radia VCF with the same unique prefix used by the Muse VCF adjacent to it: | |
find vcfs -name filtered.radia.tcga_filtered.reheadered.vcf | perl -ne 'chomp; ($d)=m/(.*)\//; ($p)=map{chomp; s/.*\/([^\/]+).muse.*/$1/; $_}glob("$d/*muse*.vcf"); print "mv $_ $d/$p.radia.tcga_filtered.reheadered.vcf\n"' | bash | |
# Pull out the paired sample IDs, and TCGA barcodes, from the headers of each VCF, so we can use it later with vcf2maf/vcf2vcf: | |
find vcfs -name *.vcf | perl -ne 'chomp; %tn=map{m/ID=([^,]+).*SampleTCGABarcode=([^,]+),/}`grep ^##SAMPLE $_`; @s=grep{!/NORMAL/}keys %tn; print join("\t",map{($_,$tn{$_})}(@s,"NORMAL"),$_)."\n"' > bin/tn_pairs.tsv | |
########## | |
## TEST ## | |
########## | |
# Here we'll test a random bunch of VCFs for problems/caveats before we apply it to everything. | |
# Make copies of VCFs from 100 random TN-pairs into a subfolder named vcfs_tmp: | |
mkdir -p vcfs_tmp | |
find vcfs -mindepth 2 -type d | shuf | head -n100 | perl -ne 'chomp; print "cp $_/*.vcf vcfs_tmp\n"' | bash | |
# Rename them to something a bit more intuitive: | |
ls vcfs_tmp/*.vcf | perl -ne 'chomp; $f=$_; s/([^\/]+).(SomaticSniper|mutect|varscan.snp|varscan.indel|pindel|muse|radia|oxoG.snp|indel.capture).*$/$1\_$2.vcf/; s/SomaticSniper/somatic.sniper/; s/oxoG.snp/mutect/; s/indel.capture/indelocator/; print "mv $f $_\n"' | bash | |
# Pull out the TN-paired TCGA barcodes for each VCF: | |
find vcfs_tmp -name *.vcf | perl -ne 'chomp; %tn=map{m/ID=([^,]+).*SampleTCGABarcode=([^,]+),/}`grep ^##SAMPLE $_`; @s=grep{!/NORMAL/}keys %tn; print join("\t",map{($_,$tn{$_})}(@s,"NORMAL"),$_)."\n"' > tmp_tn_pairs.tsv | |
# Make the following fixes to the VCFs or else they break the htslib VCF parser: | |
# - Remove whitespace between comma-delimited values in vcfProcessLog header lines | |
# - Set Number=G for the PL tag in Pindel FORMAT header lines | |
find vcfs_tmp -name *.vcf | xargs perl -i -pe 's/>,\s+/>,/g if(m/^##vcfProcessLog/); s/^(##FORMAT=<ID=PL,Number)=3/$1=G/' | |
# For Muse calls, set FILTER=PASS for Tiers 1 through 4, per instructions from the author: | |
find vcfs_tmp -name *muse.vcf | xargs perl -i -pe 's/\tTier[1234]\t/\tPASS\t/ unless(m/^#/)' | |
# A ton of Radia calls have a 100% VAF, with ExAC tagging most of them as germline. These calls have SS=1 (i.e. germline) but FILTER=PASS | |
# On closer look, it's limited to a handful of TN-pairs that did not have matched RNA-seq. Here are some examples... | |
TCGA-17-Z025-01A-01W-0746-08 TCGA-17-Z025-11A-01W-0746-08 vcfs/LUAD/abb86d09-d4ff-41f4-ba22-185dc48347c9_01_11/filtered.radia.tcga_filtered.reheadered.vcf | |
TCGA-AB-2816-03B-01W-0728-08 TCGA-AB-2816-11B-01W-0728-08 vcfs/LAML/61db06eb-299a-48b3-8c71-4bea78169b4c_03_11/filtered.radia.tcga_filtered.reheadered.vcf | |
TCGA-AB-2871-03A-01W-0732-08 TCGA-AB-2871-11A-01W-0732-08 vcfs/LAML/b4bc4666-13ba-4864-be9b-89ff8a0a1bd8_03_11/filtered.radia.tcga_filtered.reheadered.vcf | |
# For Radia VCFs, remove lines with INFO/SS=1: | |
find vcfs_tmp -name *radia.vcf | xargs perl -i -a -F'\t' -pe '$_="" if(!m/^#/ and $F[7]=~m/SS=1/)' | |
# Also noted that some Radia VCFs don't have INFO/SS defined in the header, with breaks htslib. Easiest to just delete that tag from each row: | |
find vcfs_tmp -name *radia.vcf | xargs perl -i -a -F'\t' -pe '$F[7]=~s/SS=\d;?// if(!m/^#/); $_=join("\t",@F)' | |
# Fix chrom names in the MuTect VCFs to match names in the GRCh37 fasta: | |
find vcfs_tmp -name *mutect.vcf | xargs perl -i -pe 'unless(m/^#/){ s/^<(\S+)>/$1/; s/^M\t/MT\t/ }' | |
# Bgzip and tabix index the VCFs: | |
ls vcfs_tmp/*.vcf | perl -ne 'chomp; print "bgzip $_; tabix -p vcf $_.gz\n"' | bash | |
# Remove all non-PASSed calls, wipe clean the INFO columns, and normalize indels: | |
ls vcfs_tmp/*.vcf.gz | perl -ne 'chomp; s/.gz$//; print "bcftools view --apply-filters PASS $_.gz | bcftools annotate --remove INFO | bcftools norm --check-ref w --fasta-ref /opt/common/CentOS_6-dev/vep/v84/homo_sapiens/84_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz --multiallelics +any --output $_\n"' | bash | |
# Remove the old VCFs, and bgzip/tabix the new ones: | |
ls vcfs_tmp/*.vcf | perl -ne 'chomp; print "rm -f $_.gz $_.gz.tbi; bgzip $_; tabix -p vcf $_.gz\n"' | bash | |
# Add FILTERs, minor allele counts, and population freqs from the ExAC VCF: | |
ls vcfs_tmp/*.vcf.gz | perl -ne 'chomp; s/.gz$//; print "bsub -oo /dev/null -We 0:59 bcftools annotate --annotations /ifs/res/pwg/data/exac/ExAC.r0.3.sites.minus_somatic.vcf.gz --columns FILTER,INFO/AC,INFO/AN,INFO/AF --output $_ $_.gz\n"' | bash | |
# Remove the old VCFs: | |
ls vcfs_tmp/*.vcf.gz | perl -ne 'chomp; print "rm -f $_ $_.tbi\n"' | bash | |
# Run vcf2maf on the remaining calls: | |
cat tmp_tn_pairs.tsv | perl -ne 'chomp; ($vt,$t,$vn,$n,$f)=split("\t"); $f=~s/.vcf$//; print "bsub -n2 -oo $f.vep.log -We 0:59 /opt/common/CentOS_6-dev/perl/perl-5.22.0/bin/perl /opt/common/CentOS_6-dev/vcf2maf/develop/vcf2maf.pl --input-vcf $f.vcf --output-maf $f.vep.maf --tumor-id $t --normal-id $n --vcf-tumor-id $vt --vcf-normal-id $vn --vep-path /opt/common/CentOS_6-dev/vep/v84 --vep-data /opt/common/CentOS_6-dev/vep/v84 --ref-fasta /opt/common/CentOS_6-dev/vep/v84/homo_sapiens/84_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz\n"' | bash | |
########## | |
## PLOT ## | |
########## | |
# Make a table that we can load into R for some pwetty plots: | |
grep ^Hugo vcfs_tmp/*.maf | perl -a -F'\t' -ne '$F[0]=~s/.*_([^_]+).vep.maf:(\S*)/Caller\t$2/; print join("\t",@F[0,4,5,8,9,15,36,39..44,50,56,63,85,99,108])' | head -1 > plot_me.tsv | |
grep -vE "^#|^Hugo" vcfs_tmp/*.maf | perl -a -F'\t' -ne '$F[0]=~s/.*_([^_]+).vep.maf:(\S*)/$1\t$2/; print join("\t",@F[0,4,5,8,9,15,36,39..44,50,56,63,85,99,108])' >> plot_me.tsv | |
# Remove calls in Flanks, Introns, Untranslated Regions, and Intergenic Regions: | |
perl -i -a -F'\t' -pe '$_="" if($F[4]=~m/(Flank|Intron|UTR|IGR)/)' plot_me.tsv | |
# Remove Pindel calls with t_alt_count < 3: | |
perl -i -a -F'\t' -pe '$_="" if($F[0] eq "pindel" and $F[10] < 3 )' plot_me.tsv | |
# Remove VarScan/Sniper calls with t_alt_count < 3 or n_alt_count > 2 or t_depth < 14 or n_depth < 8: | |
perl -i -a -F'\t' -pe '$_="" if($F[0] =~ /varscan|sniper/ and ( $F[10] < 3 or $F[13] > 2 or $F[8] < 14 or $F[11] < 8 ))' plot_me.tsv | |
# Generalize the FILTER reasons: | |
perl -i -a -F'\t' -pe 'chomp($F[-1]); $F[19]="ArtifactPerExac" if($F[19]!~m/^(FILTER|PASS|common_variant)$/); $F[19]=~s/^common_variant$/GermlinePerExac/; ; $_=join("\t",@F)."\n"' plot_me.tsv | |
# Fill empty cells with NAs: | |
perl -i -a -F'\t' -pe 'chomp($F[-1]); @F=map{s/^(\s*|\.)$/NA/; $_} @F; $_=join("\t",@F)."\n"' plot_me.tsv | |
# In R, do the following to make a VAF histogram per caller: | |
library( ggplot2 ); | |
library( cowplot ); | |
library( data.table ); | |
maf = fread( "plot_me.tsv", showProgress = F ); | |
maf$t_vaf = maf$t_alt_count / maf$t_depth; | |
plot.mut = ggplot( maf[Caller=="mutect",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="MuTect", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1800)) + scale_x_continuous(limits = c(0,1)); | |
plot.mus = ggplot( maf[Caller=="muse",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="Muse", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1800)) + scale_x_continuous(limits = c(0,1)); | |
plot.rad = ggplot( maf[Caller=="radia",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="Radia", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1800)) + scale_x_continuous(limits = c(0,1)); | |
plot.ssn = ggplot( maf[Caller=="somatic.sniper",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="SomaticSniper", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1800)) + scale_x_continuous(limits = c(0,1)); | |
plot.vss = ggplot( maf[Caller=="varscan.snp",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="VarScan SNVs", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1800)) + scale_x_continuous(limits = c(0,1)); | |
plot.vsi = ggplot( maf[Caller=="varscan.indel",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="VarScan Indels", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1000)) + scale_x_continuous(limits = c(0,1)); | |
plot.ind = ggplot( maf[Caller=="indelocator",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="Indelocator", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1000)) + scale_x_continuous(limits = c(0,1)); | |
plot.pin = ggplot( maf[Caller=="pindel",], aes( t_vaf, fill = FILTER )) + geom_histogram( binwidth = 0.01 ) + labs( title="Pindel", x="Variant Allele Fraction" ) + scale_y_continuous(limits = c(0,1000)) + scale_x_continuous(limits = c(0,1)); | |
plot.all = plot_grid( plot.mut, plot.mus, plot.rad, plot.ssn, plot.vss, plot.vsi, plot.ind, plot.pin, ncol = 2, nrow = 4); | |
save_plot("tcga_mc3_vafs.pdf", plot.all, ncol = 2, nrow = 4, base_aspect_ratio = 1.5 ); | |
# Count the number of tossed calls per caller if we apply various filters: | |
for( caller in unique(maf$Caller)) { | |
filtered = nrow( maf[Caller==caller & (t_alt_count<3 | t_depth<14 | n_depth<8 | n_alt_count>2),] ); | |
total = nrow( maf[Caller==caller,] ); | |
cat( caller, " ", filtered, "/", total, " (", formatC( filtered/total*100, digits=1, format="f" ), "%)", "\n", sep="" ); | |
} | |
########### | |
## FIXES ## | |
########### | |
# Based on findings from the tests/plots above, make some fixes to all the VCFs: | |
# ::NOTE:: Wrote vcf2vcf to parse these nutty VCF formats and standardize GT:AD:DP in each. It will | |
# also reduce multiallelic sites to just the 1 ALT allele specified in the GT field. It also fixes | |
# common problems with TCGA VCFS, so they play nice with VT and bcftools. | |
# Use vcf2vcf and --add-filters appropriate for each caller: | |
Add these tags to both VarScan and SomaticSniper VCFs. These are already implemented in "vcf2vcf --add-filter": | |
LowTotalDepth - Less than 14 total reads in tumor, or less than 8 reads in normal | |
LowTumorSupport - Less than 3 allele supporting read(s) in tumor | |
HighNormalSupport - More than 2 allele supporting read(s) in the normal | |
For VarScan VCFs, create these additional tags that work well in combination with LowTotalDepth above: | |
VarScanLowConf - Fisher's exact test p-value > 0.01 | |
VarScanLowVAF - Tumor VAF <8% | |
# ::TODO:: | |
# Decompose multiallelic sites into separate lines - http://genome.sph.umich.edu/wiki/Vt#Decompose | |
# Normalize variants into their left-aligned minimalist forms - http://genome.sph.umich.edu/wiki/Vt#Normalization | |
# Decompose MNPs and complex substitutions into separate events, while retaining the original calls in INFO for later consolidation - http://genome.sph.umich.edu/wiki/Vt#Decompose_biallelic_block_substitutions | |
# For each TN-pair, merge VCFs from different callers, while retaining callers in Genotype columns - https://samtools.github.io/bcftools/bcftools.html#merge | |
# Deduplicate variants with the same POS/REF/ALT - http://genome.sph.umich.edu/wiki/Vt#Drop_duplicate_variants | |
# For each MNP/complex substitution that was split up in step 2, find all overlapping events and remove all but the largest one - Needs some custom code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment