Skip to content

Instantly share code, notes, and snippets.

@zhujack
Created December 19, 2016 22:39
Show Gist options
  • Save zhujack/849b75f5a8305edaeca1001dfb9c3fe9 to your computer and use it in GitHub Desktop.
Save zhujack/849b75f5a8305edaeca1001dfb9c3fe9 to your computer and use it in GitHub Desktop.
Parse/convert a vcf file to tabular format data table (data frame) plus filtering
## Functions to expand VariantAnnotation CollapsedVCF object and convert to a data frame
## Author: Sean Davis and Jack Zhu@Meltzerlab, 12/18/2016
## Function to parse ANN column in to a dataframe
.anncols = function(anncol,headerstring) {
anncols = strsplit(sub("Functional annotations: '",'',
headerstring),' \\| ')[[1]]
dfannempty = data.frame(matrix(vector(), 0, length(anncols),
dimnames=list(c(), anncols)),
stringsAsFactors=F)
dd <- lapply(lapply(anncol,`[`,1),function(x){strsplit(x,'\\|')[[1]]})
ncls <- max(unlist(lapply(dd, length)))
yy = data.frame(suppressWarnings(do.call(rbind,
c(dfannempty[1:ncls], dd))),
stringsAsFactors=FALSE)
yy = data.frame(lapply(yy,type.convert))
colnames(yy) = paste("ANN",anncols[1:ncls],sep="..")
return(yy)
}
## Function to convert VariantAnnotation CollapsedVCF/ExpandedVCF objects to a data frame
v2df = function(x, ...) {
df = as.data.frame(rowRanges(x))
df = cbind(df, as.data.frame(info(x)))
if ( any(c('ANN', 'EFF') %in% names(info(x))) ) {
ann = c('ANN', 'EFF')[ c('ANN', 'EFF') %in% names(info(x)) ][1]
dfann = .anncols(df$ANN, info(header(x))[ann, ]$Description)
df = df[, colnames(df) != ann]
df = cbind(df, dfann)
}
n = names(geno(x))
tmp = lapply(n, function(col) {
return(as.data.frame(geno(x)[[col]]))
})
ncols = sapply(tmp, ncol)
tmp = do.call(cbind, tmp)
colnames(tmp) = paste(rep(n, times = ncols), colnames(tmp), sep =
"_")
df = cbind(df, tmp)
df[sapply(df, is.list)] <- apply(df[sapply(df, is.list)], 2,
function(x) {
unlist(lapply(x, paste,
sep=",", collapse=";"))
} )
print(dim(df))
# print(names(df))
return(df)
}
## Function to expand VariantAnnotation CollapsedVCF object and convert to a data frame
vcf2df = function(v, expand = TRUE) {
if (expand) {
message('Expanding VCF first, so number of rows may increase')
return(v2df(expand(v)))
} else {
message('No VCF Expanding')
return(v2df(v))
}
}
#!/usr/bin/env Rscript
## Author: Jack Zhu@Meltzerlab, 12/18/2016
## This script is to convert vcf to tabular table and format results
## Options:
## -f: a vcf file name, required
## -g: a text file containing gene names, optional
## -b: a bed track file containing regions of interest, optional
## -o: name of output directory
## Usage: vcf2table.R -f variant/hpc/SE0013_Saliva.hpc.annotated.vcf -g ~/bin/gene_kk4_563.txt -b /data/CCRBioinfo/public/serpentine_resources/design/Agilent_SureSelect_Killian_Version4.design.hg19.merged.bed
#####################################
suppressPackageStartupMessages(library("optparse"))
option_list <- list(
make_option(c("-f", "--vcfFile"), help="Required. "),
make_option(c("-g", "--geneListFile"), default='', help="A text file containing gene names. [default: %default]"),
make_option(c("-o", "--outDir"), default='', help="name of output directory. [default: %default]"),
make_option(c("-b", "--bedTrack"), default='', help="a bed track file containing regions of interest. [default: %default]")
)
# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
opt <- parse_args(OptionParser(option_list=option_list))
if( ! is.element('vcfFile', names(opt)) || !file.exists(opt$vcfFile) ) stop("-f is required. ")
#####################################
require(VariantAnnotation)
source('vcf2df.R')
getALT <- function (ALT_vector, n, emptyReplacement = '') {
if( length(ALT_vector) >= n & ! is.na(ALT_vector[n]) ){
return(ALT_vector[n])
} else {
return(emptyReplacement)
}
}
## some vairable
vcfFile = opt$vcfFile
if( opt$outDir != '') {
outBase <- file.path( opt$outDir, sub('.vcf', '', basename(vcfFile)) )
} else {
outBase <- sub('.vcf', '', vcfFile)
}
## read a vcf file and convert to a data frame
print(vcfFile)
v = readVcf(vcfFile, "hg19")
df = vcf2df(v)
#####################################
## format output
## replace sample names in fields
fBase <- basename( vcfFile)
nBase <- sub('\\..*$', '', fBase)
nUnderscore <- sapply(regmatches(nBase, gregexpr("_", nBase)), length)
if( nUnderscore < 2 ) {
## germline name
names(df) <- sub( paste('_', nBase, sep=''), "_Sample", names(df) )
} else {
## somatic names
nameT <- sub('\\_.*\\_', '_', nBase)
nameT1 <- sub('^.*\\_', '', nameT)
nameN <- sub( paste('\\_', nameT1, sep=''), '', nBase)
names(df) <- sub( nameN, "NORMAL", names(df) )
names(df) <- sub( nameT, "TUMOR", names(df) )
}
df$SampleName = nBase
df$SampleName1 = df$SampleName
df$SampleName2 = df$SampleName
## rename fields
old_fds <- c("seqnames", "start", "end", "width", "strand", "Cosmic..CosmicID", "dbSNP..RS")
old_fds_c <- old_fds[ old_fds %in% names(df) ]
new_fds <- c("Chr", "Start", "End", "Length", "Strand", "Cosmic_ID", "dbSNP_ID")
new_fds_c <- new_fds[ old_fds %in% names(df) ]
names(df)[ names(df) %in% old_fds_c ] <- new_fds_c
names(df) <- sub( 'ANN..', '', names(df) )
## CustomGeneList
if( is.element('Gene_Name', names(df)) && is.element('geneListFile', names(opt)) && file.exists(opt$geneListFile) ) {
geneList <- read.delim(opt$geneListFile, header=FALSE, sep='\t', as.is=TRUE )$V1
df$CustomGeneList <- sub('\\..*$', '', df$Gene_Name, perl=TRUE) %in% geneList
df$CustomGeneList <- sub('FALSE', '', df$CustomGeneList)
} else {
df$CustomGeneList = ''
}
## baitRegion
if( all(c("Chr", "Start") %in% names(df)) && is.element('bedTrack', names(opt)) && file.exists(opt$bedTrack) ) {
library(GenomicRanges)
library(rtracklayer)
baitGR <- import("/data/CCRBioinfo/public/serpentine_resources/design/Agilent_SureSelect_Killian_Version4.design.hg19.merged.bed")
vGR = GRanges(seqnames=df$Chr,ranges=IRanges(start=as.integer(df$Start), width=1))
df$baitRegion <- vGR %over% baitGR
df$baitRegion <- sub('FALSE', '', df$baitRegion)
}
## re-define values
if( is.element("dbSNP_ID", names(df)) ) {
df$dbSNP_ID[ !is.na(df$dbSNP_ID) ] = paste('rs', df$dbSNP_ID[ !is.na(df$dbSNP_ID) ], sep='')
}
if( is.element("dbNSFP_SIFT_score", names(df)) ) {
df$dbNSFP_SIFT_score <- as.numeric(df$dbNSFP_SIFT_score)
df$dbNSFP_SIFT_score[ is.na(df$dbNSFP_SIFT_score) ] = -2
}
if( is.element("dbNSFP_1000Gp1_AF", names(df)) ) {
df$dbNSFP_1000Gp1_AF <- as.numeric(df$dbNSFP_1000Gp1_AF)
df$dbNSFP_1000Gp1_AF[ is.na(df$dbNSFP_1000Gp1_AF) ] = -2
}
if( is.element("dbSNP..COMMON", names(df)) ) {
df$dbSNP..COMMON[ is.na(df$dbSNP..COMMON) ] = -2
}
if( is.element("ExAC..AF", names(df)) ) {
df$ExAC..AF <- as.numeric(df$ExAC..AF)
df$ExAC..AF[ is.na(df$ExAC..AF) ] = -2
}
if( is.element("ExAC..ESP_AF_GLOBAL", names(df)) ) {
df$ExAC..ESP_AF_GLOBAL <- as.numeric(df$ExAC..ESP_AF_GLOBAL)
df$ExAC..ESP_AF_GLOBAL[ is.na(df$ExAC..ESP_AF_GLOBAL) ] = -2
}
if( is.element("ExAC..KG_AF_GLOBAL", names(df)) ) {
df$ExAC..KG_AF_GLOBAL <- as.numeric(df$ExAC..KG_AF_GLOBAL)
df$ExAC..KG_AF_GLOBAL[ is.na(df$ExAC..KG_AF_GLOBAL) ] = -2
}
if( is.element("ESP6500..MAF", names(df)) ) {
maf_ls <- strsplit(df$ESP6500..MAF, ";")
df$ESP6500..MAF_EA <- as.numeric(lapply(maf_ls, getALT, n=1, emptyReplacement=NA))/100
df$ESP6500..MAF_AA <- as.numeric(lapply(maf_ls, getALT, n=2, emptyReplacement=NA))/100
df$ESP6500..MAF_All <- as.numeric(lapply(maf_ls, getALT, n=3, emptyReplacement=NA))/100
df$ESP6500..MAF_EA[ is.na(df$ESP6500..MAF_EA) ] <- -2
df$ESP6500..MAF_AA[ is.na(df$ESP6500..MAF_AA) ] <- -2
df$ESP6500..MAF_All[ is.na(df$ESP6500..MAF_All) ] <- -2
}
## test
# df[, c("ESP6500..MAF_EA", "ExAC..ESP_AF_GLOBAL", "ExAC..AF","dbSNP..COMMON","dbNSFP_1000Gp1_AF","dbNSFP_SIFT_score")]
if( grepl('(unifiedgenotyper|ugt)', vcfFile) ){
df$DP_ALT = df$AD_Sample.2
df$ALT_fraction <- round( df$DP_ALT/df$DP, 3)
cols_front <- c("DP", "DP_ALT", "ALT_fraction")
} else if( grepl('platypus', vcfFile) ) {
df$TR <- as.integer(sub( ',.*$', '', df$TR))
df$TC <- as.integer(sub( ',.*$', '', df$TC))
df$ALT_fraction <- round( df$TR/df$TC, 3)
cols_front <- c( "TC", "TR", "ALT_fraction")
} else if ( grepl('dellyGermline', vcfFile) ){
df$DP = df$DR_Sample + df$DV_Sample
df$ALT_fraction = df$DV_Sample/df$DP
cols_front <- c("DP", "DV_Sample", "ALT_fraction", "CIEND", "CIPOS", "PE", "MAPQ", "SR", "SRQ","CONSENSUS", "CE", "CT", "IMPRECISE", "PRECISE", "SVTYPE","SVMETHOD", "INSLEN", "HOMLEN", "RDRATIO", "AC", "AN")
} else if ( grepl('snvs', vcfFile )) {
DP_NORMAL_ALT = NULL
DP_TUMOR_ALT = NULL
for (i in 1:nrow(df)) {
DP_NORMAL_ALT = c(DP_NORMAL_ALT, df[i, paste(df$ALT[i], 'U_NORMAL.1', sep='')])
DP_TUMOR_ALT = c(DP_TUMOR_ALT, df[i, paste(df$ALT[i], 'U_TUMOR.1', sep='')])
}
df$DP_NORMAL_ALT = DP_NORMAL_ALT
df$DP_TUMOR_ALT = DP_TUMOR_ALT
df$NORMAL_ALT_fraction = df$DP_NORMAL_ALT/df$DP_NORMAL
df$TUMOR_ALT_fraction = df$DP_TUMOR_ALT/df$DP_TUMOR
cols_front <- c("DP_NORMAL", "DP_NORMAL_ALT", "NORMAL_ALT_fraction","DP_TUMOR", "DP_TUMOR_ALT", "TUMOR_ALT_fraction")
} else if ( grepl('indels', vcfFile) ) {
df$TIR_NORMAL.1_fraction <- round(df$TIR_NORMAL.1/as.integer(df$DP_NORMAL),3)
df$TIR_TUMOR.1_fraction <- round(df$TIR_TUMOR.1/as.integer(df$DP_TUMOR),3)
cols_front <- c("DP_NORMAL", "TIR_NORMAL.1", "TIR_NORMAL.1_fraction", "DP_TUMOR", "TIR_TUMOR.1", "TIR_TUMOR.1_fraction")
} else if ( grepl('mutect', vcfFile) ) {
df$DP_NORMAL = df$AD_NORMAL.1 + df$AD_NORMAL.2
df$DP_TUMOR = df$AD_TUMOR.1 + df$AD_TUMOR.2
cols_front <- c("DP_NORMAL", "AD_NORMAL.2", "AF_NORMAL","DP_TUMOR", "AD_TUMOR.2", "AF_TUMOR")
} else if ( grepl('dellySomatic', vcfFile) ){
df$DP_NORMAL = df$DR_NORMAL + df$DV_NORMAL
df$NORMAL_ALT_fraction = df$DV_NORMAL/df$DP_NORMAL
df$DP_TUMOR = df$DR_TUMOR + df$DV_TUMOR
df$TUMOR_ALT_fraction = df$DV_TUMOR/df$DP_TUMOR
cols_front <- c("DP_NORMAL", "DV_NORMAL", "NORMAL_ALT_fraction", "DP_TUMOR", "DV_TUMOR", "TUMOR_ALT_fraction", "CIEND", "CIPOS", "PE", "MAPQ", "SR", "SRQ","CONSENSUS", "CE", "CT", "IMPRECISE", "PRECISE", "SVTYPE","SVMETHOD", "INSLEN", "HOMLEN", "RDRATIO")
} else if( grepl('hpc', vcfFile) ){
df$ALT_fraction <- round( df$AD_Sample.2/df$DP_Sample, 3)
cols_front <- c("DP_Sample","AD_Sample.2", "ALT_fraction")
} else if( grepl('scalpel', vcfFile) ){
df$NORMAL_ALT_fraction <- round( df$AD_NORMAL.2/df$DP_NORMAL, 3)
df$TUMOR_ALT_fraction <- round( df$AD_TUMOR.2/df$DP_TUMOR, 3)
cols_front <- c("DP_NORMAL","AD_NORMAL.2", "NORMAL_ALT_fraction","DP_TUMOR","AD_TUMOR.2", "TUMOR_ALT_fraction")
} else {
cols_front <- c()
}
## re-order columns
cols_front_all <- c("SampleName", "Chr", "Start", "End", "CHR2", "END", "REF", "ALT", "Length", cols_front, "Gene_Name", "CustomGeneList", "baitRegion", "dbSNP_ID", "dbSNP..COMMON", "Cosmic_ID", "QUAL", "MQ", "MQ0", "QSS", "QSI", "FILTER", "ExAC..AF", "ExAC..ESP_AF_GLOBAL", "ESP6500..MAF_All", "dbNSFP_SIFT_score", "ESP6500..HGVS_CDNA_VAR", "Annotation", "Annotation_Impact", "Feature_Type", "Feature_ID", "Transcript_BioType", "Rank", "HGVS.c", "HGVS.p", "cDNA.pos / cDNA.length", "CDS.pos / CDS.length", "AA.pos / AA.length", "Distance", "dbNSFP_Polyphen2_HVAR_pred", "Cosmic..CNT", "CLNSIG", "SampleName1")
cols_front_common <- cols_front_all[cols_front_all %in% names(df)]
cols_rest <- names(df)[ ! names(df) %in% cols_front_common ]
##only "PASS"
df_ordered <-df[(df$FILTER == '.' | df$FILTER == 'PASS'), c(cols_front_common, cols_rest) ]
rm(df)
print(dim(df_ordered))
write.table(df_ordered, file=paste(outBase, "_formated.tab", sep=''), quote=FALSE, sep='\t',na='',row.names=FALSE)
##################################
## filtered by design genes and regions
if( all(c("CustomGeneList", "baitRegion") %in% names(df_ordered)) ) {
df_ordered_selected <- df_ordered[ df_ordered$CustomGeneList == "TRUE" | df_ordered$baitRegion == "TRUE", cols_front_common];
} else {
df_ordered_selected <- df_ordered[, cols_front_common];
}
# #### for testing
# df_ordered_selected = df_ordered
rm(df_ordered)
## filtered by differenct criteria
library(dplyr)
if( grepl('indels', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>%
filter(TIR_NORMAL.1 <=2 | TIR_NORMAL.1_fraction < 0.01) %>%
filter(TIR_TUMOR.1 >= 2 & TIR_TUMOR.1_fraction >= 0.05)
} else if (grepl('snvs', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>%
filter(DP_NORMAL_ALT <=2 | NORMAL_ALT_fraction < 0.01) %>%
filter(DP_TUMOR_ALT >= 2 & TUMOR_ALT_fraction >= 0.05)
} else if (grepl('mutect', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>%
filter(AD_NORMAL.2 <=2 | AF_NORMAL < 0.01) %>%
filter(AD_TUMOR.2 >= 2 & AF_Fumor >= 0.05)
} else if (grepl('(scalpel)', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>%
filter(AD_NORMAL.2 <=2 | NORMAL_ALT_fraction < 0.01) %>%
filter(AD_TUMOR.2 >= 2 & TUMOR_ALT_fraction >= 0.05)
} else if (grepl('dellySomatic', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>%
filter(DV_NORMAL <=2 | NORMAL_ALT_fraction < 0.01) %>%
filter(DV_TUMOR >= 2 & TUMOR_ALT_fraction >= 0.05)
} else if (grepl('(unifiedgenotyper|ugt)', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP >5 & DP_ALT >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE'))
} else if (grepl('(hpc)', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP_Sample >5 & AD_Sample.2 >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE'))
} else if (grepl('platypus', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(TC >5 & TR >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE'))
} else if (grepl('dellyGermline', vcfFile) ) {
df_ordered_filtered = df_ordered_selected %>%
filter(DP >5 & DV_Sample >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE'))
} else {
if( is.element("Annotation_Impact", names(df_ordered_selected)) ) {
df_ordered_filtered <- df_ordered_selected[ which(df_ordered_selected$Annotation_Impact %in% c('HIGH','MODERATE','LOW')),]
} else {
df_ordered_filtered <- df_ordered_selected
}
}
rm(df_ordered_selected)
## filtered file name
filtered_file <- paste(outBase, "_formated_filtered.tab", sep='')
## touch an enpty file
if( nrow(df_ordered_filtered) == 0 ) {
system(paste("touch ", filtered_file, sep=""))
q(save="no", status=1, runLast=FALSE)
}
write.table(df_ordered_filtered, file=filtered_file, quote=FALSE, sep='\t',na='',row.names=FALSE)
print(dim(df_ordered_filtered))
## generate filtered vcf files
if( nrow(df_ordered_filtered) > 0 ) {
filtered_for_bed <- df_ordered_filtered[,c('SampleName','Chr','Start','Start')]
vcf_filtered <- paste(outBase, "_filtered.vcf", sep='')
bedName <- paste(outBase, "_filtered.bed", sep='')
df_bed <- filtered_for_bed[, 2:4 ]
write.table(unique(df_bed), file=bedName, append = FALSE, row.names = FALSE, col.names = FALSE, quote = FALSE, na = "", sep='\t')
cmd_int <- paste("intersectBed -header -a ", vcfFile, " -b ", bedName, " > ", vcf_filtered, sep='' )
system(cmd_int)
cmd_idx <- paste("module load igvtools; igvtools index ", vcf_filtered, sep='' )
system(cmd_idx)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment