Created
December 19, 2016 22:39
-
-
Save zhujack/849b75f5a8305edaeca1001dfb9c3fe9 to your computer and use it in GitHub Desktop.
Parse/convert a vcf file to tabular format data table (data frame) plus filtering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Functions to expand VariantAnnotation CollapsedVCF object and convert to a data frame | |
## Author: Sean Davis and Jack Zhu@Meltzerlab, 12/18/2016 | |
## Function to parse ANN column in to a dataframe | |
.anncols = function(anncol,headerstring) { | |
anncols = strsplit(sub("Functional annotations: '",'', | |
headerstring),' \\| ')[[1]] | |
dfannempty = data.frame(matrix(vector(), 0, length(anncols), | |
dimnames=list(c(), anncols)), | |
stringsAsFactors=F) | |
dd <- lapply(lapply(anncol,`[`,1),function(x){strsplit(x,'\\|')[[1]]}) | |
ncls <- max(unlist(lapply(dd, length))) | |
yy = data.frame(suppressWarnings(do.call(rbind, | |
c(dfannempty[1:ncls], dd))), | |
stringsAsFactors=FALSE) | |
yy = data.frame(lapply(yy,type.convert)) | |
colnames(yy) = paste("ANN",anncols[1:ncls],sep="..") | |
return(yy) | |
} | |
## Function to convert VariantAnnotation CollapsedVCF/ExpandedVCF objects to a data frame | |
v2df = function(x, ...) { | |
df = as.data.frame(rowRanges(x)) | |
df = cbind(df, as.data.frame(info(x))) | |
if ( any(c('ANN', 'EFF') %in% names(info(x))) ) { | |
ann = c('ANN', 'EFF')[ c('ANN', 'EFF') %in% names(info(x)) ][1] | |
dfann = .anncols(df$ANN, info(header(x))[ann, ]$Description) | |
df = df[, colnames(df) != ann] | |
df = cbind(df, dfann) | |
} | |
n = names(geno(x)) | |
tmp = lapply(n, function(col) { | |
return(as.data.frame(geno(x)[[col]])) | |
}) | |
ncols = sapply(tmp, ncol) | |
tmp = do.call(cbind, tmp) | |
colnames(tmp) = paste(rep(n, times = ncols), colnames(tmp), sep = | |
"_") | |
df = cbind(df, tmp) | |
df[sapply(df, is.list)] <- apply(df[sapply(df, is.list)], 2, | |
function(x) { | |
unlist(lapply(x, paste, | |
sep=",", collapse=";")) | |
} ) | |
print(dim(df)) | |
# print(names(df)) | |
return(df) | |
} | |
## Function to expand VariantAnnotation CollapsedVCF object and convert to a data frame | |
vcf2df = function(v, expand = TRUE) { | |
if (expand) { | |
message('Expanding VCF first, so number of rows may increase') | |
return(v2df(expand(v))) | |
} else { | |
message('No VCF Expanding') | |
return(v2df(v)) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
## Author: Jack Zhu@Meltzerlab, 12/18/2016 | |
## This script is to convert vcf to tabular table and format results | |
## Options: | |
## -f: a vcf file name, required | |
## -g: a text file containing gene names, optional | |
## -b: a bed track file containing regions of interest, optional | |
## -o: name of output directory | |
## Usage: vcf2table.R -f variant/hpc/SE0013_Saliva.hpc.annotated.vcf -g ~/bin/gene_kk4_563.txt -b /data/CCRBioinfo/public/serpentine_resources/design/Agilent_SureSelect_Killian_Version4.design.hg19.merged.bed | |
##################################### | |
suppressPackageStartupMessages(library("optparse")) | |
option_list <- list( | |
make_option(c("-f", "--vcfFile"), help="Required. "), | |
make_option(c("-g", "--geneListFile"), default='', help="A text file containing gene names. [default: %default]"), | |
make_option(c("-o", "--outDir"), default='', help="name of output directory. [default: %default]"), | |
make_option(c("-b", "--bedTrack"), default='', help="a bed track file containing regions of interest. [default: %default]") | |
) | |
# get command line options, if help option encountered print help and exit, | |
# otherwise if options not found on command line then set defaults, | |
opt <- parse_args(OptionParser(option_list=option_list)) | |
if( ! is.element('vcfFile', names(opt)) || !file.exists(opt$vcfFile) ) stop("-f is required. ") | |
##################################### | |
require(VariantAnnotation) | |
source('vcf2df.R') | |
getALT <- function (ALT_vector, n, emptyReplacement = '') { | |
if( length(ALT_vector) >= n & ! is.na(ALT_vector[n]) ){ | |
return(ALT_vector[n]) | |
} else { | |
return(emptyReplacement) | |
} | |
} | |
## some vairable | |
vcfFile = opt$vcfFile | |
if( opt$outDir != '') { | |
outBase <- file.path( opt$outDir, sub('.vcf', '', basename(vcfFile)) ) | |
} else { | |
outBase <- sub('.vcf', '', vcfFile) | |
} | |
## read a vcf file and convert to a data frame | |
print(vcfFile) | |
v = readVcf(vcfFile, "hg19") | |
df = vcf2df(v) | |
##################################### | |
## format output | |
## replace sample names in fields | |
fBase <- basename( vcfFile) | |
nBase <- sub('\\..*$', '', fBase) | |
nUnderscore <- sapply(regmatches(nBase, gregexpr("_", nBase)), length) | |
if( nUnderscore < 2 ) { | |
## germline name | |
names(df) <- sub( paste('_', nBase, sep=''), "_Sample", names(df) ) | |
} else { | |
## somatic names | |
nameT <- sub('\\_.*\\_', '_', nBase) | |
nameT1 <- sub('^.*\\_', '', nameT) | |
nameN <- sub( paste('\\_', nameT1, sep=''), '', nBase) | |
names(df) <- sub( nameN, "NORMAL", names(df) ) | |
names(df) <- sub( nameT, "TUMOR", names(df) ) | |
} | |
df$SampleName = nBase | |
df$SampleName1 = df$SampleName | |
df$SampleName2 = df$SampleName | |
## rename fields | |
old_fds <- c("seqnames", "start", "end", "width", "strand", "Cosmic..CosmicID", "dbSNP..RS") | |
old_fds_c <- old_fds[ old_fds %in% names(df) ] | |
new_fds <- c("Chr", "Start", "End", "Length", "Strand", "Cosmic_ID", "dbSNP_ID") | |
new_fds_c <- new_fds[ old_fds %in% names(df) ] | |
names(df)[ names(df) %in% old_fds_c ] <- new_fds_c | |
names(df) <- sub( 'ANN..', '', names(df) ) | |
## CustomGeneList | |
if( is.element('Gene_Name', names(df)) && is.element('geneListFile', names(opt)) && file.exists(opt$geneListFile) ) { | |
geneList <- read.delim(opt$geneListFile, header=FALSE, sep='\t', as.is=TRUE )$V1 | |
df$CustomGeneList <- sub('\\..*$', '', df$Gene_Name, perl=TRUE) %in% geneList | |
df$CustomGeneList <- sub('FALSE', '', df$CustomGeneList) | |
} else { | |
df$CustomGeneList = '' | |
} | |
## baitRegion | |
if( all(c("Chr", "Start") %in% names(df)) && is.element('bedTrack', names(opt)) && file.exists(opt$bedTrack) ) { | |
library(GenomicRanges) | |
library(rtracklayer) | |
baitGR <- import("/data/CCRBioinfo/public/serpentine_resources/design/Agilent_SureSelect_Killian_Version4.design.hg19.merged.bed") | |
vGR = GRanges(seqnames=df$Chr,ranges=IRanges(start=as.integer(df$Start), width=1)) | |
df$baitRegion <- vGR %over% baitGR | |
df$baitRegion <- sub('FALSE', '', df$baitRegion) | |
} | |
## re-define values | |
if( is.element("dbSNP_ID", names(df)) ) { | |
df$dbSNP_ID[ !is.na(df$dbSNP_ID) ] = paste('rs', df$dbSNP_ID[ !is.na(df$dbSNP_ID) ], sep='') | |
} | |
if( is.element("dbNSFP_SIFT_score", names(df)) ) { | |
df$dbNSFP_SIFT_score <- as.numeric(df$dbNSFP_SIFT_score) | |
df$dbNSFP_SIFT_score[ is.na(df$dbNSFP_SIFT_score) ] = -2 | |
} | |
if( is.element("dbNSFP_1000Gp1_AF", names(df)) ) { | |
df$dbNSFP_1000Gp1_AF <- as.numeric(df$dbNSFP_1000Gp1_AF) | |
df$dbNSFP_1000Gp1_AF[ is.na(df$dbNSFP_1000Gp1_AF) ] = -2 | |
} | |
if( is.element("dbSNP..COMMON", names(df)) ) { | |
df$dbSNP..COMMON[ is.na(df$dbSNP..COMMON) ] = -2 | |
} | |
if( is.element("ExAC..AF", names(df)) ) { | |
df$ExAC..AF <- as.numeric(df$ExAC..AF) | |
df$ExAC..AF[ is.na(df$ExAC..AF) ] = -2 | |
} | |
if( is.element("ExAC..ESP_AF_GLOBAL", names(df)) ) { | |
df$ExAC..ESP_AF_GLOBAL <- as.numeric(df$ExAC..ESP_AF_GLOBAL) | |
df$ExAC..ESP_AF_GLOBAL[ is.na(df$ExAC..ESP_AF_GLOBAL) ] = -2 | |
} | |
if( is.element("ExAC..KG_AF_GLOBAL", names(df)) ) { | |
df$ExAC..KG_AF_GLOBAL <- as.numeric(df$ExAC..KG_AF_GLOBAL) | |
df$ExAC..KG_AF_GLOBAL[ is.na(df$ExAC..KG_AF_GLOBAL) ] = -2 | |
} | |
if( is.element("ESP6500..MAF", names(df)) ) { | |
maf_ls <- strsplit(df$ESP6500..MAF, ";") | |
df$ESP6500..MAF_EA <- as.numeric(lapply(maf_ls, getALT, n=1, emptyReplacement=NA))/100 | |
df$ESP6500..MAF_AA <- as.numeric(lapply(maf_ls, getALT, n=2, emptyReplacement=NA))/100 | |
df$ESP6500..MAF_All <- as.numeric(lapply(maf_ls, getALT, n=3, emptyReplacement=NA))/100 | |
df$ESP6500..MAF_EA[ is.na(df$ESP6500..MAF_EA) ] <- -2 | |
df$ESP6500..MAF_AA[ is.na(df$ESP6500..MAF_AA) ] <- -2 | |
df$ESP6500..MAF_All[ is.na(df$ESP6500..MAF_All) ] <- -2 | |
} | |
## test | |
# df[, c("ESP6500..MAF_EA", "ExAC..ESP_AF_GLOBAL", "ExAC..AF","dbSNP..COMMON","dbNSFP_1000Gp1_AF","dbNSFP_SIFT_score")] | |
if( grepl('(unifiedgenotyper|ugt)', vcfFile) ){ | |
df$DP_ALT = df$AD_Sample.2 | |
df$ALT_fraction <- round( df$DP_ALT/df$DP, 3) | |
cols_front <- c("DP", "DP_ALT", "ALT_fraction") | |
} else if( grepl('platypus', vcfFile) ) { | |
df$TR <- as.integer(sub( ',.*$', '', df$TR)) | |
df$TC <- as.integer(sub( ',.*$', '', df$TC)) | |
df$ALT_fraction <- round( df$TR/df$TC, 3) | |
cols_front <- c( "TC", "TR", "ALT_fraction") | |
} else if ( grepl('dellyGermline', vcfFile) ){ | |
df$DP = df$DR_Sample + df$DV_Sample | |
df$ALT_fraction = df$DV_Sample/df$DP | |
cols_front <- c("DP", "DV_Sample", "ALT_fraction", "CIEND", "CIPOS", "PE", "MAPQ", "SR", "SRQ","CONSENSUS", "CE", "CT", "IMPRECISE", "PRECISE", "SVTYPE","SVMETHOD", "INSLEN", "HOMLEN", "RDRATIO", "AC", "AN") | |
} else if ( grepl('snvs', vcfFile )) { | |
DP_NORMAL_ALT = NULL | |
DP_TUMOR_ALT = NULL | |
for (i in 1:nrow(df)) { | |
DP_NORMAL_ALT = c(DP_NORMAL_ALT, df[i, paste(df$ALT[i], 'U_NORMAL.1', sep='')]) | |
DP_TUMOR_ALT = c(DP_TUMOR_ALT, df[i, paste(df$ALT[i], 'U_TUMOR.1', sep='')]) | |
} | |
df$DP_NORMAL_ALT = DP_NORMAL_ALT | |
df$DP_TUMOR_ALT = DP_TUMOR_ALT | |
df$NORMAL_ALT_fraction = df$DP_NORMAL_ALT/df$DP_NORMAL | |
df$TUMOR_ALT_fraction = df$DP_TUMOR_ALT/df$DP_TUMOR | |
cols_front <- c("DP_NORMAL", "DP_NORMAL_ALT", "NORMAL_ALT_fraction","DP_TUMOR", "DP_TUMOR_ALT", "TUMOR_ALT_fraction") | |
} else if ( grepl('indels', vcfFile) ) { | |
df$TIR_NORMAL.1_fraction <- round(df$TIR_NORMAL.1/as.integer(df$DP_NORMAL),3) | |
df$TIR_TUMOR.1_fraction <- round(df$TIR_TUMOR.1/as.integer(df$DP_TUMOR),3) | |
cols_front <- c("DP_NORMAL", "TIR_NORMAL.1", "TIR_NORMAL.1_fraction", "DP_TUMOR", "TIR_TUMOR.1", "TIR_TUMOR.1_fraction") | |
} else if ( grepl('mutect', vcfFile) ) { | |
df$DP_NORMAL = df$AD_NORMAL.1 + df$AD_NORMAL.2 | |
df$DP_TUMOR = df$AD_TUMOR.1 + df$AD_TUMOR.2 | |
cols_front <- c("DP_NORMAL", "AD_NORMAL.2", "AF_NORMAL","DP_TUMOR", "AD_TUMOR.2", "AF_TUMOR") | |
} else if ( grepl('dellySomatic', vcfFile) ){ | |
df$DP_NORMAL = df$DR_NORMAL + df$DV_NORMAL | |
df$NORMAL_ALT_fraction = df$DV_NORMAL/df$DP_NORMAL | |
df$DP_TUMOR = df$DR_TUMOR + df$DV_TUMOR | |
df$TUMOR_ALT_fraction = df$DV_TUMOR/df$DP_TUMOR | |
cols_front <- c("DP_NORMAL", "DV_NORMAL", "NORMAL_ALT_fraction", "DP_TUMOR", "DV_TUMOR", "TUMOR_ALT_fraction", "CIEND", "CIPOS", "PE", "MAPQ", "SR", "SRQ","CONSENSUS", "CE", "CT", "IMPRECISE", "PRECISE", "SVTYPE","SVMETHOD", "INSLEN", "HOMLEN", "RDRATIO") | |
} else if( grepl('hpc', vcfFile) ){ | |
df$ALT_fraction <- round( df$AD_Sample.2/df$DP_Sample, 3) | |
cols_front <- c("DP_Sample","AD_Sample.2", "ALT_fraction") | |
} else if( grepl('scalpel', vcfFile) ){ | |
df$NORMAL_ALT_fraction <- round( df$AD_NORMAL.2/df$DP_NORMAL, 3) | |
df$TUMOR_ALT_fraction <- round( df$AD_TUMOR.2/df$DP_TUMOR, 3) | |
cols_front <- c("DP_NORMAL","AD_NORMAL.2", "NORMAL_ALT_fraction","DP_TUMOR","AD_TUMOR.2", "TUMOR_ALT_fraction") | |
} else { | |
cols_front <- c() | |
} | |
## re-order columns | |
cols_front_all <- c("SampleName", "Chr", "Start", "End", "CHR2", "END", "REF", "ALT", "Length", cols_front, "Gene_Name", "CustomGeneList", "baitRegion", "dbSNP_ID", "dbSNP..COMMON", "Cosmic_ID", "QUAL", "MQ", "MQ0", "QSS", "QSI", "FILTER", "ExAC..AF", "ExAC..ESP_AF_GLOBAL", "ESP6500..MAF_All", "dbNSFP_SIFT_score", "ESP6500..HGVS_CDNA_VAR", "Annotation", "Annotation_Impact", "Feature_Type", "Feature_ID", "Transcript_BioType", "Rank", "HGVS.c", "HGVS.p", "cDNA.pos / cDNA.length", "CDS.pos / CDS.length", "AA.pos / AA.length", "Distance", "dbNSFP_Polyphen2_HVAR_pred", "Cosmic..CNT", "CLNSIG", "SampleName1") | |
cols_front_common <- cols_front_all[cols_front_all %in% names(df)] | |
cols_rest <- names(df)[ ! names(df) %in% cols_front_common ] | |
##only "PASS" | |
df_ordered <-df[(df$FILTER == '.' | df$FILTER == 'PASS'), c(cols_front_common, cols_rest) ] | |
rm(df) | |
print(dim(df_ordered)) | |
write.table(df_ordered, file=paste(outBase, "_formated.tab", sep=''), quote=FALSE, sep='\t',na='',row.names=FALSE) | |
################################## | |
## filtered by design genes and regions | |
if( all(c("CustomGeneList", "baitRegion") %in% names(df_ordered)) ) { | |
df_ordered_selected <- df_ordered[ df_ordered$CustomGeneList == "TRUE" | df_ordered$baitRegion == "TRUE", cols_front_common]; | |
} else { | |
df_ordered_selected <- df_ordered[, cols_front_common]; | |
} | |
# #### for testing | |
# df_ordered_selected = df_ordered | |
rm(df_ordered) | |
## filtered by differenct criteria | |
library(dplyr) | |
if( grepl('indels', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>% | |
filter(TIR_NORMAL.1 <=2 | TIR_NORMAL.1_fraction < 0.01) %>% | |
filter(TIR_TUMOR.1 >= 2 & TIR_TUMOR.1_fraction >= 0.05) | |
} else if (grepl('snvs', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>% | |
filter(DP_NORMAL_ALT <=2 | NORMAL_ALT_fraction < 0.01) %>% | |
filter(DP_TUMOR_ALT >= 2 & TUMOR_ALT_fraction >= 0.05) | |
} else if (grepl('mutect', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>% | |
filter(AD_NORMAL.2 <=2 | AF_NORMAL < 0.01) %>% | |
filter(AD_TUMOR.2 >= 2 & AF_Fumor >= 0.05) | |
} else if (grepl('(scalpel)', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>% | |
filter(AD_NORMAL.2 <=2 | NORMAL_ALT_fraction < 0.01) %>% | |
filter(AD_TUMOR.2 >= 2 & TUMOR_ALT_fraction >= 0.05) | |
} else if (grepl('dellySomatic', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_NORMAL >5 & DP_TUMOR >3 & Annotation_Impact %in% c('HIGH','MODERATE')) %>% | |
filter(DV_NORMAL <=2 | NORMAL_ALT_fraction < 0.01) %>% | |
filter(DV_TUMOR >= 2 & TUMOR_ALT_fraction >= 0.05) | |
} else if (grepl('(unifiedgenotyper|ugt)', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP >5 & DP_ALT >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE')) | |
} else if (grepl('(hpc)', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP_Sample >5 & AD_Sample.2 >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE')) | |
} else if (grepl('platypus', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(TC >5 & TR >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE')) | |
} else if (grepl('dellyGermline', vcfFile) ) { | |
df_ordered_filtered = df_ordered_selected %>% | |
filter(DP >5 & DV_Sample >3 & ALT_fraction >= 0.05 & Annotation_Impact %in% c('HIGH','MODERATE')) | |
} else { | |
if( is.element("Annotation_Impact", names(df_ordered_selected)) ) { | |
df_ordered_filtered <- df_ordered_selected[ which(df_ordered_selected$Annotation_Impact %in% c('HIGH','MODERATE','LOW')),] | |
} else { | |
df_ordered_filtered <- df_ordered_selected | |
} | |
} | |
rm(df_ordered_selected) | |
## filtered file name | |
filtered_file <- paste(outBase, "_formated_filtered.tab", sep='') | |
## touch an enpty file | |
if( nrow(df_ordered_filtered) == 0 ) { | |
system(paste("touch ", filtered_file, sep="")) | |
q(save="no", status=1, runLast=FALSE) | |
} | |
write.table(df_ordered_filtered, file=filtered_file, quote=FALSE, sep='\t',na='',row.names=FALSE) | |
print(dim(df_ordered_filtered)) | |
## generate filtered vcf files | |
if( nrow(df_ordered_filtered) > 0 ) { | |
filtered_for_bed <- df_ordered_filtered[,c('SampleName','Chr','Start','Start')] | |
vcf_filtered <- paste(outBase, "_filtered.vcf", sep='') | |
bedName <- paste(outBase, "_filtered.bed", sep='') | |
df_bed <- filtered_for_bed[, 2:4 ] | |
write.table(unique(df_bed), file=bedName, append = FALSE, row.names = FALSE, col.names = FALSE, quote = FALSE, na = "", sep='\t') | |
cmd_int <- paste("intersectBed -header -a ", vcfFile, " -b ", bedName, " > ", vcf_filtered, sep='' ) | |
system(cmd_int) | |
cmd_idx <- paste("module load igvtools; igvtools index ", vcf_filtered, sep='' ) | |
system(cmd_idx) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment