Last active
June 6, 2023 21:05
-
-
Save acvill/f8de2a2039102478ccf666796d0d141f to your computer and use it in GitHub Desktop.
Given vcf and gff, plot position and allele freq. of SNPs and indels over genomic ranges
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given vcf and gff, plot position and allele freq. of SNPs and indels over ranges | |
# tested with VCFv4.1 and gff version 3 | |
# expects certain vcf and gff attributes, may not work with all files | |
plot_vcf <- function(vcf, gff, ranges) { | |
require(tidyverse) # v2.0.0 | |
require(gggenes) # v0.5.0 | |
require(patchwork) # v1.1.2 | |
barcols <- c("SNP" = "red", | |
"insertion" = "purple", | |
"deletion" = "blue") | |
genecols <- c("CDS" = "#8dd3c7", | |
"ncRNA" = "#ffffb3", | |
"rRNA" = "#bebada", | |
"tRNA" = "#fb8072") | |
anno <- read_tsv(gff, | |
comment = "#", | |
col_names = c("seqname","source","feature","start","end", | |
"score","strand","frame","attribute"), | |
col_types = list(seqname = "c", | |
source = "c", | |
feature = "c", | |
start = "d", | |
end = "d", | |
score = "d", | |
strand = "c", | |
frame = "c", | |
attribute = "c")) |> | |
filter(feature %in% c("CDS","ncRNA","rRNA","tRNA")) |> | |
separate_wider_delim(cols = attribute, | |
delim = ";", | |
names = c("alias","id","name","note", "trans_table"), | |
too_few = "align_start") |> | |
select(seqname, feature, start, end, strand, name) |> | |
mutate(name = gsub(pattern = "Name=", replacement = "", x = name)) |> | |
mutate(strand = ifelse(strand == "+", "forward", "reverse")) |> | |
mutate(orientation = ifelse(strand == "forward", 1, 0)) |> | |
mutate(center = (end + start) / 2) | |
varx <- read_tsv(vcf, | |
comment = "#", | |
col_names = c("chromosome","position","id","ref", | |
"alt","quality_score","filter","info")) |> | |
mutate(info = gsub(pattern = "[A-Z]{2}=", replacement = "", x = info)) |> | |
separate_wider_delim(cols = info, | |
delim = ";", | |
names = c("allele_freq","allele_depth","depth_of_coverage"), | |
cols_remove = TRUE) |> | |
type_convert() |> | |
mutate(type = case_when(nchar(ref) == nchar(alt) ~ "SNP", | |
nchar(alt) > nchar(ref) ~ "insertion", | |
nchar(ref) < nchar(alt) ~ "deletion")) |> | |
select(position, allele_freq, type) |> | |
group_by(position, type) |> | |
summarize(allele_freq = sum(allele_freq)) |> | |
ungroup() | |
apply(X = ranges, MARGIN = 1, FUN = function(x) { | |
barwidth <- (x[2] - x[1]) / 400 | |
annorng <- anno |> filter(start > x[1], end < x[2]) |> | |
ggplot() + | |
geom_gene_arrow(mapping = aes(xmin = start, | |
xmax = end, | |
y = seqname, | |
fill = feature, | |
forward = orientation), | |
show.legend = TRUE) + | |
geom_text(mapping = aes(x = center, | |
y = seqname, | |
label = name), | |
angle = 0, | |
vjust = -1, | |
size = 3, | |
color = "black") + | |
xlab("genomic coordinate") + | |
theme_genes() + | |
scale_x_continuous(limits = c(x[1],x[2]), | |
expand = c(0,0)) + | |
scale_fill_manual(values = genecols) + | |
theme(legend.position = "right", | |
axis.text.y = element_blank(), | |
axis.text.x = element_text(color = "black", size = 9), | |
axis.ticks = element_blank(), | |
axis.title = element_blank()) | |
pdat <- filter(varx, position %in% c(x[1]:x[2])) | |
ylim <- 2 * max(pdat$allele_freq) | |
if (ylim > 1) { | |
ylim <- 1 | |
} | |
prng <- ggplot(data = pdat) + | |
geom_bar(mapping = aes(x = position, | |
y = allele_freq, | |
fill = type, | |
color = type), | |
position = "stack", | |
stat = "identity", | |
width = barwidth) + | |
scale_fill_manual(values = barcols) + | |
scale_color_manual(values = barcols) + | |
xlab("") + | |
ylab("allele frequency") + | |
theme_bw() + | |
scale_y_continuous(expand = c(0,0), | |
breaks = round(seq(0,ylim,ylim/4),2), | |
limits = c(0,ylim)) + | |
scale_x_continuous(limits = c(x[1],x[2]), | |
expand = c(0,0)) + | |
theme(panel.grid = element_blank(), | |
axis.text.y = element_text(color = "black"), | |
axis.text.x = element_blank(), | |
axis.ticks.x = element_blank()) | |
prng / annorng + plot_layout(heights = c(8,1)) | |
}) | |
} | |
library(tidyverse) | |
test_ranges <- tibble(start = c(24000, 381000, 3807000, 260000), | |
end = c(26000, 384000, 3809000, 273000)) | |
plot_vcf(vcf = "input.vcf", | |
gff = "reference.gff3", | |
ranges = test_ranges) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment