Skip to content

Instantly share code, notes, and snippets.

@acvill
Last active June 6, 2023 21:05
Show Gist options
  • Save acvill/f8de2a2039102478ccf666796d0d141f to your computer and use it in GitHub Desktop.
Save acvill/f8de2a2039102478ccf666796d0d141f to your computer and use it in GitHub Desktop.
Given vcf and gff, plot position and allele freq. of SNPs and indels over genomic ranges
# Given vcf and gff, plot position and allele freq. of SNPs and indels over ranges
# tested with VCFv4.1 and gff version 3
# expects certain vcf and gff attributes, may not work with all files
plot_vcf <- function(vcf, gff, ranges) {
require(tidyverse) # v2.0.0
require(gggenes) # v0.5.0
require(patchwork) # v1.1.2
barcols <- c("SNP" = "red",
"insertion" = "purple",
"deletion" = "blue")
genecols <- c("CDS" = "#8dd3c7",
"ncRNA" = "#ffffb3",
"rRNA" = "#bebada",
"tRNA" = "#fb8072")
anno <- read_tsv(gff,
comment = "#",
col_names = c("seqname","source","feature","start","end",
"score","strand","frame","attribute"),
col_types = list(seqname = "c",
source = "c",
feature = "c",
start = "d",
end = "d",
score = "d",
strand = "c",
frame = "c",
attribute = "c")) |>
filter(feature %in% c("CDS","ncRNA","rRNA","tRNA")) |>
separate_wider_delim(cols = attribute,
delim = ";",
names = c("alias","id","name","note", "trans_table"),
too_few = "align_start") |>
select(seqname, feature, start, end, strand, name) |>
mutate(name = gsub(pattern = "Name=", replacement = "", x = name)) |>
mutate(strand = ifelse(strand == "+", "forward", "reverse")) |>
mutate(orientation = ifelse(strand == "forward", 1, 0)) |>
mutate(center = (end + start) / 2)
varx <- read_tsv(vcf,
comment = "#",
col_names = c("chromosome","position","id","ref",
"alt","quality_score","filter","info")) |>
mutate(info = gsub(pattern = "[A-Z]{2}=", replacement = "", x = info)) |>
separate_wider_delim(cols = info,
delim = ";",
names = c("allele_freq","allele_depth","depth_of_coverage"),
cols_remove = TRUE) |>
type_convert() |>
mutate(type = case_when(nchar(ref) == nchar(alt) ~ "SNP",
nchar(alt) > nchar(ref) ~ "insertion",
nchar(ref) < nchar(alt) ~ "deletion")) |>
select(position, allele_freq, type) |>
group_by(position, type) |>
summarize(allele_freq = sum(allele_freq)) |>
ungroup()
apply(X = ranges, MARGIN = 1, FUN = function(x) {
barwidth <- (x[2] - x[1]) / 400
annorng <- anno |> filter(start > x[1], end < x[2]) |>
ggplot() +
geom_gene_arrow(mapping = aes(xmin = start,
xmax = end,
y = seqname,
fill = feature,
forward = orientation),
show.legend = TRUE) +
geom_text(mapping = aes(x = center,
y = seqname,
label = name),
angle = 0,
vjust = -1,
size = 3,
color = "black") +
xlab("genomic coordinate") +
theme_genes() +
scale_x_continuous(limits = c(x[1],x[2]),
expand = c(0,0)) +
scale_fill_manual(values = genecols) +
theme(legend.position = "right",
axis.text.y = element_blank(),
axis.text.x = element_text(color = "black", size = 9),
axis.ticks = element_blank(),
axis.title = element_blank())
pdat <- filter(varx, position %in% c(x[1]:x[2]))
ylim <- 2 * max(pdat$allele_freq)
if (ylim > 1) {
ylim <- 1
}
prng <- ggplot(data = pdat) +
geom_bar(mapping = aes(x = position,
y = allele_freq,
fill = type,
color = type),
position = "stack",
stat = "identity",
width = barwidth) +
scale_fill_manual(values = barcols) +
scale_color_manual(values = barcols) +
xlab("") +
ylab("allele frequency") +
theme_bw() +
scale_y_continuous(expand = c(0,0),
breaks = round(seq(0,ylim,ylim/4),2),
limits = c(0,ylim)) +
scale_x_continuous(limits = c(x[1],x[2]),
expand = c(0,0)) +
theme(panel.grid = element_blank(),
axis.text.y = element_text(color = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
prng / annorng + plot_layout(heights = c(8,1))
})
}
library(tidyverse)
test_ranges <- tibble(start = c(24000, 381000, 3807000, 260000),
end = c(26000, 384000, 3809000, 273000))
plot_vcf(vcf = "input.vcf",
gff = "reference.gff3",
ranges = test_ranges)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment