Skip to content

Instantly share code, notes, and snippets.

@mrvollger
Last active April 22, 2022 16:54
Show Gist options
  • Save mrvollger/3bdd2d34f312932c12917a4379a55973 to your computer and use it in GitHub Desktop.
Save mrvollger/3bdd2d34f312932c12917a4379a55973 to your computer and use it in GitHub Desktop.
Figure code for making the HPRC SD collapse results.
```{r HPRC SD intersect}
if (!require("karyoploteR")) BiocManager::install("karyoploteR")
if (!require("GenomicRanges")) BiocManager::install("GenomicRanges")
if (!require("argparse")) BiocManager::install("argparse")
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggnewscale")) install.packages("ggnewscale")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("data.table")) install.packages("data.table")
if (!require("glue")) install.packages("glue")
if (!require("RColorBrewer")) install.packages("RColorBrewer")
if (!require("scales")) install.packages("scales")
if (!require("cowplot")) install.packages("cowplot")
if (!require("argparse")) install.packages("argparse")
library(ggpubr)
library(ggforce)
library(IRanges)
library(openxlsx)
library(data.table)
library(circlize)
library(glue)
library(ggExtra)
library(HelloRanges)
library(gt)
library(valr)
load("Rdata/plotutils.data")
f="/net/eichler/vol27/projects/hprc/nobackups/assemblies/hifiasm_trio_yr1_genbank/unreliable_regions/ALL.window.bed"
f = "/net/eichler/vol27/projects/hprc/nobackups/assemblies/hifiasm_trio_yr1_genbank/unreliable_regions/2022-04-22/SD.window.bed.gz"
regions <- fread(f, nThread = 8)
colnames(regions) <- c("chrom", "start", "end", "name", "score", "strand", "ts", "te", "color", "source")[1:ncol(regions)]
regions
colnames(regions)
regions$row_id <- seq(nrow(regions))
regions = regions[chrom!="chrY"]
sds <- SEDEF_V1.1
colnames(sds)[1:3] <- c("chrom", "start", "end")
regions_sds <- valr::bed_intersect(regions, sds, suffix = c("", "_SD"))
colnames(regions_sds)
regions_sds_plot.df <- regions_sds %>%
#head(1e5) %>%
filter(.overlap > 0) %>%
group_by(chrom, start, end, row_id) %>%
filter(.overlap == max(.overlap)) %>%
slice(which.max(matchB_SD * fracMatch_SD)) %>%
data.table()
# all <- regions_sds_plot.df
#regions_sds_plot.df <- all
#regions_sds_plot.df <- all %>% sample_n(1e5)
regions_sds_plot.df$name <- factor(regions_sds_plot.df$name, levels = c("Hap", "Col", "Dup", "Err", "Unk"))
```
```{r HPRC SD intersect plot}
dim(regions) / 1e6
dim(regions_sds) / 1e6
dim(regions_sds_plot.df) / 1e6
cur_colors <- c(
Col = "#590181", Dup = "darkorange",
Err = NEWCOLOR, Hap = "darkgreen",
Unk = "darkgray"
)
p.hist <- ggplot(
data = regions_sds_plot.df,
aes(
y = factor(name, levels = rev(levels(name))),
weight = (end - start) / 94e6, fill = name
)
) +
geom_bar(alpha = 0.8) +
scale_x_continuous(label = comma, trans = "log10") +
ylab("") +
xlab("Mbp of sequence intersecting SDs per haplotype") +
annotation_logticks(side = "b") +
scale_fill_manual("", values = cur_colors) +
theme_minimal_grid() +
theme(legend.position = "none")
p <- ggplot(
data = regions_sds_plot.df,
aes(y = fracMatch_SD * 100, x = matchB_SD, color = name)
) +
geom_density_2d(bins = 50, alpha = 0.5, size = .1) +
scale_x_continuous(trans = "log10", label = comma) +
annotation_logticks(side = "b") +
ylab("SD % identity") +
xlab("SD length") +
scale_color_manual("", values = cur_colors) +
facet_col(~name) +
theme_minimal_grid() +
theme(legend.position = "none")
fig <- plot_grid(p.hist, p, ncol = 1, rel_heights = c(1, 3))
ggsave("hprc_sd_fig.pdf", plot = fig, height = 8, width = 12)
```
```{r}
regions_sds_plot.df %>%
mutate(is_error = name != "Hap") %>%
group_by(name) %>%
group_by(is_error) %>%
summarise(
Mbp = sum(end - start) / 94e6,
frac_match = median(fracMatch_SD)*100,
sd_len = median(matchB_SD)/1e3
)
```
@mrvollger
Copy link
Author

mrvollger commented Apr 22, 2022

Downloading the data:

src="s3://human-pangenomics/submissions/e9ad8022-1b30-11ec-ab04-0a13c5208311--COVERAGE_ANALYSIS_Y1_GENBANK/FLAGGER/APR_08_2022/FINAL_HIFI_BASED/FLAGGER_HIFI_PROJECTED_SIMPLIFIED_BEDS/"
dest="ALL"
mkdir -p $dest
aws s3 --no-sign-request sync $@ $src $dest 

Making SD windows and inputs for the Rscript:

ls ALL/*/*bed | parallel -n 1 $'sed \'s/$/\t{/.}/\' {} | grep -v "^track" ' |  bedtools sort -i - | bgzip -@ 12 > ALL.bed.gz
zcat ALL.bed.gz | sed 's/\t/__/4g' | bedtools makewindows -i src -b - -w 5000 | sed 's/__/\t/g' | bgzip -@ 30 > ./ALL.window.bed.gz
bedtools intersect -wa -a ALL.window.bed.gz -b <(bedtools merge -i ~/chm13_t2t/Assembly_analysis/SEDEF/T2T-CHM13v2.SDs.bed ) \
  | bgzip -@ 16 > ./SD.window.bed.gz 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment