Last active
August 25, 2022 13:10
-
-
Save pgonzale60/6ea9d2ae40f5a46614802d45f7a53989 to your computer and use it in GitHub Desktop.
Compares oxford plots of multiple species using shared coordinates rather than facets. Designed for BUSCO's full table.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# User input #### | |
x_species <- c("nxOscDolc1.1", "nxOscOnir1.2") | |
y_species <- c("nxOscSper1.1", "nxOscSpeu1.1") | |
# path to BUSCO table directory | |
# path must end in "/" (e.g. busco_dir/) | |
busco_dir <- "analyses/paper1/prelim_to_20220220/nemaChromQC/" | |
shared_busco_tsv_suffix <- ".primary_nematoda_odb10_full_table.tsv" | |
### Funcions #### | |
# Function for loading buscos | |
read_busco <- function(buscoFile){ | |
busco <- read_tsv(buscoFile, | |
col_names = c("busco_id", "Status", "Sequence", | |
"start", "end", "strand", "Score", "Length", | |
"OrthoDB_url", "Description"), | |
col_types = c("ccciicdicc"), | |
comment = "#") %>% | |
filter(Status == "Complete") %>% | |
select(busco_id, Sequence, start, end) | |
return(busco) | |
} | |
# Function for creating multispecies coordinates index | |
# From busco_tibble | |
# busco_tibble needs to have columns "multispecies_sequence" and "end" | |
buscos_to_multi_sp_index <- function(busco_tb){ | |
multi_sp_index <- group_by(busco_tb, multispecies_sequence) %>% | |
summarise(size = max(end), .groups = "drop") %>% | |
arrange(multispecies_sequence) %>% | |
# slice(gtools::mixedorder(multispecies_sequence, numeric.type = "roman")) %>% | |
mutate(multispecies_sequence = factor(multispecies_sequence), | |
multispecies_index = cumsum(size) - size) %>% | |
select(multispecies_sequence, multispecies_index) | |
return(multi_sp_index) | |
} | |
# Function for multispecies Oxfor plot | |
multispecies_oxford_plot <- function(busco_tbl, sp_x, sp_y){ | |
x_busco <- filter(busco_tbl, assembly %in% sp_x) | |
y_busco <- filter(busco_tbl, assembly %in% sp_y) | |
buscos_to_cmpr <- left_join(x_busco, y_busco, | |
by = "busco_id") | |
multi_sp_index_x <- select(buscos_to_cmpr, | |
multispecies_sequence = multispecies_sequence.x, | |
end = end.x) %>% buscos_to_multi_sp_index() | |
multi_sp_index_y <- select(buscos_to_cmpr, | |
multispecies_sequence = multispecies_sequence.y, | |
end = end.y) %>% buscos_to_multi_sp_index() | |
# Extrapolate USCOs to multispecies coordinates | |
multispecies_buscos <- left_join(buscos_to_cmpr, multi_sp_index_x, | |
by = c("multispecies_sequence.x" = "multispecies_sequence")) %>% | |
left_join(multi_sp_index_y, | |
by = c("multispecies_sequence.y" = "multispecies_sequence")) %>% | |
mutate(multispecies_start.x = start.x + multispecies_index.x, | |
multispecies_start.y = start.y + multispecies_index.y) %>% | |
select(busco_id, multispecies_start.x, multispecies_start.y, | |
multispecies_sequence.x, multispecies_sequence.y) | |
# Main plot | |
p <- ggplot(multispecies_buscos, aes(x = multispecies_start.x, | |
y = multispecies_start.y)) + | |
geom_point(alpha = 0.3) + | |
scale_x_continuous(breaks = multi_sp_index_x$multispecies_index, | |
labels = multi_sp_index_x$multispecies_sequence, | |
expand = c(0, 0)) + | |
scale_y_continuous(breaks = multi_sp_index_y$multispecies_index, | |
labels = multi_sp_index_y$multispecies_sequence, | |
expand = c(0, 0)) + | |
geom_vline(xintercept=multi_sp_index_x$multispecies_index, size = 0.2) + | |
geom_hline(yintercept=multi_sp_index_y$multispecies_index, size = 0.2) + | |
xlab("") + | |
ylab("") + | |
theme_bw() + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
axis.text.y = element_text(angle = 90, hjust = 0.5), | |
axis.ticks = element_blank()) | |
return(p) | |
} | |
### Read data #### | |
buscofiles <- list.files(busco_dir, full.names = T, | |
pattern = shared_busco_tsv_suffix) | |
names(buscofiles) <- make.names(sub(paste0(".+//(.+)", | |
shared_busco_tsv_suffix), "\\1", | |
buscofiles)) | |
buscos <- map_df(buscofiles, read_busco, | |
.id = "assembly") %>% | |
mutate(short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly), | |
multispecies_sequence = paste0(short_species_id, "_", Sequence)) | |
### Make plot #### | |
multispecies_oxford_plot(buscos, x_species, y_species) | |
# For cases with many sequences, you might want to explore | |
# the plot interactively | |
# You can do so by running the following line | |
plotly::ggplotly() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment