Created
March 13, 2023 14:04
-
-
Save pgonzale60/e66d03120764758544bba596d5e9821b to your computer and use it in GitHub Desktop.
Oxford plot with sequence coordinates transformed to genome wide coordinates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# requires(gtools) # for mixedsort function with roman numerals | |
### Funcions #### | |
# Function for loading buscos | |
read_busco <- function(buscoFile){ | |
read_tsv(buscoFile, | |
col_names = c("Busco_id", "Status", "Sequence", | |
"start", "end", "strand", "Score", "Length", | |
"OrthoDB_url", "Description"), | |
col_types = c("ccciicdicc"), | |
comment = "#") %>% | |
filter(Status == "Complete") %>% | |
select(Busco_id, Sequence, start, end) | |
} | |
# Function for making synteny plots | |
### Dictionaries #### | |
# Nigon colours | |
cols <- c("A" = "#af0e2b", "B" = "#e4501e", | |
"C" = "#4caae5", "D" = "#f3ac2c", | |
"E" = "#57b741", "N" = "#8880be", | |
"X" = "#81008b", "-" = "#aaaaaa") | |
nigonDict <- read_tsv("https://github.com/pgonzale60/vis_ALG/raw/main/data/gene2Nigon_busco20200927.tsv.gz", | |
col_types = "cc") | |
# Oscheius tipulae NCBI assembly sequence IDs | |
ncbi_otipu <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/425/905/GCA_013425905.1_ASM1342590v1/GCA_013425905.1_ASM1342590v1_assembly_report.txt", | |
comment = "#", col_names = c("Sequence_Name", "Sequence_Role", "Assigned_Molecule", | |
"Assigned_Molecule_Location", "GenBank_Accn", | |
"Relationship", "RefSeq_Accn", "Assembly_Unit", | |
"Sequence_Length", "UCSC_style_name")) | |
### Core tibble creation #### | |
# Break sites | |
break_files <- list.files("analyses/paper1/prelim_to_20220220/curated_GRS_coords/", | |
full.names = T, pattern = ".tsv") | |
names(break_files) <- make.names(sub(".+//(.+).chr_diminutions_sites.tsv", "\\1", break_files)) | |
break_sites <- map_df(break_files, read_tsv, | |
col_names = c("Sequence", "pos"), | |
col_types = c("ci"), | |
.id = "assembly") %>% | |
mutate(Sequence = sub("chr_", "", Sequence)) | |
# Sequence sizes | |
fai_files <- list.files("analyses/paper1/prelim_to_20220220/genome_features/sequence_sizes/", | |
full.names = T, pattern = ".fai") | |
names(fai_files) <- make.names(sub(".+//(.+).primary.fa.gz.fai", "\\1", fai_files)) | |
seq_sizes <- map_df(fai_files, read_tsv, | |
col_names = c("Sequence", "size", "L1", "L2", "L3"), | |
col_types = c("ciiii"), | |
.id = "assembly") %>% | |
select(assembly, Sequence, size) %>% | |
mutate(Sequence = sub("chr_", "", Sequence)) %>% | |
filter(!grepl("loc|MT", Sequence)) | |
# BUSCOs | |
buscofiles <- list.files("analyses/paper1/prelim_to_20220220/genome_features/busco/", | |
full.names = T, pattern = "nematoda_odb10_full_table.tsv") | |
names(buscofiles) <- make.names(sub(".+//(.+).primary_nematoda_odb10_full_table.tsv", "\\1", buscofiles)) | |
buscos <- map_df(buscofiles, read_busco, | |
.id = "assembly") %>% | |
mutate(Sequence = ifelse(assembly == "nxOscTipu1.1", | |
ncbi_otipu$Assigned_Molecule[match(Sequence, ncbi_otipu$GenBank_Accn)], | |
Sequence), | |
Sequence = sub("chr_", "", Sequence)) | |
### Data to shared coordinates #### | |
# Aim: convert USCO and break coordinates to shared coordinate system (for plotting) | |
# 1.- Create the multispecies sequence coordinates | |
multi_sp_index <- mutate(seq_sizes, short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly), | |
multispecies_sequence = paste0(short_species_id, "_", Sequence)) %>% | |
slice(gtools::mixedorder(multispecies_sequence, numeric.type = "roman")) %>% | |
mutate(multispecies_sequence = factor(multispecies_sequence), | |
multispecies_index = cumsum(size) - size) %>% | |
select(multispecies_sequence, multispecies_index, size) | |
# 2.- Extrapolate USCOs to multispecies coordinates | |
# Add fake USCOs located at end of sequence to maintain real sequence size when plotting USCOS | |
multispecies_nigons <- mutate(seq_sizes, Busco_id = paste0(assembly, "_", Sequence, "_fakeId")) %>% | |
# give same format as tibble of USCOs | |
select(assembly, Busco_id, Sequence, start = size, end = size) %>% | |
bind_rows(buscos) %>% | |
# Link USCOs to Nigons | |
left_join(nigonDict, by = c("Busco_id" = "Orthogroup")) %>% | |
mutate(nigon = ifelse(is.na(nigon), "-", nigon)) %>% | |
# add mulispecies sequence Id | |
mutate(short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly), | |
multispecies_sequence = paste0(short_species_id, "_", Sequence), | |
multispecies_sequence = factor(multispecies_sequence, levels = levels(multi_sp_index$multispecies_sequence))) %>% | |
# add relative coordinates | |
left_join(multi_sp_index, by = c("multispecies_sequence")) %>% | |
mutate(multispecies_start = multispecies_index + start) %>% | |
select(Busco_id, multispecies_sequence, multispecies_start, nigon) | |
# 3.- Extrapolate break sites to multispecies coordinates | |
multispecies_breaks <- mutate(break_sites, short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly), | |
multispecies_sequence = paste0(short_species_id, "_", Sequence)) %>% | |
# add relative coordinates | |
left_join(multi_sp_index, by = c("multispecies_sequence")) %>% | |
mutate(multispecies_pos = multispecies_index + pos) %>% | |
select(multispecies_sequence, multispecies_pos) | |
# Aim achieved: shared coordinates for break sites and USCOs achieved | |
### Plot synteny #### | |
# Make a tibble linking USCO coordinate of each speceies against all other species | |
# Keep fake IDs as self coordinates to keep sizes of sequences | |
multi_sp_usco_linked <- group_by(multispecies_nigons, Busco_id) %>% | |
mutate(occurrences = n()) %>% | |
ungroup() %>% | |
slice(rep(1:n(), occurrences)) %>% | |
group_by(Busco_id) %>% | |
mutate(counter_species_start = rep(unique(multispecies_start), n_distinct(multispecies_start)), | |
# remove rows linking USCO to itself in same species | |
# except when it is a fake USCO | |
is_self_coord_bool = !(counter_species_start != multispecies_start | grepl("fake", Busco_id))) %>% | |
ungroup() %>% | |
filter(!is_self_coord_bool) | |
# Main plot | |
longest_seqs <- group_by(multi_sp_usco_linked, multispecies_sequence) %>% summarise(longest = max(multispecies_start)) | |
group_by(multi_sp_index, multispecies_sequence) %>% summarise(longest = max(multispecies_index)) | |
p <- ggplot(multi_sp_usco_linked, aes(x = multispecies_start, y = counter_species_start, colour = nigon)) + | |
geom_point(alpha = 0.3, size = 0.2) + | |
scale_colour_manual(values = cols) + | |
scale_x_continuous(breaks = multi_sp_index$multispecies_index + multi_sp_index$size/2, | |
labels = multi_sp_index$multispecies_sequence, | |
expand = c(0, 0)) + | |
scale_y_continuous(breaks = multi_sp_index$multispecies_index + multi_sp_index$size/2, | |
labels = multi_sp_index$multispecies_sequence, | |
expand = c(0, 0)) + | |
geom_vline(xintercept=multi_sp_index$multispecies_index, size = 0.2) + | |
geom_hline(yintercept=multi_sp_index$multispecies_index, size = 0.2) + | |
geom_vline(xintercept=multispecies_breaks$multispecies_pos, linetype="dotted", alpha = 0.3) + | |
geom_hline(yintercept=multispecies_breaks$multispecies_pos, linetype="dotted", alpha = 0.3) + | |
xlab("") + | |
ylab("") + | |
theme_bw() + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
axis.text.y = element_text(angle = 90, hjust = 0.5, vjust = 0.5), | |
axis.ticks = element_blank()) | |
ggsave("synteny_t.pdf", p, width = 19, height = 18) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment