Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pgonzale60/e66d03120764758544bba596d5e9821b to your computer and use it in GitHub Desktop.
Save pgonzale60/e66d03120764758544bba596d5e9821b to your computer and use it in GitHub Desktop.
Oxford plot with sequence coordinates transformed to genome wide coordinates
library(tidyverse)
# requires(gtools) # for mixedsort function with roman numerals
### Funcions ####
# Function for loading buscos
read_busco <- function(buscoFile){
read_tsv(buscoFile,
col_names = c("Busco_id", "Status", "Sequence",
"start", "end", "strand", "Score", "Length",
"OrthoDB_url", "Description"),
col_types = c("ccciicdicc"),
comment = "#") %>%
filter(Status == "Complete") %>%
select(Busco_id, Sequence, start, end)
}
# Function for making synteny plots
### Dictionaries ####
# Nigon colours
cols <- c("A" = "#af0e2b", "B" = "#e4501e",
"C" = "#4caae5", "D" = "#f3ac2c",
"E" = "#57b741", "N" = "#8880be",
"X" = "#81008b", "-" = "#aaaaaa")
nigonDict <- read_tsv("https://github.com/pgonzale60/vis_ALG/raw/main/data/gene2Nigon_busco20200927.tsv.gz",
col_types = "cc")
# Oscheius tipulae NCBI assembly sequence IDs
ncbi_otipu <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/425/905/GCA_013425905.1_ASM1342590v1/GCA_013425905.1_ASM1342590v1_assembly_report.txt",
comment = "#", col_names = c("Sequence_Name", "Sequence_Role", "Assigned_Molecule",
"Assigned_Molecule_Location", "GenBank_Accn",
"Relationship", "RefSeq_Accn", "Assembly_Unit",
"Sequence_Length", "UCSC_style_name"))
### Core tibble creation ####
# Break sites
break_files <- list.files("analyses/paper1/prelim_to_20220220/curated_GRS_coords/",
full.names = T, pattern = ".tsv")
names(break_files) <- make.names(sub(".+//(.+).chr_diminutions_sites.tsv", "\\1", break_files))
break_sites <- map_df(break_files, read_tsv,
col_names = c("Sequence", "pos"),
col_types = c("ci"),
.id = "assembly") %>%
mutate(Sequence = sub("chr_", "", Sequence))
# Sequence sizes
fai_files <- list.files("analyses/paper1/prelim_to_20220220/genome_features/sequence_sizes/",
full.names = T, pattern = ".fai")
names(fai_files) <- make.names(sub(".+//(.+).primary.fa.gz.fai", "\\1", fai_files))
seq_sizes <- map_df(fai_files, read_tsv,
col_names = c("Sequence", "size", "L1", "L2", "L3"),
col_types = c("ciiii"),
.id = "assembly") %>%
select(assembly, Sequence, size) %>%
mutate(Sequence = sub("chr_", "", Sequence)) %>%
filter(!grepl("loc|MT", Sequence))
# BUSCOs
buscofiles <- list.files("analyses/paper1/prelim_to_20220220/genome_features/busco/",
full.names = T, pattern = "nematoda_odb10_full_table.tsv")
names(buscofiles) <- make.names(sub(".+//(.+).primary_nematoda_odb10_full_table.tsv", "\\1", buscofiles))
buscos <- map_df(buscofiles, read_busco,
.id = "assembly") %>%
mutate(Sequence = ifelse(assembly == "nxOscTipu1.1",
ncbi_otipu$Assigned_Molecule[match(Sequence, ncbi_otipu$GenBank_Accn)],
Sequence),
Sequence = sub("chr_", "", Sequence))
### Data to shared coordinates ####
# Aim: convert USCO and break coordinates to shared coordinate system (for plotting)
# 1.- Create the multispecies sequence coordinates
multi_sp_index <- mutate(seq_sizes, short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly),
multispecies_sequence = paste0(short_species_id, "_", Sequence)) %>%
slice(gtools::mixedorder(multispecies_sequence, numeric.type = "roman")) %>%
mutate(multispecies_sequence = factor(multispecies_sequence),
multispecies_index = cumsum(size) - size) %>%
select(multispecies_sequence, multispecies_index, size)
# 2.- Extrapolate USCOs to multispecies coordinates
# Add fake USCOs located at end of sequence to maintain real sequence size when plotting USCOS
multispecies_nigons <- mutate(seq_sizes, Busco_id = paste0(assembly, "_", Sequence, "_fakeId")) %>%
# give same format as tibble of USCOs
select(assembly, Busco_id, Sequence, start = size, end = size) %>%
bind_rows(buscos) %>%
# Link USCOs to Nigons
left_join(nigonDict, by = c("Busco_id" = "Orthogroup")) %>%
mutate(nigon = ifelse(is.na(nigon), "-", nigon)) %>%
# add mulispecies sequence Id
mutate(short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly),
multispecies_sequence = paste0(short_species_id, "_", Sequence),
multispecies_sequence = factor(multispecies_sequence, levels = levels(multi_sp_index$multispecies_sequence))) %>%
# add relative coordinates
left_join(multi_sp_index, by = c("multispecies_sequence")) %>%
mutate(multispecies_start = multispecies_index + start) %>%
select(Busco_id, multispecies_sequence, multispecies_start, nigon)
# 3.- Extrapolate break sites to multispecies coordinates
multispecies_breaks <- mutate(break_sites, short_species_id = sub("nxOsc([^0-9]+).+", "\\1", assembly),
multispecies_sequence = paste0(short_species_id, "_", Sequence)) %>%
# add relative coordinates
left_join(multi_sp_index, by = c("multispecies_sequence")) %>%
mutate(multispecies_pos = multispecies_index + pos) %>%
select(multispecies_sequence, multispecies_pos)
# Aim achieved: shared coordinates for break sites and USCOs achieved
### Plot synteny ####
# Make a tibble linking USCO coordinate of each speceies against all other species
# Keep fake IDs as self coordinates to keep sizes of sequences
multi_sp_usco_linked <- group_by(multispecies_nigons, Busco_id) %>%
mutate(occurrences = n()) %>%
ungroup() %>%
slice(rep(1:n(), occurrences)) %>%
group_by(Busco_id) %>%
mutate(counter_species_start = rep(unique(multispecies_start), n_distinct(multispecies_start)),
# remove rows linking USCO to itself in same species
# except when it is a fake USCO
is_self_coord_bool = !(counter_species_start != multispecies_start | grepl("fake", Busco_id))) %>%
ungroup() %>%
filter(!is_self_coord_bool)
# Main plot
longest_seqs <- group_by(multi_sp_usco_linked, multispecies_sequence) %>% summarise(longest = max(multispecies_start))
group_by(multi_sp_index, multispecies_sequence) %>% summarise(longest = max(multispecies_index))
p <- ggplot(multi_sp_usco_linked, aes(x = multispecies_start, y = counter_species_start, colour = nigon)) +
geom_point(alpha = 0.3, size = 0.2) +
scale_colour_manual(values = cols) +
scale_x_continuous(breaks = multi_sp_index$multispecies_index + multi_sp_index$size/2,
labels = multi_sp_index$multispecies_sequence,
expand = c(0, 0)) +
scale_y_continuous(breaks = multi_sp_index$multispecies_index + multi_sp_index$size/2,
labels = multi_sp_index$multispecies_sequence,
expand = c(0, 0)) +
geom_vline(xintercept=multi_sp_index$multispecies_index, size = 0.2) +
geom_hline(yintercept=multi_sp_index$multispecies_index, size = 0.2) +
geom_vline(xintercept=multispecies_breaks$multispecies_pos, linetype="dotted", alpha = 0.3) +
geom_hline(yintercept=multispecies_breaks$multispecies_pos, linetype="dotted", alpha = 0.3) +
xlab("") +
ylab("") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
axis.ticks = element_blank())
ggsave("synteny_t.pdf", p, width = 19, height = 18)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment