Skip to content

Instantly share code, notes, and snippets.

@theosanderson
Created August 17, 2023 15:05
Show Gist options
  • Save theosanderson/ebefd0fa9c8aab4f12d806553c690389 to your computer and use it in GitHub Desktop.
Save theosanderson/ebefd0fa9c8aab4f12d806553c690389 to your computer and use it in GitHub Desktop.
```{r}
mutations_in_highly_mutated_seq = "A543G, G1068A, G1186A, G1264A, T1370C, G1743A, A2497G, C2583T, T2824C, G3287A, C3393T, A3430G, C3604T, G3794A, A4179G, G4300A, G4352A, G4444A, G4474A, A4501G, G4975A, T5117C, G5230A, G5720A, G5861A, C6031T, G6123A, C6198T, G6362A, A6565G, T6661C, G6759A, A6833G, C7051T, C7086T, T7114C, A7558G, C7749T, A7795G, G7934A, C8169T, T8779C, T8875C, T9148C, G9209A, A9409G, C9438T, C9521T, G9575A, A9984G, C10015T, G11198A, C11199T, G11330A, G11605A, T11701C, G11837A, G11944A, A12030G, A12271G, G12442A, C12540T, C12633T, T12703C, C13818T, C14216T, G14622A, C15026T, C15656T, C15783T, C16767T, G16861A, C17481T, T17505C, A17993G, C18201T, C18888T, A19574G, G20062A, T20502C, C21297T, C21588T, G21668A, A21717G, T21752C, A21996G, A22101G, T22189C, T22254C, G22770A, C22783T, A22893G, A23004G, C23013T, G23048A, T23425C, A23734G, T23948C, A24003G, A24062G, C24961T, G24977A, C25066T, G25537A, T25779C, C25872T, C26176T, C26195T, C26335T, G26428A, C26645T, T26819G, G27014A, A27146G, A27386G, G28193A, C28567T, C28674T, G28803A, C28873T, G29081A, C1471T, C2695T, C4763T, C5192T, C7564T, C8841T, G22898A, C22916A, C24382T, C25889T, C27371T, C28720T"
mutations_in_highly_mutated_seq = str_replace_all(mutations_in_highly_mutated_seq, "nt:", "")
# Split the string by commas, and then extract the initial nucleotide, index, and final nucleotide
mutations_tibble <- str_split(mutations_in_highly_mutated_seq, ",\\s*") %>%
unlist() %>%
tibble(mutation = .) %>%
mutate(
par = str_extract(mutation, "^[A-Z]"),
index = str_extract(mutation, "[0-9]+"),
mut = str_extract(mutation, "[A-Z]$")
) %>%
select(-mutation) %>% mutate(index=as.numeric(index)) %>% inner_join(ref_tib)
of_interest = mutations_tibble %>% group_by(par,context_before,context_after,mut) %>% tally()%>% mutate(type=paste0(par,mut))
unnormalise <- function(df){
inner_join(df,nuc_genome_counts) %>% mutate(spectrum_value = spectrum_value * genome_count) %>% mutate(type=paste0(par,mut)) %>% group_by(par,mut,type) %>% mutate(spectrum_value=spectrum_value/sum(spectrum_value))
}
model1 = long %>% mutate(type=paste0(par,mut)) %>% rename(spectrum_value = Number_of_mutations) %>% unnormalise()
model2 = ba1 %>% mutate(type=paste0(par,mut)) %>% rename(spectrum_value = Number_of_mutations)%>% unnormalise()
types_of_interest = c("GA","CT","AG","TC")
library(nnet)
calc_bfs <- function(of_interest){
join_everything = full_join(model1,model2,by=c("par","mut","type","context_before","context_after"), suffix=c("_1","_2")) %>% full_join(of_interest,by=c("par","mut","type","context_before","context_after")) %>% replace_na(list(n=0))
bfs = c()
# Step 1: Filter data for each type of interest
for (mytype in types_of_interest) {
filtered = join_everything %>% filter(type==mytype)
prob1 = dmultinom(filtered$n, size = sum(filtered$n), prob = filtered$spectrum_value_1, log = F)
prob2 = dmultinom(filtered$n, size = sum(filtered$n), prob = filtered$spectrum_value_2, log = F)
bf = prob1/prob2
bfs[mytype] = bf
}
bfs
}
bfs = calc_bfs(of_interest)
prod(bfs)
```
```{r}
uk_branches = data_nodes %>% filter(total_muts> threshold_branch_length)
uk_node_id = uk_branches$node_id
uk_muts = data_muts %>% filter(node_id %in% uk_node_id)
uk_muts = uk_muts%>% inner_join(ref_tib,by=c("nt_index"="index")) %>% rename(par=original_nt, mut=alternative_nt ) %>% group_by(node_id,par,context_before,context_after,mut) %>% tally()%>% mutate(type=paste0(par,mut))
grouped = uk_muts %>%
group_by(node_id)
calced = grouped%>%
group_split() %>%
map(~ calc_bfs(.)) %>%
bind_rows()
together = bind_cols(grouped %>% summarise() , calced)
together= together %>% mutate(comb_bf = GA*CT*AG*TC)%>% inner_join(uk_branches)
top_10_countries <- together %>%
count(consensus_country) %>%
arrange(desc(n)) %>%
head(15) %>%
pull(consensus_country)
ggplot(together %>%
filter(flagged, consensus_country %in% top_10_countries) ,aes(y=log2(GA*CT*TC*AG),x=num_descendants,color=consensus_country)) + geom_point()
```
@liamxg
Copy link

liamxg commented Apr 30, 2024

Dear @theosanderson,

Could you please tell me where to find the ref_tib file?

@theosanderson
Copy link
Author

Sorry, I can't provide support on gists

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment