Skip to content

Instantly share code, notes, and snippets.

@KamilSJaron
Created June 2, 2022 16:43
Show Gist options
  • Save KamilSJaron/62266a05774d438739b590e15c191f94 to your computer and use it in GitHub Desktop.
Save KamilSJaron/62266a05774d438739b590e15c191f94 to your computer and use it in GitHub Desktop.
Fit basic genomic models to kmer spectrum
#!/usr/bin/env Rscript
# this is a minimalist version of two tissue mdoel from https://github.com/RossLab/genomic-evidence-of-PGE-in-globular-springtails
# which is just an application of GenomeTelescope: https://github.com/KamilSJaron/GenomeTelescope
suppressPackageStartupMessages(library("argparse"))
suppressPackageStartupMessages(library("minpack.lm"))
suppressPackageStartupMessages(library("nlstools"))
###############
## FUNCTIONS ##
###############
### Original genomescope model
nls_4peak <- function(x, y, k, estKmercov, estLength, max_iterations = 40){
model4 = NULL
cat("Original genomescope model 'nls_4peak'\n")
try(model4 <- nlsLM(y ~ (((2*(1-d)*(1-(1-r)^k)) + (2*d*(1-(1-r)^k)^2) + (2*d*((1-r)^k)*(1-(1-r)^k))) * dnbinom(x, size = kmercov / bias, mu = kmercov) * length +
(((1-d)*((1-r)^k)) + (d*(1-(1-r)^k)^2)) * dnbinom(x, size = kmercov*2 / bias, mu = kmercov * 2) * length +
(2*d*((1-r)^k)*(1-(1-r)^k)) * dnbinom(x, size = kmercov*3 / bias, mu = kmercov * 3) * length +
(d*(1-r)^(2*k)) * dnbinom(x, size = kmercov*4 / bias, mu = kmercov * 4) * length),
start = list(d=0, r=0, kmercov=estKmercov, bias = 0.5, length=estLength),
control = list(minFactor=1e-12, maxiter=max_iterations)), silent = F)
if(class(model4) == "try-error"){
try(model4 <- nls(y ~ (((2*(1-d)*(1-(1-r)^k)) + (2*d*(1-(1-r)^k)^2) + (2*d*((1-r)^k)*(1-(1-r)^k))) * dnbinom(x, size = kmercov / bias, mu = kmercov) * length +
(((1-d)*((1-r)^k)) + (d*(1-(1-r)^k)^2)) * dnbinom(x, size = kmercov*2 / bias, mu = kmercov * 2) * length +
(2*d*((1-r)^k)*(1-(1-r)^k)) * dnbinom(x, size = kmercov*3 / bias, mu = kmercov * 3) * length +
(d*(1-r)^(2*k)) * dnbinom(x, size = kmercov*4 / bias, mu = kmercov * 4) * length),
start = list(d=0, r=0, kmercov=estKmercov, bias = 0.5, length=estLength),
algorithm="port", control = list(minFactor=1e-12, maxiter=max_iterations)), silent = F)
}
return(model4)
}
# two tissue model
nlsLM_2peak_peak_sizes <- function(x, y, kmerEst, lengthEst, hetEst = 0.6){
basic_model <- nlsLM(y ~ ((het * dnbinom(x, size = kmercov / bias, mu = kmercov)) +
((1 - het) * dnbinom(x, size = (2 * kmercov) / bias, mu = (2 * kmercov)))) * length,
start = list(kmercov = kmerEst, bias = 0.5, length = lengthEst, het = hetEst),
control = list(minFactor=1e-12, maxiter=40))
return(basic_model)
}
predict_1n_peak_nlsLM_2peak_unconditional <- function(model){
model_env <- model$m$getEnv()
cov_1n <- model_env$kmercov
model_env$het * dnbinom(model_env$x, size = cov_1n / model_env$bias, mu = cov_1n) * model_env$length
}
predict_2n_peak_nlsLM_2peak_unconditional <- function(model){
model_env <- model$m$getEnv()
(1 - model_env$het) * dnbinom(model_env$x, size = (model_env$kmercov * 2) / model_env$bias, mu = (model_env$kmercov * 2)) * model_env$length
}
coverage_barplot <- function(bar_heights, bar_positions, font_size = 1, width = 0.5, xlim = NA, bty = 'o'){
if ( any(is.na(xlim)) ){
xlim <- range(bar_positions)
}
plot(bar_heights, type="n", xlab="Coverage", ylab="Number of k-mers",
ylim=c(0, max(bar_heights)), xlim=xlim, bty = bty,
cex.lab=font_size, cex.axis=font_size, cex.main=font_size, cex.sub=font_size)
for ( i in 1:length(bar_heights)){
rect(bar_positions[i] - width, 0, bar_positions[i] + width, bar_heights[i], col = 'deepskyblue', border = F)
}
}
plot_PGE_model <- function(model, xlim = NA, vertical_lines = T, cex = 1, bty = 'o', legend = T){
x <- model$m$getEnv()$x
y <- model$m$getEnv()$y
cov_1n <- coef(model)['kmercov']
cov_2n <- 2 * cov_1n
monoploid_col <- 'darkorchid4'
diploid_col <- 'chocolate'
coverage_barplot(y, x, xlim = xlim, font_size = cex, bty = bty)
disomic_prediction <- predict_2n_peak_nlsLM_2peak_unconditional(model)
monosomic_prediction <- predict_1n_peak_nlsLM_2peak_unconditional(model)
lines(predict(model, response = T) ~ x, lwd = 3)
lines(monosomic_prediction ~ x, lwd = 3, col = monoploid_col)
lines(disomic_prediction ~ x, lwd = 3, col = diploid_col)
if ( vertical_lines ){
lines(c(cov_1n, cov_1n) - 0.5, c(0, max(y)), lty = 2)
lines(c(cov_2n, cov_2n) - 0.5, c(0, max(y)), lty = 2)
}
if ( legend ){
legend('topright',
c('kmer histogram','full model', 'monoploid', 'diploid'),
col = c('deepskyblue','black', monoploid_col, diploid_col),
lty = c(NA, 1, 1, 1), lwd = 3,
pch = c(15, NA, NA, NA), bty = 'n', cex = cex)
}
}
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# create parser object
parser <- ArgumentParser()
# specify our desired options
# parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
# help="Print extra output [default]")
parser$add_argument("-i", "--input", dest="kmer_hist_file",
help="Input histogram file")
parser$add_argument("-o", "--output_pattern", dest="out",
help="The output pattern for the summary and plot")
args <- parser$parse_args()
# kmer_spectrum <- read.table('output/kmcdb_k21.hist', col.names = c('coverage', 'frequency'))
kmer_spectrum <- read.table(args$kmer_hist_file, col.names = c('coverage', 'frequency'))
write("loaded k-mer spectra", stderr())
# removing errors from the model (removing all monotonicly decreasing values)
min_coverage <- min(which(diff(kmer_spectrum$frequency) > 0))
# and capping 99.9% of kmers in the dataset excluding erros
max_coverage <- min(which(cumsum(kmer_spectrum$frequency[min_coverage:nrow(kmer_spectrum)]) / sum(kmer_spectrum$frequency[min_coverage:nrow(kmer_spectrum)]) > 0.98))
cov_range <- min_coverage:max_coverage
# to get coverage mode I just use second derivation
second_deriv <- diff(sign(diff(kmer_spectrum$frequency[cov_range])))
peak_covs <- kmer_spectrum$coverage[cov_range][which(second_deriv == -2) + 1]
peak_heights <- kmer_spectrum$frequency[cov_range][which(second_deriv == -2) + 1]
coverage_mode <- peak_covs[which.max(peak_heights)]
symmetric_range <- min_coverage:(2 * coverage_mode - min_coverage)
coverage_mean <- weighted.mean(kmer_spectrum$coverage[symmetric_range], kmer_spectrum$frequency[symmetric_range])
length_prior <- sum(kmer_spectrum$coverage[cov_range] * kmer_spectrum$frequency[cov_range]) / coverage_mode
write(paste("Prior for coverage range: ", min_coverage, max_coverage), stderr())
write(paste("Prior for genome length:", length_prior), stderr())
write(paste("Trying several coverage priors"), stderr())
PGE_model_mode <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mode, length_prior, 0.5)
PGE_model_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean, length_prior, 0.5)
PGE_model_seven_tenths_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean * 0.7, length_prior, 0.5)
PGE_model_half_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean / 2, length_prior, 0.5)
GenomeScope_model <- nls_4peak(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], 21, coverage_mode / 2, length_prior)
PGE_models <- list(PGE_model_half_mean, PGE_model_seven_tenths_mean, PGE_model_mode, PGE_model_mean)
best_model <- which.min(sapply(PGE_models, deviance))
PGE_model <- PGE_models[[best_model]]
###
# summary of results
###
genomescope_estimates <- coef(GenomeScope_model)
peak_size_model_estimates <- coef(PGE_model)
propotion_CI <- confint2(PGE_model)['proportion',]
names(propotion_CI) <- c('CI_l_prop', 'CI_u_prop')
peak_size_model_estimates <- c(peak_size_model_estimates, propotion_CI)
write("GenomeScope estimates: ", stderr())
write(paste(names(genomescope_estimates), '=', round(genomescope_estimates, 4)), stderr())
write("Peak size model estimates: ", stderr())
write(paste(names(peak_size_model_estimates), '=', round(peak_size_model_estimates, 4)), stderr())
write(paste(c(genomescope_estimates, peak_size_model_estimates), sep = '\t'), stdout())
output_summary_file <- paste0(args$out, '_summary_tsv')
write(paste(c(names(genomescope_estimates), names(peak_size_model_estimates)), collapse = '\t'), output_summary_file)
write(paste(c(genomescope_estimates, peak_size_model_estimates), collapse = '\t'), output_summary_file, append = T)
###
# plot
###
output_plot <- paste0(args$out, '_plot.png')
png(output_plot)
plot(kmer_spectrum[cov_range, 1], kmer_spectrum[cov_range, 2], xlab = 'Coverage', ylab = 'Frequency')
lines(kmer_spectrum[cov_range, 1], predict(GenomeScope_model), lty = 1, lwd = 2, col = 'blue')
lines(kmer_spectrum[cov_range, 1], predict(PGE_model), lty = 2, lwd = 2, col = 'purple')
legend('topright', c('simulated k-mer spectra', 'GenomeScope model', 'Minimal model'), pch = c(1, NA, NA), lty = c(NA, 1, 2), lwd = 2, col = c('black', 'blue', 'purple'), bty = 'n')
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment