Created
June 2, 2022 16:43
-
-
Save KamilSJaron/62266a05774d438739b590e15c191f94 to your computer and use it in GitHub Desktop.
Fit basic genomic models to kmer spectrum
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# this is a minimalist version of two tissue mdoel from https://github.com/RossLab/genomic-evidence-of-PGE-in-globular-springtails | |
# which is just an application of GenomeTelescope: https://github.com/KamilSJaron/GenomeTelescope | |
suppressPackageStartupMessages(library("argparse")) | |
suppressPackageStartupMessages(library("minpack.lm")) | |
suppressPackageStartupMessages(library("nlstools")) | |
############### | |
## FUNCTIONS ## | |
############### | |
### Original genomescope model | |
nls_4peak <- function(x, y, k, estKmercov, estLength, max_iterations = 40){ | |
model4 = NULL | |
cat("Original genomescope model 'nls_4peak'\n") | |
try(model4 <- nlsLM(y ~ (((2*(1-d)*(1-(1-r)^k)) + (2*d*(1-(1-r)^k)^2) + (2*d*((1-r)^k)*(1-(1-r)^k))) * dnbinom(x, size = kmercov / bias, mu = kmercov) * length + | |
(((1-d)*((1-r)^k)) + (d*(1-(1-r)^k)^2)) * dnbinom(x, size = kmercov*2 / bias, mu = kmercov * 2) * length + | |
(2*d*((1-r)^k)*(1-(1-r)^k)) * dnbinom(x, size = kmercov*3 / bias, mu = kmercov * 3) * length + | |
(d*(1-r)^(2*k)) * dnbinom(x, size = kmercov*4 / bias, mu = kmercov * 4) * length), | |
start = list(d=0, r=0, kmercov=estKmercov, bias = 0.5, length=estLength), | |
control = list(minFactor=1e-12, maxiter=max_iterations)), silent = F) | |
if(class(model4) == "try-error"){ | |
try(model4 <- nls(y ~ (((2*(1-d)*(1-(1-r)^k)) + (2*d*(1-(1-r)^k)^2) + (2*d*((1-r)^k)*(1-(1-r)^k))) * dnbinom(x, size = kmercov / bias, mu = kmercov) * length + | |
(((1-d)*((1-r)^k)) + (d*(1-(1-r)^k)^2)) * dnbinom(x, size = kmercov*2 / bias, mu = kmercov * 2) * length + | |
(2*d*((1-r)^k)*(1-(1-r)^k)) * dnbinom(x, size = kmercov*3 / bias, mu = kmercov * 3) * length + | |
(d*(1-r)^(2*k)) * dnbinom(x, size = kmercov*4 / bias, mu = kmercov * 4) * length), | |
start = list(d=0, r=0, kmercov=estKmercov, bias = 0.5, length=estLength), | |
algorithm="port", control = list(minFactor=1e-12, maxiter=max_iterations)), silent = F) | |
} | |
return(model4) | |
} | |
# two tissue model | |
nlsLM_2peak_peak_sizes <- function(x, y, kmerEst, lengthEst, hetEst = 0.6){ | |
basic_model <- nlsLM(y ~ ((het * dnbinom(x, size = kmercov / bias, mu = kmercov)) + | |
((1 - het) * dnbinom(x, size = (2 * kmercov) / bias, mu = (2 * kmercov)))) * length, | |
start = list(kmercov = kmerEst, bias = 0.5, length = lengthEst, het = hetEst), | |
control = list(minFactor=1e-12, maxiter=40)) | |
return(basic_model) | |
} | |
predict_1n_peak_nlsLM_2peak_unconditional <- function(model){ | |
model_env <- model$m$getEnv() | |
cov_1n <- model_env$kmercov | |
model_env$het * dnbinom(model_env$x, size = cov_1n / model_env$bias, mu = cov_1n) * model_env$length | |
} | |
predict_2n_peak_nlsLM_2peak_unconditional <- function(model){ | |
model_env <- model$m$getEnv() | |
(1 - model_env$het) * dnbinom(model_env$x, size = (model_env$kmercov * 2) / model_env$bias, mu = (model_env$kmercov * 2)) * model_env$length | |
} | |
coverage_barplot <- function(bar_heights, bar_positions, font_size = 1, width = 0.5, xlim = NA, bty = 'o'){ | |
if ( any(is.na(xlim)) ){ | |
xlim <- range(bar_positions) | |
} | |
plot(bar_heights, type="n", xlab="Coverage", ylab="Number of k-mers", | |
ylim=c(0, max(bar_heights)), xlim=xlim, bty = bty, | |
cex.lab=font_size, cex.axis=font_size, cex.main=font_size, cex.sub=font_size) | |
for ( i in 1:length(bar_heights)){ | |
rect(bar_positions[i] - width, 0, bar_positions[i] + width, bar_heights[i], col = 'deepskyblue', border = F) | |
} | |
} | |
plot_PGE_model <- function(model, xlim = NA, vertical_lines = T, cex = 1, bty = 'o', legend = T){ | |
x <- model$m$getEnv()$x | |
y <- model$m$getEnv()$y | |
cov_1n <- coef(model)['kmercov'] | |
cov_2n <- 2 * cov_1n | |
monoploid_col <- 'darkorchid4' | |
diploid_col <- 'chocolate' | |
coverage_barplot(y, x, xlim = xlim, font_size = cex, bty = bty) | |
disomic_prediction <- predict_2n_peak_nlsLM_2peak_unconditional(model) | |
monosomic_prediction <- predict_1n_peak_nlsLM_2peak_unconditional(model) | |
lines(predict(model, response = T) ~ x, lwd = 3) | |
lines(monosomic_prediction ~ x, lwd = 3, col = monoploid_col) | |
lines(disomic_prediction ~ x, lwd = 3, col = diploid_col) | |
if ( vertical_lines ){ | |
lines(c(cov_1n, cov_1n) - 0.5, c(0, max(y)), lty = 2) | |
lines(c(cov_2n, cov_2n) - 0.5, c(0, max(y)), lty = 2) | |
} | |
if ( legend ){ | |
legend('topright', | |
c('kmer histogram','full model', 'monoploid', 'diploid'), | |
col = c('deepskyblue','black', monoploid_col, diploid_col), | |
lty = c(NA, 1, 1, 1), lwd = 3, | |
pch = c(15, NA, NA, NA), bty = 'n', cex = cex) | |
} | |
} | |
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
# create parser object | |
parser <- ArgumentParser() | |
# specify our desired options | |
# parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, | |
# help="Print extra output [default]") | |
parser$add_argument("-i", "--input", dest="kmer_hist_file", | |
help="Input histogram file") | |
parser$add_argument("-o", "--output_pattern", dest="out", | |
help="The output pattern for the summary and plot") | |
args <- parser$parse_args() | |
# kmer_spectrum <- read.table('output/kmcdb_k21.hist', col.names = c('coverage', 'frequency')) | |
kmer_spectrum <- read.table(args$kmer_hist_file, col.names = c('coverage', 'frequency')) | |
write("loaded k-mer spectra", stderr()) | |
# removing errors from the model (removing all monotonicly decreasing values) | |
min_coverage <- min(which(diff(kmer_spectrum$frequency) > 0)) | |
# and capping 99.9% of kmers in the dataset excluding erros | |
max_coverage <- min(which(cumsum(kmer_spectrum$frequency[min_coverage:nrow(kmer_spectrum)]) / sum(kmer_spectrum$frequency[min_coverage:nrow(kmer_spectrum)]) > 0.98)) | |
cov_range <- min_coverage:max_coverage | |
# to get coverage mode I just use second derivation | |
second_deriv <- diff(sign(diff(kmer_spectrum$frequency[cov_range]))) | |
peak_covs <- kmer_spectrum$coverage[cov_range][which(second_deriv == -2) + 1] | |
peak_heights <- kmer_spectrum$frequency[cov_range][which(second_deriv == -2) + 1] | |
coverage_mode <- peak_covs[which.max(peak_heights)] | |
symmetric_range <- min_coverage:(2 * coverage_mode - min_coverage) | |
coverage_mean <- weighted.mean(kmer_spectrum$coverage[symmetric_range], kmer_spectrum$frequency[symmetric_range]) | |
length_prior <- sum(kmer_spectrum$coverage[cov_range] * kmer_spectrum$frequency[cov_range]) / coverage_mode | |
write(paste("Prior for coverage range: ", min_coverage, max_coverage), stderr()) | |
write(paste("Prior for genome length:", length_prior), stderr()) | |
write(paste("Trying several coverage priors"), stderr()) | |
PGE_model_mode <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mode, length_prior, 0.5) | |
PGE_model_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean, length_prior, 0.5) | |
PGE_model_seven_tenths_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean * 0.7, length_prior, 0.5) | |
PGE_model_half_mean <- nlsLM_2peak_peak_sizes(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], coverage_mean / 2, length_prior, 0.5) | |
GenomeScope_model <- nls_4peak(kmer_spectrum[cov_range, 'coverage'], kmer_spectrum[cov_range, 'frequency'], 21, coverage_mode / 2, length_prior) | |
PGE_models <- list(PGE_model_half_mean, PGE_model_seven_tenths_mean, PGE_model_mode, PGE_model_mean) | |
best_model <- which.min(sapply(PGE_models, deviance)) | |
PGE_model <- PGE_models[[best_model]] | |
### | |
# summary of results | |
### | |
genomescope_estimates <- coef(GenomeScope_model) | |
peak_size_model_estimates <- coef(PGE_model) | |
propotion_CI <- confint2(PGE_model)['proportion',] | |
names(propotion_CI) <- c('CI_l_prop', 'CI_u_prop') | |
peak_size_model_estimates <- c(peak_size_model_estimates, propotion_CI) | |
write("GenomeScope estimates: ", stderr()) | |
write(paste(names(genomescope_estimates), '=', round(genomescope_estimates, 4)), stderr()) | |
write("Peak size model estimates: ", stderr()) | |
write(paste(names(peak_size_model_estimates), '=', round(peak_size_model_estimates, 4)), stderr()) | |
write(paste(c(genomescope_estimates, peak_size_model_estimates), sep = '\t'), stdout()) | |
output_summary_file <- paste0(args$out, '_summary_tsv') | |
write(paste(c(names(genomescope_estimates), names(peak_size_model_estimates)), collapse = '\t'), output_summary_file) | |
write(paste(c(genomescope_estimates, peak_size_model_estimates), collapse = '\t'), output_summary_file, append = T) | |
### | |
# plot | |
### | |
output_plot <- paste0(args$out, '_plot.png') | |
png(output_plot) | |
plot(kmer_spectrum[cov_range, 1], kmer_spectrum[cov_range, 2], xlab = 'Coverage', ylab = 'Frequency') | |
lines(kmer_spectrum[cov_range, 1], predict(GenomeScope_model), lty = 1, lwd = 2, col = 'blue') | |
lines(kmer_spectrum[cov_range, 1], predict(PGE_model), lty = 2, lwd = 2, col = 'purple') | |
legend('topright', c('simulated k-mer spectra', 'GenomeScope model', 'Minimal model'), pch = c(1, NA, NA), lty = c(NA, 1, 2), lwd = 2, col = c('black', 'blue', 'purple'), bty = 'n') | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment