Last active
August 20, 2023 04:58
-
-
Save ArtPoon/4c6cb4bdae02bf4bbd659b131a2afa9e to your computer and use it in GitHub Desktop.
Generate a Bayesian skyline plot from BEAST log files using R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(ape) | |
#' get.intervals | |
#' | |
#' Scan through BEAST tree log line-by-line and parse each | |
#' Newick string to retrieve the coalescent intervals | |
#' | |
#' @param treefile: (character) relative or absolute path to trees log file | |
#' @return List containing objects of class coalescentIntervals (ape) keyed | |
#' by state (MCMC step number) | |
get.intervals <- function(treefile) { | |
result <- list() | |
con <- file(treefile, open='r') | |
while (length(line <- readLines(con, n=1, warn=FALSE)) > 0) { | |
if (grepl("^tree", line)) { | |
state <- gsub("^.+STATE_([0-9]+) .+$", "\\1", line) | |
#nwkstr <- strsplit(line, " ")[[1]][4] # does not support BEAST1 | |
nwkstr <- gsub("^[^(]+\\((.+);.*$", "(\\1;", line) | |
phy <- read.tree(text=nwkstr) | |
intervals <- coalescent.intervals(phy) | |
result[[state]] <- intervals | |
} | |
} | |
close(con) | |
result | |
} | |
#' get.skyline | |
#' | |
#' Generate coordinates of Bayesian skyline plot by averaging the | |
#' effective population size estimates over time. | |
#' | |
#' @param logfile: path to BEAST trace log file | |
#' @param treefile: path to BEAST tree log file | |
#' @param p.burnin: proportion of chain sample to discard, starting from 0 | |
#' @param thin: thinning interval, defaults to 1 (retain all post-burnin samples) | |
#' @param bin.count: number of bins to calculate skyline trends | |
#' | |
#' @return S3 object of class 'skyline' | |
get.skyline <- function(logfile, treefile, p.burnin=0.1, thin=1, bin.count=100) { | |
# load log file | |
log <- read.table(logfile, sep='\t', comment.char='#', header=T) | |
# parse tree file | |
intervals <- get.intervals(treefile) | |
# determine if this is a BEAST1 or BEAST2 logfile | |
if (is.element("treeModel.rootHeight", names(log))) { | |
t.heights <- log$treeModel.rootHeight | |
} else if (is.element("TreeHeight", names(log))) { | |
t.heights <- log$TreeHeight | |
} else { | |
stop("Failed to locate tree height parameter in log file") | |
} | |
# maximum time is the root height's mean/median | |
t.mean <- mean(t.heights) | |
q <- quantile(t.heights, c(0.025, 0.5, 0.975)) | |
t.lower <- q[1] # <- default | |
t.median <- q[2] | |
t.upper <- q[3] | |
max.height <- t.lower | |
age.youngest <- 0 # plot backwards in time | |
min.time <- 0 | |
max.time <- max.height - age.youngest | |
delta <- (max.time - min.time) / (bin.count-1) | |
group.size.count <- sum(grepl("groupSize", names(log))) | |
# store age at the end of each group | |
group.times <- matrix(NA, nrow=nrow(log), ncol=group.size.count) | |
# determine time points of population size transitions for every sample | |
n.states <- nrow(log) | |
if (is.element("Sample", names(log))) { | |
states <- log$Sample | |
} else if (is.element("state", names(log))) { | |
states <- log$state | |
} else { | |
stop("Failed to locate chain step number in log") | |
} | |
thinned <- seq(round(n.states*p.burnin), n.states, thin) | |
for (i in thinned) { | |
state <- as.character(states[i]) | |
group.sizes <- log[i, grepl("groupSize", names(log))] | |
# get cumulative times | |
cumul.t <- cumsum(intervals[[state]]$interval.length) | |
group.times[i, ] <- cumul.t[ cumsum(as.integer(group.sizes)) ] | |
} | |
# assign population sizes to bins | |
heights <- seq(0, max.height, length.out=bin.count) | |
bins <- matrix(nrow=nrow(log), ncol=bin.count) | |
for (i in thinned) { | |
state <- as.character(states[i]) | |
pop.sizes <- log[i, grepl("popSize", names(log))] | |
index <- as.integer(cut(heights, c(0, group.times[i,]), right=F)) | |
bins[i,] <- unlist(sapply(index, function(i) { | |
ifelse (is.na(i), NA, pop.sizes[i]) | |
})) | |
} | |
# remove burnin rows before returning | |
bins <- na.omit(bins) | |
skyline <- list( | |
raw.data=bins, | |
mean=apply(bins, 2, mean), | |
median=apply(bins, 2, median), | |
lower=apply(bins, 2, function(x) quantile(x, 0.025)), | |
upper=apply(bins, 2, function(x) quantile(x, 0.975)), | |
time=heights | |
) | |
class(skyline) <- 'skyline' | |
skyline | |
} | |
#' plot | |
#' | |
#' Generic plot method for objects of class 'skyline' | |
#' @examples | |
#' logfile <- '~/Downloads/beast/bin/MTBC-skyline2.log' | |
#' treefile <- "~/Downloads/beast/bin/MTBC-skyline2.trees" | |
#' sky <- get.skyline(logfile, treefile) | |
#' plot(sky) | |
plot.skyline <- function(obj, ylim=NA, | |
poly.border=NA, poly.col='lightblue', | |
line.col='black', line.lwd=2, | |
xlab='Time', ylab='Effective population size', | |
...) { | |
if (is.na(ylim)) { | |
ylim <- range(obj$raw.data) | |
} | |
plot(obj$time, obj$mean, type='n', ylim=ylim, log='y', | |
xlab=xlab, ylab=ylab, ...) | |
polygon(x=c(obj$time, rev(obj$time)), | |
y=c(obj$upper, rev(obj$lower)), | |
border=poly.border, col=poly.col) | |
lines(obj$time, obj$mean, col=line.col, lwd=line.lwd) | |
} |
I had to make some changes recently to help another user - might have broken support for a particular BEAST version or model. Can you send me an XML by email to reproduce please?
Does your log file really contain the variable name Tree.height
?
If so you should be able to just tweak this code block:
if (is.element("treeModel.rootHeight", names(log))) {
t.heights <- log$treeModel.rootHeight
} else if (is.element("TreeHeight", names(log))) {
t.heights <- log$TreeHeight
} else {
stop("Failed to locate tree height parameter in log file")
}
So that it becomes:
if (is.element("treeModel.rootHeight", names(log))) {
t.heights <- log$treeModel.rootHeight
} else if (is.element("Tree.height", names(log))) {
t.heights <- log$Tree.height
} else {
stop("Failed to locate tree height parameter in log file")
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, I am trying to use your code, but when executing:
I get this error:
Error in get.skyline(logfile, treefile) :
Failed to locate tree height parameter in log file
Do you know what it could be?
NOTE: the Tree.height column is in a .log file with the respective data.