Created
June 13, 2018 00:08
-
-
Save geneva/b57ebd2008b114ef8ec2c19874f90e54 to your computer and use it in GitHub Desktop.
example execution of R package - rwty
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# aRe We There Yet (RWTY) Tutorial | |
# Bayesian Phylogenetic Inference using Markov Chain Monte Carlo (mcmc) | |
### Introduced in late 90s, now wildly popular | |
### Implemented in many software packages: MrBayes, BEAST, PhlyoBayes | |
# Phylogenetic inference computationally expensive, number of possible | |
# rooted trees for 60 taxa > number of protons in known universe | |
# Exhausitive tree search via optimality criteria such as maximum parsimony | |
# or maximum liklihood is NP-Hard | |
# mcmc - Efficient algorithm to rapidly explore vast parameter | |
# space (treespace) without getting stuck on local optima | |
# simplified MCMC proceedure | |
# propose topology and underlying model parameters | |
# | | |
# evaluate on some criterion (often LnL for phylo) | |
# | | |
# if criterion improved - accept | |
# | | |
# if not improved - accept with probability proportional to difference | |
# | | |
# perturb and estimate new topology and model parameters | |
# | | |
# repeat evaluation step | |
# For a detailed review of Bayesian Phylogenetics using | |
# Markov Chain Monte Carlo methods see: | |
# Nascimento et al 2017. Nature Ecology and Evolution 1:1446–1454. | |
# A unique issue to mcmc searches is that it is difficult to tell how | |
# long of a chain is long enough | |
# Key is to diagnose CONVERGENCE, point in a Markov chain where | |
# parameter estimates reach a stationary distribution, meaning | |
# that they have settled into a single region of parameter space. | |
# Portions of the chain prior to convergence are discarded as BURNIN | |
# Posterior parameter estimates are drawn from sampling values from the | |
# chain AFTER convergence. Setting burnin too early results in | |
# biased or poorly resolved topology and parameter estimates | |
# Many phylogenetic mcmc packages initiate multiple independent mcmc runs | |
# Provides added confidence in parameter estimates if these independent | |
# runs all converge on similar regions of parameter space | |
# Many ways phylogenetic mcmc can go wrong: | |
## Poor Mixing | |
### Chains get stuck in local optima | |
### Sampling too frequently (values too autocorrelated) | |
## Failure to Converge | |
### Flat likelihood space | |
### MOST COMMON: Chains are not given enough time to converge | |
# rwty - designed to assist in the evaluation of mcmc | |
#Other exisiting tools: | |
## Feature poor | |
## Rely on closed source software | |
####################### | |
# | |
# BEGIN TUTORIAL | |
# | |
####################### | |
# check if rwty is installed and if not, install it | |
if (!("rwty" %in% installed.packages())){ | |
install.packages("rwty", dependencies = TRUE) | |
} | |
#load rwty and dependencies | |
library(rwty) | |
####################### | |
# | |
# LOAD EXAMPLE DATA | |
# | |
####################### | |
# one locus four runs | |
data(fungus) | |
####################### | |
# | |
# DATA STRUCTURE | |
# | |
####################### | |
#show individual parts of data objects | |
names(fungus) | |
names(fungus$Fungus.Run1) | |
head(fungus$Fungus.Run1$ptable) | |
# four loci, 2 runs each | |
data(salamanders) | |
names(salamanders) | |
# reduce to single locus with 2 runs for comparison to fugus | |
salamanders.amotl <- list(salamanders[[1]], salamanders[[2]]) | |
##################################################################### | |
# | |
# | |
# GENERAL PARAMETER TRACE PLOTS | |
# | |
# plot and calculate Effective Sample Size (lagged autocorrelation) for | |
# params estimated by model. Phylo models can include 10s of params | |
# | |
# this work with any mcmc software: phylogenetics, phylogenetic | |
# comparative methods, and population structure analyses | |
# (e.g. Structure, Structurama) | |
# | |
##################################################################### | |
# plot LnL without burnin - variation obscured | |
makeplot.param(salamanders.amotl, burnin = 0, "LnL") | |
# plot with burnin | |
makeplot.param(salamanders.amotl, burnin = 50, "LnL") | |
makeplot.param(fungus, burnin = 50, "LnL") | |
# Can plot other logged parameters - here adenine base freq. | |
makeplot.param(fungus, burnin = 50, "pi.A.") | |
##################################################################### | |
# | |
# | |
# Tree-Specific Measures | |
# | |
# tools to assess convergence and mixing WITHIN chains of tree | |
# | |
##################################################################### | |
####################### | |
# | |
# TOPOLOGY PLOTS | |
# | |
# analogous to parameter | |
# plots above, this plots | |
# path distance to chain1 tree1 | |
# | |
# !!SLOW!! - ANY QUESTIONS? | |
# | |
####################### | |
makeplot.topology(salamanders.amotl, burnin = 50) | |
makeplot.topology(fungus, burnin = 50) | |
####################### | |
# | |
# Split frequency plots | |
# | |
# plots posterior prob. | |
# of 20 worst (most variable) nodes | |
# across mcmc generations | |
# | |
####################### | |
makeplot.splitfreqs.cumulative(salamanders.amotl, burnin = 50) | |
makeplot.splitfreqs.cumulative(fungus, burnin = 50) | |
####################### | |
# | |
# Parameter pairs plots | |
# | |
####################### | |
makeplot.pairs(salamanders[[1]], burnin = 50, params = c("LnL", "pi.A.", "pi.C.")) | |
makeplot.pairs(fungus[[1]], burnin = 50, params = c("LnL", "pi.A.", "pi.C.")) | |
####################### | |
# | |
# TreeSpace | |
# | |
# NMDS non-metric mulitdimensional | |
# scaling of tree disimilarity | |
# visualize treespace in 2D | |
# | |
####################### | |
makeplot.treespace(salamanders.amotl, burnin =50, fill.color = "LnL") | |
makeplot.treespace(fungus, burnin =50, fill.color = "LnL") | |
##################################################################### | |
# | |
# | |
# Comparing Independent Runs | |
# | |
# tools to assess convergence BETWEEN sets of trees | |
# | |
##################################################################### | |
####################### | |
# | |
# ASDSF PLOTS | |
# | |
# Average standard deviation | |
# of split frequencies | |
# (sd of node PP), commonly used | |
# to diagnose convergence between | |
# independent runs | |
# ribbons 75,95,100 CI | |
# | |
####################### | |
makeplot.asdsf(salamanders.amotl, burnin = 50) | |
makeplot.asdsf(fungus, burnin = 50) | |
####################### | |
# | |
# Split Frequency Plots | |
# | |
####################### | |
makeplot.splitfreq.matrix(salamanders, burnin = 50) | |
makeplot.splitfreq.matrix(fungus, burnin = 50) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment