Last active
March 31, 2020 04:58
-
-
Save christophergandrud/5675688 to your computer and use it in GitHub Desktop.
Function for creating nonparamentric multiple change point plots with estimates from the ecp package.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Function for creating nonparamentric multiple change point plots with | |
#' estimates from the ecp package. | |
#' | |
#' @param data A data frame with the covariates and time variable. | |
#' @param Vars A character vector listing the names of the variates from | |
#' \code{data} to include in the nonparametric multiple change point analysis. | |
#' @param TimeVar A character string naming the time variable in \code{data}. | |
#' Must be in a format handled by POSIX, e.g. YYYY-MM-DD. | |
#' @param sig.lvl The level at which to sequentially test if a proposed change | |
#' point is statistically significant. | |
#' @param R The maximum number of random permutations to use in each iteration | |
#' of the permutaiton test. The permutation test p-value is calculated usign the | |
#' method outlined in Gandy (2009). | |
#' @param k Number of change point locations to estimate, surpressing the | |
#' permutation based testing. If \code{k=NULL} then only the statistically | |
#' significant estimated change points are returned. | |
#' @param min.size Minimum number of observations between change points. | |
#' @param alpha The moment index used for determining the distance between and | |
#' within segments. | |
#' @param PlotVars A character string of the variables from the analysis to | |
#' plot. | |
#' @param Grid Logical. If \code{TRUE} then each variable will be given its own | |
#' plot. If \code{FALSE} then values from all of the variables in \code{Var} | |
#' will be plotted on the same figure. | |
#' @param base_size numeric. The plot theme's base font size. | |
#' See \link{ggplot2}. | |
#' @param palette If a string, it will use that named palette. If a number, will | |
#' index into the list of palettes of appropriate type, | |
#' @param leg.name A character string. If \code{Facet = FALSE}, it allows you to | |
#' specify the legend name. | |
#' @param Titles A character vector of the same lenght as \code{Vars} giving | |
#' alternative title names for the plot titles. Only applies if | |
#' \code{Grid = TRUE}. | |
#' @param JustGraph logical indicating whether or not to only create a | |
#' grid-style time-series graph of the variables, without running a change point | |
#' analysis. When \code{JustGraph = TRUE} all other arguments apart from | |
#' \code{data}, \code{Vars}, \code{TimeVar}, \code{Titles} are ignored. | |
#' | |
#' @details Uses \code{\link{e.divisive}} to run a nonparametric multiple chang | |
#' point analysis (James and Matteson, 2013) on variables in a data frame. It | |
#' then uses \code{\link{ggplot2}} to plot the variable values with vertical | |
#' dashed lines indicating the estimated change points. | |
#' | |
#' | |
#' @seealso \link{ecp} and \link{ggplot2} | |
#' | |
#' @source Matteson, David S and Nicholas A James. 2014. ''A Nonparametric | |
#' Approach for Multiple Change Point Analysis of Multivariate Data.'' Journal | |
#' of the American Statistical Association 109(505):334-345. | |
#' | |
#' Gandy, A. (2009) "Sequential implementation of Monte Carlo tests with | |
#' uniformly bounded resampling risk." Journal of the American Statistical | |
#' Association. | |
#' | |
e.divGG <- function(data, Vars, TimeVar, sig.lvl = 0.05, R = 199, k = NULL, | |
min.size = 30, alpha = 1, PlotVars = NULL, Grid = TRUE, | |
base_size = 12, palette = "Set1", leg.name = "", | |
Titles = NULL, JustGraph = FALSE) | |
{ | |
# Load required packages | |
require(ecp) | |
require(reshape2) | |
require(ggplot2) | |
require(gridExtra) | |
# Create plot titles if non specified | |
if (is.null(Titles)) Titles <- Vars | |
# Create T x d matrix | |
DataMatrix <- as.matrix(data[, Vars]) | |
# Turn of faceting if DataMatrix only has 1 column | |
if (ncol(DataMatrix) == 1) Facet <- FALSE | |
# Function to graph only variables, not change points | |
NoCPGraph <- function(){ | |
KeepVars <- c(TimeVar, Vars) | |
SubData <- data[, KeepVars] | |
SubMolten <- melt(data = SubData, id.vars = TimeVar, | |
meansure.vars = Vars) | |
SubMolten[, 1] <- as.POSIXct(SubMolten[, 1]) | |
eachVar <- unique(SubMolten$variable) | |
VarLabels <- data.frame(eachVar, Titles, stringsAsFactors = FALSE) | |
names(SubMolten) <- c("Time", "variable", "value") | |
Rows <- 1:nrow(VarLabels) | |
p <- list() | |
for (i in Rows){ | |
SubData <- subset(SubMolten, variable == VarLabels[i, 1]) | |
Title_i <- VarLabels[i, 2] | |
p[[i]] <- ggplot(data = SubData, | |
aes(x = Time, y = value)) + | |
geom_line() + | |
xlab("") + ylab("") + | |
ggtitle(paste(Title_i, "\n")) + | |
theme_bw(base_size = base_size) | |
} | |
suppressWarnings(do.call(grid.arrange, p)) | |
} | |
if (JustGraph) NoCPGraph() | |
else if (!isTRUE(JustGraph)){ | |
# Estimate change points | |
CP <- e.divisive(X = DataMatrix, sig.lvl = sig.lvl, R = R, k = k, | |
min.size = min.size, alpha = alpha) | |
# Extract change points | |
CPEstimates <- CP$estimates | |
# Remove first/last points | |
# (these are the first and last values of the matrix) | |
CPEstSub <- CPEstimates[c(-1, -length(CPEstimates))] | |
if (length(CPEstSub) == 0){ | |
NoCPGraph() | |
message(paste("\nNo change points found at the", sig.lvl, | |
"significance level.\n")) | |
} | |
else if (length(CPEstSub) != 0){ | |
#Find corresponding TimeVar value | |
CPTimes <- as.POSIXct(data[CPEstSub, TimeVar]) | |
# Report estimates | |
CPMessage <- lapply(CPTimes, function(x) paste(x, "\n")) | |
message("\nChange points estimated at:") | |
message(CPMessage) | |
# Melt data frame so that it can be plotted | |
if (!is.null(PlotVars)){ | |
DataSub <- data[, c(TimeVar, PlotVars)] | |
DataMolten <- melt(data = DataSub, id.vars = TimeVar, | |
measure.vars = PlotVars) | |
} | |
else if (is.null(PlotVars)){ | |
DataSub <- data[, c(TimeVar, Vars)] | |
DataMolten <- melt(data = DataSub, id.vars = TimeVar, | |
measure.vars = Vars) | |
} | |
# Clean pre plotting | |
names(DataMolten) <- c("Time", "GroupVar", "Value") | |
DataMolten$Time <- as.POSIXct(DataMolten$Time) | |
DataMolten <- merge(DataMolten, CPTimes, all = TRUE) | |
names(DataMolten) <- c("Time", "GroupVar", "Value", "Lines") | |
DataMolten$Lines[DataMolten$Time != DataMolten$Lines] <- NA | |
# Plot | |
if (length(unique(DataMolten$GroupVar)) == 1){ | |
ggplot(data = DataMolten, aes(x = Time, y = Value)) + | |
geom_line() + | |
geom_vline(aes(xintercept = as.numeric(Lines)), | |
linetype = "longdash", colour = "#DE2D26") + | |
xlab("") + ylab("") + | |
theme_bw(base_size = base_size) | |
} | |
else if (length(unique(DataMolten$GroupVar)) > 1) { | |
if (Grid == FALSE){ | |
ggplot(data = DataMolten, aes(x = Time, y = Value, | |
group = GroupVar, colour = GroupVar)) + | |
geom_line() + | |
geom_vline(aes(xintercept = as.numeric(Lines)), | |
linetype = "longdash") + | |
scale_colour_brewer(palette = palette, | |
name = leg.name) + | |
xlab("") + ylab("") + | |
theme_bw(base_size = base_size) | |
} | |
else if (Grid == TRUE){ | |
eachVar <- unique(DataMolten$GroupVar) | |
VarLabels <- data.frame(eachVar, Titles, | |
stringsAsFactors = FALSE) | |
Rows <- 1:nrow(VarLabels) | |
p <- list() | |
for (i in Rows){ | |
SubData <- subset(DataMolten, | |
GroupVar == VarLabels[i, 1]) | |
Title_i <- VarLabels[i, 2] | |
p[[i]] <- ggplot(data = SubData, | |
aes(x = Time, y = Value)) + | |
geom_line() + | |
geom_vline(aes(xintercept = | |
as.numeric(Lines)), | |
linetype = "longdash", | |
colour = "#DE2D26") + | |
xlab("") + ylab("") + | |
ggtitle(paste(Title_i, "\n")) + | |
theme_bw(base_size = base_size) | |
} | |
suppressWarnings(do.call(grid.arrange, p)) | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment