mattbaggott/predictingbreaks.R

## predictingbreaks.R
#######################
#
# short investigation of id'ing breakpoints in data
#
# matt@baggott.net

library("bcp")
library("ggplot2")
library("changepoint")
library("reshape")
# should investigate newer library("cpm")

#######################
#
#  load data

data = c(746,743,735,735,728,717,709,691,746,738,732,728,722,782,781,778,776,
775,766,763,763,761,758,750,744,737,731,723,712,706,767,764,755,746,746,744,738,735,
726,718,717,708,694,759,756,749,744,744,740,738,738,737,728,711,705,699,753,749,
744,743,735,731,723,723,718,709,703,769,764,756,750,743,738,734,725,720,712,708,
756,749,741,737,737,731,705,705,685,779,776,775,782,785,781,776,775,769,769,759,
759,756,752,747,738,734,725,712,767,763,758,755,753,752,744,738,729,732,725,720,
709,702,700,779,766,763,758,755,753,753,747,744,737,729,723,715,706,705,696,782,
778,772,769,772,775,775,775,772,772,767,750,755,753,752,749,752,752,749,743,743,
732,732,728,725,720,715,711,703,696)

plot(data) # take a look

data1 <- data.frame(vals=data)  # make a data frame

data1$time <- row(data1)        # make a 'time' var

#######################
#
#  many change point approaches are looking for changes in mean or variance
#  which we don't have here, and that is why the approach is failing, I think
#  with the increased data in the newer example, it is estimating the mean better.
#  One way around that would be to use sequential differences
#  or some other derived value, here we use diff()

data1$diffs <- c(0,diff(data1$vals)) # padding the start with a zero since there is no diff there
                                     # and we want equal lengths

#######################
#
#  a simple answer might be something like this

data1$bigdiffs <- c(0,diff(data1$vals))>30  # define break as a change of > 30 points

#######################
# or we make classifiers using the differences between points

#######################
# BCP method on diffs

data1$bcpfit <- bcp(data1$diffs)$posterior.prob #from bcp


#######################
# PELT method on diffs

cpts1=cpt.var(data1$diffs,method="PELT")@cpts #from changepoint
cpts2=cpt.meanvar(data1$diffs,method="PELT")@cpts #from changepoint

data1$cptfit1 <- 0          # populate the variable
data1[cpts1,]$cptfit1 <- 1  # assign based on cpts1

data1$cptfit2 <- 0          # populate the variable
data1[cpts2,]$cptfit2 <- 1  # assign based on cpts1

#######################
# reorder columns for convenience: time should be first

data1 <- data1[,c("time","vals","diffs","bigdiffs","cptfit1","cptfit2","bcpfit")]

#######################
# plot results

data.m <- melt(data1,id.var=c("time")) #from reshape
head(data.m)
summary(data.m)
ggplot(data.m,aes(x=time, y=value, colour=variable))+geom_line()+facet_grid(variable~., scales="free")

#######################
#
# RESULTS: it looks like the simple bigdiffs approach outperforms the others
#          maybe simpler is better
#
# HOWEVER, if we wanted to hand label a bunch of data, we could use it to train a classifier
#          that could combine these approaches
	#######################
	#
	# short investigation of id'ing breakpoints in data
	#
	# matt@baggott.net

	library("bcp")
	library("ggplot2")
	library("changepoint")
	library("reshape")
	# should investigate newer library("cpm")

	#######################
	#
	# load data

	data = c(746,743,735,735,728,717,709,691,746,738,732,728,722,782,781,778,776,
	775,766,763,763,761,758,750,744,737,731,723,712,706,767,764,755,746,746,744,738,735,
	726,718,717,708,694,759,756,749,744,744,740,738,738,737,728,711,705,699,753,749,
	744,743,735,731,723,723,718,709,703,769,764,756,750,743,738,734,725,720,712,708,
	756,749,741,737,737,731,705,705,685,779,776,775,782,785,781,776,775,769,769,759,
	759,756,752,747,738,734,725,712,767,763,758,755,753,752,744,738,729,732,725,720,
	709,702,700,779,766,763,758,755,753,753,747,744,737,729,723,715,706,705,696,782,
	778,772,769,772,775,775,775,772,772,767,750,755,753,752,749,752,752,749,743,743,
	732,732,728,725,720,715,711,703,696)

	plot(data) # take a look

	data1 <- data.frame(vals=data) # make a data frame

	data1$time <- row(data1) # make a 'time' var

	#######################
	#
	# many change point approaches are looking for changes in mean or variance
	# which we don't have here, and that is why the approach is failing, I think
	# with the increased data in the newer example, it is estimating the mean better.
	# One way around that would be to use sequential differences
	# or some other derived value, here we use diff()

	data1$diffs <- c(0,diff(data1$vals)) # padding the start with a zero since there is no diff there
	# and we want equal lengths

	#######################
	#
	# a simple answer might be something like this

	data1$bigdiffs <- c(0,diff(data1$vals))>30 # define break as a change of > 30 points

	#######################
	# or we make classifiers using the differences between points

	#######################
	# BCP method on diffs

	data1$bcpfit <- bcp(data1$diffs)$posterior.prob #from bcp


	#######################
	# PELT method on diffs

	cpts1=cpt.var(data1$diffs,method="PELT")@cpts #from changepoint
	cpts2=cpt.meanvar(data1$diffs,method="PELT")@cpts #from changepoint

	data1$cptfit1 <- 0 # populate the variable
	data1[cpts1,]$cptfit1 <- 1 # assign based on cpts1

	data1$cptfit2 <- 0 # populate the variable
	data1[cpts2,]$cptfit2 <- 1 # assign based on cpts1

	#######################
	# reorder columns for convenience: time should be first

	data1 <- data1[,c("time","vals","diffs","bigdiffs","cptfit1","cptfit2","bcpfit")]

	#######################
	# plot results

	data.m <- melt(data1,id.var=c("time")) #from reshape
	head(data.m)
	summary(data.m)
	ggplot(data.m,aes(x=time, y=value, colour=variable))+geom_line()+facet_grid(variable~., scales="free")

	#######################
	#
	# RESULTS: it looks like the simple bigdiffs approach outperforms the others
	# maybe simpler is better
	#
	# HOWEVER, if we wanted to hand label a bunch of data, we could use it to train a classifier
	# that could combine these approaches