Created
October 20, 2015 00:08
-
-
Save BoltzmannBrain/ca00cd605d830c761ce0 to your computer and use it in GitHub Desktop.
R script to run the Twitter AnomalyDetection algorithms on the NAB dataset.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################ | |
# This script runs the Twitter AnomalyDetection algorithms on the NAB data set. | |
# | |
# You must first install the AnomalyDetection package: | |
# https://github.com/twitter/AnomalyDetection#how-to-get-started | |
# | |
# You must also have NAB installed and specify the path at the bottom of this | |
# script. | |
################################################################################ | |
library(methods) | |
library(AnomalyDetection) | |
library(jsonlite) | |
addDetections <- function(anomalyDataFrame, detections, algorithmName) { | |
anomalyDataFrame$anomaly_score=0.0 | |
if (length(detections$anoms) > 0) { | |
for (i in 1:nrow(detections$anoms)) { | |
if (algorithmName == "twitterADTs") { | |
idx = match(detections$anoms[i, 1], anomalyDataFrame$timestamp) | |
} | |
else if (algorithmName == "twitterADVec") { | |
idx = detections$anoms[i, 1] | |
} | |
anomalyDataFrame[idx,]$anomaly_score = 1.0 | |
} | |
} | |
return(anomalyDataFrame) | |
} | |
addLabels <- function(anomalyDataFrame, anomalyBounds) { | |
anomalyDataFrame$label = 0 | |
if (length(anomalyBounds) != 0) { | |
for (i in 1:nrow(anomalyBounds)) { | |
lower = anomalyBounds[i, 1] | |
upper = anomalyBounds[i, 2] | |
idx = anomalyDataFrame$timestamp >= lower & anomalyDataFrame$timestamp <= upper | |
idx[is.na(idx)] = FALSE | |
anomalyDataFrame[idx,]$label = 1 | |
} | |
} | |
return(anomalyDataFrame) | |
} | |
runTwitter <- function(algorithmName, nab_data, filename) { | |
if (algorithmName == "twitterADTs") { | |
results = tryCatch( | |
{ | |
message(paste( | |
"Attempting detection w/ AnomalyDetectionTS on ", filename)) | |
AnomalyDetectionTs( | |
nab_data, max_anoms=0.0008, direction='both', plot=FALSE) | |
}, | |
error = function(cond) { | |
message(paste("Unable to run the algorithm for ", filename)) | |
return(NULL) | |
} | |
) | |
} | |
else if (algorithmName == "twitterADVec") { | |
message(paste("Detecting w/ AnomalyDetectionVec on ", filename)) | |
results = AnomalyDetectionVec( | |
nab_data[,2], alpha=0.05, period=150, max_anoms=0.0020, direction='both', | |
plot=FALSE) | |
} | |
message("Results...") | |
print(results$anoms) | |
return(results) | |
} | |
main <- function(pathToNAB, algorithmName, skipFiles=list()) { | |
# pathToNAB (character): string specifying path to the NAB dir. | |
# algorithmName (character): either 'twitterADTs' or 'twitterADVec'. | |
# skipFiles (list): file names to skip; useful in debugging. | |
# Format dates: coerce from character class to nabDate class | |
setClass("nabDate") | |
setAs( | |
"character", | |
"nabDate", | |
function(from) as.POSIXlt(from, format="%Y-%m-%d %H:%M:%OS")) | |
# Setup paths to NAB data and results | |
nabDataDir = paste(pathToNAB, "data", sep='/') | |
dataDirs = list.files(nabDataDir) | |
resultsDir = paste(pathToNAB, "results", algorithmName, sep='/') | |
# Get the truth anomaly windows | |
windows = fromJSON(paste(pathToNAB, "labels/combined_windows.json", sep='/')) | |
for (dDir in dataDirs) { | |
dataFiles = list.files(paste(nabDataDir, dDir, sep='/')) | |
for (dFile in dataFiles) { | |
if (is.element(dFile, skipFiles)) { | |
next | |
} | |
# Get the data and run the detector | |
dataName = paste(dDir, dFile, sep='/') | |
dFilePath = paste(nabDataDir, dataName, sep='/') | |
nab_data = read.csv(dFilePath, colClasses=c("nabDate", "numeric")) | |
results = runTwitter(algorithmName, nab_data, dFilePath) | |
# Populate dataframe with anomaly scores and truth labels | |
nab_data = addDetections(nab_data, results, algorithmName) | |
nab_data = addLabels(nab_data, windows[[dataName]]) | |
# Write results to csv | |
resultsFileName = paste(algorithmName, dFile, sep='_') | |
write.csv( | |
nab_data, | |
paste(resultsDir, dDir, resultsFileName, sep='/'), | |
row.names=FALSE) | |
} | |
} | |
} | |
pathToNAB = "path/to/nab" | |
skipFiles = list() | |
algorithmNames = list("twitterADVec", "twitterADTs") | |
for (alg in algorithmNames) { | |
main(pathToNAB, alg) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment