christrotter/CanaryInterestingMetrics.ps1

## CanaryInterestingMetrics.ps1
$ErrorActionPreference = "Stop"
$modulePath = $OctopusParameters['Octopus.Action[DeployScripts].Output.Package.InstallationDirectoryPath']

Import-Module AWSPowerShell
Import-Module WebAdministration
Import-Module $modulePath\scripts\modules\Elastico\Elastico.psd1
Import-Module $modulePath\scripts\modules\Elasticsearch\PS-Elasticsearch.psm1
Import-Module $modulePath\scripts\modules\Manager-CanaryFunctions.psm1
Import-Module $modulePath\scripts\modules\FilebeatFunctions.psm1

####################################################################################################################
####################################################################################################################
# AWS variables
$accessKey = $OctopusParameters['aws.Access']
$secretKey = $OctopusParameters['aws.Secret']
$region    = $OctopusParameters['aws.Region']

# ELBV2 variables
$appName        = $OctopusParameters['elbv2.appName'] # used in an IIS context - site name, appPool name
$oldBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisOldSiteId"]
if ($oldBuildNumber -eq 'onboarding') {
    Write-Output "ONBOARDING: Setting oldAppName to appName."
    $oldAppName = "$appName"
}
else {
    $oldAppName = "$appName-$oldBuildNumber"
}
$newBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisNewSiteId"]
$newAppName     = "$appName-$newBuildNumber"
$targetGroupArn = $OctopusParameters['elbv2.arnName']
$canaryTargetGroupArn = $OctopusParameters['elbv2.canaryArnName']
[int]$checkInterval = $OctopusParameters['elbv2.stateCheckInterval'] # seconds between instance ELBV2 state checks
[int]$maxChecks     = $OctopusParameters['elbv2.stateMaxChecks']  # maximum checks to run when checking instance ELBV2 state

# Warmup variables
$hostname = $appName # used in a DNS context
$uri      = $OctopusParameters['warmup.uri']
$MaxWarmupAttempts      = $OctopusParameters['warmup.maxWarmupAttempts']   # number of times to call the test URI
[int]$time_taken_target = $OctopusParameters['warmup.timeTakenTarget'] # the ELK query millisecond number

# Interesting metrics variables
$sleepTime                 = $OctopusParameters['metrics.sleepTime'] # how long to sleep and collect active traffic
$indexShortName            = $OctopusParameters['metrics.indexShortName']
[int]$lastMinutes          = $OctopusParameters['metrics.lastMinutes'] # how many minutes to look back (prolly want to set this to "since deploy")
[int]$responseThresholdMs  = $OctopusParameters['metrics.responseThresholdMs'] # millisecond threshold for response times
$errorRateThreshold        = "{0:P2}" -f $OctopusParameters['metrics.errorRateThreshold'] # percent of requests that are 500s
$responseThreshold         = "{0:P2}" -f $OctopusParameters['metrics.responseThreshold'] # percent of requests that are over $responseThresholdMs

Write-Output "Populating AWS variables for the register-instance and deregister-instance functions..."
$instanceId = $response = Invoke-RestMethod -Uri "http://169.254.169.254/latest/meta-data/instance-id" -Method Get
$target = New-Object -TypeName Amazon.ElasticLoadBalancingV2.Model.TargetDescription
$target.Id = $instanceId

####################################################################################################################
####################################################################################################################

<####################################################################################################################
                                                 SCRIPT STARTS HERE
####################################################################################################################>

<#
    This script is environment-agnostic, for now.  Once we get more metrics, we'll want to contextually target them.

    1. check interesting metrics
    2. determine pass/fail, send as an exit code

#>

Write-Output "Ok, our instance is showing as healthy in the ELB - time to run some tests of our interesting metrics!"

Write-Output "First off, we wait $($sleepTime)s to let the logs flood in..."
Start-Sleep -Seconds $sleepTime

Try {
    Write-Output "To make our canary process actually useful, we should really target our log searching against a specific server."
    $beatHostname = hostname
    Write-Output "Our target hostname is: $beatHostname"

    $date = Get-Date
    $startDate = ((Get-Date).AddMinutes($lastMinutes)).ToUniversalTime().ToString("o")
    Write-Output "Our start date variable (modified by our lastMinutes variable: $lastMinutes) is: $startDate"
    Write-Output " "
    # Define the Lucene queries here
    #     the -f $newAppName stuff is 'Format String', i.e. find/replace
    #     ...because of how the Lucene query needs to be formatted, single vs. double quotes matter
    $responseQueryErrors       = 'application_name="{0}" AND response:500' -f $newAppName
    $responseQueryNotErrors    = 'application_name="{0}" AND response:(200 OR 301 OR 302)' -f $newAppName
    $responseQueryBadResponse  = 'application_name="{0}" AND time_taken:(>{1})' -f $newAppName, $responseThresholdMs
    $responseQueryGoodResponse = 'application_name="{0}" AND time_taken:(<{1})' -f $newAppName, $responseThresholdMs


    # Gather the data from ES
    Write-Output "Querying elk for errors: $responseQueryErrors"
    $searchOutputErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryErrors -startDate $startDate
    Write-Output "Querying elk for not-errors: $responseQueryNotErrors"
    $searchOutputNotErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryNotErrors -startDate $startDate
    Write-Output "Querying elk for bad response: $responseQueryBadResponse"
    $searchOutputHighResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryBadResponse -startDate $startDate
    Write-Output "Querying elk for good response: $responseQueryGoodResponse"
    $searchOutputLowResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryGoodResponse -startDate $startDate


    # Calculate the results
    Write-Output " "
    Write-Output "######################################################"
    Write-Output "Calculating the metric for response errors vs. not-errors...i.e. Error Rate"
    Write-Output "We are taking the results for the search output and dividing error response count by total counts to find a percent error rate."
    if ($searchOutputErrors.Documents.Count -eq 0) {
        $errorRate = 0
    }
    else {
        $totalDocumentsErrors = ($searchOutputErrors.Documents.Count + $searchOutputNotErrors.Documents.Count)
        $errorRate = ($searchOutputErrors.Documents.Count / $totalDocumentsErrors)
    }
    $errorRateClean = "{0:P2}" -f $errorRate
    Write-Output "Our error rate is $errorRateClean, and our threshold is $errorRateThreshold."
    Write-Output "######################################################"
    Write-Output " "
    Write-Output "######################################################"
    Write-Output "Calculating the metric for high vs low response times..."
    Write-Output "We are taking the results from the search output and dividing above-time-taken-threshold counts vs. total counts to get a percent."
    if ($searchOutputHighResponse.Documents.Count -eq 0) {
        $responsePercent = 0
    }
    else {
        $totalDocumentsResponseTime = ($searchOutputHighResponse.Documents.Count + $searchOutputLowResponse.Documents.Count)
        $responsePercent = ($searchOutputHighResponse.Documents.Count / $totalDocumentsResponseTime)
    }
    $responsePercentClean = "{0:P2}" -f $responsePercent
    Write-Output "$responsePercentClean of all requests are over $($responseThresholdMs)ms"
    Write-Output "######################################################"
    Write-Output " "


    # Some extra output just to lend easy/fast credence to the above
    Write-Output "######################################################"
    Write-Output "##            Some output for clarity               ##"
    Write-Output "######################################################"
    Write-Output "Here's how many hits for bad HTTP responses: $($badSearchOutput.Documents.Count)"
    Write-Output "Here's how many hits for good HTTP responses: $($searchOutputNotErrors.Documents.Count)"
    Write-Output "Here's how many hits for high response time: $($searchOutputHighResponse.Documents.Count)"
    Write-Output "Here's how many hits for low response time: $($searchOutputLowResponse.Documents.Count)"

    Write-Output "If nothing shows below, no documents found!"
    Write-Output "Example good error document:"
    $searchOutputNotErrors.Documents | Select-Object -First 1 | Format-List message
    Write-Output "Example bad error document:"
    $searchOutputErrors.Documents | Select-Object -First 1 | Format-List message
    Write-Output "Example good response document: "
    $searchOutputLowResponse.Documents | Select-Object -first 1 | Format-List message
    Write-Output "Example bad response document: "
    $searchOutputHighResponse.Documents | Select-Object -first 1 | Format-List message

    Write-Output "######################################################"

}
Catch {
    Write-Output "Failed to query ELK!"
    Write-Output $_.Exception.Message
    return $_.Exception.StackTrace

}


##################################################
$failure = $null
Write-Output "We are now determining pass/fail based on the interesting metric thresholds..."

Write-Output "Checking interesting metric: Error Rate"
if ($errorRateClean -gt $errorRateThreshold) {
    Write-Output "Our error rate $errorRateClean is greater than the threshold of $errorRateThreshold"
    $failure = $true
}
else {
    Write-Output "Successful error rate tests! We can move on..."
}

Write-Output "Checking interesting metric: % of responses over $responseThresholdMs"
if ($responsePercentClean -gt $responseThreshold) {
    Write-Output "Our % of requests over response time of $responseThresholdMs ($responsePercentClean) is greater than the threshold of $errorRateThreshold %"
    $failure = $true
}
else {
    Write-Output "Successful response time tests! We can move on..."
}


##################################################
if ($failure -eq $true) {
    Write-Output "Deploy failed on $hostname during our canary testing!!"
    Write-Output "Running the revert to remove from canary target group, stop new site, start old site, and put back into production target group!"
    Exit 1
}
else {
    Write-Output "Success!!  We have a good set of test results and interesting metrics!"

$successMan = @"
.----------------.  .----------------.  .----------------.
| .--------------. || .--------------. || .--------------. |
| |    __        | || |     ____     | || |        __    | |
| |    \ \       | || |   .'    `.   | || |       / /    | |
| |     \ \      | || |  /  .--.  \  | || |      / /     | |
| |      \ \     | || |  | |    | |  | || |     / /      | |
| |       \ \    | || |  \  `--'  /  | || |    / /       | |
| |        \_\   | || |   `.____.'   | || |   /_/        | |
| |              | || |              | || |              | |
| '--------------' || '--------------' || '--------------' |
 '----------------'  '----------------'  '----------------'
"@
    Write-Output $successMan
    Exit 0
}
	$ErrorActionPreference = "Stop"
	$modulePath = $OctopusParameters['Octopus.Action[DeployScripts].Output.Package.InstallationDirectoryPath']

	Import-Module AWSPowerShell
	Import-Module WebAdministration
	Import-Module $modulePath\scripts\modules\Elastico\Elastico.psd1
	Import-Module $modulePath\scripts\modules\Elasticsearch\PS-Elasticsearch.psm1
	Import-Module $modulePath\scripts\modules\Manager-CanaryFunctions.psm1
	Import-Module $modulePath\scripts\modules\FilebeatFunctions.psm1

	####################################################################################################################
	####################################################################################################################
	# AWS variables
	$accessKey = $OctopusParameters['aws.Access']
	$secretKey = $OctopusParameters['aws.Secret']
	$region = $OctopusParameters['aws.Region']

	# ELBV2 variables
	$appName = $OctopusParameters['elbv2.appName'] # used in an IIS context - site name, appPool name
	$oldBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisOldSiteId"]
	if ($oldBuildNumber -eq 'onboarding') {
	Write-Output "ONBOARDING: Setting oldAppName to appName."
	$oldAppName = "$appName"
	}
	else {
	$oldAppName = "$appName-$oldBuildNumber"
	}
	$newBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisNewSiteId"]
	$newAppName = "$appName-$newBuildNumber"
	$targetGroupArn = $OctopusParameters['elbv2.arnName']
	$canaryTargetGroupArn = $OctopusParameters['elbv2.canaryArnName']
	[int]$checkInterval = $OctopusParameters['elbv2.stateCheckInterval'] # seconds between instance ELBV2 state checks
	[int]$maxChecks = $OctopusParameters['elbv2.stateMaxChecks'] # maximum checks to run when checking instance ELBV2 state

	# Warmup variables
	$hostname = $appName # used in a DNS context
	$uri = $OctopusParameters['warmup.uri']
	$MaxWarmupAttempts = $OctopusParameters['warmup.maxWarmupAttempts'] # number of times to call the test URI
	[int]$time_taken_target = $OctopusParameters['warmup.timeTakenTarget'] # the ELK query millisecond number

	# Interesting metrics variables
	$sleepTime = $OctopusParameters['metrics.sleepTime'] # how long to sleep and collect active traffic
	$indexShortName = $OctopusParameters['metrics.indexShortName']
	[int]$lastMinutes = $OctopusParameters['metrics.lastMinutes'] # how many minutes to look back (prolly want to set this to "since deploy")
	[int]$responseThresholdMs = $OctopusParameters['metrics.responseThresholdMs'] # millisecond threshold for response times
	$errorRateThreshold = "{0:P2}" -f $OctopusParameters['metrics.errorRateThreshold'] # percent of requests that are 500s
	$responseThreshold = "{0:P2}" -f $OctopusParameters['metrics.responseThreshold'] # percent of requests that are over $responseThresholdMs

	Write-Output "Populating AWS variables for the register-instance and deregister-instance functions..."
	$instanceId = $response = Invoke-RestMethod -Uri "http://169.254.169.254/latest/meta-data/instance-id" -Method Get
	$target = New-Object -TypeName Amazon.ElasticLoadBalancingV2.Model.TargetDescription
	$target.Id = $instanceId

	####################################################################################################################
	####################################################################################################################

	<####################################################################################################################
	SCRIPT STARTS HERE
	####################################################################################################################>

	<#
	This script is environment-agnostic, for now. Once we get more metrics, we'll want to contextually target them.

	1. check interesting metrics
	2. determine pass/fail, send as an exit code

	#>

	Write-Output "Ok, our instance is showing as healthy in the ELB - time to run some tests of our interesting metrics!"

	Write-Output "First off, we wait $($sleepTime)s to let the logs flood in..."
	Start-Sleep -Seconds $sleepTime

	Try {
	Write-Output "To make our canary process actually useful, we should really target our log searching against a specific server."
	$beatHostname = hostname
	Write-Output "Our target hostname is: $beatHostname"

	$date = Get-Date
	$startDate = ((Get-Date).AddMinutes($lastMinutes)).ToUniversalTime().ToString("o")
	Write-Output "Our start date variable (modified by our lastMinutes variable: $lastMinutes) is: $startDate"
	Write-Output " "
	# Define the Lucene queries here
	# the -f $newAppName stuff is 'Format String', i.e. find/replace
	# ...because of how the Lucene query needs to be formatted, single vs. double quotes matter
	$responseQueryErrors = 'application_name="{0}" AND response:500' -f $newAppName
	$responseQueryNotErrors = 'application_name="{0}" AND response:(200 OR 301 OR 302)' -f $newAppName
	$responseQueryBadResponse = 'application_name="{0}" AND time_taken:(>{1})' -f $newAppName, $responseThresholdMs
	$responseQueryGoodResponse = 'application_name="{0}" AND time_taken:(<{1})' -f $newAppName, $responseThresholdMs


	# Gather the data from ES
	Write-Output "Querying elk for errors: $responseQueryErrors"
	$searchOutputErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryErrors -startDate $startDate
	Write-Output "Querying elk for not-errors: $responseQueryNotErrors"
	$searchOutputNotErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryNotErrors -startDate $startDate
	Write-Output "Querying elk for bad response: $responseQueryBadResponse"
	$searchOutputHighResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryBadResponse -startDate $startDate
	Write-Output "Querying elk for good response: $responseQueryGoodResponse"
	$searchOutputLowResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryGoodResponse -startDate $startDate


	# Calculate the results
	Write-Output " "
	Write-Output "######################################################"
	Write-Output "Calculating the metric for response errors vs. not-errors...i.e. Error Rate"
	Write-Output "We are taking the results for the search output and dividing error response count by total counts to find a percent error rate."
	if ($searchOutputErrors.Documents.Count -eq 0) {
	$errorRate = 0
	}
	else {
	$totalDocumentsErrors = ($searchOutputErrors.Documents.Count + $searchOutputNotErrors.Documents.Count)
	$errorRate = ($searchOutputErrors.Documents.Count / $totalDocumentsErrors)
	}
	$errorRateClean = "{0:P2}" -f $errorRate
	Write-Output "Our error rate is $errorRateClean, and our threshold is $errorRateThreshold."
	Write-Output "######################################################"
	Write-Output " "
	Write-Output "######################################################"
	Write-Output "Calculating the metric for high vs low response times..."
	Write-Output "We are taking the results from the search output and dividing above-time-taken-threshold counts vs. total counts to get a percent."
	if ($searchOutputHighResponse.Documents.Count -eq 0) {
	$responsePercent = 0
	}
	else {
	$totalDocumentsResponseTime = ($searchOutputHighResponse.Documents.Count + $searchOutputLowResponse.Documents.Count)
	$responsePercent = ($searchOutputHighResponse.Documents.Count / $totalDocumentsResponseTime)
	}
	$responsePercentClean = "{0:P2}" -f $responsePercent
	Write-Output "$responsePercentClean of all requests are over $($responseThresholdMs)ms"
	Write-Output "######################################################"
	Write-Output " "


	# Some extra output just to lend easy/fast credence to the above
	Write-Output "######################################################"
	Write-Output "## Some output for clarity ##"
	Write-Output "######################################################"
	Write-Output "Here's how many hits for bad HTTP responses: $($badSearchOutput.Documents.Count)"
	Write-Output "Here's how many hits for good HTTP responses: $($searchOutputNotErrors.Documents.Count)"
	Write-Output "Here's how many hits for high response time: $($searchOutputHighResponse.Documents.Count)"
	Write-Output "Here's how many hits for low response time: $($searchOutputLowResponse.Documents.Count)"

	Write-Output "If nothing shows below, no documents found!"
	Write-Output "Example good error document:"
	$searchOutputNotErrors.Documents \| Select-Object -First 1 \| Format-List message
	Write-Output "Example bad error document:"
	$searchOutputErrors.Documents \| Select-Object -First 1 \| Format-List message
	Write-Output "Example good response document: "
	$searchOutputLowResponse.Documents \| Select-Object -first 1 \| Format-List message
	Write-Output "Example bad response document: "
	$searchOutputHighResponse.Documents \| Select-Object -first 1 \| Format-List message

	Write-Output "######################################################"

	}
	Catch {
	Write-Output "Failed to query ELK!"
	Write-Output $_.Exception.Message
	return $_.Exception.StackTrace

	}


	##################################################
	$failure = $null
	Write-Output "We are now determining pass/fail based on the interesting metric thresholds..."

	Write-Output "Checking interesting metric: Error Rate"
	if ($errorRateClean -gt $errorRateThreshold) {
	Write-Output "Our error rate $errorRateClean is greater than the threshold of $errorRateThreshold"
	$failure = $true
	}
	else {
	Write-Output "Successful error rate tests! We can move on..."
	}

	Write-Output "Checking interesting metric: % of responses over $responseThresholdMs"
	if ($responsePercentClean -gt $responseThreshold) {
	Write-Output "Our % of requests over response time of $responseThresholdMs ($responsePercentClean) is greater than the threshold of $errorRateThreshold %"
	$failure = $true
	}
	else {
	Write-Output "Successful response time tests! We can move on..."
	}


	##################################################
	if ($failure -eq $true) {
	Write-Output "Deploy failed on $hostname during our canary testing!!"
	Write-Output "Running the revert to remove from canary target group, stop new site, start old site, and put back into production target group!"
	Exit 1
	}
	else {
	Write-Output "Success!! We have a good set of test results and interesting metrics!"

	$successMan = @"
	.----------------. .----------------. .----------------.
	\| .--------------. \|\| .--------------. \|\| .--------------. \|
	\| \| __ \| \|\| \| ____ \| \|\| \| __ \| \|
	\| \| \ \ \| \|\| \| .' `. \| \|\| \| / / \| \|
	\| \| \ \ \| \|\| \| / .--. \ \| \|\| \| / / \| \|
	\| \| \ \ \| \|\| \| \| \| \| \| \| \|\| \| / / \| \|
	\| \| \ \ \| \|\| \| \ `--' / \| \|\| \| / / \| \|
	\| \| \_\ \| \|\| \| `.____.' \| \|\| \| /_/ \| \|
	\| \| \| \|\| \| \| \|\| \| \| \|
	\| '--------------' \|\| '--------------' \|\| '--------------' \|
	'----------------' '----------------' '----------------'
	"@
	Write-Output $successMan
	Exit 0
	}