Created September 17, 2017 01:14
$ErrorActionPreference = "Stop"
$modulePath = $OctopusParameters['Octopus.Action[DeployScripts].Output.Package.InstallationDirectoryPath']
Import-Module AWSPowerShell
Import-Module WebAdministration
Import-Module $modulePath\scripts\modules\Elastico\Elastico.psd1
Import-Module $modulePath\scripts\modules\Elasticsearch\PS-Elasticsearch.psm1
Import-Module $modulePath\scripts\modules\Manager-CanaryFunctions.psm1
Import-Module $modulePath\scripts\modules\FilebeatFunctions.psm1
# AWS variables
$accessKey = $OctopusParameters['aws.Access']
$secretKey = $OctopusParameters['aws.Secret']
$region = $OctopusParameters['aws.Region']
# ELBV2 variables
$appName = $OctopusParameters['elbv2.appName'] # used in an IIS context - site name, appPool name
$oldBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisOldSiteId"]
if ($oldBuildNumber -eq 'onboarding') {
Write-Output "ONBOARDING: Setting oldAppName to appName."
$oldAppName = "$appName"
else {
$oldAppName = "$appName-$oldBuildNumber"
$newBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisNewSiteId"]
$newAppName = "$appName-$newBuildNumber"
$targetGroupArn = $OctopusParameters['elbv2.arnName']
$canaryTargetGroupArn = $OctopusParameters['elbv2.canaryArnName']
[int]$checkInterval = $OctopusParameters['elbv2.stateCheckInterval'] # seconds between instance ELBV2 state checks
[int]$maxChecks = $OctopusParameters['elbv2.stateMaxChecks'] # maximum checks to run when checking instance ELBV2 state
# Warmup variables
$hostname = $appName # used in a DNS context
$uri = $OctopusParameters['warmup.uri']
$MaxWarmupAttempts = $OctopusParameters['warmup.maxWarmupAttempts'] # number of times to call the test URI
[int]$time_taken_target = $OctopusParameters['warmup.timeTakenTarget'] # the ELK query millisecond number
# Interesting metrics variables
$sleepTime = $OctopusParameters['metrics.sleepTime'] # how long to sleep and collect active traffic
$indexShortName = $OctopusParameters['metrics.indexShortName']
[int]$lastMinutes = $OctopusParameters['metrics.lastMinutes'] # how many minutes to look back (prolly want to set this to "since deploy")
[int]$responseThresholdMs = $OctopusParameters['metrics.responseThresholdMs'] # millisecond threshold for response times
$errorRateThreshold = "{0:P2}" -f $OctopusParameters['metrics.errorRateThreshold'] # percent of requests that are 500s
$responseThreshold = "{0:P2}" -f $OctopusParameters['metrics.responseThreshold'] # percent of requests that are over $responseThresholdMs
Write-Output "Populating AWS variables for the register-instance and deregister-instance functions..."
$instanceId = $response = Invoke-RestMethod -Uri "" -Method Get
$target = New-Object -TypeName Amazon.ElasticLoadBalancingV2.Model.TargetDescription
$target.Id = $instanceId
This script is environment-agnostic, for now. Once we get more metrics, we'll want to contextually target them.
1. check interesting metrics
2. determine pass/fail, send as an exit code
Write-Output "Ok, our instance is showing as healthy in the ELB - time to run some tests of our interesting metrics!"
Write-Output "First off, we wait $($sleepTime)s to let the logs flood in..."
Start-Sleep -Seconds $sleepTime
Try {
Write-Output "To make our canary process actually useful, we should really target our log searching against a specific server."
$beatHostname = hostname
Write-Output "Our target hostname is: $beatHostname"
$date = Get-Date
$startDate = ((Get-Date).AddMinutes($lastMinutes)).ToUniversalTime().ToString("o")
Write-Output "Our start date variable (modified by our lastMinutes variable: $lastMinutes) is: $startDate"
Write-Output " "
# Define the Lucene queries here
# the -f $newAppName stuff is 'Format String', i.e. find/replace
# ...because of how the Lucene query needs to be formatted, single vs. double quotes matter
$responseQueryErrors = 'application_name="{0}" AND response:500' -f $newAppName
$responseQueryNotErrors = 'application_name="{0}" AND response:(200 OR 301 OR 302)' -f $newAppName
$responseQueryBadResponse = 'application_name="{0}" AND time_taken:(>{1})' -f $newAppName, $responseThresholdMs
$responseQueryGoodResponse = 'application_name="{0}" AND time_taken:(<{1})' -f $newAppName, $responseThresholdMs
# Gather the data from ES
Write-Output "Querying elk for errors: $responseQueryErrors"
$searchOutputErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryErrors -startDate $startDate
Write-Output "Querying elk for not-errors: $responseQueryNotErrors"
$searchOutputNotErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryNotErrors -startDate $startDate
Write-Output "Querying elk for bad response: $responseQueryBadResponse"
$searchOutputHighResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryBadResponse -startDate $startDate
Write-Output "Querying elk for good response: $responseQueryGoodResponse"
$searchOutputLowResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryGoodResponse -startDate $startDate
# Calculate the results
Write-Output " "
Write-Output "######################################################"
Write-Output "Calculating the metric for response errors vs. not-errors...i.e. Error Rate"
Write-Output "We are taking the results for the search output and dividing error response count by total counts to find a percent error rate."
if ($searchOutputErrors.Documents.Count -eq 0) {
$errorRate = 0
else {
$totalDocumentsErrors = ($searchOutputErrors.Documents.Count + $searchOutputNotErrors.Documents.Count)
$errorRate = ($searchOutputErrors.Documents.Count / $totalDocumentsErrors)
$errorRateClean = "{0:P2}" -f $errorRate
Write-Output "Our error rate is $errorRateClean, and our threshold is $errorRateThreshold."
Write-Output "######################################################"
Write-Output " "
Write-Output "######################################################"
Write-Output "Calculating the metric for high vs low response times..."
Write-Output "We are taking the results from the search output and dividing above-time-taken-threshold counts vs. total counts to get a percent."
if ($searchOutputHighResponse.Documents.Count -eq 0) {
$responsePercent = 0
else {
$totalDocumentsResponseTime = ($searchOutputHighResponse.Documents.Count + $searchOutputLowResponse.Documents.Count)
$responsePercent = ($searchOutputHighResponse.Documents.Count / $totalDocumentsResponseTime)
$responsePercentClean = "{0:P2}" -f $responsePercent
Write-Output "$responsePercentClean of all requests are over $($responseThresholdMs)ms"
Write-Output "######################################################"
Write-Output " "
# Some extra output just to lend easy/fast credence to the above
Write-Output "######################################################"
Write-Output "## Some output for clarity ##"
Write-Output "######################################################"
Write-Output "Here's how many hits for bad HTTP responses: $($badSearchOutput.Documents.Count)"
Write-Output "Here's how many hits for good HTTP responses: $($searchOutputNotErrors.Documents.Count)"
Write-Output "Here's how many hits for high response time: $($searchOutputHighResponse.Documents.Count)"
Write-Output "Here's how many hits for low response time: $($searchOutputLowResponse.Documents.Count)"
Write-Output "If nothing shows below, no documents found!"
Write-Output "Example good error document:"
$searchOutputNotErrors.Documents | Select-Object -First 1 | Format-List message
Write-Output "Example bad error document:"
$searchOutputErrors.Documents | Select-Object -First 1 | Format-List message
Write-Output "Example good response document: "
$searchOutputLowResponse.Documents | Select-Object -first 1 | Format-List message
Write-Output "Example bad response document: "
$searchOutputHighResponse.Documents | Select-Object -first 1 | Format-List message
Write-Output "######################################################"
Catch {
Write-Output "Failed to query ELK!"
Write-Output $_.Exception.Message
return $_.Exception.StackTrace
$failure = $null
Write-Output "We are now determining pass/fail based on the interesting metric thresholds..."
Write-Output "Checking interesting metric: Error Rate"
if ($errorRateClean -gt $errorRateThreshold) {
Write-Output "Our error rate $errorRateClean is greater than the threshold of $errorRateThreshold"
$failure = $true
else {
Write-Output "Successful error rate tests! We can move on..."
Write-Output "Checking interesting metric: % of responses over $responseThresholdMs"
if ($responsePercentClean -gt $responseThreshold) {
Write-Output "Our % of requests over response time of $responseThresholdMs ($responsePercentClean) is greater than the threshold of $errorRateThreshold %"
$failure = $true
else {
Write-Output "Successful response time tests! We can move on..."
if ($failure -eq $true) {
Write-Output "Deploy failed on $hostname during our canary testing!!"
Write-Output "Running the revert to remove from canary target group, stop new site, start old site, and put back into production target group!"
Exit 1
else {
Write-Output "Success!! We have a good set of test results and interesting metrics!"
$successMan = @"
.----------------. .----------------. .----------------.
| .--------------. || .--------------. || .--------------. |
| | __ | || | ____ | || | __ | |
| | \ \ | || | .' `. | || | / / | |
| | \ \ | || | / .--. \ | || | / / | |
| | \ \ | || | | | | | | || | / / | |
| | \ \ | || | \ `--' / | || | / / | |
| | \_\ | || | `.____.' | || | /_/ | |
| | | || | | || | | |
| '--------------' || '--------------' || '--------------' |
'----------------' '----------------' '----------------'
Write-Output $successMan
Exit 0
