Canary-InterestingMetrics
$ErrorActionPreference = "Stop" | |
$modulePath = $OctopusParameters['Octopus.Action[DeployScripts].Output.Package.InstallationDirectoryPath'] | |
Import-Module AWSPowerShell | |
Import-Module WebAdministration | |
Import-Module $modulePath\scripts\modules\Elastico\Elastico.psd1 | |
Import-Module $modulePath\scripts\modules\Elasticsearch\PS-Elasticsearch.psm1 | |
Import-Module $modulePath\scripts\modules\Manager-CanaryFunctions.psm1 | |
Import-Module $modulePath\scripts\modules\FilebeatFunctions.psm1 | |
#################################################################################################################### | |
#################################################################################################################### | |
# AWS variables | |
$accessKey = $OctopusParameters['aws.Access'] | |
$secretKey = $OctopusParameters['aws.Secret'] | |
$region = $OctopusParameters['aws.Region'] | |
# ELBV2 variables | |
$appName = $OctopusParameters['elbv2.appName'] # used in an IIS context - site name, appPool name | |
$oldBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisOldSiteId"] | |
if ($oldBuildNumber -eq 'onboarding') { | |
Write-Output "ONBOARDING: Setting oldAppName to appName." | |
$oldAppName = "$appName" | |
} | |
else { | |
$oldAppName = "$appName-$oldBuildNumber" | |
} | |
$newBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisNewSiteId"] | |
$newAppName = "$appName-$newBuildNumber" | |
$targetGroupArn = $OctopusParameters['elbv2.arnName'] | |
$canaryTargetGroupArn = $OctopusParameters['elbv2.canaryArnName'] | |
[int]$checkInterval = $OctopusParameters['elbv2.stateCheckInterval'] # seconds between instance ELBV2 state checks | |
[int]$maxChecks = $OctopusParameters['elbv2.stateMaxChecks'] # maximum checks to run when checking instance ELBV2 state | |
# Warmup variables | |
$hostname = $appName # used in a DNS context | |
$uri = $OctopusParameters['warmup.uri'] | |
$MaxWarmupAttempts = $OctopusParameters['warmup.maxWarmupAttempts'] # number of times to call the test URI | |
[int]$time_taken_target = $OctopusParameters['warmup.timeTakenTarget'] # the ELK query millisecond number | |
# Interesting metrics variables | |
$sleepTime = $OctopusParameters['metrics.sleepTime'] # how long to sleep and collect active traffic | |
$indexShortName = $OctopusParameters['metrics.indexShortName'] | |
[int]$lastMinutes = $OctopusParameters['metrics.lastMinutes'] # how many minutes to look back (prolly want to set this to "since deploy") | |
[int]$responseThresholdMs = $OctopusParameters['metrics.responseThresholdMs'] # millisecond threshold for response times | |
$errorRateThreshold = "{0:P2}" -f $OctopusParameters['metrics.errorRateThreshold'] # percent of requests that are 500s | |
$responseThreshold = "{0:P2}" -f $OctopusParameters['metrics.responseThreshold'] # percent of requests that are over $responseThresholdMs | |
Write-Output "Populating AWS variables for the register-instance and deregister-instance functions..." | |
$instanceId = $response = Invoke-RestMethod -Uri "http://169.254.169.254/latest/meta-data/instance-id" -Method Get | |
$target = New-Object -TypeName Amazon.ElasticLoadBalancingV2.Model.TargetDescription | |
$target.Id = $instanceId | |
#################################################################################################################### | |
#################################################################################################################### | |
<#################################################################################################################### | |
SCRIPT STARTS HERE | |
####################################################################################################################> | |
<# | |
This script is environment-agnostic, for now. Once we get more metrics, we'll want to contextually target them. | |
1. check interesting metrics | |
2. determine pass/fail, send as an exit code | |
#> | |
Write-Output "Ok, our instance is showing as healthy in the ELB - time to run some tests of our interesting metrics!" | |
Write-Output "First off, we wait $($sleepTime)s to let the logs flood in..." | |
Start-Sleep -Seconds $sleepTime | |
Try { | |
Write-Output "To make our canary process actually useful, we should really target our log searching against a specific server." | |
$beatHostname = hostname | |
Write-Output "Our target hostname is: $beatHostname" | |
$date = Get-Date | |
$startDate = ((Get-Date).AddMinutes($lastMinutes)).ToUniversalTime().ToString("o") | |
Write-Output "Our start date variable (modified by our lastMinutes variable: $lastMinutes) is: $startDate" | |
Write-Output " " | |
# Define the Lucene queries here | |
# the -f $newAppName stuff is 'Format String', i.e. find/replace | |
# ...because of how the Lucene query needs to be formatted, single vs. double quotes matter | |
$responseQueryErrors = 'application_name="{0}" AND response:500' -f $newAppName | |
$responseQueryNotErrors = 'application_name="{0}" AND response:(200 OR 301 OR 302)' -f $newAppName | |
$responseQueryBadResponse = 'application_name="{0}" AND time_taken:(>{1})' -f $newAppName, $responseThresholdMs | |
$responseQueryGoodResponse = 'application_name="{0}" AND time_taken:(<{1})' -f $newAppName, $responseThresholdMs | |
# Gather the data from ES | |
Write-Output "Querying elk for errors: $responseQueryErrors" | |
$searchOutputErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryErrors -startDate $startDate | |
Write-Output "Querying elk for not-errors: $responseQueryNotErrors" | |
$searchOutputNotErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryNotErrors -startDate $startDate | |
Write-Output "Querying elk for bad response: $responseQueryBadResponse" | |
$searchOutputHighResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryBadResponse -startDate $startDate | |
Write-Output "Querying elk for good response: $responseQueryGoodResponse" | |
$searchOutputLowResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryGoodResponse -startDate $startDate | |
# Calculate the results | |
Write-Output " " | |
Write-Output "######################################################" | |
Write-Output "Calculating the metric for response errors vs. not-errors...i.e. Error Rate" | |
Write-Output "We are taking the results for the search output and dividing error response count by total counts to find a percent error rate." | |
if ($searchOutputErrors.Documents.Count -eq 0) { | |
$errorRate = 0 | |
} | |
else { | |
$totalDocumentsErrors = ($searchOutputErrors.Documents.Count + $searchOutputNotErrors.Documents.Count) | |
$errorRate = ($searchOutputErrors.Documents.Count / $totalDocumentsErrors) | |
} | |
$errorRateClean = "{0:P2}" -f $errorRate | |
Write-Output "Our error rate is $errorRateClean, and our threshold is $errorRateThreshold." | |
Write-Output "######################################################" | |
Write-Output " " | |
Write-Output "######################################################" | |
Write-Output "Calculating the metric for high vs low response times..." | |
Write-Output "We are taking the results from the search output and dividing above-time-taken-threshold counts vs. total counts to get a percent." | |
if ($searchOutputHighResponse.Documents.Count -eq 0) { | |
$responsePercent = 0 | |
} | |
else { | |
$totalDocumentsResponseTime = ($searchOutputHighResponse.Documents.Count + $searchOutputLowResponse.Documents.Count) | |
$responsePercent = ($searchOutputHighResponse.Documents.Count / $totalDocumentsResponseTime) | |
} | |
$responsePercentClean = "{0:P2}" -f $responsePercent | |
Write-Output "$responsePercentClean of all requests are over $($responseThresholdMs)ms" | |
Write-Output "######################################################" | |
Write-Output " " | |
# Some extra output just to lend easy/fast credence to the above | |
Write-Output "######################################################" | |
Write-Output "## Some output for clarity ##" | |
Write-Output "######################################################" | |
Write-Output "Here's how many hits for bad HTTP responses: $($badSearchOutput.Documents.Count)" | |
Write-Output "Here's how many hits for good HTTP responses: $($searchOutputNotErrors.Documents.Count)" | |
Write-Output "Here's how many hits for high response time: $($searchOutputHighResponse.Documents.Count)" | |
Write-Output "Here's how many hits for low response time: $($searchOutputLowResponse.Documents.Count)" | |
Write-Output "If nothing shows below, no documents found!" | |
Write-Output "Example good error document:" | |
$searchOutputNotErrors.Documents | Select-Object -First 1 | Format-List message | |
Write-Output "Example bad error document:" | |
$searchOutputErrors.Documents | Select-Object -First 1 | Format-List message | |
Write-Output "Example good response document: " | |
$searchOutputLowResponse.Documents | Select-Object -first 1 | Format-List message | |
Write-Output "Example bad response document: " | |
$searchOutputHighResponse.Documents | Select-Object -first 1 | Format-List message | |
Write-Output "######################################################" | |
} | |
Catch { | |
Write-Output "Failed to query ELK!" | |
Write-Output $_.Exception.Message | |
return $_.Exception.StackTrace | |
} | |
################################################## | |
$failure = $null | |
Write-Output "We are now determining pass/fail based on the interesting metric thresholds..." | |
Write-Output "Checking interesting metric: Error Rate" | |
if ($errorRateClean -gt $errorRateThreshold) { | |
Write-Output "Our error rate $errorRateClean is greater than the threshold of $errorRateThreshold" | |
$failure = $true | |
} | |
else { | |
Write-Output "Successful error rate tests! We can move on..." | |
} | |
Write-Output "Checking interesting metric: % of responses over $responseThresholdMs" | |
if ($responsePercentClean -gt $responseThreshold) { | |
Write-Output "Our % of requests over response time of $responseThresholdMs ($responsePercentClean) is greater than the threshold of $errorRateThreshold %" | |
$failure = $true | |
} | |
else { | |
Write-Output "Successful response time tests! We can move on..." | |
} | |
################################################## | |
if ($failure -eq $true) { | |
Write-Output "Deploy failed on $hostname during our canary testing!!" | |
Write-Output "Running the revert to remove from canary target group, stop new site, start old site, and put back into production target group!" | |
Exit 1 | |
} | |
else { | |
Write-Output "Success!! We have a good set of test results and interesting metrics!" | |
$successMan = @" | |
.----------------. .----------------. .----------------. | |
| .--------------. || .--------------. || .--------------. | | |
| | __ | || | ____ | || | __ | | | |
| | \ \ | || | .' `. | || | / / | | | |
| | \ \ | || | / .--. \ | || | / / | | | |
| | \ \ | || | | | | | | || | / / | | | |
| | \ \ | || | \ `--' / | || | / / | | | |
| | \_\ | || | `.____.' | || | /_/ | | | |
| | | || | | || | | | | |
| '--------------' || '--------------' || '--------------' | | |
'----------------' '----------------' '----------------' | |
"@ | |
Write-Output $successMan | |
Exit 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment