Created
September 17, 2017 01:14
-
-
Save christrotter/f078584582267b7825575dc4cecfa95c to your computer and use it in GitHub Desktop.
Canary-InterestingMetrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ErrorActionPreference = "Stop" | |
$modulePath = $OctopusParameters['Octopus.Action[DeployScripts].Output.Package.InstallationDirectoryPath'] | |
Import-Module AWSPowerShell | |
Import-Module WebAdministration | |
Import-Module $modulePath\scripts\modules\Elastico\Elastico.psd1 | |
Import-Module $modulePath\scripts\modules\Elasticsearch\PS-Elasticsearch.psm1 | |
Import-Module $modulePath\scripts\modules\Manager-CanaryFunctions.psm1 | |
Import-Module $modulePath\scripts\modules\FilebeatFunctions.psm1 | |
#################################################################################################################### | |
#################################################################################################################### | |
# AWS variables | |
$accessKey = $OctopusParameters['aws.Access'] | |
$secretKey = $OctopusParameters['aws.Secret'] | |
$region = $OctopusParameters['aws.Region'] | |
# ELBV2 variables | |
$appName = $OctopusParameters['elbv2.appName'] # used in an IIS context - site name, appPool name | |
$oldBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisOldSiteId"] | |
if ($oldBuildNumber -eq 'onboarding') { | |
Write-Output "ONBOARDING: Setting oldAppName to appName." | |
$oldAppName = "$appName" | |
} | |
else { | |
$oldAppName = "$appName-$oldBuildNumber" | |
} | |
$newBuildNumber = $OctopusParameters["Octopus.Action[CanaryStart].Output.iisNewSiteId"] | |
$newAppName = "$appName-$newBuildNumber" | |
$targetGroupArn = $OctopusParameters['elbv2.arnName'] | |
$canaryTargetGroupArn = $OctopusParameters['elbv2.canaryArnName'] | |
[int]$checkInterval = $OctopusParameters['elbv2.stateCheckInterval'] # seconds between instance ELBV2 state checks | |
[int]$maxChecks = $OctopusParameters['elbv2.stateMaxChecks'] # maximum checks to run when checking instance ELBV2 state | |
# Warmup variables | |
$hostname = $appName # used in a DNS context | |
$uri = $OctopusParameters['warmup.uri'] | |
$MaxWarmupAttempts = $OctopusParameters['warmup.maxWarmupAttempts'] # number of times to call the test URI | |
[int]$time_taken_target = $OctopusParameters['warmup.timeTakenTarget'] # the ELK query millisecond number | |
# Interesting metrics variables | |
$sleepTime = $OctopusParameters['metrics.sleepTime'] # how long to sleep and collect active traffic | |
$indexShortName = $OctopusParameters['metrics.indexShortName'] | |
[int]$lastMinutes = $OctopusParameters['metrics.lastMinutes'] # how many minutes to look back (prolly want to set this to "since deploy") | |
[int]$responseThresholdMs = $OctopusParameters['metrics.responseThresholdMs'] # millisecond threshold for response times | |
$errorRateThreshold = "{0:P2}" -f $OctopusParameters['metrics.errorRateThreshold'] # percent of requests that are 500s | |
$responseThreshold = "{0:P2}" -f $OctopusParameters['metrics.responseThreshold'] # percent of requests that are over $responseThresholdMs | |
Write-Output "Populating AWS variables for the register-instance and deregister-instance functions..." | |
$instanceId = $response = Invoke-RestMethod -Uri "http://169.254.169.254/latest/meta-data/instance-id" -Method Get | |
$target = New-Object -TypeName Amazon.ElasticLoadBalancingV2.Model.TargetDescription | |
$target.Id = $instanceId | |
#################################################################################################################### | |
#################################################################################################################### | |
<#################################################################################################################### | |
SCRIPT STARTS HERE | |
####################################################################################################################> | |
<# | |
This script is environment-agnostic, for now. Once we get more metrics, we'll want to contextually target them. | |
1. check interesting metrics | |
2. determine pass/fail, send as an exit code | |
#> | |
Write-Output "Ok, our instance is showing as healthy in the ELB - time to run some tests of our interesting metrics!" | |
Write-Output "First off, we wait $($sleepTime)s to let the logs flood in..." | |
Start-Sleep -Seconds $sleepTime | |
Try { | |
Write-Output "To make our canary process actually useful, we should really target our log searching against a specific server." | |
$beatHostname = hostname | |
Write-Output "Our target hostname is: $beatHostname" | |
$date = Get-Date | |
$startDate = ((Get-Date).AddMinutes($lastMinutes)).ToUniversalTime().ToString("o") | |
Write-Output "Our start date variable (modified by our lastMinutes variable: $lastMinutes) is: $startDate" | |
Write-Output " " | |
# Define the Lucene queries here | |
# the -f $newAppName stuff is 'Format String', i.e. find/replace | |
# ...because of how the Lucene query needs to be formatted, single vs. double quotes matter | |
$responseQueryErrors = 'application_name="{0}" AND response:500' -f $newAppName | |
$responseQueryNotErrors = 'application_name="{0}" AND response:(200 OR 301 OR 302)' -f $newAppName | |
$responseQueryBadResponse = 'application_name="{0}" AND time_taken:(>{1})' -f $newAppName, $responseThresholdMs | |
$responseQueryGoodResponse = 'application_name="{0}" AND time_taken:(<{1})' -f $newAppName, $responseThresholdMs | |
# Gather the data from ES | |
Write-Output "Querying elk for errors: $responseQueryErrors" | |
$searchOutputErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryErrors -startDate $startDate | |
Write-Output "Querying elk for not-errors: $responseQueryNotErrors" | |
$searchOutputNotErrors = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryNotErrors -startDate $startDate | |
Write-Output "Querying elk for bad response: $responseQueryBadResponse" | |
$searchOutputHighResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryBadResponse -startDate $startDate | |
Write-Output "Querying elk for good response: $responseQueryGoodResponse" | |
$searchOutputLowResponse = Search-ESLastMinutes -indexShortName $indexShortName -querySearchTerms $responseQueryGoodResponse -startDate $startDate | |
# Calculate the results | |
Write-Output " " | |
Write-Output "######################################################" | |
Write-Output "Calculating the metric for response errors vs. not-errors...i.e. Error Rate" | |
Write-Output "We are taking the results for the search output and dividing error response count by total counts to find a percent error rate." | |
if ($searchOutputErrors.Documents.Count -eq 0) { | |
$errorRate = 0 | |
} | |
else { | |
$totalDocumentsErrors = ($searchOutputErrors.Documents.Count + $searchOutputNotErrors.Documents.Count) | |
$errorRate = ($searchOutputErrors.Documents.Count / $totalDocumentsErrors) | |
} | |
$errorRateClean = "{0:P2}" -f $errorRate | |
Write-Output "Our error rate is $errorRateClean, and our threshold is $errorRateThreshold." | |
Write-Output "######################################################" | |
Write-Output " " | |
Write-Output "######################################################" | |
Write-Output "Calculating the metric for high vs low response times..." | |
Write-Output "We are taking the results from the search output and dividing above-time-taken-threshold counts vs. total counts to get a percent." | |
if ($searchOutputHighResponse.Documents.Count -eq 0) { | |
$responsePercent = 0 | |
} | |
else { | |
$totalDocumentsResponseTime = ($searchOutputHighResponse.Documents.Count + $searchOutputLowResponse.Documents.Count) | |
$responsePercent = ($searchOutputHighResponse.Documents.Count / $totalDocumentsResponseTime) | |
} | |
$responsePercentClean = "{0:P2}" -f $responsePercent | |
Write-Output "$responsePercentClean of all requests are over $($responseThresholdMs)ms" | |
Write-Output "######################################################" | |
Write-Output " " | |
# Some extra output just to lend easy/fast credence to the above | |
Write-Output "######################################################" | |
Write-Output "## Some output for clarity ##" | |
Write-Output "######################################################" | |
Write-Output "Here's how many hits for bad HTTP responses: $($badSearchOutput.Documents.Count)" | |
Write-Output "Here's how many hits for good HTTP responses: $($searchOutputNotErrors.Documents.Count)" | |
Write-Output "Here's how many hits for high response time: $($searchOutputHighResponse.Documents.Count)" | |
Write-Output "Here's how many hits for low response time: $($searchOutputLowResponse.Documents.Count)" | |
Write-Output "If nothing shows below, no documents found!" | |
Write-Output "Example good error document:" | |
$searchOutputNotErrors.Documents | Select-Object -First 1 | Format-List message | |
Write-Output "Example bad error document:" | |
$searchOutputErrors.Documents | Select-Object -First 1 | Format-List message | |
Write-Output "Example good response document: " | |
$searchOutputLowResponse.Documents | Select-Object -first 1 | Format-List message | |
Write-Output "Example bad response document: " | |
$searchOutputHighResponse.Documents | Select-Object -first 1 | Format-List message | |
Write-Output "######################################################" | |
} | |
Catch { | |
Write-Output "Failed to query ELK!" | |
Write-Output $_.Exception.Message | |
return $_.Exception.StackTrace | |
} | |
################################################## | |
$failure = $null | |
Write-Output "We are now determining pass/fail based on the interesting metric thresholds..." | |
Write-Output "Checking interesting metric: Error Rate" | |
if ($errorRateClean -gt $errorRateThreshold) { | |
Write-Output "Our error rate $errorRateClean is greater than the threshold of $errorRateThreshold" | |
$failure = $true | |
} | |
else { | |
Write-Output "Successful error rate tests! We can move on..." | |
} | |
Write-Output "Checking interesting metric: % of responses over $responseThresholdMs" | |
if ($responsePercentClean -gt $responseThreshold) { | |
Write-Output "Our % of requests over response time of $responseThresholdMs ($responsePercentClean) is greater than the threshold of $errorRateThreshold %" | |
$failure = $true | |
} | |
else { | |
Write-Output "Successful response time tests! We can move on..." | |
} | |
################################################## | |
if ($failure -eq $true) { | |
Write-Output "Deploy failed on $hostname during our canary testing!!" | |
Write-Output "Running the revert to remove from canary target group, stop new site, start old site, and put back into production target group!" | |
Exit 1 | |
} | |
else { | |
Write-Output "Success!! We have a good set of test results and interesting metrics!" | |
$successMan = @" | |
.----------------. .----------------. .----------------. | |
| .--------------. || .--------------. || .--------------. | | |
| | __ | || | ____ | || | __ | | | |
| | \ \ | || | .' `. | || | / / | | | |
| | \ \ | || | / .--. \ | || | / / | | | |
| | \ \ | || | | | | | | || | / / | | | |
| | \ \ | || | \ `--' / | || | / / | | | |
| | \_\ | || | `.____.' | || | /_/ | | | |
| | | || | | || | | | | |
| '--------------' || '--------------' || '--------------' | | |
'----------------' '----------------' '----------------' | |
"@ | |
Write-Output $successMan | |
Exit 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment