Skip to content

Instantly share code, notes, and snippets.

@ezeeetm
Created December 18, 2015 14:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ezeeetm/1f2e72b8a68062ce9ba9 to your computer and use it in GitHub Desktop.
Save ezeeetm/1f2e72b8a68062ce9ba9 to your computer and use it in GitHub Desktop.
blueGreenDeploy.ps1
<#
.SYNOPSIS
Command line utility for initiating blue-green deployments, to be called by a build step, or by another script, in a Jenkins job.
.PARAMETER region
Optional/has default value. Specifies the AWS region, e.g. 'us-east-1'. See script Param section for allowed values.
.PARAMETER environment
Required. Specifies a friendly name for the environment, e.g. 'dev'. See script Param section for allowed values.
.PARAMETER product
Required. Specifies a friendly name for a product, e.g. 'fms'. See script Param section for allowed values.
.PARAMETER uid
Required. Specifies the uid for the product stack, e.g. 'uaa'.
.PARAMETER zip_name
Required. Specifies the value to apply to the 'deploymentZip' tag, to bind the new ASG to a single app version, e.g. 'dev-lms-49-2015-10-28.zip'.
.PARAMETER accessKey
Optional. Specifies an AWS access key to use for running locally/testing.
.PARAMETER secretKey
Optional. Specifies an AWS secret key to use for running locally/testing.
.EXAMPLE
.\deploy.ps1 -environment dev -product fms -uid uaa -zip_name dev-lms-49-2015-10-28.zip
.NOTES
DEPENDS: AWS Tools for Windows PowerShell: 3.1.23.0 or greater
.TODO
- add granular $rollBack levels and modify RollBack function to handle them
- make detachment from live elb conditional in RollBack
- make check for new scaling events occur during each CheckInService iteration, so new values can be applied as early as possible
#>
Param
(
[Parameter(Mandatory=$False)]
[ValidateSet("us-east-1","us-west-1","us-west-2","ap-northeast-1","ap-southeast-1","ap-southeast-2","eu-central-1","eu-west-1","sa-east-1",IgnoreCase = $false)]
[string]$region = "us-east-1",
[Parameter(Mandatory=$True)]
[ValidateSet("dev","test","testfull","preview","prod",IgnoreCase = $false)]
[string]$environment,
[Parameter(Mandatory=$True)]
[ValidateSet("fms","lms","cms","none",IgnoreCase = $false)]
[string]$product,
[Parameter(Mandatory=$True)]
[string]$uid,
[Parameter(Mandatory=$True)]
[string]$zip_name,
[Parameter(Mandatory=$False)]
[string]$accessKey,
[Parameter(Mandatory=$False)]
[string]$secretKey
)
# pretty logging for Jenkins console output.
function Log ($indentLevel, $logLevel, $logMessage)
{
try
{
$now = get-date -Format "hh:mm:ss"
$indent = "`t" * $indentLevel
Write-Host $now $loglevel$indent$logMessage
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "Log" $exMsg
}
}
# to ensure API calls that don't return anything actually got made
function awsLog ($indentLevel)
{
try
{
$logLevel = "INFO"
$requestId = $AWSHistory.LastServiceResponse.ResponseMetadata.RequestId
$requestStatusCode = $AWSHistory.LastServiceResponse.HttpStatusCode
if ($requestStatusCode -ne "OK")
{
$logLevel = "WARN"
}
Log $indentLevel $logLevel "AWS response: $requestStatusCode / Request ID: $requestId"
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "awsLog" $exMsg
}
}
function HandleException($function,$exceptionMessage)
{
$msg = "Exception in $($function): $exceptionMessage, exiting."
Log 1 FATAL $msg
throw $msg
exit 1
}
function Setup ($region)
{
try
{
# remove any lingering AWS sessions, to ensure that Jenkins role or passed key params are used
Clear-AWSDefaults
Clear-AWSCredentials
Set-DefaultAWSRegion -Region $region
Log 2 INFO "script environment ready"
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "Setup" $exMsg
}
}
function CheckStackState
{
try
{
$isBad = $false
$asgs = Get-ASAutoScalingGroup | where {$_.AutoScalingGroupName.StartsWith("$environment-$product")}
$liveAsgCount = ($asgs | where {$_.DesiredCapacity -ne 0}).count
if ($liveAsgCount -gt 1)
{
Log 2 FATAL "there are $liveAsgCount ASGs with nonzero desired capacity for product $product in environment $environment"
$isBad = $true
}
$activeAsgCount = ($asgs | where {$_.Tags.value.Contains("active")}).count
if ($activeAsgCount -gt 1)
{
Log 2 FATAL "there are $activeAsgCount ASGs with state = 'active' for product $product in environment $environment"
$isBad = $true
}
$tempElb = Get-ELBLoadBalancer | where {$_.LoadBalancerName -eq "$environment-$product-elb-temp"}
if ($tempElb)
{
Log 2 FATAL "at least one ELB with name $environment-$product-elb-temp already exists"
$isBad = $true
}
$tempElbStack = Get-CFNStack | where {$_.StackName -eq "$environment-$product-temp-elb"}
if ($tempElbStack)
{
# very rarely, DeleteTempElb at the end of the script results in DELETE_FAILED
# in those cases, we'll just delete it again.
# incuding "DELETE_IN_PROGRESS" here so back-to-back deployments function correctly as well
if (($tempElbStack.StackStatus.Value -eq "DELETE_FAILED") -or ($tempElbStack.StackStatus.Value -eq "DELETE_IN_PROGRESS"))
{
Log 2 WARN "CFN stack $environment-$product-temp-elb is in state $($tempElbStack.StackStatus.Value) from previous deployment, cleaning this up"
$isBad = DeleteTempElb "recheck"
}
else
{
Log 2 FATAL "CFN stack $environment-$product-temp-elb is in an unexpected state"
$isBad = $true
}
}
# check that current live ASG instances are all InService
$instanceStates = Get-ELBInstanceHealth -LoadBalancerName "$environment-$product-elb-$uid" | select State
foreach ($instanceState in $instanceStates)
{
if ($instanceState.State -ne "InService")
{
Log 2 FATAL "elb for active ASG $environment-$product-elb-$uid has one or more instances not InService."
$isBad = $true
}
}
if ($isBad)
{
Log 2 FATAL "stack is not in acceptable state for deployment"
Log 2 FATAL "is a previous deployment still in progress / teardown failed?"
HandleException "CheckStackState" "FAILED"
}
Log 2 INFO "stack ready for deployment"
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "CheckStackState" $exMsg
}
}
function GetAsg ($status)
{
try
{
sleep -Seconds 3 # buffer to allow previous updates to asg values to propagate in AWS before making back-to-back calls
$asgs = Get-ASAutoScalingGroup | where {$_.AutoScalingGroupName.StartsWith("$environment-$product")} #must refresh every time called
foreach ($asg in $asgs)
{
$tags = $asg.Tags
$asgStatus = ($tags | where {$_.Key -eq "status"}).Value
if ($asgStatus -eq $status )
{
return $asg
}
}
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "GetAsg" $exMsg
}
}
function UpdateInactiveASG ($activeAsg,$inactiveAsg)
{
try
{
$inactiveAsgName = $inactiveAsg.AutoScalingGroupName
$wasUpdated = $false
if (($inactiveAsg.MaxSize) -ne ($activeAsg.MaxSize))
{
Log 2 INFO "updating inactive ASG MaxSize: $($inactiveAsg.MaxSize) to match active ASG MaxSize: $($activeAsg.MaxSize)"
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -MaxSize $activeAsg.MaxSize
awsLog 3
$wasUpdated = $true
}
if (($inactiveAsg.MinSize) -ne ($activeAsg.MinSize))
{
Log 2 INFO "updating inactive ASG MinSize: $($inactiveAsg.MinSize) to match active ASG MinSize: $($activeAsg.MinSize)"
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -MinSize $activeAsg.MinSize
awsLog 3
$wasUpdated = $true
}
if (($inactiveAsg.DesiredCapacity) -ne ($activeAsg.DesiredCapacity))
{
Log 2 INFO "updating inactive ASG DesiredCapacity: $($inactiveAsg.DesiredCapacity) to match active ASG DesiredCapacity: $($activeAsg.DesiredCapacity)"
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -DesiredCapacity $activeAsg.DesiredCapacity
awsLog 3
$wasUpdated = $true
}
return $wasUpdated
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "UpdateInactiveASG" $exMsg
}
}
function UpdateAsgTag ($asg,$tagKey,$tagValue)
{
try
{
$asgTags = $asg.Tags
$asgName = $asg.AutoScalingGroupName
$currentTagValue = ($asgTags | where {$_.Key -eq $tagKey}).Value
if ($currentTagValue -ne $tagValue)
{
# ASG names and zips are long values, broken up to avoid line wrapping
Log 2 INFO "updating tag for ASG: $asgName"
Log 3 INFO "tag: $tagKey, currentValue: $currentTagValue, newValue: $tagValue"
Set-ASTag -Tag @( @{ResourceType="auto-scaling-group"; ResourceId=$asgName; Key=$tagKey; Value=$tagValue; PropagateAtLaunch=$true} )
}
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "UpdateAsgTag" $exMsg
}
}
function CreateTempElb
{
try
{
$elbStack = Get-CFNStack | where {$_.StackName.StartsWith("$environment-$product-$uid-elb")}
$tempElbStackName = "$environment-$product-temp-elb"
$stackParams = $elbStack.Parameters
$uidParam = $stackParams | where {$_.ParameterKey -eq "uidParameter"}
$uidParam.ParameterValue = "temp"
$cfnStatusCheckInterval = 15 #seconds
$templateUrl = "https://s3.amazonaws.com/$environment-cfntemplates-$uid/$product/elb.json"
$resp = New-CFNStack -StackName $tempElbStackName -Parameter $stackParams -TemplateURL $templateUrl -Capabilities "CAPABILITY_IAM"
Log 2 INFO "CFN response: $resp"
do
{
sleep -Seconds $cfnStatusCheckInterval
$stack = Get-CFNStack -StackName $tempElbStackName
$stackStatus = $stack.StackStatus
Log 2 INFO "...$stackStatus"
}
until
(
$stackStatus -eq "CREATE_COMPLETE"
)
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "CreateTempElb" $exMsg
}
}
function UpdateElb ($asg,$elbName,$action)
{
try
{
if ($action -eq "mount")
{
Add-ASLoadBalancer -LoadBalancerName $elbName -AutoScalingGroupName $asg.AutoScalingGroupName
}
if ($action -eq "dismount")
{
Dismount-ASLoadBalancer -LoadBalancerName $elbName -AutoScalingGroupName $asg.AutoScalingGroupName
}
Log 2 INFO "action: $action"
Log 2 INFO "ELB: $elbName"
Log 2 INFO "ASG: $($asg.AutoScalingGroupName)"
awsLog 3
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "UpdateElb" $exMsg
}
}
function CheckInService ($elbName,$expectedInstances)
{
try
{
$retries = 1
$elbInstanceHealthCheckInterval = 15 #seconds
$elbInstanceHealthRetries = 100
do
{
sleep -Seconds $elbInstanceHealthCheckInterval
$instances = Get-ELBInstanceHealth -LoadBalancerName $elbName
$instancesInService = $instances | where {$_.State -eq "InService"}
$retryString = $retries.ToString("000")
Log 2 INFO "retry $($retryString)/$($elbInstanceHealthRetries):`t$($instancesInService.count)/$expectedInstances instances in service"
$retries += 1
}
until
(
(($instancesInService.count -eq $expectedInstances) -or ($retries -ge $elbInstanceHealthRetries))
)
if ($retries -ge $elbInstanceHealthRetries)
{
RollBack
}
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "CheckInService" $exMsg
}
}
function UpdateAutoScaling ($asg,$action)
{
try
{
Log 2 INFO "action: $action"
Log 2 INFO "ASG: $($asg.AutoScalingGroupName)"
if ($action -eq "suspend")
{
Suspend-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName
}
if ($action -eq "resume")
{
Resume-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName
}
if ($action -eq "noCloudWatchAlarms")
{
Suspend-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName -ScalingProcess AlarmNotification
}
awsLog 3
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "UpdateAutoScaling" $exMsg
}
}
function PurgeAsg ($asg)
{
try
{
$asgName = $asg.AutoScalingGroupName
Log 2 INFO "updating inactive ASG MinSize: $($asg.MinSize) to 0"
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -MinSize 0
awsLog 3
Log 2 INFO "updating inactive ASG DesiredCapacity: $($asg.DesiredCapacity) to 0"
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -DesiredCapacity 0
awsLog 3
Log 2 INFO "updating inactive ASG MaxSize: $($asg.MaxSize) to 0"
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -MaxSize 0
awsLog 3
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "PurgeAsg" $exMsg
}
}
function DeleteTempElb ($recheck)
{
try
{
$tempElbStackName = "$environment-$product-temp-elb"
Log 2 INFO "deleting CFN stack $tempElbStackName"
Remove-CFNStack -StackName $tempElbStackName -Force
awsLog 3
if ($recheck -eq "recheck")
{
$retryThreshold = 20 #10m @ 30s. Usually takes 2m - 5m
do
{
$tempElbStack = Get-CFNStack | where {$_.StackName.Contains($tempElbStackName)}
if($tempElbStack)
{
Log 3 INFO "...$($tempElbStack.StackStatus.Value)"
if ($($tempElbStack.StackStatus.Value) -eq "DELETE_FAILED")
{
Log 3 INFO "...(we'll see about that)"
Remove-CFNStack -StackName $tempElbStackName -Force
}
}
sleep -Seconds 30
$retries += 1
}
until
(
($tempElbStack -eq $null) -or ($retries -ge $retryThreshold)
)
if ($tempElbStack -eq $null)
{
Log 3 INFO "...DELETED"
return $false #sets $isBad = $false in CheckStackState because tempElbStack is now gone.
}
if ($retries -ge $retryThreshold)
{
return $true #sets $isBad = $true in CheckStackState because it refused to die. This should never happen.
}
}
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "DeleteTempElb" $exMsg
}
}
function RollBack
{
try
{
Log 1 WARN "One or more instances unhealthy after retry threshold was reached. Rolling back."
$inactiveAsg = GetAsg inactive
$activeAsg = GetAsg active
UpdateAutoScaling $inactiveAsg resume
UpdateAutoScaling $activeAsg resume
PurgeAsg ($inactiveAsg)
$tempElbName = "$environment-$product-elb-temp"
UpdateElb $inactiveAsg $tempElbName dismount
#this throws if not actually attached, make it conditional (if attached)
#$activeElbName = "$environment-$product-elb-$uid"
#UpdateElb $inactiveAsg $activeElbName dismount
DeleteTempElb "noRecheck"
Log 1 INFO "Rollback complete, script exiting"
# "ROLLBACK" | Out-File -Append -FilePath .\deployTestLog.csv # FOR TESTING
exit 1
}
Catch [Exception]
{
$exMsg = $_.Exception.Message
HandleException "RollBack" $exMsg
}
}
# --------------------------------- script entry point ---------------------------------
# setup
Log 1 INFO "Setup:"
Setup $region
# if keys are passed, use them.
Set-DefaultAWSRegion -Region $region
if (($accessKey) -and ($secretKey))
{
Initialize-AWSDefaults -AccessKey $accessKey -SecretKey $secretKey -Region $region
}
# ensure stack is in good state for a deployment
Log 1 INFO "CheckStackState:"
CheckStackState
# get asgs
Log 1 INFO "GetAsgs:"
$activeAsg = GetAsg active
Log 2 INFO "active ASG: $($activeAsg.AutoScalingGroupName)"
$inactiveAsg = GetAsg inactive
Log 2 INFO "inactive ASG: $($inactiveAsg.AutoScalingGroupName)"
# clone active elb into a temp elb and attach to inactive asg to satisfy health check
# this is done like this because each ELB is ~$20/month
Log 1 INFO "CreateTempElb:"
CreateTempElb
# mount temp elb on inactive asg
$tempElbName = "$environment-$product-elb-temp"
Log 1 INFO "UpdateElb:"
UpdateElb $inactiveAsg $tempElbName mount
# disable autoscaling triggered by low CPU/mem/network CW alarms on inactive asg
# to prevent unwanted scale-down events while asg is waiting for instances to come in service
Log 1 INFO "UpdateAutoScaling:"
UpdateAutoScaling $inactiveAsg noCloudWatchAlarms
# update inactive asg with new deployment.zip tag and same des/min/max as active asg
Log 1 INFO "UpdateInactiveASG:"
UpdateAsgTag $inactiveAsg deploymentZip "$product/$zip_name"
$wasUpdated = UpdateInactiveASG $activeAsg $inactiveAsg
# ensure all instances behind temp elb are inService
# if activeAsg scales during this check, adjust and recheck before continuing
Log 1 INFO "CheckInService on ELB $($tempElbName):"
do
{
$inactiveAsg = GetAsg inactive # to refresh DesiredCapacity
$expectedInstances = $inactiveAsg.DesiredCapacity
CheckInService $tempElbName $expectedInstances
$activeAsg = GetAsg active # to refresh desired/min/max values in case there was a scaling event during previous CheckInService
Log 1 INFO "Checking for min/max/desired changes caused by new scaling events in active ASG"
$wasUpdated = UpdateInactiveASG $activeAsg $inactiveAsg
if (!$wasUpdated)
{
Log 1 INFO "No new scaling events found, continuing with deployment"
}
}
until
(
$wasUpdated -eq $False
)
# suspend scaling on active asg, not done earlier because time window required by CheckInService is too long
# to disable autoscaling. Done here to ensure scaling events from this point forward do not interfere w deployment.
# since elb heath checks are identical, this window is always be very brief ( ~30 seconds in testing)
Log 1 INFO "UpdateAutoScaling:"
UpdateAutoScaling $activeAsg suspend
# mount active elb on inactiveAsg
$activeElbName = "$environment-$product-elb-$uid"
Log 1 INFO "UpdateElb:"
UpdateElb $inactiveAsg $activeElbName mount
# ensure all instances behind active elb are inService
Log 1 INFO "CheckInService on ELB $($activeElbName):"
$expectedInstances = $inactiveAsg.DesiredCapacity + $activeAsg.DesiredCapacity
CheckInService $activeElbName $expectedInstances
# update asg status tags, and refresh asg objects
Log 1 INFO "UpdateAsgTag:"
UpdateAsgTag $inactiveAsg status active
UpdateAsgTag $activeAsg status inactive
$activeAsg = GetAsg active
$inactiveAsg = GetAsg inactive
# turn autoscaling back on for activeAsg
# turn autoscaling back on for inactiveAsg so its in a consistent state for next deployment
Log 1 INFO "UpdateAutoScaling:"
UpdateAutoScaling $activeAsg resume
UpdateAutoScaling $inactiveAsg resume
# dismount active elb from the inactiveAsg, and temp elb from the activeAsg
Log 1 INFO "UpdateElb:"
UpdateElb $inactiveAsg $activeElbName dismount
UpdateElb $activeAsg $tempElbName dismount
# set inactiveAsg des/min/max to 0
Log 1 INFO "PurgeAsg:"
PurgeAsg $inactiveAsg
# delete temp elb stack
# noRecheck keeps it from blocking in Jenkins
Log 1 INFO "DeleteTempElb:"
DeleteTempElb "noRecheck"
# have a nice day!
Log 1 INFO "Deployment complete, script exiting"
#"SUCCESS" | Out-File -Append -FilePath .\deployTestLog.csv # FOR TESTING
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment