Skip to content

Instantly share code, notes, and snippets.

Last active October 2, 2017 20:17
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save mdbooth/163f5fdf47ab45d7addd to your computer and use it in GitHub Desktop.
Put a nova compute node into 'maintenance mode'. Disable its service, and attempt to migrate all instances on it to other nodes.
#!/usr/bin/env python
import argparse
from collections import defaultdict
import os
import sys
import time
from novaclient import client
from novaclient import exceptions
# States which we can migrate
# States which indicate a migration is in progress
# States which should go away if we wait
class MissingClientOpt(Exception):
class InvalidHost(Exception):
class InstancesRemaining(Exception):
def __init__(self, instances):
self.instances = instances
super(InstancesRemaining, self).__init__()
def _get_client():
opts = {
'username': 'OS_USERNAME',
'api_key': 'OS_PASSWORD',
'project_id': 'OS_TENANT_NAME',
'auth_url': 'OS_AUTH_URL',
vals = {}
for opt, env in opts.iteritems():
vals[opt] = val = os.environ.get(env)
if val is None:
raise MissingClientOpt('environment variable %s not set' % env)
return client.Client('2', **vals)
def _migrate_single(instance, force_cold):
if not force_cold[] and instance.status == 'ACTIVE':
except exceptions.BadRequest as ex:
# We have no robust way of knowing what this exception is beyond a
# 400, so we're just going to guess that it failed because of
# shared storage and try again with block_migration set. This
# potentially hides a real error, but we don't have any way to tell
# when we have a real error.
return False
elif force_cold[] or instance.status == 'SHUTOFF':
return True
def _migrate_multiple(instances, max_migrations, cold_fallback,
attempts, force_cold, verify):
busy = set()
migrating = set()
for instance in instances:
# Instance is already migrating
if instance.status in STATES_MIGRATING:
# Instance is busy with something, and we should wait
if instance.status in STATES_EPHEMERAL:
# If we've tried 3 times to live migrate a guest and the user has
# specified cold fallback, shut the guest down and try a cold migration
if (cold_fallback and attempts[] == 3 and
instance.status == 'ACTIVE' and not force_cold[]):
print ('WARNING: falling back to cold migration of %s' %
force_cold[] = True
attempts[] = 0
# For safety, we don't attempt to do anything with an instance which is
# in a state we don't explicitly handle. Note that we can't safely
# handle PAUSED, SUSPENDED, and RESCUED in any case, as migration
# requires moving the instance out of this state, and we don't have the
# context to do that safely.
elif (instance.status not in STATES_MIGRATABLE or
attempts[] == 3):
if len(migrating) < max_migrations:
attempts[] += 1
resize = _migrate_single(instance, force_cold)
except Exception as ex:
# Almost everything is a 'BadRequest'. This covers such a wide
# variety of errors, some of them ephemeral, that we just retry
# here regardless
if resize:
return migrating, busy
def _pretty_instance(instance):
return '%s(%s)' % (,
def _migrate_host(nova, host, max_migrations, poll_interval, cold_fallback):
host_filter = {'host': host}
# The number of times we've attempted to migrate an instance
attempts = defaultdict(int)
verify = set()
force_cold = defaultdict(bool)
# Display
instances = nova.servers.list(search_opts=host_filter)
if len(instances) > 0:
print 'Found instances on host:'
for instance in instances:
print ' %s' % _pretty_instance(instance)
# Repeatedly attempt to evacuate the host until completion
while True:
# Attempt to migrate instances on the host
migrating, busy = _migrate_multiple(instances,
max_migrations, cold_fallback, attempts, force_cold, verify)
remaining_instance_ids = set([ for instance in instances])
# If we successfully cold migrated an instance, it will be left in
# VERIFY_RESIZE on a new host. Here we auto-confirm it, as it was a
# simple copy. This may involve an expensive data-scrubbing operation,
# so we start it as soon as possible.
verified = set()
for instance_id in verify:
# Instances in VERIFY_RESIZE have moved to a different host, so
# they won't be in instances, which only contains instances from
# the target host.
if instance_id in remaining_instance_ids:
instance = nova.servers.get(instance_id)
if instance.status == 'VERIFY_RESIZE':
elif instance.status in STATES_MIGRATABLE:
elif instance.status == 'RESIZE':
# Haven't finished on the target host
# Instance is in unexpected state on target host. Warn, and
# don't wait any longer.
print ('WARNING: Instance %s has migrated to host %s. '
'Expected status VERIFY_RESIZE, but status is %s'
% (_pretty_instance(instance),
getattr(instance, 'OS-EXT-SRV-ATTR:host'),
# Finish when we didn't do anything, and we're not waiting on anything
if len(migrating) == 0 and len(busy) == 0 and len(verify) == 0:
# Report status on every round
if len(migrating) > 0:
print 'Migrating:'
for instance in migrating:
print ' %s' % _pretty_instance(instance)
if len(busy) > 0:
print 'Busy:'
for instance in busy:
print ' %s: %s' % (_pretty_instance(instance),
instances = nova.servers.list(search_opts=host_filter)
if len(instances) > 0:
raise InstancesRemaining(instances)
def nova_compute_maintenance(nova, host, max_migrations=2, poll_interval=5,
# Disable the service in the scheduler to prevent new instances being sent
# there
try:, 'nova-compute')
except exceptions.NotFound:
raise InvalidHost
_migrate_host(nova, host, max_migrations, poll_interval, cold_fallback)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Disable a nova compute service and move all its '
'instances to other hosts')
help='The nova compute host to evacuate for maintenance')
parser.add_argument('--max-migrations', '-m', type=int,
help='The maximum number of simultaneous migrations to perform')
parser.add_argument('--poll-interval', '-p', type=int,
help='The frequency at which we poll the status of the compute '
parser.add_argument('--cold-fallback', '-c', action='store_true',
help='Fall back to cold migration if live migration fails')
opts = vars(parser.parse_args())
host = opts.pop('host')
# Filter out unset options so they're defaulted by nova_compute_maintenance
opts = {k:v for k, v in opts.iteritems() if v is not None}
nova = _get_client()
except MissingClientOpt as ex:
print ex.message
nova_compute_maintenance(nova, host, **opts)
print 'Success: No instances left on host'
except InvalidHost:
print "%s is not a nova compute host" % host
except InstancesRemaining as ex:
print 'Failed to migrate the following instances:'
for instance in ex.instances:
print ' %s: %s %s' % (_pretty_instance(instance),
getattr(instance, 'fault', ''))
print 'See logs for details'
Copy link

sbauza commented Sep 7, 2015

LGTM, I could nitpick with some yakshaving (like I dislike catching the base Exception) but I think it's really good to provide this as a KB.

Copy link

mriedem commented Nov 20, 2015

Copy link

How would I go about using this script?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment