Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Put a nova compute node into 'maintenance mode'. Disable its service, and attempt to migrate all instances on it to other nodes.
#!/usr/bin/env python
import argparse
from collections import defaultdict
import os
import sys
import time
from novaclient import client
from novaclient import exceptions
# States which we can migrate
STATES_MIGRATABLE = set(['ACTIVE', 'SHUTOFF'])
# States which indicate a migration is in progress
STATES_MIGRATING = set(['MIGRATING', 'RESIZE'])
# States which should go away if we wait
STATES_EPHEMERAL = set(['REBOOT', 'HARD_REBOOT', 'REBUILD', 'REVERT_RESIZE'])
class MissingClientOpt(Exception):
pass
class InvalidHost(Exception):
pass
class InstancesRemaining(Exception):
def __init__(self, instances):
self.instances = instances
super(InstancesRemaining, self).__init__()
def _get_client():
opts = {
'username': 'OS_USERNAME',
'api_key': 'OS_PASSWORD',
'project_id': 'OS_TENANT_NAME',
'auth_url': 'OS_AUTH_URL',
}
vals = {}
for opt, env in opts.iteritems():
vals[opt] = val = os.environ.get(env)
if val is None:
raise MissingClientOpt('environment variable %s not set' % env)
return client.Client('2', **vals)
def _migrate_single(instance, force_cold):
if not force_cold[instance.id] and instance.status == 'ACTIVE':
try:
instance.live_migrate()
except exceptions.BadRequest as ex:
# We have no robust way of knowing what this exception is beyond a
# 400, so we're just going to guess that it failed because of
# shared storage and try again with block_migration set. This
# potentially hides a real error, but we don't have any way to tell
# when we have a real error.
instance.live_migrate(block_migration=True)
return False
elif force_cold[instance.id] or instance.status == 'SHUTOFF':
instance.migrate()
return True
def _migrate_multiple(instances, max_migrations, cold_fallback,
attempts, force_cold, verify):
busy = set()
migrating = set()
for instance in instances:
# Instance is already migrating
if instance.status in STATES_MIGRATING:
migrating.add(instance)
continue
# Instance is busy with something, and we should wait
if instance.status in STATES_EPHEMERAL:
busy.add(instance)
continue
# If we've tried 3 times to live migrate a guest and the user has
# specified cold fallback, shut the guest down and try a cold migration
if (cold_fallback and attempts[instance.id] == 3 and
instance.status == 'ACTIVE' and not force_cold[instance.id]):
print ('WARNING: falling back to cold migration of %s' %
_pretty_instance(instance))
force_cold[instance.id] = True
attempts[instance.id] = 0
# For safety, we don't attempt to do anything with an instance which is
# in a state we don't explicitly handle. Note that we can't safely
# handle PAUSED, SUSPENDED, and RESCUED in any case, as migration
# requires moving the instance out of this state, and we don't have the
# context to do that safely.
elif (instance.status not in STATES_MIGRATABLE or
attempts[instance.id] == 3):
continue
if len(migrating) < max_migrations:
migrating.add(instance)
attempts[instance.id] += 1
try:
resize = _migrate_single(instance, force_cold)
except Exception as ex:
# Almost everything is a 'BadRequest'. This covers such a wide
# variety of errors, some of them ephemeral, that we just retry
# here regardless
continue
if resize:
verify.add(instance.id)
return migrating, busy
def _pretty_instance(instance):
return '%s(%s)' % (instance.name, instance.id)
def _migrate_host(nova, host, max_migrations, poll_interval, cold_fallback):
host_filter = {'host': host}
# The number of times we've attempted to migrate an instance
attempts = defaultdict(int)
verify = set()
force_cold = defaultdict(bool)
# Display
instances = nova.servers.list(search_opts=host_filter)
if len(instances) > 0:
print 'Found instances on host:'
for instance in instances:
print ' %s' % _pretty_instance(instance)
# Repeatedly attempt to evacuate the host until completion
while True:
# Attempt to migrate instances on the host
migrating, busy = _migrate_multiple(instances,
max_migrations, cold_fallback, attempts, force_cold, verify)
remaining_instance_ids = set([instance.id for instance in instances])
# If we successfully cold migrated an instance, it will be left in
# VERIFY_RESIZE on a new host. Here we auto-confirm it, as it was a
# simple copy. This may involve an expensive data-scrubbing operation,
# so we start it as soon as possible.
verified = set()
for instance_id in verify:
# Instances in VERIFY_RESIZE have moved to a different host, so
# they won't be in instances, which only contains instances from
# the target host.
if instance_id in remaining_instance_ids:
continue
instance = nova.servers.get(instance_id)
if instance.status == 'VERIFY_RESIZE':
instance.confirm_resize()
elif instance.status in STATES_MIGRATABLE:
verified.add(instance_id)
elif instance.status == 'RESIZE':
# Haven't finished on the target host
pass
else:
# Instance is in unexpected state on target host. Warn, and
# don't wait any longer.
print ('WARNING: Instance %s has migrated to host %s. '
'Expected status VERIFY_RESIZE, but status is %s'
% (_pretty_instance(instance),
getattr(instance, 'OS-EXT-SRV-ATTR:host'),
instance.status))
verified.add(instance_id)
verify.difference_update(verified)
# Finish when we didn't do anything, and we're not waiting on anything
if len(migrating) == 0 and len(busy) == 0 and len(verify) == 0:
break
# Report status on every round
if len(migrating) > 0:
print 'Migrating:'
for instance in migrating:
print ' %s' % _pretty_instance(instance)
if len(busy) > 0:
print 'Busy:'
for instance in busy:
print ' %s: %s' % (_pretty_instance(instance),
instance.status)
time.sleep(poll_interval)
instances = nova.servers.list(search_opts=host_filter)
if len(instances) > 0:
raise InstancesRemaining(instances)
def nova_compute_maintenance(nova, host, max_migrations=2, poll_interval=5,
cold_fallback=False):
# Disable the service in the scheduler to prevent new instances being sent
# there
try:
nova.services.disable(host, 'nova-compute')
except exceptions.NotFound:
raise InvalidHost
_migrate_host(nova, host, max_migrations, poll_interval, cold_fallback)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Disable a nova compute service and move all its '
'instances to other hosts')
parser.add_argument('host',
help='The nova compute host to evacuate for maintenance')
parser.add_argument('--max-migrations', '-m', type=int,
help='The maximum number of simultaneous migrations to perform')
parser.add_argument('--poll-interval', '-p', type=int,
help='The frequency at which we poll the status of the compute '
'service')
parser.add_argument('--cold-fallback', '-c', action='store_true',
help='Fall back to cold migration if live migration fails')
opts = vars(parser.parse_args())
host = opts.pop('host')
# Filter out unset options so they're defaulted by nova_compute_maintenance
opts = {k:v for k, v in opts.iteritems() if v is not None}
try:
nova = _get_client()
except MissingClientOpt as ex:
print ex.message
sys.exit(1)
try:
nova_compute_maintenance(nova, host, **opts)
print 'Success: No instances left on host'
sys.exit(0)
except InvalidHost:
print "%s is not a nova compute host" % host
except InstancesRemaining as ex:
print 'Failed to migrate the following instances:'
for instance in ex.instances:
print ' %s: %s %s' % (_pretty_instance(instance),
instance.status,
getattr(instance, 'fault', ''))
print 'See logs for details'
sys.exit(1)
@sbauza

This comment has been minimized.

Copy link

sbauza commented Sep 7, 2015

LGTM, I could nitpick with some yakshaving (like I dislike catching the base Exception) but I think it's really good to provide this as a KB.

@mriedem

This comment has been minimized.

@alcamie101

This comment has been minimized.

Copy link

alcamie101 commented Jun 11, 2016

How would I go about using this script?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.