yuvalta/service_mover.py

## service_mover.py
# coding=utf-8
import os
import sys
import waiting
from datetime import datetime
from time import sleep

import retrying
from strato_common import credentials
from vm_manager_client import client as vm_client_module
from strato_kv.clustermanagement import clustermanagementapi

MAINTENANCE_MODE_COOLDOWN_TIME = 2
COOLDOWN_TIME = 10
TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300
INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10
CONTROL_NODES = ["stratonode1.node.strato"]

compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd',
                    'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent',
                    'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute']

SERVICES_WITH_NODE_TYPE = [
    'opa-policy-manager',
    'acm-api',
    'alarms-engine',
    'alarms-service',
    'api-explorer',
    'app-catalog-periodic-tasks',
    'app-catalog',
    'app-catalog-worker',
    'asg-api',
    'autoscaling-groups',
    'autoscaling-groups-worker',
    'aws-auth',
    'cassandra-engine',
    'certificate-manager-api',
    'cloudwatch-api',
    'cloudwatch-backend-api',
    'cloudwatch-backend-worker',
    'conversions',
    'conversions-worker',
    'credit-manager',
    'crs-manager-api',
    'dbc-manager-api',
    'dbs-manager',
    'docker-registry',
    'ec2-compute',
    'elb-api',
    'emr-api',
    'engine-manager-api',
    'engine-manager-worker',
    'events-service',
    'external-endpoint-manager',
    'galeramariadb-engine',
    'gargantua',
    'gcm',
    'grafana',
    'guestnet-admin-tool-api',
    'guestnet-admin-tool-beat',
    'guestnet-admin-tool-worker',
    'hot-upgrade',
    'hot-upgrade-worker',
    'http-proxy-service',
    'iam',
    'identity-manager',
    'image-manager-api',
    'image-manager-worker',
    # 'influxdb': ['control', 2],
    'inspector-api',
    'inspector-worker',
    'jaeger',
    'kafka-engine',
    'kubernetes-manager',
    'kubernetes-worker',
    'lbaas-manager',
    'lbaas-worker',
    'logserver',
    'maestro-auth',
    'maestro-data-reporter',
    'maestro-events-reporter',
    'maestro-gotty',
    'maestro-tunnel-client',
    'mancala-dr',
    'mancala-externalstorage',
    'mapreduce-api',
    'melet-api',
    'metrics-service',
    'mongodb-engine',
    'mssql-engine',
    # 'mysql': ['control', 3],
    'net-metrics-collector-worker',
    'neutron-db-init',
    'neutron-rpc-server',
    'neutron-server',
    'nfs-manager-api',
    'ntpd-server',
    'oauth2-proxy',
    'oort',
    'openotp-ldap-bridge',
    'openstack-cinder-api',
    'openstack-cinder-scheduler',
    'openstack-cinder-volume',
    'openstack-keystone',
    'openstack-nova-api',
    'openstack-nova-cert',
    'openstack-nova-conductor',
    'openstack-nova-consoleauth',
    'openstack-nova-novncproxy',
    'openstack-nova-scheduler',
    'placementapi',
    'placement',
    # 'policy-enforcer': ['control', 3],
    'policy-store',
    'protection-scheduler-api',
    'protection-scheduler-worker',
    'quotas-manager',
    # 'rack-storage-mgr': ['control', 3],
    # 'rack-storage-monitor': ['control', 3],
    # 'rack-storage-radosgw': ['control', 3],
    'rds-api',
    'redis-cache',
    'redis-engine',
    'region',
    'resource-tracker',
    'route53',
    's3-manager-api',
    's3-manager-worker',
    's3-scality',
    's3-vault',
    'scality-engine',
    'service-provisioner',
    'services-metrics-collector',
    'snapshot-manager',
    'sns-api',
    'sns-backend',
    'sqs-engine',
    'sqs-service-api',
    'strato-kapacitor',
    'stratonet-frontend',
    'stratonet-garbagecollector',
    'stratonet-ipam',
    'ui-backend',
    'ui-console',
    'updatemanagerapi',
    'vault-manager',
    'vault',
    'virtual-api2',
    'virtual-dr',
    'virtual-installation',
    'virtual-maestro',
    'virtual-nb',
    'virtual-region',
    'virtual-servicesgw',
    'vm-manager',
    'vm-manager-worker',
    'vms-monitor',
    'volumehealth',
    'volume-manager',
    'vpc-backend-api',
    'vpc-backend-periodic-tasks',
    'vpc-backend-worker',
]

def is_service_healthy(service, to_node):
    to_node_real_name = to_node.split('.')[0]
    command = (
        "dig {service}.service.strato | grep 'status: NOERROR' && "
        "consul catalog services -node {to_node} | grep '{service}' 2>&1 /dev/null ".format(
            service=service, to_node=to_node_real_name
        )
    )

    try:
        res = os.system(command)
        return res == 0
    except:
        return False


def flip_placement_map(placement_map):
    '''Flips a nested dict inside out
    takes {hostname: {service: state}},  returns {service: [hostnames]}
    '''
    res = {}
    for hostname in placement_map:
        for service in placement_map[hostname]:
            res.setdefault(service, []).append(hostname)
    return res


@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000)
def move_service_with_retry(service, from_node, to_node):
    '''Move a service to a node, retrying if it fails'''
    os.system('timeout 45s inspector tools cm move-service {service} {from_node} {to_node} -q'.format(service=service,
                                                                                                      from_node=from_node,
                                                                                                      to_node=to_node))


def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
    bar = fill * filled_length + '-' * (length - filled_length)
    sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
    sys.stdout.flush()
    if iteration == total:
        sys.stdout.write('\n')


def print_moving_services_table(data):
    print '{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node')
    for k, v in data.iteritems():
        to_node, from_node = v.values()

        print '{:<30} {:<30} {:<30}'.format(k, from_node, to_node)

    print '-------------------------- Total services to move: {} --------------------------'.format(len(data))


def system_command_with_timeout(command):
    try:
        os.system('timeout 45s {}'.format(command))
    except Exception as e:
        print(e.message)
        raise e


def move_services_to_control_nodes(cmapi):
    services_to_move = summary_of_moving_services(cmapi, print_results=False)
    total_moves = len(services_to_move)

    moves_counter = 0
    for control_service, path in services_to_move.iteritems():
        to_node, from_node = path.values()

        moves_counter += 1
        progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter),
                     suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node))
        try:
            move_service_with_retry(control_service, from_node, to_node)
        except:
            print 'Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node)
            continue

        waiting.wait(
            lambda: is_service_healthy(control_service, to_node),
            timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK,
            sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK,
            waiting_for='Service {} to be healthy'.format(control_service))
        )

    print 'Done moving {} services'.format(moves_counter)


def find_vms_on_control_nodes(vm_client):
    all_vms = vm_client.list()
    vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES]
    if vms_on_control_nodes:
        print 'Found {} VMs on control nodes:'.format(len(vms_on_control_nodes))
        for vm in vms_on_control_nodes:
            print 'VM {} is on node {}'.format(vm["name"], vm["hostname"])
    else:
        print 'No VMs found on control nodes'


def run_validators(vm_client):
    print '\n\n[] Validation that no VMs are found on a future control node'
    find_vms_on_control_nodes(vm_client)


def summary_of_moving_services(cmapi, print_results=True):
    control_map = cmapi.registry.get('cluster/control_services_placement_map')
    flipped_control_map = flip_placement_map(control_map)

    services_dict = {}
    total_moves = 0

    for control_service in SERVICES_WITH_NODE_TYPE:
        from_node = flipped_control_map[control_service][0]
        to_node = CONTROL_NODES[total_moves % len(CONTROL_NODES)]

        if from_node in CONTROL_NODES:
            continue

        if from_node == to_node:
            continue

        total_moves += 1
        services_dict[control_service] = {'from': str(from_node), 'to': to_node}

    if print_results:
        print_moving_services_table(services_dict)

    return services_dict


def consul_backup():
    file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    system_command_with_timeout('consul snapshot save {}'.format(file_name))
    print 'Saved consul snapshot to {}'.format(file_name)


def fix_rack_storage_monitor_issue(cmapi):
    print 'Deleting all ceph storage monitors from Consul'

    cmapi.registry.delete('cluster/cephMonitors', recursive=True)

    print '*** Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service ***'


def run_compute_maintenance():
    for service in compute_services:
        print 'Putting service {} in maintenance mode'.format(service)
        system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service))

        sleep(MAINTENANCE_MODE_COOLDOWN_TIME)

    print 'Done putting services in maintenance mode'


def run_compute_unmaintenance():
    for service in compute_services:
        print 'Putting service {} out of maintenance mode'.format(service)
        system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service))

        sleep(MAINTENANCE_MODE_COOLDOWN_TIME)

    print 'Done putting services out of maintenance mode'


def main():
    cmapi = clustermanagementapi.ClusterManagementAPI()
    vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms

    if len(sys.argv) == 1:
        print 'Choose the following flags: backup, validate, summary, move, fix-rack, compute-maint, compute-unmaint'
        return

    flag = sys.argv[1]

    # run consul backup
    if flag == 'backup':
        try:
            consul_backup()
        except Exception as e:
            print 'Error while trying to take a consul snapshot {} - try again!'.format(e.message)
            return

    # run validators
    if flag == 'validate':
        try:
            run_validators(vm_client)
        except Exception as e:
            print 'Error while running validations {} - try again!'.format(e.message)
            return

    # run a summary of services to move
    if flag == 'summary':
        try:
            summary_of_moving_services(cmapi)
        except Exception as e:
            print 'Error while trying get a summary of all moving services {} - try again!'.format(e.message)
            return

    # run the actual move
    if flag == 'move':
        try:
            move_services_to_control_nodes(cmapi)
        except Exception as e:
            print 'Error while trying to move services {} - try again!'.format(e.message)
            return

    # fix rack-storage-monitor issue
    if flag == 'fix-rack':
        print 'Fixing rack storage monitor issue'
        try:
            fix_rack_storage_monitor_issue(cmapi)
        except Exception as e:
            print 'Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message)
            return

    # run compute services maintenance
    if flag == 'compute-maint':
        try:
            run_compute_maintenance()
        except Exception as e:
            print 'Error while trying to run compute services maintenance {} - try again!'.format(e.message)
            return

    # run compute services un-maintenance
    if flag == 'compute-unmaint':
        try:
            run_compute_unmaintenance()
        except Exception as e:
            print 'Error while trying to run compute services un-maintenance {} - try again!'.format(e.message)
            return


main()
	# coding=utf-8
	import os
	import sys
	import waiting
	from datetime import datetime
	from time import sleep

	import retrying
	from strato_common import credentials
	from vm_manager_client import client as vm_client_module
	from strato_kv.clustermanagement import clustermanagementapi

	MAINTENANCE_MODE_COOLDOWN_TIME = 2
	COOLDOWN_TIME = 10
	TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300
	INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10
	CONTROL_NODES = ["stratonode1.node.strato"]

	compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd',
	'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent',
	'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute']

	SERVICES_WITH_NODE_TYPE = [
	'opa-policy-manager',
	'acm-api',
	'alarms-engine',
	'alarms-service',
	'api-explorer',
	'app-catalog-periodic-tasks',
	'app-catalog',
	'app-catalog-worker',
	'asg-api',
	'autoscaling-groups',
	'autoscaling-groups-worker',
	'aws-auth',
	'cassandra-engine',
	'certificate-manager-api',
	'cloudwatch-api',
	'cloudwatch-backend-api',
	'cloudwatch-backend-worker',
	'conversions',
	'conversions-worker',
	'credit-manager',
	'crs-manager-api',
	'dbc-manager-api',
	'dbs-manager',
	'docker-registry',
	'ec2-compute',
	'elb-api',
	'emr-api',
	'engine-manager-api',
	'engine-manager-worker',
	'events-service',
	'external-endpoint-manager',
	'galeramariadb-engine',
	'gargantua',
	'gcm',
	'grafana',
	'guestnet-admin-tool-api',
	'guestnet-admin-tool-beat',
	'guestnet-admin-tool-worker',
	'hot-upgrade',
	'hot-upgrade-worker',
	'http-proxy-service',
	'iam',
	'identity-manager',
	'image-manager-api',
	'image-manager-worker',
	# 'influxdb': ['control', 2],
	'inspector-api',
	'inspector-worker',
	'jaeger',
	'kafka-engine',
	'kubernetes-manager',
	'kubernetes-worker',
	'lbaas-manager',
	'lbaas-worker',
	'logserver',
	'maestro-auth',
	'maestro-data-reporter',
	'maestro-events-reporter',
	'maestro-gotty',
	'maestro-tunnel-client',
	'mancala-dr',
	'mancala-externalstorage',
	'mapreduce-api',
	'melet-api',
	'metrics-service',
	'mongodb-engine',
	'mssql-engine',
	# 'mysql': ['control', 3],
	'net-metrics-collector-worker',
	'neutron-db-init',
	'neutron-rpc-server',
	'neutron-server',
	'nfs-manager-api',
	'ntpd-server',
	'oauth2-proxy',
	'oort',
	'openotp-ldap-bridge',
	'openstack-cinder-api',
	'openstack-cinder-scheduler',
	'openstack-cinder-volume',
	'openstack-keystone',
	'openstack-nova-api',
	'openstack-nova-cert',
	'openstack-nova-conductor',
	'openstack-nova-consoleauth',
	'openstack-nova-novncproxy',
	'openstack-nova-scheduler',
	'placementapi',
	'placement',
	# 'policy-enforcer': ['control', 3],
	'policy-store',
	'protection-scheduler-api',
	'protection-scheduler-worker',
	'quotas-manager',
	# 'rack-storage-mgr': ['control', 3],
	# 'rack-storage-monitor': ['control', 3],
	# 'rack-storage-radosgw': ['control', 3],
	'rds-api',
	'redis-cache',
	'redis-engine',
	'region',
	'resource-tracker',
	'route53',
	's3-manager-api',
	's3-manager-worker',
	's3-scality',
	's3-vault',
	'scality-engine',
	'service-provisioner',
	'services-metrics-collector',
	'snapshot-manager',
	'sns-api',
	'sns-backend',
	'sqs-engine',
	'sqs-service-api',
	'strato-kapacitor',
	'stratonet-frontend',
	'stratonet-garbagecollector',
	'stratonet-ipam',
	'ui-backend',
	'ui-console',
	'updatemanagerapi',
	'vault-manager',
	'vault',
	'virtual-api2',
	'virtual-dr',
	'virtual-installation',
	'virtual-maestro',
	'virtual-nb',
	'virtual-region',
	'virtual-servicesgw',
	'vm-manager',
	'vm-manager-worker',
	'vms-monitor',
	'volumehealth',
	'volume-manager',
	'vpc-backend-api',
	'vpc-backend-periodic-tasks',
	'vpc-backend-worker',
	]

	def is_service_healthy(service, to_node):
	to_node_real_name = to_node.split('.')[0]
	command = (
	"dig {service}.service.strato \| grep 'status: NOERROR' && "
	"consul catalog services -node {to_node} \| grep '{service}' 2>&1 /dev/null ".format(
	service=service, to_node=to_node_real_name
	)
	)

	try:
	res = os.system(command)
	return res == 0
	except:
	return False


	def flip_placement_map(placement_map):
	'''Flips a nested dict inside out
	takes {hostname: {service: state}}, returns {service: [hostnames]}
	'''
	res = {}
	for hostname in placement_map:
	for service in placement_map[hostname]:
	res.setdefault(service, []).append(hostname)
	return res


	@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000)
	def move_service_with_retry(service, from_node, to_node):
	'''Move a service to a node, retrying if it fails'''
	os.system('timeout 45s inspector tools cm move-service {service} {from_node} {to_node} -q'.format(service=service,
	from_node=from_node,
	to_node=to_node))


	def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
	percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
	filled_length = int(length * iteration // total)
	bar = fill * filled_length + '-' * (length - filled_length)
	sys.stdout.write('\r%s \|%s\| %s%% %s' % (prefix, bar, percent, suffix))
	sys.stdout.flush()
	if iteration == total:
	sys.stdout.write('\n')


	def print_moving_services_table(data):
	print '{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node')
	for k, v in data.iteritems():
	to_node, from_node = v.values()

	print '{:<30} {:<30} {:<30}'.format(k, from_node, to_node)

	print '-------------------------- Total services to move: {} --------------------------'.format(len(data))


	def system_command_with_timeout(command):
	try:
	os.system('timeout 45s {}'.format(command))
	except Exception as e:
	print(e.message)
	raise e


	def move_services_to_control_nodes(cmapi):
	services_to_move = summary_of_moving_services(cmapi, print_results=False)
	total_moves = len(services_to_move)

	moves_counter = 0
	for control_service, path in services_to_move.iteritems():
	to_node, from_node = path.values()

	moves_counter += 1
	progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter),
	suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node))
	try:
	move_service_with_retry(control_service, from_node, to_node)
	except:
	print 'Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node)
	continue

	waiting.wait(
	lambda: is_service_healthy(control_service, to_node),
	timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK,
	sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK,
	waiting_for='Service {} to be healthy'.format(control_service))
	)

	print 'Done moving {} services'.format(moves_counter)


	def find_vms_on_control_nodes(vm_client):
	all_vms = vm_client.list()
	vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES]
	if vms_on_control_nodes:
	print 'Found {} VMs on control nodes:'.format(len(vms_on_control_nodes))
	for vm in vms_on_control_nodes:
	print 'VM {} is on node {}'.format(vm["name"], vm["hostname"])
	else:
	print 'No VMs found on control nodes'


	def run_validators(vm_client):
	print '\n\n[] Validation that no VMs are found on a future control node'
	find_vms_on_control_nodes(vm_client)


	def summary_of_moving_services(cmapi, print_results=True):
	control_map = cmapi.registry.get('cluster/control_services_placement_map')
	flipped_control_map = flip_placement_map(control_map)

	services_dict = {}
	total_moves = 0

	for control_service in SERVICES_WITH_NODE_TYPE:
	from_node = flipped_control_map[control_service][0]
	to_node = CONTROL_NODES[total_moves % len(CONTROL_NODES)]

	if from_node in CONTROL_NODES:
	continue

	if from_node == to_node:
	continue

	total_moves += 1
	services_dict[control_service] = {'from': str(from_node), 'to': to_node}

	if print_results:
	print_moving_services_table(services_dict)

	return services_dict


	def consul_backup():
	file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
	system_command_with_timeout('consul snapshot save {}'.format(file_name))
	print 'Saved consul snapshot to {}'.format(file_name)


	def fix_rack_storage_monitor_issue(cmapi):
	print 'Deleting all ceph storage monitors from Consul'

	cmapi.registry.delete('cluster/cephMonitors', recursive=True)

	print '* Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service *'


	def run_compute_maintenance():
	for service in compute_services:
	print 'Putting service {} in maintenance mode'.format(service)
	system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service))

	sleep(MAINTENANCE_MODE_COOLDOWN_TIME)

	print 'Done putting services in maintenance mode'


	def run_compute_unmaintenance():
	for service in compute_services:
	print 'Putting service {} out of maintenance mode'.format(service)
	system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service))

	sleep(MAINTENANCE_MODE_COOLDOWN_TIME)

	print 'Done putting services out of maintenance mode'


	def main():
	cmapi = clustermanagementapi.ClusterManagementAPI()
	vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms

	if len(sys.argv) == 1:
	print 'Choose the following flags: backup, validate, summary, move, fix-rack, compute-maint, compute-unmaint'
	return

	flag = sys.argv[1]

	# run consul backup
	if flag == 'backup':
	try:
	consul_backup()
	except Exception as e:
	print 'Error while trying to take a consul snapshot {} - try again!'.format(e.message)
	return

	# run validators
	if flag == 'validate':
	try:
	run_validators(vm_client)
	except Exception as e:
	print 'Error while running validations {} - try again!'.format(e.message)
	return

	# run a summary of services to move
	if flag == 'summary':
	try:
	summary_of_moving_services(cmapi)
	except Exception as e:
	print 'Error while trying get a summary of all moving services {} - try again!'.format(e.message)
	return

	# run the actual move
	if flag == 'move':
	try:
	move_services_to_control_nodes(cmapi)
	except Exception as e:
	print 'Error while trying to move services {} - try again!'.format(e.message)
	return

	# fix rack-storage-monitor issue
	if flag == 'fix-rack':
	print 'Fixing rack storage monitor issue'
	try:
	fix_rack_storage_monitor_issue(cmapi)
	except Exception as e:
	print 'Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message)
	return

	# run compute services maintenance
	if flag == 'compute-maint':
	try:
	run_compute_maintenance()
	except Exception as e:
	print 'Error while trying to run compute services maintenance {} - try again!'.format(e.message)
	return

	# run compute services un-maintenance
	if flag == 'compute-unmaint':
	try:
	run_compute_unmaintenance()
	except Exception as e:
	print 'Error while trying to run compute services un-maintenance {} - try again!'.format(e.message)
	return


	main()