Skip to content

Instantly share code, notes, and snippets.

@yuvalta
Last active March 7, 2024 16:11
Show Gist options
  • Save yuvalta/5f3789a6c3bd2c6865d728d578e3b669 to your computer and use it in GitHub Desktop.
Save yuvalta/5f3789a6c3bd2c6865d728d578e3b669 to your computer and use it in GitHub Desktop.
Script to move control services to control nodes
# coding=utf-8
import os
import sys
import waiting
from datetime import datetime
from time import sleep
import retrying
from strato_common import credentials
from vm_manager_client import client as vm_client_module
from strato_kv.clustermanagement import clustermanagementapi
MAINTENANCE_MODE_COOLDOWN_TIME = 2
COOLDOWN_TIME = 10
TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300
INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10
CONTROL_NODES = ["stratonode1.node.strato"]
compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd',
'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent',
'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute']
SERVICES_WITH_NODE_TYPE = [
'opa-policy-manager',
'acm-api',
'alarms-engine',
'alarms-service',
'api-explorer',
'app-catalog-periodic-tasks',
'app-catalog',
'app-catalog-worker',
'asg-api',
'autoscaling-groups',
'autoscaling-groups-worker',
'aws-auth',
'cassandra-engine',
'certificate-manager-api',
'cloudwatch-api',
'cloudwatch-backend-api',
'cloudwatch-backend-worker',
'conversions',
'conversions-worker',
'credit-manager',
'crs-manager-api',
'dbc-manager-api',
'dbs-manager',
'docker-registry',
'ec2-compute',
'elb-api',
'emr-api',
'engine-manager-api',
'engine-manager-worker',
'events-service',
'external-endpoint-manager',
'galeramariadb-engine',
'gargantua',
'gcm',
'grafana',
'guestnet-admin-tool-api',
'guestnet-admin-tool-beat',
'guestnet-admin-tool-worker',
'hot-upgrade',
'hot-upgrade-worker',
'http-proxy-service',
'iam',
'identity-manager',
'image-manager-api',
'image-manager-worker',
# 'influxdb': ['control', 2],
'inspector-api',
'inspector-worker',
'jaeger',
'kafka-engine',
'kubernetes-manager',
'kubernetes-worker',
'lbaas-manager',
'lbaas-worker',
'logserver',
'maestro-auth',
'maestro-data-reporter',
'maestro-events-reporter',
'maestro-gotty',
'maestro-tunnel-client',
'mancala-dr',
'mancala-externalstorage',
'mapreduce-api',
'melet-api',
'metrics-service',
'mongodb-engine',
'mssql-engine',
# 'mysql': ['control', 3],
'net-metrics-collector-worker',
'neutron-db-init',
'neutron-rpc-server',
'neutron-server',
'nfs-manager-api',
'ntpd-server',
'oauth2-proxy',
'oort',
'openotp-ldap-bridge',
'openstack-cinder-api',
'openstack-cinder-scheduler',
'openstack-cinder-volume',
'openstack-keystone',
'openstack-nova-api',
'openstack-nova-cert',
'openstack-nova-conductor',
'openstack-nova-consoleauth',
'openstack-nova-novncproxy',
'openstack-nova-scheduler',
'placementapi',
'placement',
# 'policy-enforcer': ['control', 3],
'policy-store',
'protection-scheduler-api',
'protection-scheduler-worker',
'quotas-manager',
# 'rack-storage-mgr': ['control', 3],
# 'rack-storage-monitor': ['control', 3],
# 'rack-storage-radosgw': ['control', 3],
'rds-api',
'redis-cache',
'redis-engine',
'region',
'resource-tracker',
'route53',
's3-manager-api',
's3-manager-worker',
's3-scality',
's3-vault',
'scality-engine',
'service-provisioner',
'services-metrics-collector',
'snapshot-manager',
'sns-api',
'sns-backend',
'sqs-engine',
'sqs-service-api',
'strato-kapacitor',
'stratonet-frontend',
'stratonet-garbagecollector',
'stratonet-ipam',
'ui-backend',
'ui-console',
'updatemanagerapi',
'vault-manager',
'vault',
'virtual-api2',
'virtual-dr',
'virtual-installation',
'virtual-maestro',
'virtual-nb',
'virtual-region',
'virtual-servicesgw',
'vm-manager',
'vm-manager-worker',
'vms-monitor',
'volumehealth',
'volume-manager',
'vpc-backend-api',
'vpc-backend-periodic-tasks',
'vpc-backend-worker',
]
def is_service_healthy(service, to_node):
to_node_real_name = to_node.split('.')[0]
command = (
"dig {service}.service.strato | grep 'status: NOERROR' && "
"consul catalog services -node {to_node} | grep '{service}' 2>&1 /dev/null ".format(
service=service, to_node=to_node_real_name
)
)
try:
res = os.system(command)
return res == 0
except:
return False
def flip_placement_map(placement_map):
'''Flips a nested dict inside out
takes {hostname: {service: state}}, returns {service: [hostnames]}
'''
res = {}
for hostname in placement_map:
for service in placement_map[hostname]:
res.setdefault(service, []).append(hostname)
return res
@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000)
def move_service_with_retry(service, from_node, to_node):
'''Move a service to a node, retrying if it fails'''
os.system('timeout 45s inspector tools cm move-service {service} {from_node} {to_node} -q'.format(service=service,
from_node=from_node,
to_node=to_node))
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
filled_length = int(length * iteration // total)
bar = fill * filled_length + '-' * (length - filled_length)
sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
sys.stdout.flush()
if iteration == total:
sys.stdout.write('\n')
def print_moving_services_table(data):
print '{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node')
for k, v in data.iteritems():
to_node, from_node = v.values()
print '{:<30} {:<30} {:<30}'.format(k, from_node, to_node)
print '-------------------------- Total services to move: {} --------------------------'.format(len(data))
def system_command_with_timeout(command):
try:
os.system('timeout 45s {}'.format(command))
except Exception as e:
print(e.message)
raise e
def move_services_to_control_nodes(cmapi):
services_to_move = summary_of_moving_services(cmapi, print_results=False)
total_moves = len(services_to_move)
moves_counter = 0
for control_service, path in services_to_move.iteritems():
to_node, from_node = path.values()
moves_counter += 1
progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter),
suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node))
try:
move_service_with_retry(control_service, from_node, to_node)
except:
print 'Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node)
continue
waiting.wait(
lambda: is_service_healthy(control_service, to_node),
timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK,
sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK,
waiting_for='Service {} to be healthy'.format(control_service))
)
print 'Done moving {} services'.format(moves_counter)
def find_vms_on_control_nodes(vm_client):
all_vms = vm_client.list()
vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES]
if vms_on_control_nodes:
print 'Found {} VMs on control nodes:'.format(len(vms_on_control_nodes))
for vm in vms_on_control_nodes:
print 'VM {} is on node {}'.format(vm["name"], vm["hostname"])
else:
print 'No VMs found on control nodes'
def run_validators(vm_client):
print '\n\n[] Validation that no VMs are found on a future control node'
find_vms_on_control_nodes(vm_client)
def summary_of_moving_services(cmapi, print_results=True):
control_map = cmapi.registry.get('cluster/control_services_placement_map')
flipped_control_map = flip_placement_map(control_map)
services_dict = {}
total_moves = 0
for control_service in SERVICES_WITH_NODE_TYPE:
from_node = flipped_control_map[control_service][0]
to_node = CONTROL_NODES[total_moves % len(CONTROL_NODES)]
if from_node in CONTROL_NODES:
continue
if from_node == to_node:
continue
total_moves += 1
services_dict[control_service] = {'from': str(from_node), 'to': to_node}
if print_results:
print_moving_services_table(services_dict)
return services_dict
def consul_backup():
file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
system_command_with_timeout('consul snapshot save {}'.format(file_name))
print 'Saved consul snapshot to {}'.format(file_name)
def fix_rack_storage_monitor_issue(cmapi):
print 'Deleting all ceph storage monitors from Consul'
cmapi.registry.delete('cluster/cephMonitors', recursive=True)
print '*** Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service ***'
def run_compute_maintenance():
for service in compute_services:
print 'Putting service {} in maintenance mode'.format(service)
system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print 'Done putting services in maintenance mode'
def run_compute_unmaintenance():
for service in compute_services:
print 'Putting service {} out of maintenance mode'.format(service)
system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print 'Done putting services out of maintenance mode'
def main():
cmapi = clustermanagementapi.ClusterManagementAPI()
vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms
if len(sys.argv) == 1:
print 'Choose the following flags: backup, validate, summary, move, fix-rack, compute-maint, compute-unmaint'
return
flag = sys.argv[1]
# run consul backup
if flag == 'backup':
try:
consul_backup()
except Exception as e:
print 'Error while trying to take a consul snapshot {} - try again!'.format(e.message)
return
# run validators
if flag == 'validate':
try:
run_validators(vm_client)
except Exception as e:
print 'Error while running validations {} - try again!'.format(e.message)
return
# run a summary of services to move
if flag == 'summary':
try:
summary_of_moving_services(cmapi)
except Exception as e:
print 'Error while trying get a summary of all moving services {} - try again!'.format(e.message)
return
# run the actual move
if flag == 'move':
try:
move_services_to_control_nodes(cmapi)
except Exception as e:
print 'Error while trying to move services {} - try again!'.format(e.message)
return
# fix rack-storage-monitor issue
if flag == 'fix-rack':
print 'Fixing rack storage monitor issue'
try:
fix_rack_storage_monitor_issue(cmapi)
except Exception as e:
print 'Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message)
return
# run compute services maintenance
if flag == 'compute-maint':
try:
run_compute_maintenance()
except Exception as e:
print 'Error while trying to run compute services maintenance {} - try again!'.format(e.message)
return
# run compute services un-maintenance
if flag == 'compute-unmaint':
try:
run_compute_unmaintenance()
except Exception as e:
print 'Error while trying to run compute services un-maintenance {} - try again!'.format(e.message)
return
main()
@yuvalta
Copy link
Author

yuvalta commented Jan 15, 2023

move consul_backup to root dir instead of /tmp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment