Skip to content

Instantly share code, notes, and snippets.

@yuvalta
Last active March 7, 2024 16:11
Show Gist options
  • Save yuvalta/5f3789a6c3bd2c6865d728d578e3b669 to your computer and use it in GitHub Desktop.
Save yuvalta/5f3789a6c3bd2c6865d728d578e3b669 to your computer and use it in GitHub Desktop.
Script to move control services to control nodes
# coding=utf-8
import os
import sys
import waiting
from datetime import datetime
from time import sleep
import retrying
from strato_common import credentials
from vm_manager_client import client as vm_client_module
from strato_kv.clustermanagement import clustermanagementapi
MAINTENANCE_MODE_COOLDOWN_TIME = 2
COOLDOWN_TIME = 10
TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300
INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10
CONTROL_NODES = ["stratonode1.node.strato"]
compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd',
'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent',
'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute']
SERVICES_WITH_NODE_TYPE = [
'opa-policy-manager',
'acm-api',
'alarms-engine',
'alarms-service',
'api-explorer',
'app-catalog-periodic-tasks',
'app-catalog',
'app-catalog-worker',
'asg-api',
'autoscaling-groups',
'autoscaling-groups-worker',
'aws-auth',
'cassandra-engine',
'certificate-manager-api',
'cloudwatch-api',
'cloudwatch-backend-api',
'cloudwatch-backend-worker',
'conversions',
'conversions-worker',
'credit-manager',
'crs-manager-api',
'dbc-manager-api',
'dbs-manager',
'docker-registry',
'ec2-compute',
'elb-api',
'emr-api',
'engine-manager-api',
'engine-manager-worker',
'events-service',
'external-endpoint-manager',
'galeramariadb-engine',
'gargantua',
'gcm',
'grafana',
'guestnet-admin-tool-api',
'guestnet-admin-tool-beat',
'guestnet-admin-tool-worker',
'hot-upgrade',
'hot-upgrade-worker',
'http-proxy-service',
'iam',
'identity-manager',
'image-manager-api',
'image-manager-worker',
# 'influxdb': ['control', 2],
'inspector-api',
'inspector-worker',
'jaeger',
'kafka-engine',
'kubernetes-manager',
'kubernetes-worker',
'lbaas-manager',
'lbaas-worker',
'logserver',
'maestro-auth',
'maestro-data-reporter',
'maestro-events-reporter',
'maestro-gotty',
'maestro-tunnel-client',
'mancala-dr',
'mancala-externalstorage',
'mapreduce-api',
'melet-api',
'metrics-service',
'mongodb-engine',
'mssql-engine',
# 'mysql': ['control', 3],
'net-metrics-collector-worker',
'neutron-db-init',
'neutron-rpc-server',
'neutron-server',
'nfs-manager-api',
'ntpd-server',
'oauth2-proxy',
'oort',
'openotp-ldap-bridge',
'openstack-cinder-api',
'openstack-cinder-scheduler',
'openstack-cinder-volume',
'openstack-keystone',
'openstack-nova-api',
'openstack-nova-cert',
'openstack-nova-conductor',
'openstack-nova-consoleauth',
'openstack-nova-novncproxy',
'openstack-nova-scheduler',
'placementapi',
'placement',
# 'policy-enforcer': ['control', 3],
'policy-store',
'protection-scheduler-api',
'protection-scheduler-worker',
'quotas-manager',
# 'rack-storage-mgr': ['control', 3],
# 'rack-storage-monitor': ['control', 3],
# 'rack-storage-radosgw': ['control', 3],
'rds-api',
'redis-cache',
'redis-engine',
'region',
'resource-tracker',
'route53',
's3-manager-api',
's3-manager-worker',
's3-scality',
's3-vault',
'scality-engine',
'service-provisioner',
'services-metrics-collector',
'snapshot-manager',
'sns-api',
'sns-backend',
'sqs-engine',
'sqs-service-api',
'strato-kapacitor',
'stratonet-frontend',
'stratonet-garbagecollector',
'stratonet-ipam',
'ui-backend',
'ui-console',
'updatemanagerapi',
'vault-manager',
'vault',
'virtual-api2',
'virtual-dr',
'virtual-installation',
'virtual-maestro',
'virtual-nb',
'virtual-region',
'virtual-servicesgw',
'vm-manager',
'vm-manager-worker',
'vms-monitor',
'volumehealth',
'volume-manager',
'vpc-backend-api',
'vpc-backend-periodic-tasks',
'vpc-backend-worker',
]
def is_service_healthy(service, to_node):
to_node_real_name = to_node.split('.')[0]
command = (
"dig {service}.service.strato | grep 'status: NOERROR' && "
"consul catalog services -node {to_node} | grep '{service}' 2>&1 /dev/null ".format(
service=service, to_node=to_node_real_name
)
)
try:
res = os.system(command)
return res == 0
except:
return False
def flip_placement_map(placement_map):
'''Flips a nested dict inside out
takes {hostname: {service: state}}, returns {service: [hostnames]}
'''
res = {}
for hostname in placement_map:
for service in placement_map[hostname]:
res.setdefault(service, []).append(hostname)
return res
@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000)
def move_service_with_retry(service, from_node, to_node):
'''Move a service to a node, retrying if it fails'''
os.system('timeout 45s inspector tools cm move-service {service} {from_node} {to_node} -q'.format(service=service,
from_node=from_node,
to_node=to_node))
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
filled_length = int(length * iteration // total)
bar = fill * filled_length + '-' * (length - filled_length)
sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
sys.stdout.flush()
if iteration == total:
sys.stdout.write('\n')
def print_moving_services_table(data):
print '{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node')
for k, v in data.iteritems():
to_node, from_node = v.values()
print '{:<30} {:<30} {:<30}'.format(k, from_node, to_node)
print '-------------------------- Total services to move: {} --------------------------'.format(len(data))
def system_command_with_timeout(command):
try:
os.system('timeout 45s {}'.format(command))
except Exception as e:
print(e.message)
raise e
def move_services_to_control_nodes(cmapi):
services_to_move = summary_of_moving_services(cmapi, print_results=False)
total_moves = len(services_to_move)
moves_counter = 0
for control_service, path in services_to_move.iteritems():
to_node, from_node = path.values()
moves_counter += 1
progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter),
suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node))
try:
move_service_with_retry(control_service, from_node, to_node)
except:
print 'Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node)
continue
waiting.wait(
lambda: is_service_healthy(control_service, to_node),
timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK,
sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK,
waiting_for='Service {} to be healthy'.format(control_service))
)
print 'Done moving {} services'.format(moves_counter)
def find_vms_on_control_nodes(vm_client):
all_vms = vm_client.list()
vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES]
if vms_on_control_nodes:
print 'Found {} VMs on control nodes:'.format(len(vms_on_control_nodes))
for vm in vms_on_control_nodes:
print 'VM {} is on node {}'.format(vm["name"], vm["hostname"])
else:
print 'No VMs found on control nodes'
def run_validators(vm_client):
print '\n\n[] Validation that no VMs are found on a future control node'
find_vms_on_control_nodes(vm_client)
def summary_of_moving_services(cmapi, print_results=True):
control_map = cmapi.registry.get('cluster/control_services_placement_map')
flipped_control_map = flip_placement_map(control_map)
services_dict = {}
total_moves = 0
for control_service in SERVICES_WITH_NODE_TYPE:
from_node = flipped_control_map[control_service][0]
to_node = CONTROL_NODES[total_moves % len(CONTROL_NODES)]
if from_node in CONTROL_NODES:
continue
if from_node == to_node:
continue
total_moves += 1
services_dict[control_service] = {'from': str(from_node), 'to': to_node}
if print_results:
print_moving_services_table(services_dict)
return services_dict
def consul_backup():
file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
system_command_with_timeout('consul snapshot save {}'.format(file_name))
print 'Saved consul snapshot to {}'.format(file_name)
def fix_rack_storage_monitor_issue(cmapi):
print 'Deleting all ceph storage monitors from Consul'
cmapi.registry.delete('cluster/cephMonitors', recursive=True)
print '*** Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service ***'
def run_compute_maintenance():
for service in compute_services:
print 'Putting service {} in maintenance mode'.format(service)
system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print 'Done putting services in maintenance mode'
def run_compute_unmaintenance():
for service in compute_services:
print 'Putting service {} out of maintenance mode'.format(service)
system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print 'Done putting services out of maintenance mode'
def main():
cmapi = clustermanagementapi.ClusterManagementAPI()
vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms
if len(sys.argv) == 1:
print 'Choose the following flags: backup, validate, summary, move, fix-rack, compute-maint, compute-unmaint'
return
flag = sys.argv[1]
# run consul backup
if flag == 'backup':
try:
consul_backup()
except Exception as e:
print 'Error while trying to take a consul snapshot {} - try again!'.format(e.message)
return
# run validators
if flag == 'validate':
try:
run_validators(vm_client)
except Exception as e:
print 'Error while running validations {} - try again!'.format(e.message)
return
# run a summary of services to move
if flag == 'summary':
try:
summary_of_moving_services(cmapi)
except Exception as e:
print 'Error while trying get a summary of all moving services {} - try again!'.format(e.message)
return
# run the actual move
if flag == 'move':
try:
move_services_to_control_nodes(cmapi)
except Exception as e:
print 'Error while trying to move services {} - try again!'.format(e.message)
return
# fix rack-storage-monitor issue
if flag == 'fix-rack':
print 'Fixing rack storage monitor issue'
try:
fix_rack_storage_monitor_issue(cmapi)
except Exception as e:
print 'Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message)
return
# run compute services maintenance
if flag == 'compute-maint':
try:
run_compute_maintenance()
except Exception as e:
print 'Error while trying to run compute services maintenance {} - try again!'.format(e.message)
return
# run compute services un-maintenance
if flag == 'compute-unmaint':
try:
run_compute_unmaintenance()
except Exception as e:
print 'Error while trying to run compute services un-maintenance {} - try again!'.format(e.message)
return
main()
@yuvalta
Copy link
Author

yuvalta commented Jan 11, 2023

Script for moving control services to selected nodes

This script helps to decrease the "storm" of moving services across the cluster when converting to LCS (without maint.)

How to use:

  • Copy and paste the code to a node
  • Choose your control nodes and update CONTROL_NODES accordingly
  • Update COOLDOWN_TIME if needed
  • Run it with one of the following flags -
    • backup - create a consul snapshot
    • validate - run validations on the cluster (currently only checking if there are VMs on a future control node)
    • summary - plot a table with a list of all services that need to move, and their path (from node and to node)
    • move - move services

@yuvalta
Copy link
Author

yuvalta commented Jan 15, 2023

move consul_backup to root dir instead of /tmp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment