Last active
March 7, 2024 16:11
-
-
Save yuvalta/5f3789a6c3bd2c6865d728d578e3b669 to your computer and use it in GitHub Desktop.
Script to move control services to control nodes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import os | |
import sys | |
import waiting | |
from datetime import datetime | |
from time import sleep | |
import retrying | |
from strato_common import credentials | |
from vm_manager_client import client as vm_client_module | |
from strato_kv.clustermanagement import clustermanagementapi | |
MAINTENANCE_MODE_COOLDOWN_TIME = 2 | |
COOLDOWN_TIME = 10 | |
TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300 | |
INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10 | |
CONTROL_NODES = ["stratonode1.node.strato"] | |
compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd', | |
'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent', | |
'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute'] | |
SERVICES_WITH_NODE_TYPE = [ | |
'opa-policy-manager', | |
'acm-api', | |
'alarms-engine', | |
'alarms-service', | |
'api-explorer', | |
'app-catalog-periodic-tasks', | |
'app-catalog', | |
'app-catalog-worker', | |
'asg-api', | |
'autoscaling-groups', | |
'autoscaling-groups-worker', | |
'aws-auth', | |
'cassandra-engine', | |
'certificate-manager-api', | |
'cloudwatch-api', | |
'cloudwatch-backend-api', | |
'cloudwatch-backend-worker', | |
'conversions', | |
'conversions-worker', | |
'credit-manager', | |
'crs-manager-api', | |
'dbc-manager-api', | |
'dbs-manager', | |
'docker-registry', | |
'ec2-compute', | |
'elb-api', | |
'emr-api', | |
'engine-manager-api', | |
'engine-manager-worker', | |
'events-service', | |
'external-endpoint-manager', | |
'galeramariadb-engine', | |
'gargantua', | |
'gcm', | |
'grafana', | |
'guestnet-admin-tool-api', | |
'guestnet-admin-tool-beat', | |
'guestnet-admin-tool-worker', | |
'hot-upgrade', | |
'hot-upgrade-worker', | |
'http-proxy-service', | |
'iam', | |
'identity-manager', | |
'image-manager-api', | |
'image-manager-worker', | |
# 'influxdb': ['control', 2], | |
'inspector-api', | |
'inspector-worker', | |
'jaeger', | |
'kafka-engine', | |
'kubernetes-manager', | |
'kubernetes-worker', | |
'lbaas-manager', | |
'lbaas-worker', | |
'logserver', | |
'maestro-auth', | |
'maestro-data-reporter', | |
'maestro-events-reporter', | |
'maestro-gotty', | |
'maestro-tunnel-client', | |
'mancala-dr', | |
'mancala-externalstorage', | |
'mapreduce-api', | |
'melet-api', | |
'metrics-service', | |
'mongodb-engine', | |
'mssql-engine', | |
# 'mysql': ['control', 3], | |
'net-metrics-collector-worker', | |
'neutron-db-init', | |
'neutron-rpc-server', | |
'neutron-server', | |
'nfs-manager-api', | |
'ntpd-server', | |
'oauth2-proxy', | |
'oort', | |
'openotp-ldap-bridge', | |
'openstack-cinder-api', | |
'openstack-cinder-scheduler', | |
'openstack-cinder-volume', | |
'openstack-keystone', | |
'openstack-nova-api', | |
'openstack-nova-cert', | |
'openstack-nova-conductor', | |
'openstack-nova-consoleauth', | |
'openstack-nova-novncproxy', | |
'openstack-nova-scheduler', | |
'placementapi', | |
'placement', | |
# 'policy-enforcer': ['control', 3], | |
'policy-store', | |
'protection-scheduler-api', | |
'protection-scheduler-worker', | |
'quotas-manager', | |
# 'rack-storage-mgr': ['control', 3], | |
# 'rack-storage-monitor': ['control', 3], | |
# 'rack-storage-radosgw': ['control', 3], | |
'rds-api', | |
'redis-cache', | |
'redis-engine', | |
'region', | |
'resource-tracker', | |
'route53', | |
's3-manager-api', | |
's3-manager-worker', | |
's3-scality', | |
's3-vault', | |
'scality-engine', | |
'service-provisioner', | |
'services-metrics-collector', | |
'snapshot-manager', | |
'sns-api', | |
'sns-backend', | |
'sqs-engine', | |
'sqs-service-api', | |
'strato-kapacitor', | |
'stratonet-frontend', | |
'stratonet-garbagecollector', | |
'stratonet-ipam', | |
'ui-backend', | |
'ui-console', | |
'updatemanagerapi', | |
'vault-manager', | |
'vault', | |
'virtual-api2', | |
'virtual-dr', | |
'virtual-installation', | |
'virtual-maestro', | |
'virtual-nb', | |
'virtual-region', | |
'virtual-servicesgw', | |
'vm-manager', | |
'vm-manager-worker', | |
'vms-monitor', | |
'volumehealth', | |
'volume-manager', | |
'vpc-backend-api', | |
'vpc-backend-periodic-tasks', | |
'vpc-backend-worker', | |
] | |
def is_service_healthy(service, to_node): | |
to_node_real_name = to_node.split('.')[0] | |
command = ( | |
"dig {service}.service.strato | grep 'status: NOERROR' && " | |
"consul catalog services -node {to_node} | grep '{service}' 2>&1 /dev/null ".format( | |
service=service, to_node=to_node_real_name | |
) | |
) | |
try: | |
res = os.system(command) | |
return res == 0 | |
except: | |
return False | |
def flip_placement_map(placement_map): | |
'''Flips a nested dict inside out | |
takes {hostname: {service: state}}, returns {service: [hostnames]} | |
''' | |
res = {} | |
for hostname in placement_map: | |
for service in placement_map[hostname]: | |
res.setdefault(service, []).append(hostname) | |
return res | |
@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000) | |
def move_service_with_retry(service, from_node, to_node): | |
'''Move a service to a node, retrying if it fails''' | |
os.system('timeout 45s inspector tools cm move-service {service} {from_node} {to_node} -q'.format(service=service, | |
from_node=from_node, | |
to_node=to_node)) | |
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): | |
percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total))) | |
filled_length = int(length * iteration // total) | |
bar = fill * filled_length + '-' * (length - filled_length) | |
sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix)) | |
sys.stdout.flush() | |
if iteration == total: | |
sys.stdout.write('\n') | |
def print_moving_services_table(data): | |
print '{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node') | |
for k, v in data.iteritems(): | |
to_node, from_node = v.values() | |
print '{:<30} {:<30} {:<30}'.format(k, from_node, to_node) | |
print '-------------------------- Total services to move: {} --------------------------'.format(len(data)) | |
def system_command_with_timeout(command): | |
try: | |
os.system('timeout 45s {}'.format(command)) | |
except Exception as e: | |
print(e.message) | |
raise e | |
def move_services_to_control_nodes(cmapi): | |
services_to_move = summary_of_moving_services(cmapi, print_results=False) | |
total_moves = len(services_to_move) | |
moves_counter = 0 | |
for control_service, path in services_to_move.iteritems(): | |
to_node, from_node = path.values() | |
moves_counter += 1 | |
progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter), | |
suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node)) | |
try: | |
move_service_with_retry(control_service, from_node, to_node) | |
except: | |
print 'Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node) | |
continue | |
waiting.wait( | |
lambda: is_service_healthy(control_service, to_node), | |
timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK, | |
sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK, | |
waiting_for='Service {} to be healthy'.format(control_service)) | |
) | |
print 'Done moving {} services'.format(moves_counter) | |
def find_vms_on_control_nodes(vm_client): | |
all_vms = vm_client.list() | |
vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES] | |
if vms_on_control_nodes: | |
print 'Found {} VMs on control nodes:'.format(len(vms_on_control_nodes)) | |
for vm in vms_on_control_nodes: | |
print 'VM {} is on node {}'.format(vm["name"], vm["hostname"]) | |
else: | |
print 'No VMs found on control nodes' | |
def run_validators(vm_client): | |
print '\n\n[] Validation that no VMs are found on a future control node' | |
find_vms_on_control_nodes(vm_client) | |
def summary_of_moving_services(cmapi, print_results=True): | |
control_map = cmapi.registry.get('cluster/control_services_placement_map') | |
flipped_control_map = flip_placement_map(control_map) | |
services_dict = {} | |
total_moves = 0 | |
for control_service in SERVICES_WITH_NODE_TYPE: | |
from_node = flipped_control_map[control_service][0] | |
to_node = CONTROL_NODES[total_moves % len(CONTROL_NODES)] | |
if from_node in CONTROL_NODES: | |
continue | |
if from_node == to_node: | |
continue | |
total_moves += 1 | |
services_dict[control_service] = {'from': str(from_node), 'to': to_node} | |
if print_results: | |
print_moving_services_table(services_dict) | |
return services_dict | |
def consul_backup(): | |
file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) | |
system_command_with_timeout('consul snapshot save {}'.format(file_name)) | |
print 'Saved consul snapshot to {}'.format(file_name) | |
def fix_rack_storage_monitor_issue(cmapi): | |
print 'Deleting all ceph storage monitors from Consul' | |
cmapi.registry.delete('cluster/cephMonitors', recursive=True) | |
print '*** Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service ***' | |
def run_compute_maintenance(): | |
for service in compute_services: | |
print 'Putting service {} in maintenance mode'.format(service) | |
system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service)) | |
sleep(MAINTENANCE_MODE_COOLDOWN_TIME) | |
print 'Done putting services in maintenance mode' | |
def run_compute_unmaintenance(): | |
for service in compute_services: | |
print 'Putting service {} out of maintenance mode'.format(service) | |
system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service)) | |
sleep(MAINTENANCE_MODE_COOLDOWN_TIME) | |
print 'Done putting services out of maintenance mode' | |
def main(): | |
cmapi = clustermanagementapi.ClusterManagementAPI() | |
vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms | |
if len(sys.argv) == 1: | |
print 'Choose the following flags: backup, validate, summary, move, fix-rack, compute-maint, compute-unmaint' | |
return | |
flag = sys.argv[1] | |
# run consul backup | |
if flag == 'backup': | |
try: | |
consul_backup() | |
except Exception as e: | |
print 'Error while trying to take a consul snapshot {} - try again!'.format(e.message) | |
return | |
# run validators | |
if flag == 'validate': | |
try: | |
run_validators(vm_client) | |
except Exception as e: | |
print 'Error while running validations {} - try again!'.format(e.message) | |
return | |
# run a summary of services to move | |
if flag == 'summary': | |
try: | |
summary_of_moving_services(cmapi) | |
except Exception as e: | |
print 'Error while trying get a summary of all moving services {} - try again!'.format(e.message) | |
return | |
# run the actual move | |
if flag == 'move': | |
try: | |
move_services_to_control_nodes(cmapi) | |
except Exception as e: | |
print 'Error while trying to move services {} - try again!'.format(e.message) | |
return | |
# fix rack-storage-monitor issue | |
if flag == 'fix-rack': | |
print 'Fixing rack storage monitor issue' | |
try: | |
fix_rack_storage_monitor_issue(cmapi) | |
except Exception as e: | |
print 'Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message) | |
return | |
# run compute services maintenance | |
if flag == 'compute-maint': | |
try: | |
run_compute_maintenance() | |
except Exception as e: | |
print 'Error while trying to run compute services maintenance {} - try again!'.format(e.message) | |
return | |
# run compute services un-maintenance | |
if flag == 'compute-unmaint': | |
try: | |
run_compute_unmaintenance() | |
except Exception as e: | |
print 'Error while trying to run compute services un-maintenance {} - try again!'.format(e.message) | |
return | |
main() |
move consul_backup
to root
dir instead of /tmp
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Script for moving
control
services to selected nodesThis script helps to decrease the "storm" of moving services across the cluster when converting to LCS (without maint.)
How to use:
CONTROL_NODES
accordinglyCOOLDOWN_TIME
if neededbackup
- create aconsul snapshot
validate
- run validations on the cluster (currently only checking if there are VMs on a future control node)summary
- plot a table with a list of all services that need to move, and their path (from node and to node)move
- move services