Skip to content

Instantly share code, notes, and snippets.

@grishatsuker
Forked from yuvalta/service_mover.py
Last active April 14, 2024 13:37
Show Gist options
  • Save grishatsuker/f3a749fc4759034b1a3d2df6271049a3 to your computer and use it in GitHub Desktop.
Save grishatsuker/f3a749fc4759034b1a3d2df6271049a3 to your computer and use it in GitHub Desktop.
Script to move control services to control nodes
# coding=utf-8
import os
import sys
import subprocess
import waiting
import json
from datetime import datetime, timedelta
from time import sleep
import retrying
from strato_common import credentials
from strato_common import admin_creds
from vm_manager_client import client as vm_client_module
from neutronclient.neutron import client as neutron_client_module
from strato_kv.clustermanagement import clustermanagementapi
from strato_kv.consulutils.consulkeyvaluestoreclient import ConsulKeyValueStoreClient
MAINTENANCE_MODE_COOLDOWN_TIME = 2
COOLDOWN_TIME = 10
TIMEOUT_FOR_SERVICE_HEALTH_CHECK = 300
INTERVAL_FOR_SERVICE_HEALTH_CHECK = 10
CONTROL_NODES = ["stratonode0.node.strato", "stratonode2.node.strato", "stratonode1.node.strato"]
openstack_services = [
'openstack-nova-api',
'openstack-nova-scheduler',
'openstack-nova-conductor',
'neutron-server',
'neutron-rpc-server',
'openstack-keystone',
]
compute_services = ['neutron-openvswitch-agent', 'neutron-ovs-cleanup', 'neutron-metering-agent', 'multipathd',
'snapshot-manager-worker', 'servicesgw', 'neutron-l3-agent', 'nrad', 'neutron-dhcp-agent',
'strato-filebeat', 'neutron-metadata-agent', 'openstack-nova-compute']
SERVICES_WITH_NODE_TYPE = [
'opa-policy-manager',
'acm-api',
'alarms-engine',
'alarms-service',
'api-explorer',
'app-catalog-periodic-tasks',
'app-catalog',
'app-catalog-worker',
'asg-api',
'autoscaling-groups',
'autoscaling-groups-worker',
'aws-auth',
'cassandra-engine',
'certificate-manager-api',
'cloudwatch-api',
'cloudwatch-backend-api',
'cloudwatch-backend-worker',
'conversions',
'conversions-worker',
'credit-manager',
'crs-manager-api',
'dbc-manager-api',
'dbs-manager',
'docker-registry',
'ec2-compute',
'elb-api',
'emr-api',
'engine-manager-api',
'engine-manager-worker',
'events-service',
'external-endpoint-manager',
'galeramariadb-engine',
'gargantua',
'gcm',
'grafana',
'guestnet-admin-tool-api',
'guestnet-admin-tool-beat',
'guestnet-admin-tool-worker',
'hot-upgrade',
'hot-upgrade-worker',
'http-proxy-service',
'iam',
'identity-manager',
'image-manager-api',
'image-manager-worker',
# 'influxdb': ['control', 2],
'inspector-api',
'inspector-worker',
'jaeger',
'kafka-engine',
'kubernetes-manager',
'kubernetes-worker',
'lbaas-manager',
'lbaas-worker',
'logserver',
'maestro-auth',
'maestro-data-reporter',
'maestro-events-reporter',
'maestro-gotty',
'maestro-tunnel-client',
'mancala-dr',
'mancala-externalstorage',
'mapreduce-api',
'melet-api',
'metrics-service',
'mongodb-engine',
'mssql-engine',
# 'mysql': ['control', 3],
'net-metrics-collector-worker',
'neutron-db-init',
#'neutron-rpc-server': ['control', 3],
#'neutron-server': ['control', 3],
'nfs-manager-api',
'ntpd-server',
'oauth2-proxy',
'oort',
'openotp-ldap-bridge',
'openstack-cinder-api',
'openstack-cinder-scheduler',
'openstack-cinder-volume',
# 'openstack-keystone': ['control', 3],
# 'openstack-nova-api': ['control', 3],
'openstack-nova-cert',
# 'openstack-nova-conductor': ['control', 3],
'openstack-nova-consoleauth',
'openstack-nova-novncproxy',
# 'openstack-nova-scheduler': ['control', 3],
'placementapi',
'placement',
# 'policy-enforcer': ['control', 3],
'policy-store',
'protection-scheduler-api',
'protection-scheduler-worker',
'quotas-manager',
# 'rack-storage-mgr': ['control', 3],
# 'rack-storage-monitor': ['control', 3],
# 'rack-storage-radosgw': ['control', 3],
'rds-api',
'redis-cache',
'redis-engine',
'region',
'resource-tracker',
'route53',
's3-manager-api',
's3-manager-worker',
's3-scality',
's3-vault',
'scality-engine',
'service-provisioner',
'services-metrics-collector',
'snapshot-manager',
'sns-api',
'sns-backend',
'sqs-engine',
'sqs-service-api',
'strato-kapacitor',
'stratonet-frontend',
'stratonet-garbagecollector',
'stratonet-ipam',
'ui-backend',
'ui-console',
'updatemanagerapi',
'vault-manager',
'vault',
'virtual-api2',
'virtual-dr',
'virtual-installation',
'virtual-maestro',
'virtual-nb',
'virtual-region',
'virtual-servicesgw',
'vm-manager',
'vm-manager-worker',
'vms-monitor',
'volumehealth',
'volume-manager',
'vpc-backend-api',
'vpc-backend-periodic-tasks',
'vpc-backend-worker',
]
def is_service_healthy(service, to_node):
command = (
"dig {service}.service.strato | grep 'status: NOERROR' && "
"consul catalog services -node {to_node} | grep '{service}' ".format(
service=service, to_node=to_node
)
)
print("Running cmd: {}".format(command))
try:
res = os.system(command)
return res == 0
except:
return False
def flip_placement_map(placement_map):
'''Flips a nested dict inside out
takes {hostname: {service: state}}, returns {service: [hostnames]}
'''
res = {}
remove_empty_values_in_dict(placement_map)
for hostname in placement_map:
for service in placement_map[hostname]:
res.setdefault(service, []).append(hostname)
return res
@retrying.retry(stop_max_attempt_number=3, wait_fixed=5000)
def move_service_with_retry(service, from_node, to_node):
'''Move a service to a node, retrying if it fails'''
command = 'inspector tools cm move-service {service} {from_node} {to_node} -q'.format(
service=service, from_node=from_node, to_node=to_node
)
print("Running CMD: {}".format(command))
os.system('timeout 45s {}'.format(command))
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
filled_length = int(length * iteration // total)
bar = fill * filled_length + '-' * (length - filled_length)
sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
sys.stdout.flush()
if iteration == total:
sys.stdout.write('\n')
def print_moving_services_table(data):
print('{:<30} {:<30} {:<30}'.format('Service', 'From Node', 'To Node'))
for service_name, node_move_mappings in data.iteritems():
for node_move_mapping in node_move_mappings:
to_node, from_node = node_move_mapping.values()
print('{:<30} {:<30} {:<30}'.format(service_name, from_node, to_node))
print('-------------------------- Total services to move: {} --------------------------'.format(len(data)))
def system_command_with_timeout(command):
try:
os.system('timeout 45s {}'.format(command))
except Exception as e:
print(e.message)
raise e
def move_services_to_control_nodes(cmapi, services_to_filter=None):
services_to_move = summary_of_moving_services(cmapi, print_results=False, services_to_filter=services_to_filter)
total_moves = len(services_to_move)
moves_counter = 0
for control_service, node_migrations in services_to_move.iteritems():
print("control services: {} node migrations:{}".format(control_service, node_migrations))
for path in node_migrations:
to_node, from_node = path.values()
moves_counter += 1
progress_bar(moves_counter, total_moves, prefix='Step {}'.format(moves_counter),
suffix='Moving service {} from {} to node {}'.format(control_service, from_node, to_node))
try:
move_service_with_retry(control_service, from_node, to_node)
wait_for_service_to_be_healthy(control_service, to_node)
except:
print('Failed to move service {} from {} to node {}'.format(control_service, from_node, to_node))
continue
print('Done moving {} services'.format(moves_counter))
def wait_for_service_to_be_healthy(service, host):
host_no_suffix = host.replace(".node.strato", "")
waiting.wait(
lambda: is_service_healthy(service, host_no_suffix),
timeout_seconds=TIMEOUT_FOR_SERVICE_HEALTH_CHECK,
sleep_seconds=INTERVAL_FOR_SERVICE_HEALTH_CHECK,
waiting_for='Service {} to be healthy'.format(service),
)
def find_vms_on_control_nodes(vm_client):
all_vms = vm_client.list()
vms_on_control_nodes = [vm for vm in all_vms if vm["hostname"] in CONTROL_NODES]
if vms_on_control_nodes:
print('Found {} VMs on control nodes:'.format(len(vms_on_control_nodes)))
for vm in vms_on_control_nodes:
print('VM {} is on node {}'.format(vm["name"], vm["hostname"]))
else:
print('No VMs found on control nodes')
def run_validators(vm_client):
print('\n\n[] Validation that no VMs are found on a future control node')
find_vms_on_control_nodes(vm_client)
def get_services_to_node_mapping(cmapi, services_to_filter=None):
"""
Get a mapping of services to nodes
In the format: {service: [node1, node2, ...]}
"""
control_map = cmapi.registry.get('cluster/control_services_placement_map')
# print ("before filter control map: {}".format(control_map))
filtered_services = {}
if services_to_filter:
for node, node_services in control_map.iteritems():
filtered_services[node] = dict(filter(lambda item: item[0] in services_to_filter, node_services.items()))
control_map = filtered_services
flipped_control_map = flip_placement_map(control_map)
return flipped_control_map
def summary_of_moving_services(cmapi, print_results=True, services_to_filter=None):
flipped_control_map = get_services_to_node_mapping(cmapi, services_to_filter)
services_dict = {}
total_moves = 0
services_to_move = services_to_filter or SERVICES_WITH_NODE_TYPE
print("services to move:{}".format(services_to_move))
for control_service in services_to_move:
services_dict[control_service] = []
current_service_nodes = flipped_control_map[control_service]
from_nodes = list(set(current_service_nodes) - (set(CONTROL_NODES))) # The nodes that are not control nodes do not need ot move
to_nodes = list(set(CONTROL_NODES) - (set(current_service_nodes))) # The nodes that already have the service cannot be the targets
print("Service: {} to move from nodes: {}".format(control_service, from_nodes))
print("Service: {} to move to nodes: {}".format(control_service, to_nodes))
if not from_nodes:
continue
for from_node in from_nodes:
to_node = to_nodes[total_moves % len(to_nodes)]
if from_node in CONTROL_NODES:
continue
if from_node == to_node:
continue
total_moves += 1
services_dict[control_service].append({'from': str(from_node), 'to': to_node})
remove_empty_values_in_dict(services_dict)
if print_results:
print_moving_services_table(services_dict)
return services_dict
def remove_empty_values_in_dict(dict_to_remove):
for key, value in dict_to_remove.items():
if not value:
del dict_to_remove[key]
def get_none_compute_from_nodes(service_nodes, control_nodes):
return list(set(service_nodes) - (set(control_nodes)))
def consul_backup():
file_name = '/cluster_consul_backup_{}.snap'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
system_command_with_timeout('consul snapshot save {}'.format(file_name))
print('Saved consul snapshot to {}'.format(file_name))
def fix_rack_storage_monitor_issue(cmapi):
print('Deleting all ceph storage monitors from Consul')
cmapi.registry.delete('cluster/cephMonitors', recursive=True)
print('*** Now delete /var/lib/ceph/mon/ from the relevant nodes, and restart the service ***')
def run_compute_maintenance():
for service in compute_services:
print('Putting service {} in maintenance mode'.format(service))
system_command_with_timeout('inspector tools cm service-to-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print('Done putting services in maintenance mode')
def run_compute_unmaintenance():
for service in compute_services:
print('Putting service {} out of maintenance mode'.format(service))
system_command_with_timeout('inspector tools cm service-from-maintenance {}'.format(service))
sleep(MAINTENANCE_MODE_COOLDOWN_TIME)
print('Done putting services out of maintenance mode')
def print_mysql_info():
print_service_hosts("mysql")
print("mysql master info:")
os.system("consul kv get cluster/service_master_node/mysql")
print("noded process PIDs:")
os.system(""" inspector exec 'pgrep -f "python -m strato.noded.main" -P 1' """)
def print_service_hosts(service_name):
print('Nodes running service {}:'.format(service_name))
os.system('consul catalog nodes -service={}'.format(service_name))
def wait_for_mysql_synced(node_name, timeout=30):
print("Waiting for mysql synced on node: %s" % node_name)
def _check_mysql_wsrep_status():
command = '''inspector exec -n {} "mysql --skip-column-names -B -e \\"show global status like '%wsrep_local_state_comment%'\\""'''.format(node_name)
try:
value = subprocess.check_output([command], shell=True)
if "Synced" in str(value):
print("mysql is synced on %s" % node_name)
return True
print("mysql is not in sync on %s: %s" % node_name, value)
except Exception as ex: # pylint: disable=broad-except
print("Error in _wait_for_mysql_synced: %s" % ex)
raise
waiting.wait(_check_mysql_wsrep_status,
timeout_seconds=timeout,
sleep_seconds=10,
expected_exceptions=Exception)
return True
def does_node_have_mysql(node_name):
command = 'consul catalog nodes -service=mysql | grep {}'.format(node_name)
try:
res = subprocess.call([command], shell=True)
return res == 0
except Exception as e:
return False
def is_mysql_fit_for_master(node_name):
node_have_mysql = does_node_have_mysql(node_name)
if not node_have_mysql:
return False
return wait_for_mysql_synced(node_name)
def set_new_mysql_master(cmapi, node, noded_pid):
if is_mysql_fit_for_master(node):
set_key_for_mysql_master(cmapi, node, noded_pid)
else:
msg = "Node {} is not fit for mysql master".format(node)
print(msg)
raise Exception(msg)
def set_key_for_mysql_master(consul_kv_client, node, noded_pid):
consul_kv_client.set(
'cluster/service_master_node/mysql',
{"hostname": "{}.node.strato".format(node), "pid": noded_pid})
def increase_openstack_server_count(dry_run=False):
increase_compute_and_identity_server_count(dry_run=dry_run)
increase_neutron_server_count(dry_run=dry_run)
def increase_compute_and_identity_server_count(dry_run=False):
print("Increasing openstack server count according to worker alligner")
try:
compute_identity_worker_resizer_cmd = (
"inspector tools compute all-workers-resizer --lcs {}".format("--dry-run" if dry_run else '')
)
output = subprocess.check_output(compute_identity_worker_resizer_cmd, shell=True)
except Exception as e:
print("Failed to run all-workers-resizer: %s" % e)
return
def increase_neutron_server_count(dry_run=False):
print("Increasing network server count according to worker alligner")
try:
neutron_worker_resizer_cmd = (
"inspector tools network neutron worker-auto-align --lcs".format("--dry-run" if dry_run else '')
)
output = subprocess.check_output(neutron_worker_resizer_cmd, shell=True)
except Exception as e:
print("Failed to run all-workers-resizer: %s" % e)
return
def migrate_openstack_servers_to_future_control_nodes(cmapi):
print("Migrating openstack services to future control nodes")
move_services_to_control_nodes(cmapi=cmapi, services_to_filter=openstack_services)
def get_services_nodes(consul_kv_client, service_name):
nodes = consul_kv_client.get('cluster/services/{}/nodes'.format(service_name))
return nodes
def set_rpc_agent_down_time(agent_down_time, force_restart=False, cmapi=None):
print('Setting Neutron RPC server agent down time to {}'.format(agent_down_time))
system_command_with_timeout('crudini --set /etc/neutron/neutron.conf DEFAULT agent_down_time {}'.format(agent_down_time))
system_command_with_timeout('inspector copy to-remotes --src-path /etc/neutron/neutron.conf')
if force_restart:
rolling_restart_for_service(cmapi, 'neutron-rpc-server')
print('Setting Nova RPC server agent down time to {}'.format(agent_down_time))
system_command_with_timeout('crudini --set /etc/nova/nova.conf DEFAULT service_down_time {}'.format(agent_down_time))
system_command_with_timeout('inspector copy to-remotes --src-path /etc/nova/nova.conf')
if force_restart:
rolling_restart_for_service(cmapi, 'openstack-nova-conductor')
def migreate_rabbit_to_control_node(cmapi, to_node):
current_rabbit_host = cmapi.registry.get('cluster/rabbit/leader')['hostname']
if to_node in current_rabbit_host:
print("Rabbit is already on the requested node")
return
if not to_node.endswith('.node.strato'):
print('Invalid node name: {}'.format(to_node))
return
print("Migrating rabbit to node: %s" % to_node)
cmapi.registry.set('cluster/rabbit/requested-node', to_node)
print("Waiting for rabbit to be migrated to node: %s" % to_node)
waiting.wait(
lambda: cmapi.registry.get('cluster/rabbit/leader')['hostname'] == to_node,
timeout_seconds=30,
sleep_seconds=5,
)
def rolling_restart_for_services(consul_kv_client, services):
print("Restarting services: %s", services)
for service in services:
rolling_restart_for_service(consul_kv_client=consul_kv_client, service=service)
def rolling_restart_for_openstack_servers(consul_kv_client):
rolling_restart_for_services(consul_kv_client=consul_kv_client, services=openstack_services)
def rolling_restart_for_service(consul_kv_client, service):
nodes_to_service_mapping = get_services_to_node_mapping(cmapi=consul_kv_client, services_to_filter=[service])
current_service_hosts = nodes_to_service_mapping[service]
for host in current_service_hosts:
print("Restarting service: %s on host: %s" % (service, host))
restart_service_on_host(service, host)
wait_for_service_to_be_healthy(service, host)
def restart_service_on_host(service, host):
command = "inspector exec -n {} \'systemctl restart {} \'".format(host, service)
print("Restarting service with command: {}".format(command))
system_command_with_timeout(command)
def verify_agents_are_up(neutron_client, restart_unsynced_agents=False):
print("Verifying that all agents are synced")
unsynced_agents = get_list_of_unsynced_agents(neutron_client)
if unsynced_agents:
print("Retrying to get agent list, incase of any unsynced agents on first try")
unsynced_agents = get_list_of_unsynced_agents(neutron_client)
print("Unsynced agents: %s" % unsynced_agents)
if unsynced_agents and restart_unsynced_agents:
print("Unsynced agents found, restarting them")
restart_agent_services(unsynced_agents)
def restart_agent_services(agent_to_restart_list):
for agent in agent_to_restart_list:
print("Restarting agent: %s" % agent['id'])
restart_service_on_host(service=agent['binary'], host=agent['host'])
# wait_for_service_to_be_healthy(service=agent['binary'], host=agent['host'])
def get_list_of_unsynced_agents(neutron_client):
agent_list = neutron_client.list_agents()['agents']
unsynced_agents = []
most_graced_last_heartbeat_time = datetime.now() - timedelta(seconds=20)
for agent in agent_list:
agent_heartbeat_datetime = datetime.strptime(agent['heartbeat_timestamp'], '%Y-%m-%d %H:%M:%S')
print("last heartbeat: {}, agent heartbeats {}".format(most_graced_last_heartbeat_time, agent_heartbeat_datetime))
if agent_heartbeat_datetime < most_graced_last_heartbeat_time:
print("Agent %s is down" % agent['id'])
unsynced_agents.append(agent)
return unsynced_agents
def migrate_openstack_and_rabbit_stack_all_in_one(cmapi, neutron_client, to_node):
set_rpc_agent_down_time(agent_down_time=900, force_restart=True)
increase_openstack_server_count()
migrate_openstack_servers_to_future_control_nodes(cmapi)
migreate_rabbit_to_control_node(cmapi, to_node)
rolling_restart_for_openstack_servers(cmapi)
verify_agents_are_up(neutron_client)
def main():
cmapi = clustermanagementapi.ClusterManagementAPI()
vm_client = vm_client_module.Client(headers=credentials.get_internal_headers()).api.v2.compute.vms
token = admin_creds.get_credentials().token
neutron_client = neutron_client_module.Client(
'2.0',
token=token,
endpoint_url='http://neutron-server.service.strato:9696',
insecure=True
)
options = [
'backup', 'validate', 'summary', 'move',
'fix-rack', 'compute-maint', 'compute-unmaint',
'print-mysql-info', 'set-new-mysql-master',
'migrate-openstack-rabbit', 'set-rpc-agent-down-time',
'increase-openstack-server-count',
'migrate-openstack-servers-to-future-control-nodes',
'migrate-rabbit-to-control-node',
'rolling-restart-for-openstack-servers',
'verify-agents-are-up',
'rolling-restart-for-services',
]
if len(sys.argv) == 1:
print('Choose the following flags: {}'.format(", ".join(options)))
return
flag = sys.argv[1]
# run consul backup
if flag == 'backup':
try:
consul_backup()
except Exception as e:
print('Error while trying to take a consul snapshot {} - try again!'.format(e.message))
return
# run validators
if flag == 'validate':
try:
run_validators(vm_client)
except Exception as e:
print('Error while running validations {} - try again!'.format(e.message))
return
# run a summary of services to move
if flag == 'summary':
try:
summary_of_moving_services(cmapi)
except Exception as e:
print('Error while trying get a summary of all moving services {} - try again!'.format(e.message))
return
# run the actual move
if flag == 'move':
try:
move_services_to_control_nodes(cmapi)
except Exception as e:
print('Error while trying to move services {} - try again!'.format(e.message))
return
# fix rack-storage-monitor issue
if flag == 'fix-rack':
print('Fixing rack storage monitor issue')
try:
fix_rack_storage_monitor_issue(cmapi)
except Exception as e:
print('Error while trying to fix rack-storage-monitor issue {} - try again!'.format(e.message))
return
# run compute services maintenance
if flag == 'compute-maint':
try:
run_compute_maintenance()
except Exception as e:
print('Error while trying to run compute services maintenance {} - try again!'.format(e.message))
return
# run compute services un-maintenance
if flag == 'compute-unmaint':
try:
run_compute_unmaintenance()
except Exception as e:
print('Error while trying to run compute services un-maintenance {} - try again!'.format(e.message))
return
# set new mysql master
# usage: python move_services.py set-mysql-master <node>
# example: python move_services.py set-mysql-master stratonode2
# this will set the mysql master to stratonode2
if flag == 'set-new-mysql-master':
if len(sys.argv) != 4:
print('Usage: python move_services.py set-new-mysql-master <node> <noded_pid>')
return
try:
consul_kv_client = ConsulKeyValueStoreClient()
set_new_mysql_master(consul_kv_client, sys.argv[2], sys.argv[3])
except Exception as e:
print('Error while trying to set new mysql master {} - try again!'.format(e.message))
return
# print(mysql info)
if flag == 'print-mysql-info':
try:
print_mysql_info()
except Exception as e:
print('Error while trying to print(mysql info {} - try again!'.format(e.message))
return
# Migrate Openstack and rabbit
if flag == 'migrate-openstack-rabbit':
if len(sys.argv) != 3:
print('Usage: python move_services.py migrate-openstack-rabbit <node>')
return
try:
migrate_openstack_and_rabbit_stack_all_in_one(cmapi, neutron_client, sys.argv[2])
except Exception as e:
print('Error while trying to migrate openstack and rabbit {} - try again!'.format(e.message))
return
# Set RPC agent downtime to a set time
if flag == 'set-rpc-agent-down-time':
force_restart = False
agent_down_time = 900
if '--down-time' in sys.argv:
if sys.argv[2] == '--down-time':
print("Setting downtime:")
try:
if sys.argv[3].isdigit():
agent_down_time = int(sys.argv[3])
else:
print("Down time flag not set correctly,Usage: python move_services.py set-rpc-agent-down-time --down-time <down_time_int> --force-restart")
return
except Exception as e:
print("Down time flag not set correctly,Usage: python move_services.py set-rpc-agent-down-time --down-time <down_time_int> --force-restart ")
return
try:
if '--force-restart' in sys.argv:
if sys.argv[4] == '--force-restart':
force_restart = True
except Exception as e:
print("Force restart flag not set correctly, setting to False. Usage: python move_services.py set-rpc-agent-down-time --down-time <down_time_int> --force-restart")
force_restart = False
try:
set_rpc_agent_down_time(agent_down_time=agent_down_time, force_restart=force_restart, cmapi=cmapi)
except Exception as e:
print('Error while trying to set rpc agent down time {} - try again!'.format(e.message))
return
#Increase opnstack server count
if flag == 'increase-openstack-server-count':
dry_run = False
if "--dry-run" in sys.argv:
if sys.argv[2] == "--dry-run":
print("Running dry run")
dry_run = True
else:
print("Dry run flag not set correctly, Usage: python move_services.py increase-openstack-server-count --dry-run")
return
try:
increase_openstack_server_count(dry_run=dry_run)
except Exception as e:
print('Error while trying to increase openstack server count {} - try again!'.format(e.message))
return
#migrate_openstack_servers_to_future_control_nodes
if flag == 'migrate-openstack-servers-to-future-control-nodes':
try:
migrate_openstack_servers_to_future_control_nodes(cmapi)
except Exception as e:
print('Error while trying to migrate openstack servers to future control nodes {} - try again!'.format(e.message))
return
#migreate_rabbit_to_control_node
if flag == 'migrate-rabbit-to-control-node':
if len(sys.argv) != 3:
print('Usage: python move_services.py migrate-rabbit-to-control-node <node>')
return
try:
migreate_rabbit_to_control_node(cmapi, sys.argv[2])
except Exception as e:
print('Error while trying to migrate rabbit to control node {} - try again!'.format(e.message))
return
#rolling_restart_for_openstack_servers
if flag == 'rolling-restart-for-openstack-servers':
try:
rolling_restart_for_openstack_servers(cmapi)
except Exception as e:
print('Error while trying to rolling restart for openstack servers {} - try again!'.format(e.message))
return
# Rolling restart for any list of services
# usage: python move_services.py rolling-restart-for-services <service1> <service2> <service3> ...
# example: python move_services.py rolling-restart-for-services nova-api nova-scheduler nova-conductor
# note: the services must be in the list of services that are supported by the rolling restart function
# if the service is not supported, the function will print an error message and exit
if flag == 'rolling-restart-for-services':
if len(sys.argv) < 3:
print('Usage: python move_services.py rolling-restart-for-services <service1> <service2> <service3> ...')
return
try:
rolling_restart_for_services(cmapi, sys.argv[2:])
except Exception as e:
print('Error while trying to rolling restart for services {} - try again!'.format(e.message))
return
#verify_agents_are_up
if flag == 'verify-agents-are-up':
restart_unsynced_agents = False
if "--restart-unsynced-agents" in sys.argv:
if sys.argv[2] == "--restart-unsynced-agents":
print("Restarting unsynced agents")
restart_unsynced_agents = True
else:
print("Restart unsynced agents flag not set correctly, Usage: python move_services.py verify-agents-are-up --restart-unsynced-agents")
return
try:
verify_agents_are_up(neutron_client, restart_unsynced_agents)
except Exception as e:
print('Error while trying to verify agents are up {} - try again!'.format(e.message))
return
if flag not in options:
print('Choose the following flags: {}'.format(", ".join(options)))
return
main()
@yuvalta
Copy link

yuvalta commented Mar 10, 2024

@grishatsuker
Just setting the key for mysql master node is enough? don't we need to also release the lock beforehand?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment