Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rodrigogansobarbieri/1838356c6b40c6d4aa428c1aa1a1ac5f to your computer and use it in GitHub Desktop.
Save rodrigogansobarbieri/1838356c6b40c6d4aa428c1aa1a1ac5f to your computer and use it in GitHub Desktop.
Recover juju machine agents
#!/usr/bin/env python3
"""
This is a tool for recovering lost machine units in current model
Usage:
{0} model-name dest-dir
"""
# changes summary:
# added logic to prevent overwrite /var/lib/juju to avoid removing non-machine units
# added logic for detecting path of mongo binary (e.g. if present in $PATH or is in /usr/lib/juju/mongo*/bin)
# added logic to determine primary mongo node to run queries against PRIMARY mongo
# added logic to also restore systemd unit files
# removed need to specify controller ip, as this is set when mongo PRIMARY detected
import json
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
MONGOPASS_CMD = "juju ssh ubuntu@%s \"sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf\" | awk -e '{print $2}'"
MONGO_CMD = ""
MONGO_TEMPLATE = "juju ssh -m controller ubuntu@%s \"sudo %s --port 37017 --sslAllowInvalidCertificates --ssl --authenticationDatabase admin -u machine-%s -p %s juju < /home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1"
def run(cmd, output=True, shell=True):
print(cmd)
if output:
return subprocess.check_output(cmd, shell=shell).decode().strip()
return subprocess.call(cmd, stderr=subprocess.PIPE,
stdout=subprocess.PIPE, shell=shell)
def update_machine_password(controller, model, machine_number, passwordhash):
file_content = """use juju
db.machines.update({"model-uuid": "%s", "machineid": "%s"}, {$set:{"passwordhash": "%s"}})
""" % (model, machine_number, passwordhash)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
run(MONGO_CMD % (controller, password, os.path.basename(tmp_file.name)))
def get_model_uuid(controller, model):
file_content = """use juju
db.models.find({"name": "%s"}, {"modeluuid": 1})
""" % (model)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
uuid_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
uuid = json.loads(uuid_json)['_id']
return uuid
def determine_primary_mongo():
global MONGO_CMD
file_content = """use juju
rs.isMaster()['primary']
"""
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
first_controller_num = run('juju machines -m controller |grep started|head -n1|cut -d " " -f1')
mongo_cmd_tuple = gen_mongo_cmd(first_controller_num)
temp_mongo_cmd = mongo_cmd_tuple[0] % os.path.basename(tmp_file.name)
run("juju scp -m controller {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, first_controller_num))
primary_ip = run(temp_mongo_cmd).split(':')[0]
# now that primary established, we'll just use old logic for MONGO_CMD variable/pass
# but hardcoding the machine number
primary_controller_num = run('juju machines -m controller |grep {}|head -n1|cut -d " " -f1'.format(primary_ip))
MONGO_CMD = "juju ssh ubuntu@%s \"sudo {} --port 37017 --sslAllowInvalidCertificates " \
"--ssl --authenticationDatabase admin -u machine-{} -p %s juju < " \
"/home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1".format(mongo_cmd_tuple[1],primary_controller_num)
return primary_ip
def gen_mongo_cmd(controller_num):
binary_path = run('''juju ssh -m controller %s "ps aux|grep mongo"|grep -v grep|awk '{print $11}'|head -n1 | rev | cut -c 2- | rev''' % controller_num)
#binary_path = "mongo"
controller_pass = run("juju ssh -m controller %s 'sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf' | awk -e '{print $2}'" % controller_num )
return (MONGO_TEMPLATE % (controller_num, binary_path, controller_num, controller_pass, '%s'), binary_path)
def get_donor_password(controller, donor, model_uuid):
file_content = """use juju
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"passwordhash": 1})
""" % (model_uuid, donor)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
attributes = json.loads(attributes_json)
passwordhash = attributes['passwordhash']
return passwordhash
def get_machine_nonce(controller, machine, model_uuid):
file_content = """use juju
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"nonce": 1})
""" % (model_uuid, machine)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
print('get machine nonce reached...')
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
attributes = json.loads(attributes_json)
nonce = attributes['nonce']
return nonce
def recover_machine(machine_number, juju_tar, donor, passwordhash, controller, model, systemd_tar):
print("Recovering machine {}".format(machine_number))
nonce = get_machine_nonce(controller, machine_number, model)
check_juju_dir_or_create(machine_number)
run("juju scp {} {}:/home/ubuntu/juju.tar".format(
juju_tar, machine_number))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/juju.tar -C /var/lib --skip-old-files --keep-directory-symlink --dereference'".format(machine_number))
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format(
machine_number, donor, machine_string))
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format(
machine_number, donor, machine_string))
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce))
else:
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format(
machine_number, donor, machine_number))
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format(
machine_number, donor, machine_number))
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce))
# Update files
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_string)
run("juju ssh {} 'sudo sed -i \"s|tag: machine-{}|tag: machine-{}|g\" {}'".format(
machine_number, donor, machine_string, agent_file))
run("juju ssh {} 'sudo sed -i \"s|jujud-machine-{}|jujud-machine-{}|g\" {}'".format(
machine_number, donor, machine_string, agent_file))
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format(
machine_number, nonce, agent_file))
else:
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_number)
run("juju ssh {} 'sudo sed -i \"s/tag: machine-{}/tag: machine-{}/g\" {}'".format(
machine_number, donor, machine_number, agent_file))
run("juju ssh {} 'sudo sed -i \"s/jujud-machine-{}/jujud-machine-{}/g\" {}'".format(
machine_number, donor, machine_number, agent_file))
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format(
machine_number, nonce, agent_file))
# restore unit files
if 'lxd' in machine_number:
run("juju scp {} {}:/home/ubuntu/systemd.tar".format(
systemd_tar, machine_number))
machine_string = machine_number.replace('/', '-')
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_string))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number))
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_string))
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format(
machine_number,donor, donor, machine_string, machine_string))
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_string)
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_string, machine_string)
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format(
machine_number, donor, machine_string, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s|--machine-id {}|--machine-id {}|g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format(
machine_number, donor, machine_string, jujud_unit_file))
else:
run("juju scp {} {}:/home/ubuntu/systemd.tar".format(
systemd_tar, machine_number))
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_number))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number))
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_number))
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format(
machine_number,donor, donor, machine_number, machine_number))
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_number)
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_number)
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s/--machine-id {}/--machine-id {}/g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format(
machine_number, donor, machine_number, jujud_unit_file))
# Update mongo
update_machine_password(controller, model, machine_number, passwordhash)
# Restart services
link_to = determine_juju_version(machine_number)
run("juju ssh {} 'for u in $(sudo ls /var/lib/juju/agents/|sort); do sudo ln -sf /var/lib/juju/tools/{} /var/lib/juju/tools/$u; done'".format(machine_number,link_to))
run("juju ssh {} sudo systemctl daemon-reload".format(machine_number))
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number,
machine_string, machine_string))
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_string))
else:
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number,
machine_number, machine_number))
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_number))
def check_juju_dir_or_create(machine_num):
command = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju'".format(machine_num)
exit_code = run(command, output=False)
if exit_code != 0:
create_command = "juju ssh {} 'sudo mkdir /var/lib/juju'".format(machine_num)
run(create_command)
# check for existence of /var/lib/juju/agents/machine-$
# remove it to avoid mv directory not-empty when recovering
else:
if 'lxd' in machine_num:
machine_string = machine_num.replace('/', '-')
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_string)
else:
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num)
if run(check_for_machine_conf, output=False) == 0:
if 'lxd' in machine_num:
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num.replace('/', '-'))
run(mv_existing_command, output=False)
else:
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num)
run(mv_existing_command, output=False)
def get_agent_from_donor(donor, destdir):
print ("Getting agent from machine {}".format(donor))
# Get dir from donor and remove everything except machine agent
run("juju ssh {} 'sudo chmod -R a+r /var/lib/juju'".format(donor))
run("juju ssh {} 'sudo tar -C /var/lib/ -cvf /tmp/juju.tar juju'".format(
donor))
run("juju scp {}:/tmp/juju.tar {}".format(donor, destdir))
run("tar -xvf {}/juju.tar -C {}".format(destdir, destdir))
run("rm -rf {}/juju/agents/unit*".format(destdir))
run("rm -rf {}/juju/meter-status.yaml".format(destdir))
run("rm -rf {}/juju/locks/*".format(destdir))
run("rm -rf {}/juju/tools/unit*".format(destdir))
run("rm -rf {}/juju/metricspool".format(destdir))
run("rm -rf {}/juju/nonce.txt".format(destdir))
run("rm -rf {}/juju.tar".format(destdir))
run("tar -cvf {}/juju.tar -C {} juju".format(destdir, destdir))
run("rm -rf {}/juju".format(destdir))
get_donor_systemd_units(donor,destdir)
def get_donor_systemd_units(donor, destdir):
run("juju ssh {} 'sudo tar -cvf /tmp/systemd.tar /lib/systemd/system/jujud-machine-{}'".format(donor,donor))
run("juju scp {}:/tmp/systemd.tar {}".format(donor, destdir))
def determine_juju_version(machine_number):
return run("juju ssh {} 'ls /var/lib/juju/tools |egrep ^2.|sort -n|head -n1'".format(machine_number), output=True)
def parse_machines_to_recover(status_json):
machines = []
units = []
for app_name, app_data in status_json['applications'].items():
for unit_name,unit_data in app_data['units'].items():
if unit_data['juju-status']['current'] == "lost":
units.append(unit_name)
machine = unit_data['machine']
if machine not in machines:
machines.append(machine)
for machine_number, machine_data in status_json['machines'].items():
if machine_data['juju-status']['current'] == "down":
if machine_number not in machines:
machines.append(machine_number)
return machines, units
def parse_donor(status_json):
candidates = []
for machine_number, machine_data in status_json['machines'].items():
if machine_data['juju-status']['current'] == "started":
candidates.append(machine_number)
for app_name, app_data in status_json['applications'].items():
for unit_name,unit_data in app_data['units'].items():
if unit_data['juju-status']['current'] == "lost":
machine = unit_data['machine']
if machine in candidates:
candidates.remove(machine)
if len(candidates) > 0:
return candidates[0]
else:
raise Exception("No donor candidates found")
def disable_units(units):
for unit in units:
service = unit.replace('/','-')
output = run("juju ssh {} 'sudo systemctl -a | grep {}' || true".format(unit, service))
if output:
if 'active' in output:
run("juju ssh {} 'sudo systemctl stop jujud-unit-{}.service'".format(unit, service))
run("juju ssh {} 'sudo systemctl disable jujud-unit-{}.service'".format(unit, service))
def main():
model = sys.argv[1]
destdir = sys.argv[2]
print("Determining the Primary MongoDB unit")
controller = determine_primary_mongo()
if os.path.exists(destdir):
shutil.rmtree(destdir)
os.mkdir(destdir)
run("juju switch {}".format(model))
model_uuid = get_model_uuid(controller, model)
print(model_uuid)
# Query first healthy unit to select as donor
print("Attempting to find a healthy machine donor")
status_string_json = run("juju status --format json")
status_json = json.loads(status_string_json)
donor = parse_donor(status_json)
passwordhash = get_donor_password(controller, donor, model_uuid)
print(passwordhash)
get_agent_from_donor(donor, destdir)
# slight modification to ensure it's 'Running'
print("Getting list of machines to recover")
status_string_json = run("juju status --format json")
status_json = json.loads(status_string_json)
machines, units = parse_machines_to_recover(status_json)
disable_units(units)
print("Machines to be recovered: {}".format(machines))
for machine in machines:
# added systemd_tar
systemd_tar = destdir + "/systemd.tar"
recover_machine(machine, "{}/juju.tar".format(destdir), donor, passwordhash, controller, model_uuid, systemd_tar)
def test():
model = sys.argv[1]
controller = sys.argv[2]
destdir = sys.argv[3]
if os.path.exists(destdir):
shutil.rmtree(destdir)
os.mkdir(destdir)
run("juju switch {}".format(model))
print(determine_primary_mongo())
if __name__ == "__main__":
if len(sys.argv) == 1 :
print(__doc__.format(sys.argv[0]))
sys.exit(-1)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment