Skip to content

Instantly share code, notes, and snippets.

@g-ramirez
Forked from tpsilva/recover_machine_agents.py
Last active November 13, 2020 19:17
Show Gist options
  • Save g-ramirez/643de4ade0e27897e72d7191810a08e3 to your computer and use it in GitHub Desktop.
Save g-ramirez/643de4ade0e27897e72d7191810a08e3 to your computer and use it in GitHub Desktop.
Recover juju machine agents
#!/usr/bin/env python
"""
This is a tool for recovering lost machine units in current model
Usage:
{0} model-name dest-dir
"""
# changes summary:
# added logic to prevent overwrite /var/lib/juju to avoid removing non-machine units
# added logic for detecting path of mongo binary (e.g. if present in $PATH or is in /usr/lib/juju/mongo*/bin)
# added logic to determine primary mongo node to run queries against PRIMARY mongo
# added logic to also restore systemd unit files
# removed need to specify controller ip, as this is set when mongo PRIMARY detected
import json
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
MONGOPASS_CMD = "juju ssh ubuntu@%s \"sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf\" | awk -e '{print $2}'"
#MONGO_CMD = "juju ssh ubuntu@%s \"sudo /usr/lib/juju/mongo3.2/bin/mongo --port 37017 --sslAllowInvalidCertificates --ssl --authenticationDatabase admin -u machine-0 -p %s juju < /home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1"
MONGO_CMD = ""
MONGO_TEMPLATE = "juju ssh -m controller ubuntu@%s \"sudo %s --port 37017 --sslAllowInvalidCertificates --ssl --authenticationDatabase admin -u machine-%s -p %s juju < /home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1"
def run(cmd, output=True, shell=True):
print(cmd)
if output:
return subprocess.check_output(cmd, shell=shell).strip()
return subprocess.call(cmd, stderr=subprocess.PIPE,
stdout=subprocess.PIPE, shell=shell)
def update_machine_password(controller, model, machine_number, passwordhash):
file_content = """use juju
db.machines.update({"model-uuid": "%s", "machineid": "%s"}, {$set:{"passwordhash": "%s"}})
""" % (model, machine_number, passwordhash)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
run(MONGO_CMD % (controller, password, os.path.basename(tmp_file.name)))
def get_model_uuid(controller, model):
file_content = """use juju
db.models.find({"name": "%s"}, {"modeluuid": 1})
""" % (model)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
uuid_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
uuid = json.loads(uuid_json)['_id']
return uuid
def determine_primary_mongo():
global MONGO_CMD
file_content = """use juju
rs.isMaster()['primary']
"""
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
first_controller_num = run('juju machines -m controller |grep started|head -n1|cut -d " " -f1')
mongo_cmd_tuple = gen_mongo_cmd(first_controller_num)
temp_mongo_cmd = mongo_cmd_tuple[0] % os.path.basename(tmp_file.name)
run("juju scp -m controller {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, first_controller_num))
primary_ip = run(temp_mongo_cmd).split(':')[0]
# now that primary established, we'll just use old logic for MONGO_CMD variable/pass
# but hardcoding the machine number
primary_controller_num = run('juju machines -m controller |grep {}|head -n1|cut -d " " -f1'.format(primary_ip))
MONGO_CMD = "juju ssh ubuntu@%s \"{} --port 37017 --sslAllowInvalidCertificates " \
"--ssl --authenticationDatabase admin -u machine-{} -p %s juju < " \
"/home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1".format(mongo_cmd_tuple[1],primary_controller_num)
return primary_ip
def gen_mongo_cmd(controller_num):
binary_path = run('''juju ssh -m controller %s "ps aux|grep mongo"|grep -v grep|awk '{print $11}'|head -n1 | rev | cut -c 2- | rev''' % controller_num)
controller_pass = run("juju ssh -m controller %s 'sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf' | awk -e '{print $2}'" % controller_num )
return (MONGO_TEMPLATE % (controller_num, binary_path, controller_num, controller_pass, '%s'), binary_path)
def get_donor_password(controller, donor, model_uuid):
file_content = """use juju
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"passwordhash": 1})
""" % (model_uuid, donor)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
attributes = json.loads(attributes_json)
passwordhash = attributes['passwordhash']
return passwordhash
def get_machine_nonce(controller, machine, model_uuid):
file_content = """use juju
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"nonce": 1})
""" % (model_uuid, machine)
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
tmp_file.write(file_content)
print 'get machine nonce reached...'
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller))
password = run(MONGOPASS_CMD % controller)
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename(
tmp_file.name)))
attributes = json.loads(attributes_json)
nonce = attributes['nonce']
return nonce
def recover_machine(machine_number, juju_tar, donor, passwordhash, controller, model, systemd_tar):
print("Recovering machine {}".format(machine_number))
nonce = get_machine_nonce(controller, machine_number, model)
check_juju_dir_or_create(machine_number)
run("juju scp {} {}:/home/ubuntu/juju.tar".format(
juju_tar, machine_number))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/juju.tar -C /var/lib --skip-old-files --keep-directory-symlink --dereference'".format(machine_number))
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format(
machine_number, donor, machine_string))
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format(
machine_number, donor, machine_string))
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce))
else:
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format(
machine_number, donor, machine_number))
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format(
machine_number, donor, machine_number))
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce))
# Update files
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_string)
run("juju ssh {} 'sudo sed -i \"s|tag: machine-{}|tag: machine-{}|g\" {}'".format(
machine_number, donor, machine_string, agent_file))
run("juju ssh {} 'sudo sed -i \"s|jujud-machine-{}|jujud-machine-{}|g\" {}'".format(
machine_number, donor, machine_string, agent_file))
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format(
machine_number, nonce, agent_file))
else:
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_number)
run("juju ssh {} 'sudo sed -i \"s/tag: machine-{}/tag: machine-{}/g\" {}'".format(
machine_number, donor, machine_number, agent_file))
run("juju ssh {} 'sudo sed -i \"s/jujud-machine-{}/jujud-machine-{}/g\" {}'".format(
machine_number, donor, machine_number, agent_file))
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format(
machine_number, nonce, agent_file))
# restore unit files
if 'lxd' in machine_number:
run("juju scp {} {}:/home/ubuntu/systemd.tar".format(
systemd_tar, machine_number))
machine_string = machine_number.replace('/', '-')
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_string))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number))
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_string))
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format(
machine_number,donor, donor, machine_string, machine_string))
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_string)
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_string, machine_string)
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format(
machine_number, donor, machine_string, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s|--machine-id {}|--machine-id {}|g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format(
machine_number, donor, machine_string, jujud_unit_file))
else:
run("juju scp {} {}:/home/ubuntu/systemd.tar".format(
systemd_tar, machine_number))
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_number))
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number))
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_number))
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format(
machine_number,donor, donor, machine_number, machine_number))
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_number)
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_number)
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s/--machine-id {}/--machine-id {}/g\" {}'".format(
machine_number, donor, machine_number, exec_start_file))
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format(
machine_number, donor, machine_number, jujud_unit_file))
# Update mongo
update_machine_password(controller, model, machine_number, passwordhash)
# Restart services
link_to = determine_juju_version(machine_number)
run("juju ssh {} 'for u in $(sudo ls /var/lib/juju/agents/|sort); do sudo ln -sf /var/lib/juju/tools/{} /var/lib/juju/tools/$u; done'".format(machine_number,link_to))
run("juju ssh {} sudo systemctl daemon-reload".format(machine_number))
if 'lxd' in machine_number:
machine_string = machine_number.replace('/', '-')
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number,
machine_string, machine_string))
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_string))
else:
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number,
machine_number, machine_number))
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_number))
def check_juju_dir_or_create(machine_num):
command = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju'".format(machine_num)
exit_code = run(command, output=False)
if exit_code != 0:
create_command = "juju ssh {} 'sudo mkdir /var/lib/juju'".format(machine_num)
run(create_command)
# check for existence of /var/lib/juju/agents/machine-$
# remove it to avoid mv directory not-empty when recovering
else:
if 'lxd' in machine_num:
machine_string = machine_num.replace('/', '-')
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_string)
else:
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num)
if run(check_for_machine_conf, output=False) == 0:
if 'lxd' in machine_num:
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num.replace('/', '-'))
run(mv_existing_command, output=False)
else:
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num)
run(mv_existing_command, output=False)
def get_agent_from_donor(donor, destdir):
print ("Getting agent from machine {}".format(donor))
# Get dir from donor and remove everything except machine agent
run("juju ssh {} 'sudo chmod -R a+r /var/lib/juju'".format(donor))
run("juju ssh {} 'sudo tar -C /var/lib/ -cvf /tmp/juju.tar juju'".format(
donor))
run("juju scp {}:/tmp/juju.tar {}".format(donor, destdir))
run("tar -xvf {}/juju.tar -C {}".format(destdir, destdir))
run("rm -rf {}/juju/agents/unit*".format(destdir))
run("rm -rf {}/juju/meter-status.yaml".format(destdir))
run("rm -rf {}/juju/locks/*".format(destdir))
run("rm -rf {}/juju/tools/unit*".format(destdir))
run("rm -rf {}/juju/metricspool".format(destdir))
run("rm -rf {}/juju/nonce.txt".format(destdir))
run("rm -rf {}/juju.tar".format(destdir))
run("tar -cvf {}/juju.tar -C {} juju".format(destdir, destdir))
run("rm -rf {}/juju".format(destdir))
get_donor_systemd_units(donor,destdir)
def get_donor_systemd_units(donor, destdir):
run("juju ssh {} 'sudo tar -cvf /tmp/systemd.tar /lib/systemd/system/jujud-machine-{}'".format(donor,donor))
run("juju scp {}:/tmp/systemd.tar {}".format(donor, destdir))
def determine_juju_version(machine_number):
return run("juju ssh {} 'ls /var/lib/juju/tools |egrep ^2.|sort -n|head -n1'".format(machine_number), output=True)
def main():
model = sys.argv[1]
destdir = sys.argv[2]
controller = determine_primary_mongo()
if os.path.exists(destdir):
shutil.rmtree(destdir)
os.mkdir(destdir)
run("juju switch {}".format(model))
model_uuid = get_model_uuid(controller, model)
print(model_uuid)
# Query first healthy unit to select as donor
donor = run("juju machines | grep started | grep -v lxd| head -n 1 | cut -d ' ' -f 1")
passwordhash = get_donor_password(controller, donor, model_uuid)
print(passwordhash)
get_agent_from_donor(donor, destdir)
# slight modification to ensure it's 'Running'
machines = run("juju machines | grep down")
for machine in machines.split("\n"):
machine = machine.split()[0]
# added systemd_tar
systemd_tar = destdir + "/systemd.tar"
recover_machine(machine, "{}/juju.tar".format(destdir), donor, passwordhash, controller, model_uuid, systemd_tar)
def test():
model = sys.argv[1]
controller = sys.argv[2]
destdir = sys.argv[3]
if os.path.exists(destdir):
shutil.rmtree(destdir)
os.mkdir(destdir)
run("juju switch {}".format(model))
print determine_primary_mongo()
if __name__ == "__main__":
if len(sys.argv) == 1 :
print(__doc__.format(sys.argv[0]))
sys.exit(-1)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment