$ juju deployer -c autoheal.yaml rabbitmq-autoheal
$ chmod 0755 partition-recover-rabbitmq.py
$ ./partition-recover-rabbitmq
ubuntu@niedbalski-bastion:~/bundles$ ./partition-recover.py
Waiting for cluster partition to occur
Waiting for cluster partition to occur
Cluster is partitioned
Waiting for autoheal to recover
Recovered cluster
["/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log:Autoheal request received from 'rabbit@juju-amulet-machine-2'",
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-=ERROR REPORT==== 30-Sep-2015::16:07:48 ===',
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-Mnesia('rabbit@juju-amulet-machine-1'): ** ERROR ** mnesia_event got {inconsistent_database, running_partitioned_network, 'rabbit@juju-amulet-machine-3'}",
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-=INFO REPORT==== 30-Sep-2015::16:07:48 ===',
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log:Autoheal decision',
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log- * Partitions: [['rabbit@juju-amulet-machine-3'],",
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log- ['rabbit@juju-amulet-machine-2',",
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log- 'rabbit@juju-amulet-machine-1']]",
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log- * Winner: 'rabbit@juju-amulet-machine-2'",
"/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log- * Losers: ['rabbit@juju-amulet-machine-3']",
'/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
[...snip...]
Last active
September 30, 2015 19:19
-
-
Save niedbalski/aceba280b0365bdff46f to your computer and use it in GitHub Desktop.
partition-recover-rabbitmq.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rabbitmq-autoheal: | |
series: "trusty" | |
services: | |
rabbitmq-server: | |
branch: "lp:~openstack-charmers/charms/trusty/rabbitmq-server/next" | |
constraints: mem=1G | |
num_units: 3 | |
options: | |
cluster-partition-handling: autoheal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script is intended to test the partition/recover mechanism | |
using autoheal on rabbitmq-server. | |
""" | |
__author__ = "Jorge Niedbalski <jnr@metaklass.org>" | |
from optparse import OptionParser | |
import subprocess | |
import yaml | |
import re | |
import time | |
import pprint | |
def juju_run(command, service=None, unit=None): | |
cmd = [ "juju", "run" ] | |
if service: | |
cmd.extend(["--service", service]) | |
elif unit: | |
cmd.extend(["--unit", unit]) | |
else: | |
cmd.append("--all") | |
cmd.append(command) | |
return yaml.load(subprocess.check_output(cmd)) | |
def juju_status(): | |
return yaml.load(subprocess.check_output(["juju", "status" , "--format=yaml"])) | |
def cluster_status(): | |
return juju_run("rabbitmqctl cluster_status", service="rabbitmq-server") | |
def get_unit_ip_address(unit_id): | |
units = juju_status()['services']['rabbitmq-server']['units'] | |
return units.get("rabbitmq-server/{}".format(unit_id))['public-address'] | |
def get_running_nodes(stdout): | |
nodes = stdout.split('running_nodes')[1].split("partitions")[0] | |
return re.findall("rabbit@([^']+)", nodes) | |
def restore_network(): | |
juju_run("sudo iptables -F && sudo iptables -P INPUT ACCEPT") | |
def drop_network(unit_id, drop_unit_id): | |
juju_run("sudo iptables -F && sudo iptables -I INPUT -i eth0 -p tcp -s {} -j DROP".format(get_unit_ip_address(drop_unit_id) | |
), unit=unit_id) | |
class ClusterNotFormed(Exception): | |
pass | |
def get_cluster_nodes(): | |
status = cluster_status() | |
ret = [] | |
for machine in status: | |
stdout = machine.get('Stdout', None) | |
if stdout: | |
nodes = get_running_nodes(stdout) | |
if not len(nodes) == len(status): | |
raise ClusterNotFormed() | |
else: | |
ret.append((machine.get("UnitId"), nodes)) | |
return ret | |
def partition(units): | |
try: | |
for unit_id, nodes in get_cluster_nodes(): | |
if unit_id not in map(lambda x: "rabbitmq-server/%s" % x, units): | |
for drop_unit_id in units: | |
drop_network(unit_id, drop_unit_id) | |
while True: | |
print "Waiting for cluster partition to occur" | |
get_cluster_nodes() | |
except ClusterNotFormed: | |
print "Cluster is partitioned" | |
def recover(): | |
restore_network() | |
try: | |
get_cluster_nodes() | |
print "Recovered cluster" | |
output = juju_run("sudo grep -A 10 -i autoheal /var/log/rabbitmq/*.log | head -n 50", service="rabbitmq-server") | |
for unit in output: | |
pprint.pprint(unit.get('Stdout').splitlines()) | |
except ClusterNotFormed: | |
print "Waiting for autoheal to recover" | |
recover() | |
def parse_options(): | |
parser = OptionParser() | |
parser.add_option("-u", "--units", dest="units", default=[0], | |
help="Units to partition", metavar="units") | |
parser.add_option("-p", "--partition", default=True, | |
dest="partition", | |
help="Cause a partition") | |
parser.add_option("-r", "--recover", default=True, | |
help="Check autoheal", | |
dest="recover") | |
(options, args) = parser.parse_args() | |
return options | |
def main(): | |
options = parse_options() | |
if options.partition: | |
if not isinstance(options.units, list): | |
options.units = options.units.split(",") | |
partition(options.units) | |
if options.recover: | |
recover() | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment