Skip to content

Instantly share code, notes, and snippets.

@niedbalski
Last active September 30, 2015 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save niedbalski/aceba280b0365bdff46f to your computer and use it in GitHub Desktop.
Save niedbalski/aceba280b0365bdff46f to your computer and use it in GitHub Desktop.
partition-recover-rabbitmq.py
rabbitmq-autoheal:
series: "trusty"
services:
rabbitmq-server:
branch: "lp:~openstack-charmers/charms/trusty/rabbitmq-server/next"
constraints: mem=1G
num_units: 3
options:
cluster-partition-handling: autoheal

How to test partition/autoheal.

$ juju deployer -c autoheal.yaml rabbitmq-autoheal
$ chmod 0755 partition-recover-rabbitmq.py
$ ./partition-recover-rabbitmq 
ubuntu@niedbalski-bastion:~/bundles$ ./partition-recover.py 
Waiting for cluster partition to occur
Waiting for cluster partition to occur
Cluster is partitioned
Waiting for autoheal to recover
Recovered cluster
["/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log:Autoheal request received from 'rabbit@juju-amulet-machine-2'",
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-=ERROR REPORT==== 30-Sep-2015::16:07:48 ===',
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-Mnesia('rabbit@juju-amulet-machine-1'): ** ERROR ** mnesia_event got {inconsistent_database, running_partitioned_network, 'rabbit@juju-amulet-machine-3'}",
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-=INFO REPORT==== 30-Sep-2015::16:07:48 ===',
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log:Autoheal decision',
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-  * Partitions: [['rabbit@juju-amulet-machine-3'],",
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-                 ['rabbit@juju-amulet-machine-2',",
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-                  'rabbit@juju-amulet-machine-1']]",
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-  * Winner:     'rabbit@juju-amulet-machine-2'",
 "/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-  * Losers:     ['rabbit@juju-amulet-machine-3']",
 '/var/log/rabbitmq/rabbit@juju-amulet-machine-1.log-',
[...snip...]
#!/usr/bin/env python
"""
This script is intended to test the partition/recover mechanism
using autoheal on rabbitmq-server.
"""
__author__ = "Jorge Niedbalski <jnr@metaklass.org>"
from optparse import OptionParser
import subprocess
import yaml
import re
import time
import pprint
def juju_run(command, service=None, unit=None):
cmd = [ "juju", "run" ]
if service:
cmd.extend(["--service", service])
elif unit:
cmd.extend(["--unit", unit])
else:
cmd.append("--all")
cmd.append(command)
return yaml.load(subprocess.check_output(cmd))
def juju_status():
return yaml.load(subprocess.check_output(["juju", "status" , "--format=yaml"]))
def cluster_status():
return juju_run("rabbitmqctl cluster_status", service="rabbitmq-server")
def get_unit_ip_address(unit_id):
units = juju_status()['services']['rabbitmq-server']['units']
return units.get("rabbitmq-server/{}".format(unit_id))['public-address']
def get_running_nodes(stdout):
nodes = stdout.split('running_nodes')[1].split("partitions")[0]
return re.findall("rabbit@([^']+)", nodes)
def restore_network():
juju_run("sudo iptables -F && sudo iptables -P INPUT ACCEPT")
def drop_network(unit_id, drop_unit_id):
juju_run("sudo iptables -F && sudo iptables -I INPUT -i eth0 -p tcp -s {} -j DROP".format(get_unit_ip_address(drop_unit_id)
), unit=unit_id)
class ClusterNotFormed(Exception):
pass
def get_cluster_nodes():
status = cluster_status()
ret = []
for machine in status:
stdout = machine.get('Stdout', None)
if stdout:
nodes = get_running_nodes(stdout)
if not len(nodes) == len(status):
raise ClusterNotFormed()
else:
ret.append((machine.get("UnitId"), nodes))
return ret
def partition(units):
try:
for unit_id, nodes in get_cluster_nodes():
if unit_id not in map(lambda x: "rabbitmq-server/%s" % x, units):
for drop_unit_id in units:
drop_network(unit_id, drop_unit_id)
while True:
print "Waiting for cluster partition to occur"
get_cluster_nodes()
except ClusterNotFormed:
print "Cluster is partitioned"
def recover():
restore_network()
try:
get_cluster_nodes()
print "Recovered cluster"
output = juju_run("sudo grep -A 10 -i autoheal /var/log/rabbitmq/*.log | head -n 50", service="rabbitmq-server")
for unit in output:
pprint.pprint(unit.get('Stdout').splitlines())
except ClusterNotFormed:
print "Waiting for autoheal to recover"
recover()
def parse_options():
parser = OptionParser()
parser.add_option("-u", "--units", dest="units", default=[0],
help="Units to partition", metavar="units")
parser.add_option("-p", "--partition", default=True,
dest="partition",
help="Cause a partition")
parser.add_option("-r", "--recover", default=True,
help="Check autoheal",
dest="recover")
(options, args) = parser.parse_args()
return options
def main():
options = parse_options()
if options.partition:
if not isinstance(options.units, list):
options.units = options.units.split(",")
partition(options.units)
if options.recover:
recover()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment