portante/are-shards-balanced.py

## are-shards-balanced.py
#!/usr/bin/env python2

# A script to generate a report of Elasticsearch shard usage
# (from _cat/shards?v&bytes=b) to see which shards of an index
# are not evenly spread across the cluster, and for a set of
# hard-coded date patterns, which date patterns are not spread
# evenly across the cluster.
#
# E.g.
# $ curl -X GET http://localhost:9200/_cat/shards?v\&bytes=b -o shards.lis
# $ ./are-shards-balanced.py shards.lis

import sys
import re
import collections
import operator


class Shard(object):
    def __init__(self, parts):
        self.index = parts[0]
        self.shard = int(parts[1])
        self.prirep = parts[2]
        self.state = parts[3]
        self.docs = int(parts[4])
        self.store = int(parts[5])
        self.node = parts[7]


dotdate_r = re.compile(r"(.+)(([0-9]{4,})\.([0-9]{2,})\.([0-9]{2,}))$")
dashdate_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,})-([0-9]{2,}))$")
dashdateym_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,}))$")
numdate_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,})([0-9]{2,}))$")
numdateym_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,}))$")


class Index(object):
    _patterns = [ dotdate_r, dashdate_r, dashdateym_r, numdate_r, numdateym_r ]
    def __init__(self, shard):
        self.index = shard.index
        self.shards = []
        self.prirep = collections.defaultdict(int)
        self.nodes_p = collections.defaultdict(int)
        self.nodes_r = collections.defaultdict(int)
        self.states = collections.defaultdict(int)
        self.docs = 0
        self.store = 0
        self.add(shard)
        for pat in self._patterns:
            date = self._date(pat)
            if date:
                self.date = date
                break
        else:
            self.date = None

    def add(self, shard):
        if self.index != shard.index:
            raise Exception("mismatched index/shard: %s != %s" % (self.index, shard.index))
        self.shards.append(shard)
        self.prirep[shard.prirep] += 1
        if shard.prirep == 'p':
            self.nodes_p[shard.node] += 1
        elif shard.prirep == 'r':
            self.nodes_r[shard.node] += 1
        else:
            raise Exception("Logic bomb!")
        self.states[shard.state] += 1
        self.docs += shard.docs
        self.store += shard.store

    def _date(self, pat):
        m = pat.match(self.index)
        if m is None:
            return None
        return m.group(2)


# index                     shard prirep state      docs     store ip           node
# metaxor-stage-2018.08.20  0     p      STARTED  309761  85870742 10.16.153.15 mayo-gprfs006

indices = {}

with open(sys.argv[1], "r") as fp:
    header = fp.readline()
    for line in fp.readlines():
        parts = line[:-1].split()
        shd = Shard(parts)
        if shd.index not in indices:
            indices[shd.index] = Index(shd)
        else:
            indices[shd.index].add(shd)

need_rebalancing = 0
for name, idx in sorted(indices.items()):
    try:
        highest = max( [count for node, count in idx.nodes_p.items() ])
    except ValueError:
        pass
    else:
        if highest > 1:
            need_rebalancing += 1
    try:
        highest = max( [count for node, count in idx.nodes_r.items() ])
    except ValueError:
        pass
    else:
        if highest > 1:
            need_rebalancing += 1

if need_rebalancing == 0:
    print "HEALTHY: all shards appear to be balanced properly."
else:
    print "WARNING: shards appear to be unbalanced across nodes:"
    for name, idx in sorted(indices.items()):
        try:
            highest = max( [count for node, count in idx.nodes_p.items() ])
        except ValueError:
            pass
        else:
            if highest > 1:
                node_list = []
                for node, count in idx.nodes_p.items():
                    if count > 1:
                        node_list.append("%s (%d)" % (node, count))
                print "pri %s: %s" % (name, ', '.join(node_list))
        try:
            highest = max( [count for node, count in idx.nodes_r.items() ])
        except ValueError:
            pass
        else:
            if highest > 1:
                node_list = []
                for node, count in idx.nodes_r.items():
                    if count > 1:
                        node_list.append("%s (%d)" % (node, count))
                print "rep %s: %s" % (name, ', '.join(node_list))

print ""
# Now see if for a given class of indices are the shards balanced

date_by_nodes = {}
for name, idx in indices.items():
    if idx.date is None:
        continue
    try:
        date_nodes = date_by_nodes[idx.date]
    except KeyError:
        date_nodes = collections.defaultdict(int)
    for node, count in idx.nodes_p.items():
        date_nodes[node] += count
    date_by_nodes[idx.date] = date_nodes

need_rebalancing = 0
for date, node_counts in date_by_nodes.items():
    if not node_counts:
        continue
    total_nodes = len(node_counts.keys())
    total_counts = sum( [ count for node, count in node_counts.items() ] )
    per_node = int(total_counts / float(total_nodes))
    for node, count in node_counts.items():
        if per_node > 0 and per_node != count:
            need_rebalancing += 1

if need_rebalancing == 0:
    print "HEALTHY: all dates appear to be balanced properly."
else:
    print "WARNING: dates appear to be unbalanced across nodes:"
    for date, node_counts in date_by_nodes.items():
        total_nodes = len(node_counts.keys())
        total_counts = sum( [ count for node, count in node_counts.items() ] )
        per_node = int(total_counts / float(total_nodes))
        print_it = False
        for node, count in node_counts.items():
            if per_node > 0 and per_node != count:
                print_it = True
        if print_it:
            node_list = []
            for node, count in node_counts.items():
                node_list.append("%s (%d)" % (node, count))
            print "%s: %s" % (date, ', '.join(node_list))
	#!/usr/bin/env python2

	# A script to generate a report of Elasticsearch shard usage
	# (from _cat/shards?v&bytes=b) to see which shards of an index
	# are not evenly spread across the cluster, and for a set of
	# hard-coded date patterns, which date patterns are not spread
	# evenly across the cluster.
	#
	# E.g.
	# $ curl -X GET http://localhost:9200/_cat/shards?v\&bytes=b -o shards.lis
	# $ ./are-shards-balanced.py shards.lis

	import sys
	import re
	import collections
	import operator


	class Shard(object):
	def __init__(self, parts):
	self.index = parts[0]
	self.shard = int(parts[1])
	self.prirep = parts[2]
	self.state = parts[3]
	self.docs = int(parts[4])
	self.store = int(parts[5])
	self.node = parts[7]


	dotdate_r = re.compile(r"(.+)(([0-9]{4,})\.([0-9]{2,})\.([0-9]{2,}))$")
	dashdate_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,})-([0-9]{2,}))$")
	dashdateym_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,}))$")
	numdate_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,})([0-9]{2,}))$")
	numdateym_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,}))$")


	class Index(object):
	_patterns = [ dotdate_r, dashdate_r, dashdateym_r, numdate_r, numdateym_r ]
	def __init__(self, shard):
	self.index = shard.index
	self.shards = []
	self.prirep = collections.defaultdict(int)
	self.nodes_p = collections.defaultdict(int)
	self.nodes_r = collections.defaultdict(int)
	self.states = collections.defaultdict(int)
	self.docs = 0
	self.store = 0
	self.add(shard)
	for pat in self._patterns:
	date = self._date(pat)
	if date:
	self.date = date
	break
	else:
	self.date = None

	def add(self, shard):
	if self.index != shard.index:
	raise Exception("mismatched index/shard: %s != %s" % (self.index, shard.index))
	self.shards.append(shard)
	self.prirep[shard.prirep] += 1
	if shard.prirep == 'p':
	self.nodes_p[shard.node] += 1
	elif shard.prirep == 'r':
	self.nodes_r[shard.node] += 1
	else:
	raise Exception("Logic bomb!")
	self.states[shard.state] += 1
	self.docs += shard.docs
	self.store += shard.store

	def _date(self, pat):
	m = pat.match(self.index)
	if m is None:
	return None
	return m.group(2)


	# index shard prirep state docs store ip node
	# metaxor-stage-2018.08.20 0 p STARTED 309761 85870742 10.16.153.15 mayo-gprfs006

	indices = {}

	with open(sys.argv[1], "r") as fp:
	header = fp.readline()
	for line in fp.readlines():
	parts = line[:-1].split()
	shd = Shard(parts)
	if shd.index not in indices:
	indices[shd.index] = Index(shd)
	else:
	indices[shd.index].add(shd)

	need_rebalancing = 0
	for name, idx in sorted(indices.items()):
	try:
	highest = max( [count for node, count in idx.nodes_p.items() ])
	except ValueError:
	pass
	else:
	if highest > 1:
	need_rebalancing += 1
	try:
	highest = max( [count for node, count in idx.nodes_r.items() ])
	except ValueError:
	pass
	else:
	if highest > 1:
	need_rebalancing += 1

	if need_rebalancing == 0:
	print "HEALTHY: all shards appear to be balanced properly."
	else:
	print "WARNING: shards appear to be unbalanced across nodes:"
	for name, idx in sorted(indices.items()):
	try:
	highest = max( [count for node, count in idx.nodes_p.items() ])
	except ValueError:
	pass
	else:
	if highest > 1:
	node_list = []
	for node, count in idx.nodes_p.items():
	if count > 1:
	node_list.append("%s (%d)" % (node, count))
	print "pri %s: %s" % (name, ', '.join(node_list))
	try:
	highest = max( [count for node, count in idx.nodes_r.items() ])
	except ValueError:
	pass
	else:
	if highest > 1:
	node_list = []
	for node, count in idx.nodes_r.items():
	if count > 1:
	node_list.append("%s (%d)" % (node, count))
	print "rep %s: %s" % (name, ', '.join(node_list))

	print ""
	# Now see if for a given class of indices are the shards balanced

	date_by_nodes = {}
	for name, idx in indices.items():
	if idx.date is None:
	continue
	try:
	date_nodes = date_by_nodes[idx.date]
	except KeyError:
	date_nodes = collections.defaultdict(int)
	for node, count in idx.nodes_p.items():
	date_nodes[node] += count
	date_by_nodes[idx.date] = date_nodes

	need_rebalancing = 0
	for date, node_counts in date_by_nodes.items():
	if not node_counts:
	continue
	total_nodes = len(node_counts.keys())
	total_counts = sum( [ count for node, count in node_counts.items() ] )
	per_node = int(total_counts / float(total_nodes))
	for node, count in node_counts.items():
	if per_node > 0 and per_node != count:
	need_rebalancing += 1

	if need_rebalancing == 0:
	print "HEALTHY: all dates appear to be balanced properly."
	else:
	print "WARNING: dates appear to be unbalanced across nodes:"
	for date, node_counts in date_by_nodes.items():
	total_nodes = len(node_counts.keys())
	total_counts = sum( [ count for node, count in node_counts.items() ] )
	per_node = int(total_counts / float(total_nodes))
	print_it = False
	for node, count in node_counts.items():
	if per_node > 0 and per_node != count:
	print_it = True
	if print_it:
	node_list = []
	for node, count in node_counts.items():
	node_list.append("%s (%d)" % (node, count))
	print "%s: %s" % (date, ', '.join(node_list))