Skip to content

Instantly share code, notes, and snippets.

@portante
Last active September 13, 2019 03:10
Show Gist options
  • Save portante/bf99472fc7dddce915ed3d64431cc17e to your computer and use it in GitHub Desktop.
Save portante/bf99472fc7dddce915ed3d64431cc17e to your computer and use it in GitHub Desktop.
A simple script to generate a report of Elasticsearch shard usage (from _cat/shards/?v&bytes=b) to see which shards of an index are not evenly spread across the cluster.
#!/usr/bin/env python2
# A script to generate a report of Elasticsearch shard usage
# (from _cat/shards?v&bytes=b) to see which shards of an index
# are not evenly spread across the cluster, and for a set of
# hard-coded date patterns, which date patterns are not spread
# evenly across the cluster.
#
# E.g.
# $ curl -X GET http://localhost:9200/_cat/shards?v\&bytes=b -o shards.lis
# $ ./are-shards-balanced.py shards.lis
import sys
import re
import collections
import operator
class Shard(object):
def __init__(self, parts):
self.index = parts[0]
self.shard = int(parts[1])
self.prirep = parts[2]
self.state = parts[3]
self.docs = int(parts[4])
self.store = int(parts[5])
self.node = parts[7]
dotdate_r = re.compile(r"(.+)(([0-9]{4,})\.([0-9]{2,})\.([0-9]{2,}))$")
dashdate_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,})-([0-9]{2,}))$")
dashdateym_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,}))$")
numdate_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,})([0-9]{2,}))$")
numdateym_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,}))$")
class Index(object):
_patterns = [ dotdate_r, dashdate_r, dashdateym_r, numdate_r, numdateym_r ]
def __init__(self, shard):
self.index = shard.index
self.shards = []
self.prirep = collections.defaultdict(int)
self.nodes_p = collections.defaultdict(int)
self.nodes_r = collections.defaultdict(int)
self.states = collections.defaultdict(int)
self.docs = 0
self.store = 0
self.add(shard)
for pat in self._patterns:
date = self._date(pat)
if date:
self.date = date
break
else:
self.date = None
def add(self, shard):
if self.index != shard.index:
raise Exception("mismatched index/shard: %s != %s" % (self.index, shard.index))
self.shards.append(shard)
self.prirep[shard.prirep] += 1
if shard.prirep == 'p':
self.nodes_p[shard.node] += 1
elif shard.prirep == 'r':
self.nodes_r[shard.node] += 1
else:
raise Exception("Logic bomb!")
self.states[shard.state] += 1
self.docs += shard.docs
self.store += shard.store
def _date(self, pat):
m = pat.match(self.index)
if m is None:
return None
return m.group(2)
# index shard prirep state docs store ip node
# metaxor-stage-2018.08.20 0 p STARTED 309761 85870742 10.16.153.15 mayo-gprfs006
indices = {}
with open(sys.argv[1], "r") as fp:
header = fp.readline()
for line in fp.readlines():
parts = line[:-1].split()
shd = Shard(parts)
if shd.index not in indices:
indices[shd.index] = Index(shd)
else:
indices[shd.index].add(shd)
need_rebalancing = 0
for name, idx in sorted(indices.items()):
try:
highest = max( [count for node, count in idx.nodes_p.items() ])
except ValueError:
pass
else:
if highest > 1:
need_rebalancing += 1
try:
highest = max( [count for node, count in idx.nodes_r.items() ])
except ValueError:
pass
else:
if highest > 1:
need_rebalancing += 1
if need_rebalancing == 0:
print "HEALTHY: all shards appear to be balanced properly."
else:
print "WARNING: shards appear to be unbalanced across nodes:"
for name, idx in sorted(indices.items()):
try:
highest = max( [count for node, count in idx.nodes_p.items() ])
except ValueError:
pass
else:
if highest > 1:
node_list = []
for node, count in idx.nodes_p.items():
if count > 1:
node_list.append("%s (%d)" % (node, count))
print "pri %s: %s" % (name, ', '.join(node_list))
try:
highest = max( [count for node, count in idx.nodes_r.items() ])
except ValueError:
pass
else:
if highest > 1:
node_list = []
for node, count in idx.nodes_r.items():
if count > 1:
node_list.append("%s (%d)" % (node, count))
print "rep %s: %s" % (name, ', '.join(node_list))
print ""
# Now see if for a given class of indices are the shards balanced
date_by_nodes = {}
for name, idx in indices.items():
if idx.date is None:
continue
try:
date_nodes = date_by_nodes[idx.date]
except KeyError:
date_nodes = collections.defaultdict(int)
for node, count in idx.nodes_p.items():
date_nodes[node] += count
date_by_nodes[idx.date] = date_nodes
need_rebalancing = 0
for date, node_counts in date_by_nodes.items():
if not node_counts:
continue
total_nodes = len(node_counts.keys())
total_counts = sum( [ count for node, count in node_counts.items() ] )
per_node = int(total_counts / float(total_nodes))
for node, count in node_counts.items():
if per_node > 0 and per_node != count:
need_rebalancing += 1
if need_rebalancing == 0:
print "HEALTHY: all dates appear to be balanced properly."
else:
print "WARNING: dates appear to be unbalanced across nodes:"
for date, node_counts in date_by_nodes.items():
total_nodes = len(node_counts.keys())
total_counts = sum( [ count for node, count in node_counts.items() ] )
per_node = int(total_counts / float(total_nodes))
print_it = False
for node, count in node_counts.items():
if per_node > 0 and per_node != count:
print_it = True
if print_it:
node_list = []
for node, count in node_counts.items():
node_list.append("%s (%d)" % (node, count))
print "%s: %s" % (date, ', '.join(node_list))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment