Last active
September 13, 2019 03:10
-
-
Save portante/bf99472fc7dddce915ed3d64431cc17e to your computer and use it in GitHub Desktop.
A simple script to generate a report of Elasticsearch shard usage (from _cat/shards/?v&bytes=b) to see which shards of an index are not evenly spread across the cluster.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# A script to generate a report of Elasticsearch shard usage | |
# (from _cat/shards?v&bytes=b) to see which shards of an index | |
# are not evenly spread across the cluster, and for a set of | |
# hard-coded date patterns, which date patterns are not spread | |
# evenly across the cluster. | |
# | |
# E.g. | |
# $ curl -X GET http://localhost:9200/_cat/shards?v\&bytes=b -o shards.lis | |
# $ ./are-shards-balanced.py shards.lis | |
import sys | |
import re | |
import collections | |
import operator | |
class Shard(object): | |
def __init__(self, parts): | |
self.index = parts[0] | |
self.shard = int(parts[1]) | |
self.prirep = parts[2] | |
self.state = parts[3] | |
self.docs = int(parts[4]) | |
self.store = int(parts[5]) | |
self.node = parts[7] | |
dotdate_r = re.compile(r"(.+)(([0-9]{4,})\.([0-9]{2,})\.([0-9]{2,}))$") | |
dashdate_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,})-([0-9]{2,}))$") | |
dashdateym_r = re.compile(r"(.+)(([0-9]{4,})-([0-9]{2,}))$") | |
numdate_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,})([0-9]{2,}))$") | |
numdateym_r = re.compile(r"(.+)(([0-9]{4,})([0-9]{2,}))$") | |
class Index(object): | |
_patterns = [ dotdate_r, dashdate_r, dashdateym_r, numdate_r, numdateym_r ] | |
def __init__(self, shard): | |
self.index = shard.index | |
self.shards = [] | |
self.prirep = collections.defaultdict(int) | |
self.nodes_p = collections.defaultdict(int) | |
self.nodes_r = collections.defaultdict(int) | |
self.states = collections.defaultdict(int) | |
self.docs = 0 | |
self.store = 0 | |
self.add(shard) | |
for pat in self._patterns: | |
date = self._date(pat) | |
if date: | |
self.date = date | |
break | |
else: | |
self.date = None | |
def add(self, shard): | |
if self.index != shard.index: | |
raise Exception("mismatched index/shard: %s != %s" % (self.index, shard.index)) | |
self.shards.append(shard) | |
self.prirep[shard.prirep] += 1 | |
if shard.prirep == 'p': | |
self.nodes_p[shard.node] += 1 | |
elif shard.prirep == 'r': | |
self.nodes_r[shard.node] += 1 | |
else: | |
raise Exception("Logic bomb!") | |
self.states[shard.state] += 1 | |
self.docs += shard.docs | |
self.store += shard.store | |
def _date(self, pat): | |
m = pat.match(self.index) | |
if m is None: | |
return None | |
return m.group(2) | |
# index shard prirep state docs store ip node | |
# metaxor-stage-2018.08.20 0 p STARTED 309761 85870742 10.16.153.15 mayo-gprfs006 | |
indices = {} | |
with open(sys.argv[1], "r") as fp: | |
header = fp.readline() | |
for line in fp.readlines(): | |
parts = line[:-1].split() | |
shd = Shard(parts) | |
if shd.index not in indices: | |
indices[shd.index] = Index(shd) | |
else: | |
indices[shd.index].add(shd) | |
need_rebalancing = 0 | |
for name, idx in sorted(indices.items()): | |
try: | |
highest = max( [count for node, count in idx.nodes_p.items() ]) | |
except ValueError: | |
pass | |
else: | |
if highest > 1: | |
need_rebalancing += 1 | |
try: | |
highest = max( [count for node, count in idx.nodes_r.items() ]) | |
except ValueError: | |
pass | |
else: | |
if highest > 1: | |
need_rebalancing += 1 | |
if need_rebalancing == 0: | |
print "HEALTHY: all shards appear to be balanced properly." | |
else: | |
print "WARNING: shards appear to be unbalanced across nodes:" | |
for name, idx in sorted(indices.items()): | |
try: | |
highest = max( [count for node, count in idx.nodes_p.items() ]) | |
except ValueError: | |
pass | |
else: | |
if highest > 1: | |
node_list = [] | |
for node, count in idx.nodes_p.items(): | |
if count > 1: | |
node_list.append("%s (%d)" % (node, count)) | |
print "pri %s: %s" % (name, ', '.join(node_list)) | |
try: | |
highest = max( [count for node, count in idx.nodes_r.items() ]) | |
except ValueError: | |
pass | |
else: | |
if highest > 1: | |
node_list = [] | |
for node, count in idx.nodes_r.items(): | |
if count > 1: | |
node_list.append("%s (%d)" % (node, count)) | |
print "rep %s: %s" % (name, ', '.join(node_list)) | |
print "" | |
# Now see if for a given class of indices are the shards balanced | |
date_by_nodes = {} | |
for name, idx in indices.items(): | |
if idx.date is None: | |
continue | |
try: | |
date_nodes = date_by_nodes[idx.date] | |
except KeyError: | |
date_nodes = collections.defaultdict(int) | |
for node, count in idx.nodes_p.items(): | |
date_nodes[node] += count | |
date_by_nodes[idx.date] = date_nodes | |
need_rebalancing = 0 | |
for date, node_counts in date_by_nodes.items(): | |
if not node_counts: | |
continue | |
total_nodes = len(node_counts.keys()) | |
total_counts = sum( [ count for node, count in node_counts.items() ] ) | |
per_node = int(total_counts / float(total_nodes)) | |
for node, count in node_counts.items(): | |
if per_node > 0 and per_node != count: | |
need_rebalancing += 1 | |
if need_rebalancing == 0: | |
print "HEALTHY: all dates appear to be balanced properly." | |
else: | |
print "WARNING: dates appear to be unbalanced across nodes:" | |
for date, node_counts in date_by_nodes.items(): | |
total_nodes = len(node_counts.keys()) | |
total_counts = sum( [ count for node, count in node_counts.items() ] ) | |
per_node = int(total_counts / float(total_nodes)) | |
print_it = False | |
for node, count in node_counts.items(): | |
if per_node > 0 and per_node != count: | |
print_it = True | |
if print_it: | |
node_list = [] | |
for node, count in node_counts.items(): | |
node_list.append("%s (%d)" % (node, count)) | |
print "%s: %s" % (date, ', '.join(node_list)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment