Skip to content

Instantly share code, notes, and snippets.

@Kami
Last active December 11, 2017 10:34
Show Gist options
  • Save Kami/5810229 to your computer and use it in GitHub Desktop.
Save Kami/5810229 to your computer and use it in GitHub Desktop.
Turns output of Cassandra nodetool cfhistograms command into pretty graphs (histograms)
#!/usr/bin/env python
# Licensed to Tomaz Muraus under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# Tomaz muraus licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import sys
from os.path import join as pjoin
from collections import defaultdict
from optparse import OptionParser
try:
import matplotlib
import matplotlib.pyplot as plt
except ImportError, e:
raise ImportError('Missing matplotlib dependency. You can install it' +
' using pip:\n pip install matplotlib')
try:
import prettyplotlib.pyplot as plt
except ImportError, e:
# prettyplotlib dependency is optional
pass
# Work without an x server
matplotlib.use('Agg')
LEGEND = {
'sstables': {
'title': 'Cassandra - Number of accessed SStables per read',
'x_axis': 'Number of SSTables accessed per read'
},
'write_latency': {
'title': 'Cassandra - Write Latency',
'x_axis': 'Write latency in microseconds'
},
'read_latency': {
'title': 'Cassandra - Read Latency',
'x_axis': 'Read latency in microseconds'
},
'row_size': {
'title': 'Cassandra - Row size',
'x_axis': 'Row size in bytes'
},
'column_count': {
'title': 'Cassandra - Column count',
'x_axis': 'Column count'
}
}
INDEX_MAP = {
1: 'sstables',
2: 'write_latency',
3: 'read_latency',
4: 'row_size',
5: 'column_count'
}
def parse_and_format_data(data):
lines = data.split('\n')
result = defaultdict(list)
seen_header = False
for index, line in enumerate(lines):
if 'offset' in line.lower():
# Header
seen_header = True
continue
if not seen_header:
# Ignore any additional lines before the actual header
continue
split = re.split('\s+', line)
if len(split) < 5:
# bad line
continue
if len(split) == 5:
# Insert 0 for sstable count
split.insert(1, 0)
offset = int(split[0])
for index in range(1, 6):
key = INDEX_MAP[index]
value = int(split[index])
if value != 0:
result[key].extend([value] * offset)
return result
def main():
usage = ('usage: %prog --input=<path to a file with cfhistograms output>'
' --output=<path to the output directory>')
parser = OptionParser(usage=usage)
parser.add_option('--input', dest='input',
help='Path to a file with cfhistograms output')
parser.add_option('--output', dest='output',
help='Path to a file where the graphs are saved')
(options, args) = parser.parse_args()
if not options.input:
print('Error: Missing "--input" option')
print parser.print_usage()
sys.exit(1)
if not options.output:
print('Error: Missing "--output" option')
print parser.print_usage()
sys.exit(1)
if not os.path.exists(options.input) or not \
os.path.isfile(options.input):
print('--input argument is not a valid file path')
sys.exit(2)
if not os.path.exists(options.output):
os.makedirs(options.output)
with open(options.input, 'r') as fp:
print('Processing file...')
content = fp.read()
parsed = parse_and_format_data(content)
for key, values in LEGEND.items():
if not key in parsed:
# All the columns are zero, skip it
print('Skipping %s, all the values are zero...' % (key))
continue
data = parsed[key]
output_path = pjoin(options.output, '%s_histogram.png' % (key))
print('Saving histogram: %s' % (output_path))
plt.title(values['title'])
plt.ylabel('Frequency')
plt.xlabel(values['x_axis'])
plt.hist(data, facecolor='green', alpha=0.75,
range=(min(data), max(data)))
fig = plt.gcf()
plt.grid(True)
plt.show()
plt.draw()
fig.savefig(output_path, dpi=100)
main()
@cris7M
Copy link

cris7M commented Dec 11, 2017

usage = ('usage: %prog --input='
' --output=')

please explain me this statement, from where i can get the cfhistograms file location.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment