Skip to content

Instantly share code, notes, and snippets.

@brockpalen
Last active March 29, 2017 20:40
Show Gist options
  • Save brockpalen/b4e59efb94ade080d17d to your computer and use it in GitHub Desktop.
Save brockpalen/b4e59efb94ade080d17d to your computer and use it in GitHub Desktop.
Lustre Logstash Files
#!/usr/bin/python
#Brock Palen
# brockp@umich.edu
#
'''
Takes data in the form of:
metric number
snapshot_time 1396141904.951010 secs.usecs
open 28540765918 samples [reqs]
close 10256936166 samples [reqs]
mknod 30061 samples [reqs]
and creates a json version:
metic: number
'''
import sys
try: #rhel5 doesn't have json
import json
except ImportError:
import simplejson as json
def dictify(filename):
try:
f = open(filename, 'r')
except:
sys.stderr.write("failed to open"+filename+"\n")
sys.exit(-1)
data = {'source':f.name}
#read the strcture line at a time and build a dict out of it:
for line in f:
words = line.split()
#OST's use formats where the last number is the one you want
#read_bytes 100121201 samples [bytes] 0 1048576 54023523712987
if(words[-1].isdigit()):
data[words[0]] = words[-1]
else:
data[words[0]] = words[1]
f.close()
print json.JSONEncoder().encode(data)
for x in range(1, len(sys.argv)):
filename = sys.argv[x]
dictify(filename)
#collection of loggrabs for lustre nodes
#Brock Palen brockp@umich.edu
input {
#metadata server inputs, gets size of filesystem, number of files etc
stdin {
type => "lustre-stdin"
}
exec {
#these are the per client stats, there are lots, so if things get overloaded this is the first place to look
type => "lustre-client-stats"
command => "/root/logstash.git/helpers/json-stats-wrapper.py /proc/fs/lustre/mdt/*/exports/*/stats"
codec => json_lines
interval => 120
}
exec {
#these are the per MDT/OST stats, these are probably the most interesting
type => "lustre-server-stats"
command => "/root/logstash.git/helpers/json-stats-wrapper.py /proc/fs/lustre/mdt/*/md_stats"
codec => json_lines
interval => 10
}
}
filter {
if [type] == "lustre-stdin" {
grok {
#grab the filesystem name
match => [ "message", "(mdt|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}%{GREEDYDATA}/%{WORD:metric} %{NUMBER:count}"]
}
}
if [type] == "lustre-client-stats" or [type] == "lustre-server-stats" {
#{'rename': '6703220', 'sync': '7016939', 'llog_init': '30', 'mknod': '30071', 'connect': '2266', 'reconnect': '780', 'close': '10260863572', 'open': '28550349796', 'disconnect': '2109', 'create': '3186', 'quotactl': '4609', 'mkdir': '24263000', 'source': '/proc/fs/lustre/mds/scratch-MDT0000/stats', 'getattr': '4903414744', 'rmdir': '17575276', 'destroy': '3147', 'snapshot_time': '1396145902.593900', 'getxattr': '5825862', 'link': '4331053', 'unlink': '208715050', 'process_config': '2', 'setattr': '284966835', 'statfs': '222', 'notify': '163'}
json{
source => "message"
remove_field => [ "command", "snapshot_time" ]
}
if[type] == "lustre-client-stats" {
grok {
match => [ "source", "(mdt|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}/exports/%{IP:client}"]
}
#clients are identified by ip@nid eg 10.255.255.1@o2ib
#graphite doesn't like all the dots and makes the splits, this making grouping hard
#change them! Be best if we could just do a lookup on them oh well
mutate {
gsub => [
"client", "\.", "-"
]
}
} else if [type] == "lustre-server-stats" {
grok {
match => [ "source", "(mdt|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}"]
}
}
}
}
output {
if [type] == "lustre-client-stats" {
#stdout { codec => rubydebug }
graphite{
host => "GRAPHITE.SERVER.EDU"
# #lustre.scratch.MDT.0000.client.open
# #lustre.scratch.OST.00a1.client.*
metrics => [
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.open", "%{open}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.close", "%{close}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.rename", "%{rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.samedir_rename", "%{samedir_rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.crossdir_rename", "%{crossdir_rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.sync", "%{sync}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.mknod", "%{mknod}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.mkdir", "%{mkdir}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.rmdir", "%{rmdir}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.getattr", "%{getattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.setattr", "%{setattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.getxattr", "%{getxattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.setxattr", "%{setxattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.link", "%{link}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.unlink", "%{unlink}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.statfs", "%{statfs}"
]
}
}
if [type] == "lustre-server-stats" {
#stdout { codec => rubydebug }
graphite{
host => "GRAPHITE.SERVER.EDU"
# #lustre.scratch.MDT.0000.open
# #lustre.scratch.OST.00a1.*
metrics => [
"lustre.%{fsname}.%{ltype}.%{ldevid}.open", "%{open}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.close", "%{close}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.rename", "%{rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.samedir_rename", "%{samedir_rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.crossdir_rename", "%{crossdir_rename}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.sync", "%{sync}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.mknod", "%{mknod}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.mkdir", "%{mkdir}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.rmdir", "%{rmdir}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.getattr", "%{getattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.setattr", "%{setattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.getxattr", "%{getxattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.setxattr", "%{setxattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.link", "%{link}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.unlink", "%{unlink}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.statfs", "%{statfs}"
]
}
}
}
#collection of loggrabs for lustre nodes
#Brock Palen brockp@umich.edu
input {
#metadata server inputs, gets size of filesystem, number of files etc
stdin {
type => "lustre-stdin"
}
exec {
#these are the per client stats, there are lots, so if things get overloaded this is the first place to look
type => "lustre-client-stats"
command => "/root/logstash.git/helpers/json-stats-wrapper.py /proc/fs/lustre/obdfilter/*/exports/*/stats"
codec => json_lines
interval => 120
}
exec {
#these are the per MDT/OST stats, these are probably the most interesting
type => "lustre-server-stats"
command => "/root/logstash.git/helpers/json-stats-wrapper.py /proc/fs/lustre/obdfilter/*/stats"
codec => json_lines
interval => 10
}
}
filter {
if [type] == "lustre-stdin" {
grok {
#grab the filesystem name
match => [ "message", "(mds|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}%{GREEDYDATA}/%{WORD:metric} %{NUMBER:count}"]
}
}
if [type] == "lustre-client-stats" or [type] == "lustre-server-stats" {
#{'rename': '6703220', 'sync': '7016939', 'llog_init': '30', 'mknod': '30071', 'connect': '2266', 'reconnect': '780', 'close': '10260863572', 'open': '28550349796', 'disconnect': '2109', 'create': '3186', 'quotactl': '4609', 'mkdir': '24263000', 'source': '/proc/fs/lustre/mds/scratch-MDT0000/stats', 'getattr': '4903414744', 'rmdir': '17575276', 'destroy': '3147', 'snapshot_time': '1396145902.593900', 'getxattr': '5825862', 'link': '4331053', 'unlink': '208715050', 'process_config': '2', 'setattr': '284966835', 'statfs': '222', 'notify': '163'}
json{
source => "message"
remove_field => [ "command", "snapshot_time" ]
}
if[type] == "lustre-client-stats" {
grok {
match => [ "source", "(mds|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}/exports/%{IP:client}"]
}
#clients are identified by ip@nid eg 10.255.255.1@o2ib
#graphite doesn't like all the dots and makes the splits, this making grouping hard
#change them! Be best if we could just do a lookup on them oh well
mutate {
gsub => [
"client", "\.", "-"
]
}
} else if [type] == "lustre-server-stats" {
grok {
match => [ "source", "(mds|obdfilter)/%{WORD:fsname}-(?<ltype>(MDT|OST))%{BASE16NUM:ldevid}"]
}
}
}
}
output {
#stdout { codec => rubydebug }
if [type] == "lustre-client-stats" {
#stdout { codec => rubydebug }
graphite{
host => "GRAPHITE.SERVER.EDU"
# #lustre.scratch.MDT.0000.open
# #lustre.scratch.OST.00a1.*
metrics => [
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.sync", "%{sync}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.write_bytes", "%{write_bytes}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.read_bytes", "%{read_bytes}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.get_info", "%{get_info}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.set_info_async", "%{set_info_async}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.process_config", "%{process_config}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.connect", "%{connect}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.reconnect", "%{reconnect}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.disconnect", "%{disconnect}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.statfs", "%{statfs}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.create", "%{create}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.destroy", "%{destroy}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.setattr", "%{setattr}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.punch", "%{punch}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.preprw", "%{preprw}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.commitrw", "%{commitrw}",
#"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.quotactl", "%{quotactl}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.%{client}.ping", "%{ping}"
]
}
}
if [type] == "lustre-server-stats" {
#stdout { codec => rubydebug }
graphite{
host => "GRAPHITE.SERVER.EDU"
# #lustre.scratch.MDT.0000.open
# #lustre.scratch.OST.00a1.*
metrics => [
"lustre.%{fsname}.%{ltype}.%{ldevid}.sync", "%{sync}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.write_bytes", "%{write_bytes}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.read_bytes", "%{read_bytes}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.get_info", "%{get_info}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.set_info_async", "%{set_info_async}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.process_config", "%{process_config}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.connect", "%{connect}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.reconnect", "%{reconnect}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.disconnect", "%{disconnect}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.statfs", "%{statfs}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.create", "%{create}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.destroy", "%{destroy}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.setattr", "%{setattr}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.punch", "%{punch}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.preprw", "%{preprw}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.commitrw", "%{commitrw}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.quotactl", "%{quotactl}",
"lustre.%{fsname}.%{ltype}.%{ldevid}.ping", "%{ping}"
]
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment