Skip to content

Instantly share code, notes, and snippets.

@irvifa
Created February 7, 2018 05:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save irvifa/35ab729a6a91ae003720897fcf2e35c8 to your computer and use it in GitHub Desktop.
Save irvifa/35ab729a6a91ae003720897fcf2e35c8 to your computer and use it in GitHub Desktop.
datadog custom metrics
import resource
import subprocess
import os
import pwd
import time
from datadog import statsd, initialize
class OpenFileCheck(object):
def __init__(self):
self.DATADOG_AGENT_HOST_IP = os.getenv("DATADOG_AGENT_HOST_IP")
self.MY_POD_IP = os.getenv("MY_POD_IP")
def check(self, instance, port):
initialize(statsd_host=self.DATADOG_AGENT_HOST_IP, statsd_port=port)
try:
#TODO LOW move this to another custom check
try:
with open('/proc/sys/fs/file-nr', 'r') as file_handle:
handle_contents = file_handle.read()
handle_metrics = handle_contents.split()
statsd.gauge('custom.system.fs.allocated_fh', float(handle_metrics[0]))
statsd.gauge('custom.system.fs.allocated_unused_fh', float(handle_metrics[1]))
statsd.gauge('custom.system.fs.max_fh', float(handle_metrics[2]))
except Exception:
self.fail_event('Cannot extract system file handles stats')
proc_name = instance['process_name_pattern']
if 'alias' in instance:
name_tag = instance['alias']
else:
name_tag = proc_name
ps_result, ps_err = subprocess.Popen(
['pgrep', '-f', proc_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
).communicate()
processes = ps_result.split('\n')
proc_count = 0
min_soft_limit = None
max_soft_limit = None
min_hard_limit = None
max_hard_limit = None
if len(processes) > 0:
for i in range(0, len(processes)):
#add try/catch here, in case we can't find the process
pid = processes[i]
try:
if pid:
limit_content, limit_content_err = subprocess.Popen(
['grep', 'open files', '/proc/' + pid + '/limits'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
).communicate()
#check for empty or null string
if limit_content:
parsed_limit_content = list(
filter(None, limit_content.split(' '))
)
#tags = [ 'pid:' + pid, 'process_name_pattern:' + proc_name ]
soft_limit = int(parsed_limit_content[3])
hard_limit = int(parsed_limit_content[4])
if min_soft_limit:
if min_soft_limit > soft_limit:
min_soft_limit = soft_limit
if min_hard_limit < hard_limit:
min_hard_limit = hard_limit
if max_soft_limit < soft_limit:
max_soft_limit = soft_limit
if max_hard_limit > hard_limit:
max_hard_limit = hard_limit
else:
min_soft_limit = soft_limit
max_soft_limit = soft_limit
min_hard_limit = hard_limit
max_hard_limit = hard_limit
proc_count = proc_count + 1
except:
self.fail_event(
'[ERROR] failed to get the process information, ' + \
'pid: ' + str(pid) + ', process name: ' + proc_name,
)
else:
if ps_err: #error message is not empty and not null
self.fail_with_exception_event(
'[ERROR] process name: ' + proc_name,
ps_err
)
tags = [ 'process_name:' + name_tag ]
statsd.gauge(
'custom.system.process.count',
proc_count,
tags=tags
)
statsd.gauge(
'custom.system.open_file.soft_limit_min',
min_soft_limit,
tags=tags
)
statsd.gauge(
'custom.system.open_file.soft_limit_max',
max_soft_limit,
tags=tags
)
statsd.gauge(
'custom.system.open_file.hard_limit_min',
min_hard_limit,
tags=tags
)
statsd.gauge(
'custom.system.open_file.hard_limit_max',
max_hard_limit,
tags=tags
)
#sending this metric to help setting up alerts
self_proc_result, self_proc_err = subprocess.Popen(
'ls -l /proc/[0-9]*/fd', shell = True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
).communicate()
proc_lines = self_proc_result.split('\n')
total_open_by_self = 0
for line in proc_lines:
if len(line) > 0 and not (line.startswith('total') or line.startswith('/proc/')):
total_open_by_self = total_open_by_self + 1
statsd.gauge(
'custom.system.open_file.ddagent_count',
total_open_by_self
)
except:
self.fail_event('[ERROR] config: ' + str(instance))
def fail_with_exception_event(message, exception):
api.Event.create(title='failure', text='{} {} {}'.format((message,
exception, int(time.time()))), tags='Caught exception')
def fail_event(message):
api.Event.create(title='failure', text='{} {}'.format((message, int(time.time()))))
if __name__ == '__main__':
print "running.."
port=8125
instances = dict(
process_name_pattern='jetty',
alias='java'
)
openfiles_metrics = OpenFileCheck()
openfiles_metrics.check(instances, port)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment