Skip to content

Instantly share code, notes, and snippets.

@zircote zircote/aurora.py
Last active Jul 4, 2017

Embed
What would you like to do?
WIP DataDog check for Apache Aurora
"""
Aurora Scheduler check
Collects metrics from aurora scheduler.
"""
import requests
from checks import AgentCheck, CheckException
class AuroraCheck(AgentCheck):
GAUGE = AgentCheck.gauge
MONOTONIC_COUNT = AgentCheck.monotonic_count
COUNT = AgentCheck.count
SERVICE_CHECK_NAME = "aurora_master.can_connect"
service_check_needed = True
SYSTEM_METRICS = {
"jvm_uptime_secs": ('aurora.scheduler.jvm_uptime_secs', MONOTONIC_COUNT),
"system_load_avg": ('aurora.scheduler.system_load_avg', GAUGE),
"http_500_responses_events": ('aurora.scheduler.http_500_responses_events', MONOTONIC_COUNT),
"system_env_SHLVL": ('aurora.scheduler.system_env_SHLVL', GAUGE),
"system_free_physical_memory_mb": ('aurora.scheduler.system_free_physical_memory_mb', GAUGE),
"system_free_swap_mb": ('aurora.scheduler.system_free_swap_mb', GAUGE),
}
LEADER_METRICS = {
"process_cpu_cores_utilized": ('aurora.scheduler.process_cpu_cores_utilized', GAUGE),
"task_store_LOST": ('aurora.scheduler.task_store_LOST', GAUGE),
"scheduler_resource_offers": ('aurora.scheduler.scheduler_resource_offers', MONOTONIC_COUNT),
"framework_registered": ('aurora.scheduler.framework_registered', COUNT),
"scheduler_log_native_append_nanos_total": (
'aurora.scheduler.scheduler_log_native_append_nanos_total', MONOTONIC_COUNT),
"scheduler_log_native_append_events": ('aurora.scheduler.scheduler_log_native_append_events', MONOTONIC_COUNT),
"timed_out_tasks": ('aurora.scheduler.timed_out_tasks', MONOTONIC_COUNT),
}
CRON_METRICS = {
"cron_job_collisions": ("aurora.scheduler.cron_job_collisions", MONOTONIC_COUNT),
"cron_job_launch_failures": ("aurora.scheduler.cron_job_launch_failures", GAUGE),
"cron_job_misfires": ("aurora.scheduler.cron_job_misfires", GAUGE),
"cron_job_parse_failures": ("aurora.scheduler.cron_job_parse_failures", GAUGE),
"cron_job_triggers": ("aurora.scheduler.cron_job_triggers", GAUGE),
"cron_jobs_loaded": ("aurora.scheduler.cron_jobs_loaded", GAUGE),
}
PREEMPTION_METRICS = {
"preemptor_tasks_preempted_non_prod": ("aurora.scheduler.preemptor_tasks_preempted_non_prod", MONOTONIC_COUNT),
"preemptor_tasks_preempted_prod": ("aurora.scheduler.preemptor_tasks_preempted_prod", MONOTONIC_COUNT),
}
QUARTZ_METRICS = {
'quartz_scheduler_running': ('aurora.scheduler.quartz_scheduler_running', COUNT)
}
TASK_STORE_METRICS = {
"task_store_ASSIGNED": ("aurora.scheduler.task_store_ASSIGNED", MONOTONIC_COUNT),
"task_store_DRAINING": ("aurora.scheduler.task_store_DRAINING", MONOTONIC_COUNT),
"task_store_FAILED": ("aurora.scheduler.task_store_FAILED", MONOTONIC_COUNT),
"task_store_FINISHED": ("aurora.scheduler.task_store_FINISHED", MONOTONIC_COUNT),
"task_store_INIT": ("aurora.scheduler.task_store_INIT", MONOTONIC_COUNT),
"task_store_KILLED": ("aurora.scheduler.task_store_KILLED", MONOTONIC_COUNT),
"task_store_KILLING": ("aurora.scheduler.task_store_KILLING", MONOTONIC_COUNT),
"task_store_LOST": ("aurora.scheduler.task_store_LOST", MONOTONIC_COUNT),
"task_store_PENDING": ("aurora.scheduler.task_store_PENDING", MONOTONIC_COUNT),
"task_store_PREEMPTING": ("aurora.scheduler.task_store_PREEMPTING", MONOTONIC_COUNT),
"task_store_RESTARTING": ("aurora.scheduler.task_store_RESTARTING", MONOTONIC_COUNT),
"task_store_RUNNING": ("aurora.scheduler.task_store_RUNNING", MONOTONIC_COUNT),
"task_store_STARTING": ("aurora.scheduler.task_store_STARTING", MONOTONIC_COUNT),
"task_store_THROTTLED": ("aurora.scheduler.task_store_THROTTLED", MONOTONIC_COUNT),
}
def _get_json(self, url, timeout):
tags = ["url:%s" % url]
msg = None
status = None
json = None
try:
r = requests.get(url, timeout=timeout)
json = r.json()
if r.is_redirect:
status = AgentCheck.OK
self.is_leader = False
msg = "Aurora Scheduler instance detected at %s but is not master" % url
elif r.status_code != 200:
status = AgentCheck.CRITICAL
msg = "Got %s when hitting %s" % (r.status_code, url)
else:
status = AgentCheck.OK
self.is_leader = True
msg = "Aurora Scheduler instance detected at %s " % url
except requests.exceptions.Timeout as e:
# If there's a timeout
msg = "%s seconds timeout when hitting %s" % (timeout, url)
status = AgentCheck.CRITICAL
except Exception as e:
msg = str(e)
status = AgentCheck.CRITICAL
finally:
if self.service_check_needed:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
self.service_check_needed = False
if status is AgentCheck.CRITICAL:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
raise CheckException("Cannot connect to aurora scheduler, please check your configuration.")
return json
def _get_state(self, url, timeout):
return self._get_json(url + '/vars.json', timeout)
def check(self, instance):
if 'url' not in instance:
raise Exception('Aurora scheduler instance missing "url" value.')
url = instance['url']
instance_tags = instance.get('tags', [])
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))
state_metrics = self._get_state(url, timeout)
if state_metrics:
tags = [
'aurora',
'mesos:framework'
]
tags += instance_tags
stats_metrics = self._get_state(url, timeout)
if stats_metrics is not None:
metrics = [self.SYSTEM_METRICS]
if self.is_leader:
metrics += [self.LEADER_METRICS, self.QUARTZ_METRICS, self.PREEMPTION_METRICS,
self.TASK_STORE_METRICS, self.CRON_METRICS, ]
for m in metrics:
print m
for key_name, (metric_name, metric_func) in m.iteritems():
if key_name in stats_metrics:
metric_func(self, metric_name, stats_metrics[key_name], tags=tags)
else:
metric_func(self, metric_name, None, tags=tags)
self.service_check_needed = True
init_config:
default_timeout: 5
instances:
- url: http://localhost:8081
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.