Skip to content

Instantly share code, notes, and snippets.

@frimik
Forked from zircote/aurora.py
Last active July 5, 2017 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frimik/bec45ff66b979098931f7e4b3b218167 to your computer and use it in GitHub Desktop.
Save frimik/bec45ff66b979098931f7e4b3b218167 to your computer and use it in GitHub Desktop.
WIP DataDog check for Apache Aurora
"""
Aurora Scheduler check
Collects metrics from aurora scheduler.
"""
import requests
from checks import AgentCheck, CheckException
class AuroraCheck(AgentCheck):
GAUGE = AgentCheck.gauge
MONOTONIC_COUNT = AgentCheck.monotonic_count
COUNT = AgentCheck.count
SERVICE_CHECK_NAME = "aurora_master.can_connect"
service_check_needed = True
SYSTEM_METRICS = {
"jvm_uptime_secs": ('aurora.scheduler.jvm_uptime_secs', MONOTONIC_COUNT),
"system_load_avg": ('aurora.scheduler.system_load_avg', GAUGE),
"http_500_responses_events": ('aurora.scheduler.http_500_responses_events', MONOTONIC_COUNT),
"system_env_SHLVL": ('aurora.scheduler.system_env_SHLVL', GAUGE),
"system_free_physical_memory_mb": ('aurora.scheduler.system_free_physical_memory_mb', GAUGE),
"system_free_swap_mb": ('aurora.scheduler.system_free_swap_mb', GAUGE),
}
LEADER_METRICS = {
"process_cpu_cores_utilized": ('aurora.scheduler.process_cpu_cores_utilized', GAUGE),
"task_store_LOST": ('aurora.scheduler.task_store_LOST', GAUGE),
"scheduler_resource_offers": ('aurora.scheduler.scheduler_resource_offers', MONOTONIC_COUNT),
"framework_registered": ('aurora.scheduler.framework_registered', COUNT),
"scheduler_log_native_append_nanos_total": (
'aurora.scheduler.scheduler_log_native_append_nanos_total', MONOTONIC_COUNT),
"scheduler_log_native_append_events": ('aurora.scheduler.scheduler_log_native_append_events', MONOTONIC_COUNT),
"timed_out_tasks": ('aurora.scheduler.timed_out_tasks', MONOTONIC_COUNT),
}
CRON_METRICS = {
"cron_job_collisions": ("aurora.scheduler.cron_job_collisions", MONOTONIC_COUNT),
"cron_job_launch_failures": ("aurora.scheduler.cron_job_launch_failures", GAUGE),
"cron_job_misfires": ("aurora.scheduler.cron_job_misfires", GAUGE),
"cron_job_parse_failures": ("aurora.scheduler.cron_job_parse_failures", GAUGE),
"cron_job_triggers": ("aurora.scheduler.cron_job_triggers", GAUGE),
"cron_jobs_loaded": ("aurora.scheduler.cron_jobs_loaded", GAUGE),
}
PREEMPTION_METRICS = {
"preemptor_tasks_preempted_non_prod": ("aurora.scheduler.preemptor_tasks_preempted_non_prod", MONOTONIC_COUNT),
"preemptor_tasks_preempted_prod": ("aurora.scheduler.preemptor_tasks_preempted_prod", MONOTONIC_COUNT),
}
QUARTZ_METRICS = {
'quartz_scheduler_running': ('aurora.scheduler.quartz_scheduler_running', COUNT)
}
TASK_STORE_METRICS = {
"task_store_ASSIGNED": ("aurora.scheduler.task_store_ASSIGNED", MONOTONIC_COUNT),
"task_store_DRAINING": ("aurora.scheduler.task_store_DRAINING", MONOTONIC_COUNT),
"task_store_FAILED": ("aurora.scheduler.task_store_FAILED", MONOTONIC_COUNT),
"task_store_FINISHED": ("aurora.scheduler.task_store_FINISHED", MONOTONIC_COUNT),
"task_store_INIT": ("aurora.scheduler.task_store_INIT", MONOTONIC_COUNT),
"task_store_KILLED": ("aurora.scheduler.task_store_KILLED", MONOTONIC_COUNT),
"task_store_KILLING": ("aurora.scheduler.task_store_KILLING", MONOTONIC_COUNT),
"task_store_LOST": ("aurora.scheduler.task_store_LOST", MONOTONIC_COUNT),
"task_store_PENDING": ("aurora.scheduler.task_store_PENDING", MONOTONIC_COUNT),
"task_store_PREEMPTING": ("aurora.scheduler.task_store_PREEMPTING", MONOTONIC_COUNT),
"task_store_RESTARTING": ("aurora.scheduler.task_store_RESTARTING", MONOTONIC_COUNT),
"task_store_RUNNING": ("aurora.scheduler.task_store_RUNNING", MONOTONIC_COUNT),
"task_store_STARTING": ("aurora.scheduler.task_store_STARTING", MONOTONIC_COUNT),
"task_store_THROTTLED": ("aurora.scheduler.task_store_THROTTLED", MONOTONIC_COUNT),
}
_task_store_states = [
"ASSIGNED",
"DRAINING",
"FAILED",
"FINISHED",
"INIT",
"KILLED",
"KILLING",
"LOST",
"PENDING",
"PREEMPTING",
"RESTARTING",
"RUNNING",
"STARTING",
"THROTTLED",
]
TASK_STORE_GAUGES = {}
for taskstate in _task_store_states:
aurora_metric = "task_store_{taskstate}".format(taskstate=taskstate)
metric_tags = ["taskstate:{taskstate}".format(taskstate=taskstate.lower())]
TASK_STORE_GAUGES.update({
aurora_metric: (
"aurora.scheduler.task_store_gauge", GAUGE, metric_tags
)
})
def _get_json(self, url, timeout):
tags = ["url:%s" % url]
msg = None
status = None
json = None
try:
r = requests.get(url, timeout=timeout)
json = r.json()
if r.is_redirect:
status = AgentCheck.OK
self.is_leader = False
msg = "Aurora Scheduler instance detected at %s but is not master" % url
elif r.status_code != 200:
status = AgentCheck.CRITICAL
msg = "Got %s when hitting %s" % (r.status_code, url)
else:
status = AgentCheck.OK
self.is_leader = True
msg = "Aurora Scheduler instance detected at %s " % url
except requests.exceptions.Timeout as e:
# If there's a timeout
msg = "%s seconds timeout when hitting %s" % (timeout, url)
status = AgentCheck.CRITICAL
except Exception as e:
msg = str(e)
status = AgentCheck.CRITICAL
finally:
if self.service_check_needed:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
self.service_check_needed = False
if status is AgentCheck.CRITICAL:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
raise CheckException("Cannot connect to aurora scheduler, please check your configuration.")
return json
def _get_state(self, url, timeout):
return self._get_json(url + '/vars.json', timeout)
def check(self, instance):
if 'url' not in instance:
raise Exception('Aurora scheduler instance missing "url" value.')
url = instance['url']
instance_tags = instance.get('tags', [])
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))
state_metrics = self._get_state(url, timeout)
if state_metrics:
tags = [
'aurora',
'mesos:framework',
'url:%s' % url,
]
tags += instance_tags
stats_metrics = self._get_state(url, timeout)
if stats_metrics is not None:
metrics = [self.SYSTEM_METRICS]
if self.is_leader:
metrics += [self.LEADER_METRICS, self.QUARTZ_METRICS, self.PREEMPTION_METRICS,
self.TASK_STORE_METRICS, self.CRON_METRICS,
self.TASK_STORE_GAUGES, ]
for m in metrics:
for key_name, metric_prop in m.iteritems():
metric_name = metric_prop[0]
metric_func = metric_prop[1]
metric_tags = metric_prop[2] if len(metric_prop) == 3 else []
my_tags = tags + metric_tags
if key_name in stats_metrics:
self.log.debug("%s: %s, tags: %s", metric_name, stats_metrics[key_name],
my_tags)
metric_func(self, metric_name, stats_metrics[key_name],
tags=my_tags)
else:
metric_func(self, metric_name, None,
tags=my_tags)
self.service_check_needed = True
init_config:
default_timeout: 5
# Add all master nodes:
instances:
- url: http://10.0.0.10:8081
tags:
- cluster:foo
- url: http://10.0.0.11:8081
tags:
- cluster:foo
- url: http://10.0.0.12:8081
tags:
- cluster:foo
- url: http://10.0.0.13:8081
tags:
- cluster:foo
- url: http://10.0.0.10:8081
tags:
- cluster:foo
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment