Skip to content

Instantly share code, notes, and snippets.

@mrhillsman
Created April 25, 2017 19:59
Show Gist options
  • Save mrhillsman/1d53ed1350ed6d2f2c2c085005648e6d to your computer and use it in GitHub Desktop.
Save mrhillsman/1d53ed1350ed6d2f2c2c085005648e6d to your computer and use it in GitHub Desktop.
Updated Prometheus OpenStack Exporter for Identity v3
#!/usr/bin/python
"""
OpenStack exporter for the prometheus monitoring system
Copyright (C) 2016 Canonical, Ltd.
Authors:
Jacek Nykis <jacek.nykis@canonical.com>
Laurent Sesques <laurent.sesques@canonical.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 3,
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranties of
MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import argparse
import yaml
from os import environ as env
from os import rename, path
import traceback
import urlparse
from threading import Thread
import pickle
import requests
from time import sleep, time
from neutronclient.v2_0 import client as neutron_client
from keystoneclient.v3 import client as keystone_client
# from novaclient.v1_1 import client as nova_client
# http://docs.openstack.org/developer/python-novaclient/api.html
from keystoneauth1 import loading
from keystoneauth1 import session
from novaclient import client as nova_client
from BaseHTTPServer import BaseHTTPRequestHandler
from BaseHTTPServer import HTTPServer
from SocketServer import ForkingMixIn
from prometheus_client import CollectorRegistry, generate_latest, Gauge, CONTENT_TYPE_LATEST
from netaddr import iter_iprange
class DataGatherer(Thread):
def __init__(self):
Thread.__init__(self)
self.daemon = True
self.duration = 0
self.refresh_interval = config.get('cache_refresh_interval', 900)
self.cache_file = config['cache_file']
def run(self):
prodstack = {}
creds = {
"username": env['OS_USERNAME'],
"password": env['OS_PASSWORD'],
"tenant_name": env['OS_TENANT_NAME'],
"auth_url": env['OS_AUTH_URL'],
"region_name": env['OS_REGION_NAME']
}
# creds_nova = [
# 2,
# env['OS_USERNAME'],
# env['OS_PASSWORD'],
# env['OS_TENANT_NAME'],
# env['OS_AUTH_URL'],
# ]
while True:
start_time = time()
try:
keystone = keystone_client.Client(**creds)
loader = loading.get_plugin_loader('password')
auth = loader.load_from_options(auth_url=env['OS_AUTH_URL'],
username=env['OS_USERNAME'],
password=env['OS_PASSWORD'],
project_name=env['OS_TENANT_NAME'],
user_domain_name=env['OS_USER_DOMAIN_NAME'],
project_domain_name=env['OS_PROJECT_DOMAIN_NAME']
)
sess = session.Session(auth=auth, verify=False)
nova = nova_client.Client(2, session=sess)
# nova = nova_client.Client(*creds_nova)
neutron = neutron_client.Client(session=sess)
# neutron = neutron_client.Client(**creds)
prodstack['projects'] = [x._info for x in keystone.projects.list()]
prodstack['hypervisors'] = [x._info for x in nova.hypervisors.list()]
prodstack['services'] = [x._info for x in nova.services.list()]
prodstack['networks'] = neutron.list_networks()['networks']
prodstack['flavors'] = [x._info for x in nova.flavors.list()]
prodstack['aggregates'] = [x.to_dict() for x in nova.aggregates.list()]
prodstack['subnets'] = neutron.list_subnets()['subnets']
prodstack['routers'] = neutron.list_routers()['routers']
prodstack['ports'] = neutron.list_ports()['ports']
prodstack['floatingips'] = neutron.list_floatingips()['floatingips']
# Instance info is very heavy, disable until we merge this bit with pantomath
prodstack['instances'] = []
marker = ''
while True:
search_opts = {'all_projects': '1', 'limit': '100', 'marker': marker}
new_instances = [x._info for x in nova.servers.list(search_opts=search_opts)]
if new_instances:
marker = new_instances[-1]['id']
prodstack['instances'].extend(new_instances)
else:
break
except:
# Ignore failures, we will try again after refresh_interval.
# Most of them are termporary ie. connectivity problmes
# To alert on stale cache use openstack_exporter_cache_age_seconds metric
print(traceback.format_exc())
else:
with open(self.cache_file + '.new', "wb+") as f:
pickle.dump((prodstack, ), f, pickle.HIGHEST_PROTOCOL)
rename(self.cache_file + '.new', self.cache_file)
self.duration = time() - start_time
sleep(self.refresh_interval)
def get_stats(self):
registry = CollectorRegistry()
labels = ['cloud']
age = Gauge('openstack_exporter_cache_age_seconds',
'Cache age in seconds. It can reset more frequently '
'than scraping interval so we use Gauge',
labels, registry=registry)
l = [config['cloud']]
age.labels(*l).set(time() - path.getmtime(self.cache_file))
duration = Gauge('openstack_exporter_cache_refresh_duration_seconds',
'Cache refresh duration in seconds.',
labels, registry=registry)
duration.labels(*l).set(self.duration)
return generate_latest(registry)
class Neutron():
def __init__(self):
self.registry = CollectorRegistry()
self.prodstack = {}
with open(config['cache_file'], 'rb') as f:
self.prodstack = pickle.load(f)[0]
self.tenant_map = {t['id']: t['name'] for t in self.prodstack['projects']}
self.network_map = {n['id']: n['name'] for n in self.prodstack['networks']}
self.subnet_map = {n['id']: {'name': n['name'], 'pool': n['allocation_pools']} for n in self.prodstack['subnets']}
self.routers = self.prodstack['routers']
self.ports = self.prodstack['ports']
self.floating_ips = self.prodstack['floatingips']
def _get_router_ip(self, uuid):
owner = "network:router_gateway"
for port in self.ports:
if port["device_id"] == uuid and port["device_owner"] == owner:
return port["fixed_ips"][0]["ip_address"]
def get_floating_ips(self):
ips = {}
for ip in self.floating_ips:
subnet = self.network_map[ip['floating_network_id']]
try:
tenant = self.tenant_map[ip['tenant_id']]
except KeyError:
tenant = 'Unknown tenant ({})'.format(ip['tenant_id'])
key = (config['cloud'], subnet, tenant, 'floatingip', ip['status'])
if key in ips:
ips[key] += 1
else:
ips[key] = 1
return ips
def get_router_ips(self):
ips = {}
for r in self.routers:
if self._get_router_ip(r['id']):
tenant = self.tenant_map[r['tenant_id']]
subnet = self.network_map[r['external_gateway_info']['network_id']]
key = (config['cloud'], subnet, tenant, 'routerip', r['status'])
if key in ips:
ips[key] += 1
else:
ips[key] = 1
return ips
def gen_subnet_size(self):
labels = ['cloud', 'network_name']
net_size = Gauge('neutron_net_size',
'Neutron networks size',
labels, registry=self.registry)
for n in self.prodstack['networks']:
size = 0
for subnet in n['subnets']:
for pool in self.subnet_map[subnet]['pool']:
size += len(list(iter_iprange(pool['start'], pool['end'])))
l = [config['cloud'], self.network_map[n['id']]]
net_size.labels(*l).set(size)
def get_stats(self):
labels = ['cloud', 'subnet_name', 'tenant', 'ip_type', 'ip_status']
ips = self.get_floating_ips()
ips.update(self.get_router_ips())
metrics = Gauge('neutron_public_ip_usage',
'Neutron floating IP and router IP usage statistics',
labels, registry=self.registry)
for k, v in ips.items():
metrics.labels(*k).set(v)
self.gen_subnet_size()
return generate_latest(self.registry)
class Nova():
def __init__(self):
self.registry = CollectorRegistry()
self.prodstack = {}
with open(config['cache_file'], 'rb') as f:
self.prodstack = pickle.load(f)[0]
self.hypervisors = self.prodstack['hypervisors']
self.tenant_map = {t['id']: t['name'] for t in self.prodstack['projects']}
self.flavor_map = {f['id']: {'ram': f['ram'], 'disk': f['disk'], 'vcpus': f['vcpus']}
for f in self.prodstack['flavors']}
self.aggregate_map = {}
self.services_map = {}
for s in self.prodstack['services']:
if s['binary'] == 'nova-compute':
self.services_map[s['host']] = s['status']
for agg in self.prodstack['aggregates']:
self.aggregate_map.update({i: agg['name'] for i in agg['hosts']})
def _get_schedulable_instances(self, host):
free_vcpus = host['vcpus'] * config['openstack_allocation_ratio_vcpu'] - host['vcpus_used']
free_ram_mbs = host['memory_mb'] * config['openstack_allocation_ratio_ram'] - host['memory_mb_used']
free_disk_gbs = host['local_gb'] * config['openstack_allocation_ratio_disk'] - host['local_gb_used']
s = config['schedulable_instance_size']
return min(int(free_vcpus / s['vcpu']),
int(free_ram_mbs / s['ram_mbs']),
int(free_disk_gbs / s['disk_gbs']))
def _get_schedulable_instances_capacity(self, host):
capacity_vcpus = host['vcpus'] * config['openstack_allocation_ratio_vcpu']
capacity_ram_mbs = host['memory_mb'] * config['openstack_allocation_ratio_ram']
capacity_disk_gbs = host['local_gb'] * config['openstack_allocation_ratio_disk']
s = config['schedulable_instance_size']
return min(int(capacity_vcpus / s['vcpu']),
int(capacity_ram_mbs / s['ram_mbs']),
int(capacity_disk_gbs / s['disk_gbs']))
def gen_hypervisor_stats(self):
labels = ['cloud', 'hypervisor_hostname', 'aggregate', 'nova_service_status']
vms = Gauge('hypervisor_running_vms', 'Number of running VMs', labels, registry=self.registry)
vcpus_total = Gauge('hypervisor_vcpus_total', 'Total number of vCPUs', labels, registry=self.registry)
vcpus_used = Gauge('hypervisor_vcpus_used', 'Number of used vCPUs', labels, registry=self.registry)
mem_total = Gauge('hypervisor_memory_mbs_total', 'Total amount of memory in MBs', labels, registry=self.registry)
mem_used = Gauge('hypervisor_memory_mbs_used', 'Used memory in MBs', labels, registry=self.registry)
disk_total = Gauge('hypervisor_disk_gbs_total', 'Total amount of disk space in GBs', labels, registry=self.registry)
disk_used = Gauge('hypervisor_disk_gbs_used', 'Used disk space in GBs', labels, registry=self.registry)
schedulable_instances = Gauge('hypervisor_schedulable_instances',
'Number of schedulable instances, see "schedulable_instance_size" option',
labels, registry=self.registry)
schedulable_instances_capacity = Gauge('hypervisor_schedulable_instances_capacity',
'Number of schedulable instances we have capacity for',
labels, registry=self.registry)
for h in self.hypervisors:
host = h['service']['host']
l = [config['cloud'], host, self.aggregate_map.get(host, 'unknown'), self.services_map[host]]
vms.labels(*l).set(h['running_vms'])
vcpus_total.labels(*l).set(h['vcpus'])
vcpus_used.labels(*l).set(h['vcpus_used'])
mem_total.labels(*l).set(h['memory_mb'])
mem_used.labels(*l).set(h['memory_mb_used'])
disk_total.labels(*l).set(h['local_gb'])
disk_used.labels(*l).set(h['local_gb_used'])
if config.get("schedulable_instance_size", False):
schedulable_instances.labels(*l).set(self._get_schedulable_instances(h))
schedulable_instances_capacity.labels(*l).set(self._get_schedulable_instances_capacity(h))
def gen_instance_stats(self):
instances = Gauge('nova_instances',
'Nova instances metrics',
['cloud', 'tenant', 'instance_state'], registry=self.registry)
res_ram = Gauge('nova_resources_ram_mbs',
'Nova RAM usage metric',
['cloud', 'tenant'], registry=self.registry)
res_vcpus = Gauge('nova_resources_vcpus',
'Nova vCPU usage metric',
['cloud', 'tenant'], registry=self.registry)
res_disk = Gauge('nova_resources_disk_gbs',
'Nova disk usage metric',
['cloud', 'tenant'], registry=self.registry)
for i in self.prodstack['instances']:
if i['tenant_id'] in self.tenant_map:
tenant = self.tenant_map[i['tenant_id']]
else:
tenant = 'orphaned'
flavor = self.flavor_map[i['flavor']['id']]
instances.labels(config['cloud'], tenant, i['status']).inc()
res_ram.labels(config['cloud'], tenant).inc(flavor['ram'])
res_vcpus.labels(config['cloud'], tenant).inc(flavor['vcpus'])
res_disk.labels(config['cloud'], tenant).inc(flavor['disk'])
def gen_overcommit_stats(self):
labels = ['cloud', 'resource']
openstack_overcommit = Gauge('openstack_allocation_ratio', 'Openstack overcommit ratios',
labels, registry=self.registry)
l = [config['cloud'], 'vcpu']
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_vcpu'])
l = [config['cloud'], 'ram']
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_ram'])
l = [config['cloud'], 'disk']
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_disk'])
def get_stats(self):
self.gen_hypervisor_stats()
self.gen_instance_stats()
self.gen_overcommit_stats()
return generate_latest(self.registry)
class Swift():
def __init__(self):
self.registry = CollectorRegistry()
self.baseurl = 'http://{}:6000/recon/{}'
self.swift_hosts = config.get('swift_hosts', [])
def gen_disk_usage_stats(self):
labels = ['cloud', 'hostname', 'device', 'type']
swift_disk = Gauge('swift_disk_usage_bytes', 'Swift disk usage in bytes',
labels, registry=self.registry)
for h in self.swift_hosts:
r = requests.get(self.baseurl.format(h, 'diskusage'))
for disk in r.json():
if not all([disk.get(i, False) for i in ['size', 'used', 'device']]):
continue
swift_disk.labels(config['cloud'], h, disk['device'], 'size').set(int(disk['size']))
swift_disk.labels(config['cloud'], h, disk['device'], 'used').set(int(disk['used']))
def gen_quarantine_stats(self):
labels = ['cloud', 'hostname', 'ring']
swift_quarantine = Gauge('swift_quarantined_objects', 'Number of quarantined objects',
labels, registry=self.registry)
for h in self.swift_hosts:
r = requests.get(self.baseurl.format(h, 'quarantined'))
for ring in ['accounts', 'objects', 'containers']:
swift_quarantine.labels(config['cloud'], h, ring).set(r.json().get(ring))
def gen_replication_stats(self):
labels = ['cloud', 'hostname', 'ring', 'type']
swift_repl = Gauge('swift_replication_stats', 'Swift replication stats', labels, registry=self.registry)
labels = ['cloud', 'hostname', 'ring']
swift_repl_duration = Gauge('swift_replication_duration_seconds', 'Swift replication duration in seconds',
labels, registry=self.registry)
for h in self.swift_hosts:
metrics = ['attempted', 'diff', 'diff_capped', 'empty',
'failure', 'hashmatch', 'no_change', 'remote_merge',
'remove', 'rsync', 'success', 'ts_repl']
# Object replication is special
r = requests.get(self.baseurl.format(h, 'replication/object'))
try:
swift_repl_duration.labels(config['cloud'], h, 'object').set(r.json()['object_replication_time'])
except TypeError:
print(traceback.format_exc())
for ring in ['account', 'container']:
r = requests.get(self.baseurl.format(h, 'replication/' + ring))
try:
swift_repl_duration.labels(config['cloud'], h, ring).set(r.json()['replication_time'])
except TypeError:
print(traceback.format_exc())
for metric in metrics:
try:
swift_repl.labels(config['cloud'], h, ring, metric).set(r.json()['replication_stats'][metric])
except TypeError:
print(traceback.format_exc())
def get_stats(self):
self.gen_disk_usage_stats()
self.gen_quarantine_stats()
self.gen_replication_stats()
return generate_latest(self.registry)
class ForkingHTTPServer(ForkingMixIn, HTTPServer):
pass
class OpenstackExporterHandler(BaseHTTPRequestHandler):
def __init__(self, *args, **kwargs):
BaseHTTPRequestHandler.__init__(self, *args, **kwargs)
def do_GET(self):
url = urlparse.urlparse(self.path)
if url.path == '/metrics':
try:
neutron = Neutron()
nova = Nova()
swift = Swift()
output = neutron.get_stats() + \
nova.get_stats() + \
swift.get_stats() + \
data_gatherer.get_stats()
self.send_response(200)
self.send_header('Content-Type', CONTENT_TYPE_LATEST)
self.end_headers()
self.wfile.write(output)
except:
self.send_response(500)
self.end_headers()
self.wfile.write(traceback.format_exc())
elif url.path == '/':
self.send_response(200)
self.end_headers()
self.wfile.write("""<html>
<head><title>OpenStack Exporter</title></head>
<body>
<h1>OpenStack Exporter</h1>
<p>Visit <code>/metrics</code> to use.</p>
</body>
</html>""")
else:
self.send_response(404)
self.end_headers()
def handler(*args, **kwargs):
OpenstackExporterHandler(*args, **kwargs)
if __name__ == '__main__':
parser = argparse.ArgumentParser(usage=__doc__,
description='Prometheus OpenStack exporter',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('config_file', nargs='?',
help='Configuration file path',
default='/etc/prometheus/prometheus-openstack-exporter.yaml',
type=argparse.FileType('r'))
args = parser.parse_args()
config = yaml.safe_load(args.config_file.read())
data_gatherer = DataGatherer()
data_gatherer.start()
server = ForkingHTTPServer(('', config.get('listen_port')), handler)
server.serve_forever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment