Created February 1, 2017 18:14
#!/usr/bin/env nix-shell
#! nix-shell -i python -p pythonPackages.treq pythonPackages.twisted pythonPackages.prometheus_client
import os
import time
from twisted.internet import reactor
from twisted.internet.task import LoopingCall, deferLater, react
from twisted.web.client import Agent
from twisted.web.server import Site
from twisted.web.resource import Resource
from treq import post
from prometheus_client import Gauge, Histogram
from prometheus_client.twisted import MetricsResource
URI = os.environ["VERIFIER_URI"]
LAST_CHECK = Gauge("verifier_last_check", "Moment of last check (timestamp)",
["deep", "repair"])
CHECK_LATENCY = Histogram("verifier_check_latency", "Check latency (seconds)",
["deep", "repair"])
AVAILABLE = Gauge("verifier_available",
"Whether the grid is available (bool)")
SHARES = Gauge("verifier_shares", "Number of Shares", ["type"])
agent = Agent(reactor, connectTimeout=CONNECT_TIMEOUT)
def checkTahoe(repair):
# To try to reduce coordinated omission, we start tracking time
# immediately and record it as late as possible.
before = time.time()
# XXX urlparse
params = "?t=check&output=json"
if repair:
params += "&repair=true"
d = post("http://localhost:3456/uri/" + URI + params, agent=agent)
def jsonify(response):
return response.json()
def measure(json):
if repair:
results = json["post-repair-results"]
results = json["results"]
corruptShares = results["count-corrupt-shares"]
wrongShares = results["count-wrong-shares"]
healthy = results["healthy"]
# recoverable = results["recoverable"]
rv = None
if not repair and not healthy:
rv = deferLater(reactor, checkTahoe, repair=True)
LAST_CHECK.labels(deep=False, repair=repair).set_to_current_time()
# As late as possible.
after = time.time()
CHECK_LATENCY.labels(deep=False, repair=repair).observe(after - before)
return rv
def fail(failure):
return d
def main(reactor):
root = Resource()
root.putChild(b'metrics', MetricsResource())
factory = Site(root)
port = reactor.listenTCP(8000, factory)
print "Listening on port", port.getHost()
lc = LoopingCall(checkTahoe, repair=False)
return lc.start(60, now=True)
