Created
February 1, 2017 18:14
-
-
Save MostAwesomeDude/2fed99e5cf630639df42df7997e12d1e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env nix-shell | |
#! nix-shell -i python -p pythonPackages.treq pythonPackages.twisted pythonPackages.prometheus_client | |
import os | |
import time | |
from twisted.internet import reactor | |
from twisted.internet.task import LoopingCall, deferLater, react | |
from twisted.web.client import Agent | |
from twisted.web.server import Site | |
from twisted.web.resource import Resource | |
from treq import post | |
from prometheus_client import Gauge, Histogram | |
from prometheus_client.twisted import MetricsResource | |
CONNECT_TIMEOUT = 7 | |
URI = os.environ["VERIFIER_URI"] | |
LAST_CHECK = Gauge("verifier_last_check", "Moment of last check (timestamp)", | |
["deep", "repair"]) | |
CHECK_LATENCY = Histogram("verifier_check_latency", "Check latency (seconds)", | |
["deep", "repair"]) | |
AVAILABLE = Gauge("verifier_available", | |
"Whether the grid is available (bool)") | |
SHARES = Gauge("verifier_shares", "Number of Shares", ["type"]) | |
agent = Agent(reactor, connectTimeout=CONNECT_TIMEOUT) | |
def checkTahoe(repair): | |
# To try to reduce coordinated omission, we start tracking time | |
# immediately and record it as late as possible. | |
before = time.time() | |
# XXX urlparse | |
params = "?t=check&output=json" | |
if repair: | |
params += "&repair=true" | |
d = post("http://localhost:3456/uri/" + URI + params, agent=agent) | |
@d.addCallback | |
def jsonify(response): | |
return response.json() | |
@d.addCallback | |
def measure(json): | |
if repair: | |
results = json["post-repair-results"] | |
else: | |
results = json["results"] | |
corruptShares = results["count-corrupt-shares"] | |
wrongShares = results["count-wrong-shares"] | |
healthy = results["healthy"] | |
# recoverable = results["recoverable"] | |
rv = None | |
if not repair and not healthy: | |
rv = deferLater(reactor, checkTahoe, repair=True) | |
SHARES.labels(type="corrupt").set(corruptShares) | |
SHARES.labels(type="wrong").set(wrongShares) | |
LAST_CHECK.labels(deep=False, repair=repair).set_to_current_time() | |
AVAILABLE.set(1) | |
# As late as possible. | |
after = time.time() | |
CHECK_LATENCY.labels(deep=False, repair=repair).observe(after - before) | |
return rv | |
@d.addErrback | |
def fail(failure): | |
AVAILABLE.set(0) | |
return d | |
def main(reactor): | |
root = Resource() | |
root.putChild(b'metrics', MetricsResource()) | |
factory = Site(root) | |
port = reactor.listenTCP(8000, factory) | |
print "Listening on port", port.getHost() | |
lc = LoopingCall(checkTahoe, repair=False) | |
return lc.start(60, now=True) | |
react(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment