Skip to content

Instantly share code, notes, and snippets.

@walac
Created August 22, 2017 13:18
Show Gist options
  • Save walac/bddbca20afe84611336dba808c75c8e7 to your computer and use it in GitHub Desktop.
Save walac/bddbca20afe84611336dba808c75c8e7 to your computer and use it in GitHub Desktop.
import os
import sys
from datetime import datetime
import psycopg2
import pytz
lower = int(sys.argv[1])
upper = int(sys.argv[2])
threshold = None
try :
threshold = int(sys.argv[3])
except ValueError:
threshold = float(sys.argv[3])
workers = ["t-yosemite-r7-{0:04d}".format(x) for x in range(lower, upper)]
total_broken_workers = 0
conn = psycopg2.connect(os.environ['PGCONNECTIONSTRING'])
cur = conn.cursor()
try:
for worker in workers:
cur.execute("select state, started, resolved from tasks where worker_id = %s order by modified desc limit 1", (worker,))
if cur.rowcount == 0:
total_broken_workers += 1
print "{} dead forever".format(worker)
continue
for r in cur:
if r[0] == 'running':
elapsed = ((datetime.utcnow() - r[1]).total_seconds())/3600.0
if elapsed > threshold:
total_broken_workers += 1
print "{} dead for {} hours".format(worker, elapsed)
break
# If the task is not running, assume that is has been resolved
elapsed = ((datetime.utcnow() - r[2]).total_seconds())/3600.0
if elapsed > threshold:
total_broken_workers += 1
print "{} dead for {} hours".format(worker, elapsed)
break
finally:
cur.close()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment