Skip to content

Instantly share code, notes, and snippets.

@lukauskas
Last active August 17, 2018 10:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukauskas/8acae8ada7fcc7c17c0dd3a115aef556 to your computer and use it in GitHub Desktop.
Save lukauskas/8acae8ada7fcc7c17c0dd3a115aef556 to your computer and use it in GitHub Desktop.
#!/bin/bash
qstat -u '*' -ext
#!/usr/bin/env python
from sh import qhost, Command
import io
import re
import humanfriendly
import logging
import logging.config
import time
import os
here = os.path.abspath(os.path.dirname(__file__))
qstat = Command(os.path.join(here, 'full_qstat.sh'))
logging.config.fileConfig('logging.conf')
logger = logging.getLogger('sge_monitoring')
def get_qstat_output():
buf = io.StringIO()
qstat(_out=buf)
output = buf.getvalue()
output = output.split('\n')
# skip header
if output:
output = output[2:]
for line in output:
if not line.strip():
continue
line = re.split('\s+', line.strip())
txt = []
for i, word in enumerate(line):
txt.append(f'{i}: {word!r}')
txt = ', '.join(txt)
extras = {}
task = line[3]
user = line[4]
state_raw = line[7]
extras['task'] = task
extras['user'] = user
extras['state_raw'] = state_raw
if state_raw == 'r':
state = 'running'
elif state_raw == 'qw':
state = 'waiting'
elif "E" in state_raw:
state = 'errored'
else:
state = 'other'
extras['state'] = state
if state == 'running':
cpu = line[8]
queue = line[17]
slots = int(line[18])
extras['queue'] = queue
extras['slots'] = slots
if cpu != 'NA':
cpu = list(map(int, cpu.split(':')))
elapsed_cpu_seconds = 0
for type_, val in zip(['seconds', 'minutes', 'hours', 'days'], reversed(cpu)):
if type_ == 'seconds':
elapsed_cpu_seconds += val
elif type_ == 'minutes':
elapsed_cpu_seconds += 60 * val
elif type_ == 'hours':
elapsed_cpu_seconds += 60*60 * val
elif type_ == 'days':
elapsed_cpu_seconds += 24*60*60 * val
io_counter = line[10]
mem = float(line[9])
io_counter = float(io_counter)
avg_mem = mem/elapsed_cpu_seconds # Gb
avg_io = io_counter/elapsed_cpu_seconds
avg_mem_per_slot = avg_mem / slots
avg_io_per_slot = avg_io / slots
extras['avg_mem'] = avg_mem
extras['avg_io'] = avg_io
extras['avg_mem_per_slot'] = avg_mem_per_slot
extras['avg_io_per_slot'] = avg_io_per_slot
# print(cpu, mem, io_counter, queue, slots)
elif state == 'other' and len(line) > 15:
queue = line[17]
slots = int(line[18])
extras['queue'] = queue
extras['slots'] = slots
# print(queue, slots)
else:
slots = int(line[14])
extras['slots'] = slots
# print(slots)
logger.info(f'SGE task report {task} {user} {state_raw} slots={slots}',
extra=extras)
def get_qhost_output():
buf = io.StringIO()
qhost('-q', _out=buf)
output = buf.getvalue()
output = output.split('\n')
output = output[3:]
it = iter(output)
total_cpus = 0
total_load = 0
total_memory = 0
total_used = 0
total_swap = 0
total_used_swap = 0
total_slots = 0
total_reserved_slots = 0
total_used_slots = 0
total_nodes = 0
total_cpu = 0
total_load = 0
for firstline in it:
if firstline.strip() == '':
break
secondline = next(it)
firstline = re.split('\s+', firstline)
secondline = re.split('\s+', secondline)
disabled = secondline[4] == 'd' or secondline[4] == 'au'
if disabled:
continue
slots = secondline[3].split('/')
total_reserved_slots += int(slots[0])
total_used_slots += int(slots[1])
total_slots += int(slots[2])
total_nodes += 1
total_cpu += int(firstline[2])
try:
total_load += float(firstline[3])
except ValueError:
total_load += 0
total_memory += humanfriendly.parse_size(firstline[4])
total_used += humanfriendly.parse_size(firstline[5])
total_swap += humanfriendly.parse_size(firstline[6])
total_used_swap += humanfriendly.parse_size(firstline[7])
total_free_slots = total_slots - total_used_slots - total_reserved_slots
mem_pct = total_used/total_memory
swap_pct = total_used_swap/total_swap
slots_pct = (total_reserved_slots+total_used_slots)/total_slots
extras = dict(total_nodes=total_nodes, total_cpu=total_cpu, total_load=total_load, total_slots=total_slots, total_reserved_slots=total_reserved_slots,
total_used_slots=total_used_slots, total_free_slots=total_free_slots, total_memory=total_memory, total_used_memory=total_used, total_swap=total_swap,
total_used_swap=total_used_swap, mem_pct=mem_pct, swap_pct=swap_pct, slots_pct=slots_pct)
logger.info('SGE status: {slots_pct:.2%} slots used, {mem_pct:.2%} mem used, {swap_pct:.2%} swap used'.format(**extras), extra=extras)
if __name__ == '__main__':
while True:
get_qhost_output()
get_qstat_output()
time.sleep(10 * 60)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment