Skip to content

Instantly share code, notes, and snippets.

@aadityabhatia
Last active February 27, 2024 14:05
Show Gist options
  • Save aadityabhatia/50574836c727a1add565c7908e22cb98 to your computer and use it in GitHub Desktop.
Save aadityabhatia/50574836c727a1add565c7908e22cb98 to your computer and use it in GitHub Desktop.
monitor squeue and send notification when node is allocated
import subprocess
import requests
import sys
from datetime import datetime
import time
import random
import signal
# arbitrarily chosen prime numbers
SLEEP_MIN = 127
SLEEP_MAX = 157
def get_slurm_status_status(username):
"""Get the status of the slurm nodes for a given user."""
output = subprocess.check_output(
['squeue', '-u', username, '-h', '-o', '"%i %N %T"']).decode('utf-8')
# parse output and return a set of status lines
output = output.strip('" \n').split('\n')
outputSet = set()
for line in output:
line = line.strip('" ')
if line:
outputSet.add(line)
return outputSet
def send_notification(post_url, message):
"""Send notification via HTTP POST."""
response = requests.post(post_url, data=message, headers={
"Title": "Slurm Status Update",
"Priority": "max",
"Tags": "computer"
})
if response.status_code != 200:
raise Exception(
f"Failed HTTP POST with {response.status_code}")
def monitor_slurm_status(username, post_url):
"""Monitor the slurm status and send notifications for changes."""
previous_status = set()
while True:
current_status = get_slurm_status_status(username)
# Check for changes in the nodes
added_status = current_status - previous_status
removed_status = previous_status - current_status
# Send a notification if there are any changes
if added_status:
print(f"{datetime.now()} Added: {added_status}")
message = "; ".join(added_status)
# send a notification only if any of the added lines contain "RUNNING"
if any("RUNNING" in line for line in added_status):
send_notification(post_url, message)
print(f"{datetime.now()} Notification sent: {message}")
if removed_status:
print(f"{datetime.now()} Removed: {removed_status}")
# Update the previous nodes
previous_status = current_status
# sleep for a random interval betwen 127 and 157 seconds
time.sleep(random.randint(SLEEP_MIN, SLEEP_MAX))
if __name__ == '__main__':
# first argument is the username to monitor
username = sys.argv[1]
# second argument is the URL to send the notification
post_url = sys.argv[2]
# trap SIGINT
def signal_handler(sig, frame):
print("Exiting...")
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
monitor_slurm_status(username, post_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment