Skip to content

Instantly share code, notes, and snippets.

@koyo922
Created September 7, 2019 14:14
Show Gist options
  • Save koyo922/db8548e15484675965c5aef99056d3ee to your computer and use it in GitHub Desktop.
Save koyo922/db8548e15484675965c5aef99056d3ee to your computer and use it in GitHub Desktop.
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 expandtab number
"""
每15钟检测一次 如果发现开机太久且GPU空闲,就告警
有个免费的运维告警平台,参见 https://caweb.aiops.com/#/integrate 下面的REST API集成
弄完后添加开机自启动
chmod 755 alert_gpu_idle.py
sudo vim /etc/rc.local # 将此脚本的绝对路径写进去
"""
from __future__ import unicode_literals
import subprocess
import sys
import time
import json
import logging
try:
import requests
except ImportError:
subprocess.call(['pip', 'install', 'requests'])
import requests
num_gpus = 2 if len(sys.argv) < 2 else int(sys.argv[1]) # 默认有两块显卡
while True:
nv_output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'])
avg_gpu_usage = sum(map(float, nv_output.splitlines())) / (100.0 * num_gpus)
up_seconds = float(open('/proc/uptime', 'rt').readline().split()[0])
if avg_gpu_usage < 0.7 and up_seconds > 2 * 3600:
response = requests.post('http://api.aiops.com/alert/api/event', headers={'Content-type': 'application/json'},
data=json.dumps({
"app": "7e79d4b9-43c7-你在睿智云上创建的appKey", "eventId": "0",
"eventType": "trigger",
"priority": 1,
"alarmName": "AWS上的GPU闲着啦",
"alarmContent": {"k1": "开机超过2小时且每块GPU平均使用率低于70%", "k2": 0.00},
}, ensure_ascii=True)) # 注意要json.dumps
logging.warning('[ALERT] sent')
assert response.ok
else:
logging.warning('[SAFE] avg_gpu_usage=%.2f up_seconds=%d seconds', avg_gpu_usage, up_seconds)
time.sleep(15 * 60)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment