Skip to content

Instantly share code, notes, and snippets.

@zhw12
Created April 1, 2021 19:13
Show Gist options
  • Save zhw12/16616cf69e63cbfe38c37b854f2ea2f5 to your computer and use it in GitHub Desktop.
Save zhw12/16616cf69e63cbfe38c37b854f2ea2f5 to your computer and use it in GitHub Desktop.
GPU info logging and scheduling
"""GPU info logging and scheduling"""
import gpustat
import time
import argparse
import json
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
def log(params):
"""a background job writing gpustat to a logging file"""
steps = 0
gpu_infos = []
# write log to tmp file
while True:
res = gpustat.new_query().jsonify()
logging_record = {'query_time': str(res['query_time']),
'gpus':res['gpus']}
with open(params.logging_file, 'a') as fout:
fout.write(json.dumps(logging_record)+'\n') # each line is a subset of gpustat
if steps >= params.max_records:
with open(params.logging_file, 'r') as fin:
for l in fin:
gpu_infos.append(json.loads(l))
gpu_infos = gpu_infos[-params.min_records:]
with open(params.logging_file, 'w') as fout:
for l in gpu_infos:
fout.write(json.dumps(l)+'\n')
steps = params.min_records - 1
steps += 1
logging.info(str(logging_record))
time.sleep(params.logging_every)
def do_process():
return
def check_availability(params):
""" check availability based on some criteria """
time_gaps = [1, 2, 5, 10, 30] # minute, checking at x minute after starting
num_try = 0
while num_try < params.max_tries:
res = gpustat.new_query().jsonify()
for gpu_info in res['gpus']:
# write your available criteria here
if gpu_info['memory.used'] < 1000: # M
num_try = float('inf')
do_process() # fork a process and exit checking program
break
time_gap = time_gaps[num_try] if num_try < len(time_gaps) else time_gaps[-1]
time.sleep(time_gap)
if num_try < len(time_gaps) - 1: # repeat at 30 mins
num_try += 1
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--logging_every", type=int, default=60,
help="logging every x seconds")
parser.add_argument("--logging_file", type=str, default="/tmp/gpuinfo_log.jsonl",
help="logging file")
parser.add_argument("--max_records", type=int, default=1000,
help="max records")
parser.add_argument("--min_records", type=int, default=500,
help="truncate records to x lines")
parser.add_argument('--check', default=False, action='store_true',
help='check availability')
parser.add_argument('--log', default=False, action='store_true',
help='check availability')
params = parser.parse_args()
if params.log:
log(params)
elif params.check:
check_availability(params)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment