Skip to content

Instantly share code, notes, and snippets.

@guyskk
Last active May 11, 2023 12:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guyskk/594f2c4eb9ac665a40e862f4f08240ac to your computer and use it in GitHub Desktop.
Save guyskk/594f2c4eb9ac665a40e862f4f08240ac to your computer and use it in GitHub Desktop.
Monitor system memory and execute recover command when OOM
[Unit]
Description=memory-monitor-jupyterhub
After=network.target
StartLimitIntervalSec=0
[Service]
Type=simple
Restart=on-failure
RestartSec=10
User=root
ExecStart=/usr/bin/python3 /root/opt/memory-monitor.py 1024 'bash /root/opt/recover-jupyterhub.sh'
[Install]
WantedBy=multi-user.target
import array
import logging
import multiprocessing
import os
import sys
import time
LOG = logging.getLogger(__name__)
LOG_FORMAT = "[%(levelname)s] %(asctime)s L%(lineno)d %(message)s"
LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(
format=LOG_FORMAT,
datefmt=LOG_DATE_FORMAT,
level=logging.INFO,
)
def allocate_memory(mb):
item = array.array('b', b'x')
return item * int(1024 * 1024 * mb)
def memory_worker(free_mb):
os.nice(19) # 降低进程优先级
try:
reserved = allocate_memory(free_mb / 2)
except MemoryError:
LOG.warning('memory monitor worker reserve memory failed!')
return sys.exit(1)
reserved_mb = int(len(reserved) / 1024 / 1024)
LOG.info('memory monitor worker reserved memory {}MB'.format(reserved_mb))
try:
while True:
tmp = []
for _ in range(10):
tmp.append(allocate_memory(free_mb / 20))
time.sleep(1)
LOG.info('memory monitor worker check memory OK')
except MemoryError:
LOG.warning('memory monitor worker out of memory!')
return sys.exit(2)
def check_memory(free_mb):
proc = multiprocessing.Process(
target=memory_worker,
args=(free_mb,),
name='memory-monitor-worker',
)
proc.start()
try:
while True:
try:
proc.join(timeout=1)
except TimeoutError:
pass
else:
break
finally:
proc.join()
LOG.info('memory-monitor-worker exited {}'.format(proc.exitcode))
def execute_recover(cmd):
LOG.info('execute recover {!r}'.format(cmd))
code = os.system(cmd)
LOG.info('execute recover exit code {}'.format(code))
def main():
if len(sys.argv) != 3:
print('Usage: memory-monitor <FREE-MB> <RECOVER-CMD>')
return
free_mb = int(sys.argv[1])
recover_cmd = sys.argv[2]
LOG.info('free_mb={} recover_cmd={!r}'.format(free_mb, recover_cmd))
while True:
check_memory(free_mb)
execute_recover(recover_cmd)
time.sleep(10)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment