Skip to content

Instantly share code, notes, and snippets.

@microlinux
Last active May 2, 2018 09:54
Show Gist options
  • Save microlinux/0b3f47cb657cb61bb89b to your computer and use it in GitHub Desktop.
Save microlinux/0b3f47cb657cb61bb89b to your computer and use it in GitHub Desktop.
OpenVZ container abusive process killer
#!/usr/bin/python
"""
nfcpud: a daemon for killing abusive container processes on openvz nodes
requires: python daemon module (centos: yum install python-daemon)
ncpud measures average process cpu usage over 5 second intervals. based on
parameters defined in config files, processes are killed if they meet any of 3
criteria. processes belonging to the node itself are exempted.
cpu usage is measured relative to total available cpu power. if a process is
measured at 50% cpu usage, the remaining 50% could be either idle or used by
other processes.
to run, execute nfcpud.py
to install, create the following directory structure:
/opt/nforge/
/opt/nforge/bin/
/opt/nforge/etc/nfcpud/
/opt/nforge/log/
/opt/nforge/run/
activity is logged to /opt/nforge/log/nfcpud.log (dont't forget to rotate!)
create banned.conf and nfcpud.conf in /opt/nforge/etc/nfcpud/
place nfcpud.py in /opt/nforge/bin/
banned.conf: this file is not required, processes listed one per line are
killed immediately
e.g. minecraft
java
superduperircbouncer
nfcpud.conf: this file is required, all listed variables must be defined
defines user and group to run as, must be able to kill any
process
e.g. user = root
group = root
defines two criteria for killing processes, hard limit and soft
limit
proccesses exceeding the "hard limit" are killed immediately
e.g. hard_limit = 60
this would immediately kill processes exceeding 60% cpu
usage
processes exceeding the "soft limit" X times within Y checks are
killed, allows processes to burst between the soft and hard limits
e.g. soft_limit = 40
soft_limit_count = 6
soft_limit_checks = 24
this would kill processes bursting over 40% of cpu usage 6
times in 120 seconds (5 secs x 24) before being killed, hard
limit still applies
"""
from collections import namedtuple
from daemon import DaemonContext
from glob import glob
from grp import getgrnam
from lockfile import FileLock
from os import kill
from os.path import isfile
from pwd import getpwnam
from re import match
from signal import SIGKILL
from sys import argv, stderr, stdout
from textwrap import TextWrapper
from time import sleep, strftime
from traceback import format_exc
class Logger(object):
def __init__(self, file):
self.buffer = ''
self.file = file
with open(self.file, 'a+') as file:
pass
def __del__(self):
self.flush()
def flush(self):
if self.buffer:
with open(self.file, 'a+') as file:
file.write(self.buffer)
self.buffer = ''
def write(self, message, flush=True):
time = strftime('%m/%d %H:%M:%S')
wrapper = TextWrapper(subsequent_indent=' ', width=100)
if isinstance(message, str):
message = message.splitlines()
for line in message:
for wrapped in wrapper.wrap('%s %s' % (time, line)):
self.buffer = self.buffer + '%s\n' % wrapped
if flush:
self.flush()
class nfcpud(object):
def __init__(self):
if isfile('/opt/nforge/run/nfcpud.lock'):
print 'existing lock file found'
exit(1)
self.load_config('/opt/nforge/etc/nfcpud/nfcpud.conf')
self.context = DaemonContext(working_directory='/opt/nforge/',
umask=0o002,
pidfile=FileLock('/opt/nforge/run/nfcpud'),
uid=getpwnam(self.config.user).pw_uid,
gid=getgrnam(self.config.group).gr_gid,
stdout=stdout)
def load_config(self, config_file):
vars = []
vals = []
with open(config_file) as file:
for line in file:
line = line.strip()
if line and not line.startswith('#'):
line = map(str.strip, line.split('='))
if match('^[0-9]+$', line[1]):
line[1] = int(line[1])
elif match('^[0-9]+\.[0-9]+$', line[1]):
line[1] = float(line[1])
else:
line[1] = str(line[1])
vars.append(line[0])
vals.append(line[1])
self.config = namedtuple('Config', vars)._make(vals)
def banned_kill(self):
with open('/opt/nforge/etc/nfcpud/banned.conf') as file:
banned = map(str.strip, file.readlines())
for pid, info in self.procs.items():
if info['prog'] in banned:
try:
kill(pid, SIGKILL)
self.ban_killed[pid] = info
del self.procs[pid]
except:
pass
def get_procs(self):
for dir in glob('/proc/[0-9]*'):
try:
with open(dir + '/status') as file:
status = map(str.split, file.readlines())
if len(status) == 46:
with open(dir + '/cmdline') as file:
ctid = int(status[11][1])
if ctid != 0:
self.procs[int(status[3][1])] = {'prog': status[0][1],
'ctid': ctid,
'cmd': file.read().replace('\x00', ' ')}
except:
pass
def get_sys_jiff(self):
with open('/proc/stat') as file:
cpu = map(float, map(str.strip, file.readline().split()[1:]))
return (sum(cpu), cpu[2])
def get_proc_jiff(self):
jiffs = {}
for pid in self.procs:
try:
with open('/proc/%s/stat' % pid) as file:
data = file.read().split()
jiffs[pid] = float(data[13]) + float(data[14])
except:
pass
return jiffs
def get_stats(self, period=5):
total_jiff_1, idle_jiff_1 = self.get_sys_jiff()
proc_jiff_1 = self.get_proc_jiff()
sleep(5)
total_jiff_2, idle_jiff_2 = self.get_sys_jiff()
proc_jiff_2 = self.get_proc_jiff()
total_jiff = total_jiff_2 - total_jiff_1
idle_jiff = idle_jiff_2 - idle_jiff_1
for pid in self.procs.keys():
if pid in proc_jiff_1 and pid in proc_jiff_2:
self.procs[pid]['cpu'] = round((proc_jiff_2[pid] - proc_jiff_1[pid]) / total_jiff * 100.0, 1)
else:
del self.procs[pid]
self.sys_cpu = round(100.0 - (idle_jiff / total_jiff * 100.0), 1)
def hard_limit_kill(self):
for pid, info in self.procs.items():
if info['cpu'] > self.config.hard_limit:
try:
kill(pid, SIGKILL)
self.hard_limit_killed[pid] = info
del self.procs[pid]
except:
pass
def soft_limit_kill(self):
expiry = self.run_id - self.config.soft_limit_checks
for pid, info in self.procs.items():
if info['cpu'] > self.config.soft_limit:
if pid in self.history:
self.history[pid].append([self.run_id, info['cpu']])
else:
self.history[pid] = [[self.run_id, info['cpu']]]
for pid in self.history.keys():
expired = 0
for i in xrange(len(self.history[pid])):
if self.history[pid][i][0] < expiry:
expired += 1
else:
break
if expired > 0:
self.history[pid] = self.history[pid][expired:]
for pid in self.history.keys():
entry_count = len(self.history[pid])
if entry_count > self.config.soft_limit_count:
try:
kill(pid, SIGKILL)
self.soft_limit_killed[pid] = self.procs[pid]
self.soft_limit_killed[pid]['entries'] = self.history[pid]
del self.procs[pid]
del self.history[pid]
except:
pass
elif entry_count == 0:
del self.history[pid]
def run(self):
self.run_log = Logger('/opt/nforge/log/nfcpud.log')
self.debug_log = Logger('/opt/nforge/log/nfcpud_debug.log')
self.history = {}
self.run_id = 1
while True:
self.ban_killed = {}
self.hard_limit_killed = {}
self.soft_limit_killed = {}
self.procs = {}
self.run_log.write('starting check %s' % self.run_id)
self.get_procs()
self.run_log.write('found %s processes' % len(self.procs))
if isfile('/opt/nforge/etc/nfcpud/banned.conf'):
self.banned_kill()
if len(self.ban_killed) > 0:
for pid, info in self.ban_killed.items():
self.run_log.write('killed banned process \'%s\' in container %s' % (info['prog'], info['ctid']))
self.get_stats()
self.hard_limit_kill()
if len(self.hard_limit_killed) > 0:
for pid, info in self.hard_limit_killed.items():
self.run_log.write('hard killed process \'%s\' in container %s at %s%% cpu' % (info['prog'], info['ctid'], info['cpu']))
self.soft_limit_kill()
if len(self.soft_limit_killed) > 0:
for pid, info in self.soft_limit_killed.items():
self.run_log.write('soft killed process \'%s\' in container %s' % (info['prog'], info['ctid']))
self.run_log.write('checks complete')
self.run_id += 1
if __name__ == '__main__':
try:
daemon = nfcpud()
with daemon.context:
daemon.run()
except:
print format_exc()
exit(254)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment