Last active
September 22, 2024 20:06
-
-
Save bdowling/48470325d08b216db5d4b6972695dd6b to your computer and use it in GitHub Desktop.
This script started out briefly, but turned into an actual generally useful tool for inspecting cgroup hierarchy. Particularly useful for seeing the effect of Kubernetes CPU Requests and Limits and throttling that is occuring.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Script to help find cgroup cpu throttling in a k8s cluster | |
# | |
# Brian Dowling | |
# | |
import subprocess | |
import argparse | |
import textwrap | |
from warnings import warn | |
import json | |
import sys | |
import os | |
import re | |
from collections import defaultdict | |
class Range(object): | |
def __init__(self, start, end): | |
self.start = start | |
self.end = end | |
def __eq__(self, other): | |
return self.start <= other <= self.end | |
def __contains__(self, item): | |
return self.__eq__(item) | |
def __iter__(self): | |
yield self | |
def __repr__(self): | |
return '[{0},{1}]'.format(self.start, self.end) | |
def run_command(command): | |
p = subprocess.Popen(command, stdout=subprocess.PIPE) | |
for line in p.stdout: | |
line = str(line.rstrip(), 'UTF-8') | |
yield line | |
def calculate_percent(f): | |
f = filepath(f, 'cpu.stat') | |
try: | |
periods = 0 | |
throttled = 0 | |
with open(f, 'r') as f: | |
for line in f.readlines(): | |
line = line.rstrip() | |
if line.startswith('nr_periods'): | |
periods=float( line[line.find(' '):] ) | |
if line.startswith('nr_throttled'): | |
throttled=float( line[line.find(' '):] ) | |
if periods != 0 and throttled != 0: | |
return throttled/periods * 100.0 | |
else: | |
return 0.0 | |
except Exception as e: | |
warn(e) | |
return 0.0 | |
def cgroup_cpu_limit(f): | |
try: | |
shares = filepath(f, 'cpu.shares') | |
# period = filepath(f, 'cpu.cfs_period_us') | |
quota = filepath(f, 'cpu.cfs_quota_us') | |
# quota procs are -1, so return that instead of -.0001 | |
q = (int(firstline(quota))) / 1000 | |
if (q < 0): | |
q = -1 | |
s = int(firstline(shares)) | |
return s, q | |
except Exception as e: | |
warn(e) | |
return 0, 0 | |
def to_hier(f): | |
if os.path.isfile(f): | |
f = os.path.dirname(f) | |
cgroup = f.replace(r'/sys/fs/cgroup/', '') | |
cgroup = cgroup.replace(r'cpu,cpuacct/', '') | |
dir = f.replace(cgroup, '') | |
res = '' | |
spaces=0 | |
for child in cgroup.split('/'): | |
dir = os.path.join(dir, child) | |
percent = calculate_percent(dir) | |
shares, quota = cgroup_cpu_limit(dir) | |
res += ' ' * spaces + format_limits(child, shares, quota, percent) + "\n" | |
spaces += 2 | |
res = res.rstrip() | |
return res | |
def collect_containers(): | |
getcontainers = """docker ps -q |xargs --no-run-if-empty -n1 -- docker inspect --format='{{ .HostConfig.CgroupParent }} {{ printf "%.12s" .Id}} {{index .Config.Labels "io.kubernetes.pod.namespace" }} {{index .Config.Labels "io.kubernetes.pod.name" }}'""" | |
for line in run_command(['sh', '-c', getcontainers]): | |
fields = line.split(' ') | |
if args.docker: | |
if not re.match(args.docker, fields[1]): | |
continue | |
if args.namespace: | |
if not re.match(args.namespace, fields[2]): | |
continue | |
if args.podname: | |
if not re.match(args.podname, fields[3]): | |
continue | |
containers_by_cgroup[fields[0]].append( fields[1:] ) | |
def cgroup_parent(f): | |
try: | |
f = f.split('/')[-2] | |
except: | |
f = f | |
return(f) | |
def readfile(f): | |
try: | |
with open(f, 'r') as fh: | |
return [line.rstrip() for line in fh] | |
except Exception as e: | |
warn(e) | |
return [] | |
def cgroup_cpuacct_pid(pid): | |
cgroups = readfile(os.path.join('/proc', str(pid), 'cgroup')) | |
for line in cgroups: | |
m = re.match(r'\d+:(cpuacct[^:]*):(.*)$', line) | |
if m: | |
if m.group(1) == 'cpuacct,cpu': | |
# no idea why this discrepency exists, I'll just blame systemd. ;) | |
cpuacct = '/sys/fs/cgroup/{}{}'.format('cpu,cpuacct', m.group(2)) | |
else: | |
cpuacct = '/sys/fs/cgroup/{}{}'.format(m.group(1), m.group(2)) | |
break | |
else: | |
warn("No match for cpuacct in cgroups") | |
sys.exit(1) | |
return [cpuacct] | |
def firstline(f, nonulls=True): | |
try: | |
line = '' | |
lines = readfile(f) | |
if len(lines): | |
line = lines[0].rstrip() | |
if (nonulls): | |
line = line.replace('\000', ' ') | |
return line | |
except Exception as e: | |
warn(e) | |
return '' | |
def get_cgroup_procs(f, short=False): | |
f = filepath(f, 'cgroup.procs') | |
procs = [] | |
for pid in readfile(f): | |
process = firstline('/proc/{}/cmdline'.format(pid)) | |
if process == '': | |
process = '[' + firstline('/proc/{}/comm'.format(pid)) +']' | |
if short: | |
process = process[0:process.find(' ')] | |
procs.append([pid, process]) | |
return procs | |
def filepath(f, filename): | |
if not os.path.isdir(f): | |
dir = os.path.dirname(f) | |
else: | |
dir = f | |
return os.path.join(dir, filename) | |
def format_limits(cgroup, shares, quota, throttled): | |
fmt_quota = '{0}: {1} shares, limit: {2}ms, throttled {3:.2g}%' | |
fmt_sharesonly = '{0}: {1} shares' | |
if (quota < 0): | |
return fmt_sharesonly.format(cgroup, shares) | |
else: | |
return fmt_quota.format(cgroup, shares, quota, throttled) | |
# ------------------------------------------------------------ | |
containers_by_cgroup = defaultdict(list) | |
parser=argparse.ArgumentParser( | |
prog='cgroup-cpu-throttling', | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=textwrap.dedent('''\ | |
This script is useful to rout out cpu throttling or just inspect the cgroup | |
hierarchy on the system. | |
Note: "limits" displayed include cpu_shares/(cpu_quota_us/1000)ms | |
If no cpu quota is set in a given cgroup level, only 'X shares' is displayed. | |
As long as the default period remains 1000/ms (1s), then | |
cpu_quota is k8s cpu limit x1000 or when using 'Nm' units it is just N. | |
e.g. CPU Limit of 2 is 2000ms or CPU limit of 500m is 500ms | |
Shares is cpu requests x1024, '2' in k8s is 2048, 500m is 512. | |
It is important to note that a really small k8s cpu requests | |
(translates to cpu_shares) can also impact the performance of your | |
k8s pods, because these shares are utilmiately a % of the | |
alloction to all containers. | |
ex: | |
cgroup-cpu-throttling -t 10 --brief -s --nosep | column -t | |
cgroup-cpu-throttling -t 10 --hier --namespace infra | |
''')) | |
parser.add_argument('-t', '--threshold', help='min %% threshold to print', type=float, default=2, choices=Range(0.0,100)) | |
parser.add_argument('--hier', help='Print the output in a hierarchy', action='store_true') | |
parser.add_argument('-c', '--commands', help='Print commands (processes) instead of containers ids when available', action='store_true') | |
parser.add_argument('-b', '--both', help='Print both commands and container info when available', action='store_true') | |
parser.add_argument('-s', '--short', help='Print short pid names (command only) in pid mode', action='store_true') | |
parser.add_argument('--brief', help='Print a brief, grepable format that just includes the container or commands with limits', action='store_true') | |
parser.add_argument('--nosep', help='In "brief" mode, don\'t print the cgroup separator (useful for piping | column -t)', action='store_true') | |
parser.add_argument('-g', '--cgroup', help='Limit output to cgroups that match (regex)', type=str) | |
parser.add_argument('-p', '--pid', help='Limit output to the given process id', type=int) | |
parser.add_argument('-d', '--docker', help='Limit output to the given docker id (regex)', type=str) | |
parser.add_argument('-n', '--namespace', help='Limit output to the given k8s namespace (regex)', type=str) | |
parser.add_argument('--podname', help='Limit output to the given k8s podname (regex)', type=str) | |
# ------------------------------------------------------------ | |
args = parser.parse_args() | |
if args.namespace or args.podname or args.docker or (args.both or not args.commands): | |
collect_containers() | |
printed = defaultdict(lambda: False) | |
cgroups=[] | |
if args.pid: | |
cgroups = cgroup_cpuacct_pid(args.pid) | |
elif args.podname or args.namespace or args.docker: | |
# collect_containers retrieved only matching namespaces. | |
find = ' -o '.join(['-path */'+x+'*/cpu.stat' for x in containers_by_cgroup.keys()]) | |
if find == '': | |
print("No matches were found") | |
sys.exit(1) | |
# parents = [x.replace('cpu.stat', '') for x in list(run_command(["sh", "-c", "find /sys/fs/cgroup " + find]))] | |
# cgroups = list(run_command(["sh", "-c", "find " + ' '.join(parents)])) | |
cgroups = list(run_command(["sh", "-c", "find /sys/fs/cgroup " + find])) | |
else: | |
# "grep -s -l -v ' 0$' $(find /sys/fs/cgroup -name 'cpu.stat')"]): | |
cgroups = run_command(["sh", "-c", "find /sys/fs/cgroup -name 'cpu.stat'"]) | |
for f in cgroups: | |
# print("IN: " + f) | |
cgroup = f.replace(r'/sys/fs/cgroup/', '') | |
cgroup = cgroup.replace(r'cpu,cpuacct/', '') | |
cgroup = cgroup.replace(r'/cpu.stat', '').replace(r'.scope', '') | |
if args.cgroup and not re.search(args.cgroup, cgroup): | |
continue | |
percent = calculate_percent(f) | |
if percent < args.threshold: | |
continue | |
if cgroup.endswith('.slice'): | |
try: | |
parent = cgroup.split('/')[-1] | |
except IndexError: | |
parent = cgroup | |
else: # .scope | |
parent = cgroup_parent(cgroup) | |
shares, quota = cgroup_cpu_limit(f) | |
if not args.brief: | |
if (args.hier): | |
# hier = hier.replace(r'kubepods.slice/', '') # just to remove some spacing | |
hier = to_hier(f) | |
print(hier) | |
else: | |
print(format_limits(cgroup, shares, quota, percent)) | |
else: | |
brief_limits = [shares, '{:g}ms'.format(quota), '{:.2g}%'.format(percent)] | |
if ((not args.commands) or args.both) and parent in containers_by_cgroup: | |
containers = containers_by_cgroup[parent] | |
else: | |
containers = [] | |
if len(containers) == 0 or args.commands or args.both: | |
procs = get_cgroup_procs(f, short=args.short) | |
else: | |
procs = [] | |
if args.brief and len(containers + procs) > 0 and not args.nosep: | |
print("----\t" + parent + ":") | |
if args.brief: | |
pids_headings = [['Shares', 'Limit ', 'Throttling', 'ID', 'Process']] | |
width = defaultdict(int) | |
max_width = defaultdict(lambda:20) | |
for row in pids_headings + procs: | |
if row[0] != 'Shares': # stuff record rows with limits | |
row = pids_headings[0][0:3] + row | |
for i, col in enumerate(row): | |
width[i] = min(max(len(col), width[i]), max_width[i]) | |
num_columns = len(width) | |
brief_pids_fmt = ' '.join('{{:<{}}}'.format(width[i]) for i in range(num_columns)) | |
if args.nosep and not printed['pids_header'] and len(procs): | |
print(brief_pids_fmt.format(*pids_headings[0])) | |
printed['pids_header'] = True | |
cont_headings = [['Shares', 'Limit ', 'Throttling', 'ID', 'Namespace', 'Container']] | |
width = defaultdict(int) | |
max_width = defaultdict(lambda:20) | |
for row in cont_headings + containers: | |
if row[0] != 'Shares': # stuff record rows with limits | |
row = cont_headings[0][0:3] + row | |
for i, col in enumerate(row): | |
width[i] = min(max(len(col), width[i]), max_width[i]) | |
num_columns = len(width) | |
brief_cont_fmt = ' '.join('{{:<{}}}'.format(width[i]) for i in range(num_columns)) | |
if args.nosep and not printed['cont_header'] and len(containers): | |
print(brief_cont_fmt.format(*cont_headings[0])) | |
printed['cont_header'] = True | |
count=0 | |
if ((not args.commands) or args.both) and parent in containers_by_cgroup: | |
if not args.brief: | |
print('\t Affected containers ns/pods:') | |
else: | |
if not args.nosep: | |
print(brief_cont_fmt.format(*cont_headings[0])) | |
for container in containers: | |
if args.brief: | |
print(brief_cont_fmt.format(*brief_limits, *container)) | |
else: | |
print('\t\t{} {}/{}'.format(*container)) | |
count += 1 | |
if (len(containers) == 0 and len(procs)) or args.commands or args.both: | |
if len(procs) > 0: | |
if not args.brief: | |
print('\t Affected Processes:') | |
else: | |
if not args.nosep: | |
print(brief_pids_fmt.format(*pids_headings[0])) | |
for proc in procs: | |
if args.brief: | |
print(brief_pids_fmt.format(*brief_limits, *proc)) | |
else: | |
print('\t\t{} {}'.format(*proc)) | |
elif not args.brief: | |
print("\t Affected Processes: None at this level of the cgroup hierarchy.") |
fwiw, systemd-cgls
and systemd-cgtop
are useful tools for looking at cgroup hierarchy as well as the libcgroup-tools
suite...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you've found this script, your likely on a journey troubleshooting k8s resource limits and how they can have a dramatic effect on your workloads if you don't get them right. CPU limits are a silent killer, as there is currently not much in the ecosystem that exposes when these limits are encountered, your pods just get cpu squelched. While you can use Prometheus, sysdig, datadog and other tools to pull out this meta data, I thought a quick and dirty script to get the job done also helps. So if you find this useful, great.
Some links related to this topic and script...
https://home.robusta.dev/blog/stop-using-cpu-limits
https://medium.com/@betz.mark/understanding-resource-limits-in-kubernetes-cpu-time-9eff74d3161b
https://medium.com/omio-engineering/cpu-limits-and-aggressive-throttling-in-kubernetes-c5b20bd8a718
https://danluu.com/cgroup-throttling/
https://erickhun.com/posts/kubernetes-faster-services-no-cpu-limits/