Skip to content

Instantly share code, notes, and snippets.

@bdowling
Last active February 10, 2023 13:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bdowling/48470325d08b216db5d4b6972695dd6b to your computer and use it in GitHub Desktop.
Save bdowling/48470325d08b216db5d4b6972695dd6b to your computer and use it in GitHub Desktop.
This script started out briefly, but turned into an actual generally useful tool for inspecting cgroup hierarchy. Particularly useful for seeing the effect of Kubernetes CPU Requests and Limits and throttling that is occuring.
#!/usr/bin/env python3
#
# Script to help find cgroup cpu throttling in a k8s cluster
#
# Brian Dowling
#
import subprocess
import argparse
import textwrap
from warnings import warn
import json
import sys
import os
import re
from collections import defaultdict
class Range(object):
def __init__(self, start, end):
self.start = start
self.end = end
def __eq__(self, other):
return self.start <= other <= self.end
def __contains__(self, item):
return self.__eq__(item)
def __iter__(self):
yield self
def __repr__(self):
return '[{0},{1}]'.format(self.start, self.end)
def run_command(command):
p = subprocess.Popen(command, stdout=subprocess.PIPE)
for line in p.stdout:
line = str(line.rstrip(), 'UTF-8')
yield line
def calculate_percent(f):
f = filepath(f, 'cpu.stat')
try:
periods = 0
throttled = 0
with open(f, 'r') as f:
for line in f.readlines():
line = line.rstrip()
if line.startswith('nr_periods'):
periods=float( line[line.find(' '):] )
if line.startswith('nr_throttled'):
throttled=float( line[line.find(' '):] )
if periods != 0 and throttled != 0:
return throttled/periods * 100.0
else:
return 0.0
except Exception as e:
warn(e)
return 0.0
def cgroup_cpu_limit(f):
try:
shares = filepath(f, 'cpu.shares')
# period = filepath(f, 'cpu.cfs_period_us')
quota = filepath(f, 'cpu.cfs_quota_us')
# quota procs are -1, so return that instead of -.0001
q = (int(firstline(quota))) / 1000
if (q < 0):
q = -1
s = int(firstline(shares))
return s, q
except Exception as e:
warn(e)
return 0, 0
def to_hier(f):
if os.path.isfile(f):
f = os.path.dirname(f)
cgroup = f.replace(r'/sys/fs/cgroup/', '')
cgroup = cgroup.replace(r'cpu,cpuacct/', '')
dir = f.replace(cgroup, '')
res = ''
spaces=0
for child in cgroup.split('/'):
dir = os.path.join(dir, child)
percent = calculate_percent(dir)
shares, quota = cgroup_cpu_limit(dir)
res += ' ' * spaces + format_limits(child, shares, quota, percent) + "\n"
spaces += 2
res = res.rstrip()
return res
def collect_containers():
getcontainers = """docker ps -q |xargs --no-run-if-empty -n1 -- docker inspect --format='{{ .HostConfig.CgroupParent }} {{ printf "%.12s" .Id}} {{index .Config.Labels "io.kubernetes.pod.namespace" }} {{index .Config.Labels "io.kubernetes.pod.name" }}'"""
for line in run_command(['sh', '-c', getcontainers]):
fields = line.split(' ')
if args.docker:
if not re.match(args.docker, fields[1]):
continue
if args.namespace:
if not re.match(args.namespace, fields[2]):
continue
if args.podname:
if not re.match(args.podname, fields[3]):
continue
containers_by_cgroup[fields[0]].append( fields[1:] )
def cgroup_parent(f):
try:
f = f.split('/')[-2]
except:
f = f
return(f)
def readfile(f):
try:
with open(f, 'r') as fh:
return [line.rstrip() for line in fh]
except Exception as e:
warn(e)
return []
def cgroup_cpuacct_pid(pid):
cgroups = readfile(os.path.join('/proc', str(pid), 'cgroup'))
for line in cgroups:
m = re.match(r'\d+:(cpuacct[^:]*):(.*)$', line)
if m:
if m.group(1) == 'cpuacct,cpu':
# no idea why this discrepency exists, I'll just blame systemd. ;)
cpuacct = '/sys/fs/cgroup/{}{}'.format('cpu,cpuacct', m.group(2))
else:
cpuacct = '/sys/fs/cgroup/{}{}'.format(m.group(1), m.group(2))
break
else:
warn("No match for cpuacct in cgroups")
sys.exit(1)
return [cpuacct]
def firstline(f, nonulls=True):
try:
line = ''
lines = readfile(f)
if len(lines):
line = lines[0].rstrip()
if (nonulls):
line = line.replace('\000', ' ')
return line
except Exception as e:
warn(e)
return ''
def get_cgroup_procs(f, short=False):
f = filepath(f, 'cgroup.procs')
procs = []
for pid in readfile(f):
process = firstline('/proc/{}/cmdline'.format(pid))
if process == '':
process = '[' + firstline('/proc/{}/comm'.format(pid)) +']'
if short:
process = process[0:process.find(' ')]
procs.append([pid, process])
return procs
def filepath(f, filename):
if not os.path.isdir(f):
dir = os.path.dirname(f)
else:
dir = f
return os.path.join(dir, filename)
def format_limits(cgroup, shares, quota, throttled):
fmt_quota = '{0}: {1} shares, limit: {2}ms, throttled {3:.2g}%'
fmt_sharesonly = '{0}: {1} shares'
if (quota < 0):
return fmt_sharesonly.format(cgroup, shares)
else:
return fmt_quota.format(cgroup, shares, quota, throttled)
# ------------------------------------------------------------
containers_by_cgroup = defaultdict(list)
parser=argparse.ArgumentParser(
prog='cgroup-cpu-throttling',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent('''\
This script is useful to rout out cpu throttling or just inspect the cgroup
hierarchy on the system.
Note: "limits" displayed include cpu_shares/(cpu_quota_us/1000)ms
If no cpu quota is set in a given cgroup level, only 'X shares' is displayed.
As long as the default period remains 1000/ms (1s), then
cpu_quota is k8s cpu limit x1000 or when using 'Nm' units it is just N.
e.g. CPU Limit of 2 is 2000ms or CPU limit of 500m is 500ms
Shares is cpu requests x1024, '2' in k8s is 2048, 500m is 512.
It is important to note that a really small k8s cpu requests
(translates to cpu_shares) can also impact the performance of your
k8s pods, because these shares are utilmiately a % of the
alloction to all containers.
ex:
cgroup-cpu-throttling -t 10 --brief -s --nosep | column -t
cgroup-cpu-throttling -t 10 --hier --namespace infra
'''))
parser.add_argument('-t', '--threshold', help='min %% threshold to print', type=float, default=2, choices=Range(0.0,100))
parser.add_argument('--hier', help='Print the output in a hierarchy', action='store_true')
parser.add_argument('-c', '--commands', help='Print commands (processes) instead of containers ids when available', action='store_true')
parser.add_argument('-b', '--both', help='Print both commands and container info when available', action='store_true')
parser.add_argument('-s', '--short', help='Print short pid names (command only) in pid mode', action='store_true')
parser.add_argument('--brief', help='Print a brief, grepable format that just includes the container or commands with limits', action='store_true')
parser.add_argument('--nosep', help='In "brief" mode, don\'t print the cgroup separator (useful for piping | column -t)', action='store_true')
parser.add_argument('-g', '--cgroup', help='Limit output to cgroups that match (regex)', type=str)
parser.add_argument('-p', '--pid', help='Limit output to the given process id', type=int)
parser.add_argument('-d', '--docker', help='Limit output to the given docker id (regex)', type=str)
parser.add_argument('-n', '--namespace', help='Limit output to the given k8s namespace (regex)', type=str)
parser.add_argument('--podname', help='Limit output to the given k8s podname (regex)', type=str)
# ------------------------------------------------------------
args = parser.parse_args()
if args.namespace or args.podname or args.docker or (args.both or not args.commands):
collect_containers()
printed = defaultdict(lambda: False)
cgroups=[]
if args.pid:
cgroups = cgroup_cpuacct_pid(args.pid)
elif args.podname or args.namespace or args.docker:
# collect_containers retrieved only matching namespaces.
find = ' -o '.join(['-path */'+x+'*/cpu.stat' for x in containers_by_cgroup.keys()])
if find == '':
print("No matches were found")
sys.exit(1)
# parents = [x.replace('cpu.stat', '') for x in list(run_command(["sh", "-c", "find /sys/fs/cgroup " + find]))]
# cgroups = list(run_command(["sh", "-c", "find " + ' '.join(parents)]))
cgroups = list(run_command(["sh", "-c", "find /sys/fs/cgroup " + find]))
else:
# "grep -s -l -v ' 0$' $(find /sys/fs/cgroup -name 'cpu.stat')"]):
cgroups = run_command(["sh", "-c", "find /sys/fs/cgroup -name 'cpu.stat'"])
for f in cgroups:
# print("IN: " + f)
cgroup = f.replace(r'/sys/fs/cgroup/', '')
cgroup = cgroup.replace(r'cpu,cpuacct/', '')
cgroup = cgroup.replace(r'/cpu.stat', '').replace(r'.scope', '')
if args.cgroup and not re.search(args.cgroup, cgroup):
continue
percent = calculate_percent(f)
if percent < args.threshold:
continue
if cgroup.endswith('.slice'):
try:
parent = cgroup.split('/')[-1]
except IndexError:
parent = cgroup
else: # .scope
parent = cgroup_parent(cgroup)
shares, quota = cgroup_cpu_limit(f)
if not args.brief:
if (args.hier):
# hier = hier.replace(r'kubepods.slice/', '') # just to remove some spacing
hier = to_hier(f)
print(hier)
else:
print(format_limits(cgroup, shares, quota, percent))
else:
brief_limits = [shares, '{:g}ms'.format(quota), '{:.2g}%'.format(percent)]
if ((not args.commands) or args.both) and parent in containers_by_cgroup:
containers = containers_by_cgroup[parent]
else:
containers = []
if len(containers) == 0 or args.commands or args.both:
procs = get_cgroup_procs(f, short=args.short)
else:
procs = []
if args.brief and len(containers + procs) > 0 and not args.nosep:
print("----\t" + parent + ":")
if args.brief:
pids_headings = [['Shares', 'Limit ', 'Throttling', 'ID', 'Process']]
width = defaultdict(int)
max_width = defaultdict(lambda:20)
for row in pids_headings + procs:
if row[0] != 'Shares': # stuff record rows with limits
row = pids_headings[0][0:3] + row
for i, col in enumerate(row):
width[i] = min(max(len(col), width[i]), max_width[i])
num_columns = len(width)
brief_pids_fmt = ' '.join('{{:<{}}}'.format(width[i]) for i in range(num_columns))
if args.nosep and not printed['pids_header'] and len(procs):
print(brief_pids_fmt.format(*pids_headings[0]))
printed['pids_header'] = True
cont_headings = [['Shares', 'Limit ', 'Throttling', 'ID', 'Namespace', 'Container']]
width = defaultdict(int)
max_width = defaultdict(lambda:20)
for row in cont_headings + containers:
if row[0] != 'Shares': # stuff record rows with limits
row = cont_headings[0][0:3] + row
for i, col in enumerate(row):
width[i] = min(max(len(col), width[i]), max_width[i])
num_columns = len(width)
brief_cont_fmt = ' '.join('{{:<{}}}'.format(width[i]) for i in range(num_columns))
if args.nosep and not printed['cont_header'] and len(containers):
print(brief_cont_fmt.format(*cont_headings[0]))
printed['cont_header'] = True
count=0
if ((not args.commands) or args.both) and parent in containers_by_cgroup:
if not args.brief:
print('\t Affected containers ns/pods:')
else:
if not args.nosep:
print(brief_cont_fmt.format(*cont_headings[0]))
for container in containers:
if args.brief:
print(brief_cont_fmt.format(*brief_limits, *container))
else:
print('\t\t{} {}/{}'.format(*container))
count += 1
if (len(containers) == 0 and len(procs)) or args.commands or args.both:
if len(procs) > 0:
if not args.brief:
print('\t Affected Processes:')
else:
if not args.nosep:
print(brief_pids_fmt.format(*pids_headings[0]))
for proc in procs:
if args.brief:
print(brief_pids_fmt.format(*brief_limits, *proc))
else:
print('\t\t{} {}'.format(*proc))
elif not args.brief:
print("\t Affected Processes: None at this level of the cgroup hierarchy.")
@bdowling
Copy link
Author

bdowling commented Feb 7, 2023

If you've found this script, your likely on a journey troubleshooting k8s resource limits and how they can have a dramatic effect on your workloads if you don't get them right. CPU limits are a silent killer, as there is currently not much in the ecosystem that exposes when these limits are encountered, your pods just get cpu squelched. While you can use Prometheus, sysdig, datadog and other tools to pull out this meta data, I thought a quick and dirty script to get the job done also helps. So if you find this useful, great.

Some links related to this topic and script...

https://home.robusta.dev/blog/stop-using-cpu-limits
https://medium.com/@betz.mark/understanding-resource-limits-in-kubernetes-cpu-time-9eff74d3161b
https://medium.com/omio-engineering/cpu-limits-and-aggressive-throttling-in-kubernetes-c5b20bd8a718
https://danluu.com/cgroup-throttling/
https://erickhun.com/posts/kubernetes-faster-services-no-cpu-limits/

@bdowling
Copy link
Author

bdowling commented Feb 8, 2023

fwiw, systemd-cgls and systemd-cgtop are useful tools for looking at cgroup hierarchy as well as the libcgroup-tools suite...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment