Skip to content

Instantly share code, notes, and snippets.

@PaulFurtado
Last active July 30, 2019 03:08
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PaulFurtado/ff6c67ec87416b66ba1c6fc70f7beec1 to your computer and use it in GitHub Desktop.
Save PaulFurtado/ff6c67ec87416b66ba1c6fc70f7beec1 to your computer and use it in GitHub Desktop.

This is a kernel 4.14.133 backport of:

[PATCH v6 1/1] sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices

From Dave Chiluk's patch here: https://lkml.org/lkml/2019/7/23/673


Results of Dave's fibtest reproducer (https://github.com/indeedeng/fibtest)

On a c5.9xlarge ec2 instance:

  • CPU: 36 core Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
  • Mem: 68 GB

which is one of our primary kubernetes instance types. These are CPU-optimized instances with 36 cores.

Before:

# ./runfibtest 
Iterations Completed(M): 228 
Throttled for: 53 
CPU Usage (msecs) = 231

After:

# ./runfibtest 
Iterations Completed(M): 498 
Throttled for: 50 
CPU Usage (msecs) = 492

got through 2.2x more iterations

was able to use 2.1x more CPU time


On an r4.16xlarge (prev generation):

  • CPU: 64 core Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
  • Mem: 480GB

Without patch:

# ./runfibtest 
Iterations Completed(M): 170 
Throttled for: 60 
CPU Usage (msecs) = 336

With patch:

# ./runfibtest 
Iterations Completed(M): 179 
Throttled for: 55 
CPU Usage (msecs) = 512

got through only 1.05x more iterations

was able to use 1.52x CPU


m5.24xlarge:

  • CPU: 96 core Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz
  • Mem: 369 GB

Without patch:

Iterations Completed(M): 192 
Throttled for: 62 
CPU Usage (msecs) = 246

With patch:

Iterations Completed(M): 366 
Throttled for: 51 
CPU Usage (msecs) = 505

1.91x more iterations

2.05x more CPU

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af7de1f9906c..75eab302d79d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4090,8 +4090,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4113,8 +4111,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4131,61 +4128,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
@@ -4369,8 +4322,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4392,7 +4344,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4417,7 +4368,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4445,8 +4396,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4459,8 +4408,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock(&cfs_b->lock);
cfs_b->distribute_running = 0;
@@ -4537,8 +4485,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4570,7 +4517,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
@@ -4587,7 +4533,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4596,11 +4541,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock(&cfs_b->lock);
- if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 452b56923c6d..268f560ec998 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -280,8 +280,6 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota, runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
short idle, period_active;
struct hrtimer period_timer, slack_timer;
@@ -489,8 +487,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock, throttled_clock_task;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment