PaulFurtado/NOTES.md

## NOTES.md

      
    Raw
  

              NOTES.md
            
          
    This is a kernel 4.14.133 backport of:

[PATCH v6 1/1] sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices

From Dave Chiluk's patch here: https://lkml.org/lkml/2019/7/23/673

Results of Dave's fibtest reproducer (https://github.com/indeedeng/fibtest)
On a c5.9xlarge ec2 instance:

CPU: 36 core Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
Mem: 68 GB

which is one of our primary kubernetes instance types.
These are CPU-optimized instances with 36 cores.
Before:
# ./runfibtest 
Iterations Completed(M): 228 
Throttled for: 53 
CPU Usage (msecs) = 231

After:
# ./runfibtest 
Iterations Completed(M): 498 
Throttled for: 50 
CPU Usage (msecs) = 492

got through 2.2x more iterations
was able to use 2.1x more CPU time

On an r4.16xlarge (prev generation):

CPU: 64 core Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
Mem: 480GB

Without patch:
# ./runfibtest 
Iterations Completed(M): 170 
Throttled for: 60 
CPU Usage (msecs) = 336

With patch:
# ./runfibtest 
Iterations Completed(M): 179 
Throttled for: 55 
CPU Usage (msecs) = 512

got through only 1.05x more iterations
was able to use 1.52x CPU

m5.24xlarge:

CPU: 96 core Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz
Mem: 369 GB

Without patch:
Iterations Completed(M): 192 
Throttled for: 62 
CPU Usage (msecs) = 246

With patch:
Iterations Completed(M): 366 
Throttled for: 51 
CPU Usage (msecs) = 505

1.91x more iterations
2.05x more CPU

  
## sched-fair-Fix-low-cpu-usage-with-high-throttling-by.patch
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af7de1f9906c..75eab302d79d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4090,8 +4090,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
-	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-	cfs_b->expires_seq++;
 }

 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4113,8 +4111,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-	u64 amount = 0, min_amount, expires;
-	int expires_seq;
+	u64 amount = 0, min_amount;

 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4131,61 +4128,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
-	expires_seq = cfs_b->expires_seq;
-	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);

 	cfs_rq->runtime_remaining += amount;
-	/*
-	 * we may have advanced our local expiration to account for allowed
-	 * spread between our sched_clock and the one on which runtime was
-	 * issued.
-	 */
-	if (cfs_rq->expires_seq != expires_seq) {
-		cfs_rq->expires_seq = expires_seq;
-		cfs_rq->runtime_expires = expires;
-	}

 	return cfs_rq->runtime_remaining > 0;
 }

-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-	/* if the deadline is ahead of our clock, nothing to do */
-	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-		return;
-
-	if (cfs_rq->runtime_remaining < 0)
-		return;
-
-	/*
-	 * If the local deadline has passed we have to consider the
-	 * possibility that our sched_clock is 'fast' and the global deadline
-	 * has not truly expired.
-	 *
-	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline(cfs_b->expires_seq) has advanced.
-	 */
-	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-		/* extend local deadline, drift is bounded above by 2 ticks */
-		cfs_rq->runtime_expires += TICK_NSEC;
-	} else {
-		/* global deadline is ahead, expiration has passed */
-		cfs_rq->runtime_remaining = 0;
-	}
-}
-
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
-	expire_cfs_rq_runtime(cfs_rq);

 	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
@@ -4369,8 +4322,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		resched_curr(rq);
 }

-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-		u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime;
@@ -4392,7 +4344,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
 		remaining -= runtime;

 		cfs_rq->runtime_remaining += runtime;
-		cfs_rq->runtime_expires = expires;

 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0)
@@ -4417,7 +4368,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
-	u64 runtime, runtime_expires;
+	u64 runtime;
 	int throttled;

 	/* no need to continue the timer with no bandwidth constraint */
@@ -4445,8 +4396,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	/* account preceding periods in which throttling occurred */
 	cfs_b->nr_throttled += overrun;

-	runtime_expires = cfs_b->runtime_expires;
-
 	/*
 	 * This check is repeated as we are holding onto the new bandwidth while
 	 * we unthrottle. This can potentially race with an unthrottled group
@@ -4459,8 +4408,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
-		runtime = distribute_cfs_runtime(cfs_b, runtime,
-						 runtime_expires);
+		runtime = distribute_cfs_runtime(cfs_b, runtime);
 		raw_spin_lock(&cfs_b->lock);

 		cfs_b->distribute_running = 0;
@@ -4537,8 +4485,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 		return;

 	raw_spin_lock(&cfs_b->lock);
-	if (cfs_b->quota != RUNTIME_INF &&
-	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+	if (cfs_b->quota != RUNTIME_INF) {
 		cfs_b->runtime += slack_runtime;

 		/* we are under rq->lock, defer unthrottling using a timer */
@@ -4570,7 +4517,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
-	u64 expires;

 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
@@ -4587,7 +4533,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
 		runtime = cfs_b->runtime;

-	expires = cfs_b->runtime_expires;
 	if (runtime)
 		cfs_b->distribute_running = 1;

@@ -4596,11 +4541,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	if (!runtime)
 		return;

-	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+	runtime = distribute_cfs_runtime(cfs_b, runtime);

 	raw_spin_lock(&cfs_b->lock);
-	if (expires == cfs_b->runtime_expires)
-		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->runtime -= min(runtime, cfs_b->runtime);
 	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 452b56923c6d..268f560ec998 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -280,8 +280,6 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 quota, runtime;
 	s64 hierarchical_quota;
-	u64 runtime_expires;
-	int expires_seq;

 	short idle, period_active;
 	struct hrtimer period_timer, slack_timer;
@@ -489,8 +487,6 @@ struct cfs_rq {

 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
-	int expires_seq;
-	u64 runtime_expires;
 	s64 runtime_remaining;

 	u64 throttled_clock, throttled_clock_task;
	diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
	index af7de1f9906c..75eab302d79d 100644
	--- a/kernel/sched/fair.c
	+++ b/kernel/sched/fair.c
	@@ -4090,8 +4090,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

	now = sched_clock_cpu(smp_processor_id());
	cfs_b->runtime = cfs_b->quota;
	- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
	- cfs_b->expires_seq++;
	}

	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
	@@ -4113,8 +4111,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	{
	struct task_group *tg = cfs_rq->tg;
	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
	- u64 amount = 0, min_amount, expires;
	- int expires_seq;
	+ u64 amount = 0, min_amount;

	/* note: this is a positive sum as runtime_remaining <= 0 */
	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
	@@ -4131,61 +4128,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	cfs_b->idle = 0;
	}
	}
	- expires_seq = cfs_b->expires_seq;
	- expires = cfs_b->runtime_expires;
	raw_spin_unlock(&cfs_b->lock);

	cfs_rq->runtime_remaining += amount;
	- /*
	- * we may have advanced our local expiration to account for allowed
	- * spread between our sched_clock and the one on which runtime was
	- * issued.
	- */
	- if (cfs_rq->expires_seq != expires_seq) {
	- cfs_rq->expires_seq = expires_seq;
	- cfs_rq->runtime_expires = expires;
	- }

	return cfs_rq->runtime_remaining > 0;
	}

	-/*
	- * Note: This depends on the synchronization provided by sched_clock and the
	- * fact that rq->clock snapshots this value.
	- */
	-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	-{
	- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
	-
	- /* if the deadline is ahead of our clock, nothing to do */
	- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
	- return;
	-
	- if (cfs_rq->runtime_remaining < 0)
	- return;
	-
	- /*
	- * If the local deadline has passed we have to consider the
	- * possibility that our sched_clock is 'fast' and the global deadline
	- * has not truly expired.
	- *
	- * Fortunately we can check determine whether this the case by checking
	- * whether the global deadline(cfs_b->expires_seq) has advanced.
	- */
	- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
	- /* extend local deadline, drift is bounded above by 2 ticks */
	- cfs_rq->runtime_expires += TICK_NSEC;
	- } else {
	- /* global deadline is ahead, expiration has passed */
	- cfs_rq->runtime_remaining = 0;
	- }
	-}
	-
	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
	{
	/* dock delta_exec before expiring quota (as it could span periods) */
	cfs_rq->runtime_remaining -= delta_exec;
	- expire_cfs_rq_runtime(cfs_rq);

	if (likely(cfs_rq->runtime_remaining > 0))
	return;
	@@ -4369,8 +4322,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
	resched_curr(rq);
	}

	-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
	- u64 remaining, u64 expires)
	+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
	{
	struct cfs_rq *cfs_rq;
	u64 runtime;
	@@ -4392,7 +4344,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
	remaining -= runtime;

	cfs_rq->runtime_remaining += runtime;
	- cfs_rq->runtime_expires = expires;

	/* we check whether we're throttled above */
	if (cfs_rq->runtime_remaining > 0)
	@@ -4417,7 +4368,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
	*/
	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
	{
	- u64 runtime, runtime_expires;
	+ u64 runtime;
	int throttled;

	/* no need to continue the timer with no bandwidth constraint */
	@@ -4445,8 +4396,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
	/* account preceding periods in which throttling occurred */
	cfs_b->nr_throttled += overrun;

	- runtime_expires = cfs_b->runtime_expires;
	-
	/*
	* This check is repeated as we are holding onto the new bandwidth while
	* we unthrottle. This can potentially race with an unthrottled group
	@@ -4459,8 +4408,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
	cfs_b->distribute_running = 1;
	raw_spin_unlock(&cfs_b->lock);
	/* we can't nest cfs_b->lock while distributing bandwidth */
	- runtime = distribute_cfs_runtime(cfs_b, runtime,
	- runtime_expires);
	+ runtime = distribute_cfs_runtime(cfs_b, runtime);
	raw_spin_lock(&cfs_b->lock);

	cfs_b->distribute_running = 0;
	@@ -4537,8 +4485,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	return;

	raw_spin_lock(&cfs_b->lock);
	- if (cfs_b->quota != RUNTIME_INF &&
	- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
	+ if (cfs_b->quota != RUNTIME_INF) {
	cfs_b->runtime += slack_runtime;

	/* we are under rq->lock, defer unthrottling using a timer */
	@@ -4570,7 +4517,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
	{
	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
	- u64 expires;

	/* confirm we're still not at a refresh boundary */
	raw_spin_lock(&cfs_b->lock);
	@@ -4587,7 +4533,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
	runtime = cfs_b->runtime;

	- expires = cfs_b->runtime_expires;
	if (runtime)
	cfs_b->distribute_running = 1;

	@@ -4596,11 +4541,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
	if (!runtime)
	return;

	- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
	+ runtime = distribute_cfs_runtime(cfs_b, runtime);

	raw_spin_lock(&cfs_b->lock);
	- if (expires == cfs_b->runtime_expires)
	- cfs_b->runtime -= min(runtime, cfs_b->runtime);
	+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
	cfs_b->distribute_running = 0;
	raw_spin_unlock(&cfs_b->lock);
	}
	diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
	index 452b56923c6d..268f560ec998 100644
	--- a/kernel/sched/sched.h
	+++ b/kernel/sched/sched.h
	@@ -280,8 +280,6 @@ struct cfs_bandwidth {
	ktime_t period;
	u64 quota, runtime;
	s64 hierarchical_quota;
	- u64 runtime_expires;
	- int expires_seq;

	short idle, period_active;
	struct hrtimer period_timer, slack_timer;
	@@ -489,8 +487,6 @@ struct cfs_rq {

	#ifdef CONFIG_CFS_BANDWIDTH
	int runtime_enabled;
	- int expires_seq;
	- u64 runtime_expires;
	s64 runtime_remaining;

	u64 throttled_clock, throttled_clock_task;