leverich/patch-3.5.0-bvt1

## patch-3.5.0-bvt1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a1f493..ef258ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,10 @@ struct sched_entity {
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
+#ifdef CONFIG_CFS_BVT
+	u64			effective_vruntime;
+	unsigned int		is_warped;
+#endif
 	u64			prev_sum_exec_runtime;

 	u64			nr_migrations;
@@ -2061,6 +2065,10 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif

+#ifdef CONFIG_CFS_BVT
+extern unsigned int sysctl_sched_bvt_place_epsilon;
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/init/Kconfig b/init/Kconfig
index d07dcf9..b0ac6b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -787,6 +787,20 @@ config CFS_BANDWIDTH
 	  restriction.
 	  See tip/Documentation/scheduler/sched-bwc.txt for more information.

+config CFS_BVT
+	bool "Borrowed-Virtual Time support for CFS"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED
+	default n
+	help
+	  This feature enables BVT extensions to CFS.  This feature
+	  allows you to bias the wakeup preemption decisions that the
+	  CFS scheduler makes.  For instance, you can guarantee that
+	  latency-sensitive task groups are not preempted by waking
+	  batch-oriented task groups, independent of how much CPU share
+	  you allocate to each group.
+	  FIXME: More documentation.
+
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd4..497d946 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1711,6 +1711,10 @@ static void __sched_fork(struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+#ifdef CONFIG_CFS_BVT
+	p->se.effective_vruntime	= 0;
+	p->se.is_warped			= 0;
+#endif
 	INIT_LIST_HEAD(&p->se.group_node);

 #ifdef CONFIG_SCHEDSTATS
@@ -7284,6 +7288,9 @@ void __init sched_init(void)
 		 */
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+#ifdef CONFIG_CFS_BVT
+		root_task_group.bvt_warp_ns = 0;
+#endif /* CONFIG_CFS_BVT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */

 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
@@ -8217,6 +8224,23 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 #endif /* CONFIG_CFS_BANDWIDTH */
+
+#ifdef CONFIG_CFS_BVT
+static int cpu_bvt_warp_write_s64(struct cgroup *cgrp,
+				struct cftype *cftype, s64 warp_ns)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	tg->bvt_warp_ns = warp_ns;
+	return 0;
+}
+
+static s64 cpu_bvt_warp_read_s64(struct cgroup *cgrp,
+				struct cftype *cftype)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	return tg->bvt_warp_ns;
+}
+#endif /* CONFIG_CFS_BVT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */

 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8267,6 +8291,13 @@ static struct cftype cpu_files[] = {
 		.read_map = cpu_stats_show,
 	},
 #endif
+#ifdef CONFIG_CFS_BVT
+	{
+		.name = "bvt_warp_ns",
+		.read_s64 = cpu_bvt_warp_read_s64,
+		.write_s64 = cpu_bvt_warp_write_s64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6..6bfe63e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -110,6 +110,22 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif

+#ifdef CONFIG_CFS_BVT
+/*
+ * If the BVT_PLACEMENT scheduler feature is enabled, waking BVT tasks
+ * are placed differently from CFS tasks when they wakeup.  Rather
+ * than being placed some large factor (i.e. sched_latency >> 1)
+ * before min_vruntime (which gives waking tasks an unfair advantage
+ * in preempting currently runng tasks), they are placed
+ * sched_bvt_place_epsilon nanoseconds relative to min_vruntime.  If
+ * you really want a BVT task to preempt currently running tasks, it
+ * should have a greater "warp" value than the current running task.
+ *
+ * Default: 1us in the future, units: nanoseconds
+ */
+unsigned int sysctl_sched_bvt_place_epsilon = 1000UL;
+#endif
+
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
@@ -416,6 +432,26 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)

 #endif	/* CONFIG_FAIR_GROUP_SCHED */

+#ifdef CONFIG_CFS_BVT
+static inline void update_effective_vruntime(struct sched_entity *se)
+{
+	s64 warp;
+	struct task_group *tg;
+
+	if (entity_is_task(se)) {
+		se->effective_vruntime = se->vruntime;
+		return;
+	}
+
+	tg = se->my_q->tg;
+	warp = tg->bvt_warp_ns;
+
+	/* FIXME: Should we calc_delta_fair on warp_ns? */
+	se->effective_vruntime = se->vruntime - warp;
+	se->is_warped = warp ? 1 : 0;
+}
+#endif /* CONFIG_CFS_BVT */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);

@@ -444,7 +480,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 static inline int entity_before(struct sched_entity *a,
 				struct sched_entity *b)
 {
+#ifdef CONFIG_CFS_BVT
+	return (s64)(a->effective_vruntime - b->effective_vruntime) < 0;
+#else
 	return (s64)(a->vruntime - b->vruntime) < 0;
+#endif
 }

 static void update_min_vruntime(struct cfs_rq *cfs_rq)
@@ -674,6 +714,9 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);

 	curr->vruntime += delta_exec_weighted;
+#ifdef CONFIG_CFS_BVT
+	update_effective_vruntime(curr);
+#endif
 	update_min_vruntime(cfs_rq);

 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
@@ -1074,10 +1117,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime -= thresh;
 	}

+#ifdef CONFIG_CFS_BVT
+	if (sched_feat(BVT_PLACEMENT) && !entity_is_task(se) && se->is_warped) {
+		vruntime = cfs_rq->min_vruntime + sysctl_sched_bvt_place_epsilon;
+	}
+#endif
+
 	/* ensure we never gain time by being placed backwards. */
 	vruntime = max_vruntime(se->vruntime, vruntime);

 	se->vruntime = vruntime;
+#ifdef CONFIG_CFS_BVT
+	update_effective_vruntime(se);
+#endif
 }

 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1089,8 +1141,13 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) {
 		se->vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+		update_effective_vruntime(se);
+#endif
+	}
+

 	/*
 	 * Update run-time statistics of the 'current'.
@@ -1199,8 +1256,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * update can refer to the ->curr item and we need to reflect this
 	 * movement in our normalized position.
 	 */
-	if (!(flags & DEQUEUE_SLEEP))
+	if (!(flags & DEQUEUE_SLEEP)) {
 		se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+		update_effective_vruntime(se);
+#endif
+	}

 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
@@ -1240,7 +1301,11 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;

 	se = __pick_first_entity(cfs_rq);
+#ifdef CONFIG_CFS_BVT
+	delta = curr->effective_vruntime - se->effective_vruntime;
+#else
 	delta = curr->vruntime - se->vruntime;
+#endif

 	if (delta < 0)
 		return;
@@ -2351,6 +2416,9 @@ static void task_waking_fair(struct task_struct *p)
 #endif

 	se->vruntime -= min_vruntime;
+#ifdef CONFIG_CFS_BVT
+	update_effective_vruntime(se);
+#endif
 }

 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2849,7 +2917,11 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 {
+#ifdef CONFIG_CFS_BVT
+	s64 gran, vdiff = curr->effective_vruntime - se->effective_vruntime;
+#else
 	s64 gran, vdiff = curr->vruntime - se->vruntime;
+#endif

 	if (vdiff <= 0)
 		return -1;
@@ -4937,8 +5009,12 @@ static void task_fork_fair(struct task_struct *p)

 	update_curr(cfs_rq);

-	if (curr)
+	if (curr) {
 		se->vruntime = curr->vruntime;
+#ifdef CONFIG_CFS_BVT
+		update_effective_vruntime(se);
+#endif
+	}
 	place_entity(cfs_rq, se, 1);

 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -4947,10 +5023,17 @@ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
+#ifdef CONFIG_CFS_BVT
+		update_effective_vruntime(curr);
+		update_effective_vruntime(se);
+#endif
 		resched_task(rq->curr);
 	}

 	se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+	update_effective_vruntime(se);
+#endif

 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -4998,6 +5081,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		 */
 		place_entity(cfs_rq, se, 0);
 		se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+		update_effective_vruntime(se);
+#endif
 	}
 }

@@ -5083,6 +5169,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	set_task_rq(p, task_cpu(p));
 	if (!on_rq)
 		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+	update_effective_vruntime(&p->se);
+#endif
 }

 void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a48..256b29c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,5 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+
+SCHED_FEAT(BVT_PLACEMENT, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f2..e2173b3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -114,6 +114,10 @@ struct task_group {
 	atomic_t load_weight;
 #endif

+#ifdef CONFIG_CFS_BVT
+	s64 bvt_warp_ns;
+#endif
+
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab1187..fd86401 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -373,6 +373,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_BVT
+	{
+		.procname	= "sched_bvt_place_epsilon",
+		.data		= &sysctl_sched_bvt_place_epsilon,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
	diff --git a/include/linux/sched.h b/include/linux/sched.h
	index 4a1f493..ef258ef 100644
	--- a/include/linux/sched.h
	+++ b/include/linux/sched.h
	@@ -1179,6 +1179,10 @@ struct sched_entity {
	u64 exec_start;
	u64 sum_exec_runtime;
	u64 vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ u64 effective_vruntime;
	+ unsigned int is_warped;
	+#endif
	u64 prev_sum_exec_runtime;

	u64 nr_migrations;
	@@ -2061,6 +2065,10 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
	extern unsigned int sysctl_sched_cfs_bandwidth_slice;
	#endif

	+#ifdef CONFIG_CFS_BVT
	+extern unsigned int sysctl_sched_bvt_place_epsilon;
	+#endif
	+
	#ifdef CONFIG_RT_MUTEXES
	extern int rt_mutex_getprio(struct task_struct *p);
	extern void rt_mutex_setprio(struct task_struct *p, int prio);
	diff --git a/init/Kconfig b/init/Kconfig
	index d07dcf9..b0ac6b3 100644
	--- a/init/Kconfig
	+++ b/init/Kconfig
	@@ -787,6 +787,20 @@ config CFS_BANDWIDTH
	restriction.
	See tip/Documentation/scheduler/sched-bwc.txt for more information.

	+config CFS_BVT
	+ bool "Borrowed-Virtual Time support for CFS"
	+ depends on EXPERIMENTAL
	+ depends on FAIR_GROUP_SCHED
	+ default n
	+ help
	+ This feature enables BVT extensions to CFS. This feature
	+ allows you to bias the wakeup preemption decisions that the
	+ CFS scheduler makes. For instance, you can guarantee that
	+ latency-sensitive task groups are not preempted by waking
	+ batch-oriented task groups, independent of how much CPU share
	+ you allocate to each group.
	+ FIXME: More documentation.
	+
	config RT_GROUP_SCHED
	bool "Group scheduling for SCHED_RR/FIFO"
	depends on EXPERIMENTAL
	diff --git a/kernel/sched/core.c b/kernel/sched/core.c
	index 468bdd4..497d946 100644
	--- a/kernel/sched/core.c
	+++ b/kernel/sched/core.c
	@@ -1711,6 +1711,10 @@ static void __sched_fork(struct task_struct *p)
	p->se.prev_sum_exec_runtime = 0;
	p->se.nr_migrations = 0;
	p->se.vruntime = 0;
	+#ifdef CONFIG_CFS_BVT
	+ p->se.effective_vruntime = 0;
	+ p->se.is_warped = 0;
	+#endif
	INIT_LIST_HEAD(&p->se.group_node);

	#ifdef CONFIG_SCHEDSTATS
	@@ -7284,6 +7288,9 @@ void __init sched_init(void)
	*/
	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
	+#ifdef CONFIG_CFS_BVT
	+ root_task_group.bvt_warp_ns = 0;
	+#endif /* CONFIG_CFS_BVT */
	#endif /* CONFIG_FAIR_GROUP_SCHED */

	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
	@@ -8217,6 +8224,23 @@ static int cpu_stats_show(struct cgroup cgrp, struct cftype cft,
	return 0;
	}
	#endif /* CONFIG_CFS_BANDWIDTH */
	+
	+#ifdef CONFIG_CFS_BVT
	+static int cpu_bvt_warp_write_s64(struct cgroup *cgrp,
	+ struct cftype *cftype, s64 warp_ns)
	+{
	+ struct task_group *tg = cgroup_tg(cgrp);
	+ tg->bvt_warp_ns = warp_ns;
	+ return 0;
	+}
	+
	+static s64 cpu_bvt_warp_read_s64(struct cgroup *cgrp,
	+ struct cftype *cftype)
	+{
	+ struct task_group *tg = cgroup_tg(cgrp);
	+ return tg->bvt_warp_ns;
	+}
	+#endif /* CONFIG_CFS_BVT */
	#endif /* CONFIG_FAIR_GROUP_SCHED */

	#ifdef CONFIG_RT_GROUP_SCHED
	@@ -8267,6 +8291,13 @@ static struct cftype cpu_files[] = {
	.read_map = cpu_stats_show,
	},
	#endif
	+#ifdef CONFIG_CFS_BVT
	+ {
	+ .name = "bvt_warp_ns",
	+ .read_s64 = cpu_bvt_warp_read_s64,
	+ .write_s64 = cpu_bvt_warp_write_s64,
	+ },
	+#endif
	#ifdef CONFIG_RT_GROUP_SCHED
	{
	.name = "rt_runtime_us",
	diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
	index c099cc6..6bfe63e 100644
	--- a/kernel/sched/fair.c
	+++ b/kernel/sched/fair.c
	@@ -110,6 +110,22 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
	#endif

	+#ifdef CONFIG_CFS_BVT
	+/*
	+ * If the BVT_PLACEMENT scheduler feature is enabled, waking BVT tasks
	+ * are placed differently from CFS tasks when they wakeup. Rather
	+ * than being placed some large factor (i.e. sched_latency >> 1)
	+ * before min_vruntime (which gives waking tasks an unfair advantage
	+ * in preempting currently runng tasks), they are placed
	+ * sched_bvt_place_epsilon nanoseconds relative to min_vruntime. If
	+ * you really want a BVT task to preempt currently running tasks, it
	+ * should have a greater "warp" value than the current running task.
	+ *
	+ * Default: 1us in the future, units: nanoseconds
	+ */
	+unsigned int sysctl_sched_bvt_place_epsilon = 1000UL;
	+#endif
	+
	/*
	* Increase the granularity value when there are more CPUs,
	* because with more CPUs the 'effective latency' as visible
	@@ -416,6 +432,26 @@ find_matching_se(struct sched_entity se, struct sched_entity pse)

	#endif /* CONFIG_FAIR_GROUP_SCHED */

	+#ifdef CONFIG_CFS_BVT
	+static inline void update_effective_vruntime(struct sched_entity *se)
	+{
	+ s64 warp;
	+ struct task_group *tg;
	+
	+ if (entity_is_task(se)) {
	+ se->effective_vruntime = se->vruntime;
	+ return;
	+ }
	+
	+ tg = se->my_q->tg;
	+ warp = tg->bvt_warp_ns;
	+
	+ /* FIXME: Should we calc_delta_fair on warp_ns? */
	+ se->effective_vruntime = se->vruntime - warp;
	+ se->is_warped = warp ? 1 : 0;
	+}
	+#endif /* CONFIG_CFS_BVT */
	+
	static __always_inline
	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);

	@@ -444,7 +480,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
	static inline int entity_before(struct sched_entity *a,
	struct sched_entity *b)
	{
	+#ifdef CONFIG_CFS_BVT
	+ return (s64)(a->effective_vruntime - b->effective_vruntime) < 0;
	+#else
	return (s64)(a->vruntime - b->vruntime) < 0;
	+#endif
	}

	static void update_min_vruntime(struct cfs_rq *cfs_rq)
	@@ -674,6 +714,9 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
	delta_exec_weighted = calc_delta_fair(delta_exec, curr);

	curr->vruntime += delta_exec_weighted;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(curr);
	+#endif
	update_min_vruntime(cfs_rq);

	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
	@@ -1074,10 +1117,19 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
	vruntime -= thresh;
	}

	+#ifdef CONFIG_CFS_BVT
	+ if (sched_feat(BVT_PLACEMENT) && !entity_is_task(se) && se->is_warped) {
	+ vruntime = cfs_rq->min_vruntime + sysctl_sched_bvt_place_epsilon;
	+ }
	+#endif
	+
	/* ensure we never gain time by being placed backwards. */
	vruntime = max_vruntime(se->vruntime, vruntime);

	se->vruntime = vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	}

	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
	@@ -1089,8 +1141,13 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
	* Update the normalized vruntime before updating min_vruntime
	* through callig update_curr().
	*/
	- if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
	+ if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING)) {
	se->vruntime += cfs_rq->min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	+ }
	+

	/*
	* Update run-time statistics of the 'current'.
	@@ -1199,8 +1256,12 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
	* update can refer to the ->curr item and we need to reflect this
	* movement in our normalized position.
	*/
	- if (!(flags & DEQUEUE_SLEEP))
	+ if (!(flags & DEQUEUE_SLEEP)) {
	se->vruntime -= cfs_rq->min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	+ }

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);
	@@ -1240,7 +1301,11 @@ check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
	return;

	se = __pick_first_entity(cfs_rq);
	+#ifdef CONFIG_CFS_BVT
	+ delta = curr->effective_vruntime - se->effective_vruntime;
	+#else
	delta = curr->vruntime - se->vruntime;
	+#endif

	if (delta < 0)
	return;
	@@ -2351,6 +2416,9 @@ static void task_waking_fair(struct task_struct *p)
	#endif

	se->vruntime -= min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	}

	#ifdef CONFIG_FAIR_GROUP_SCHED
	@@ -2849,7 +2917,11 @@ wakeup_gran(struct sched_entity curr, struct sched_entity se)
	static int
	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
	{
	+#ifdef CONFIG_CFS_BVT
	+ s64 gran, vdiff = curr->effective_vruntime - se->effective_vruntime;
	+#else
	s64 gran, vdiff = curr->vruntime - se->vruntime;
	+#endif

	if (vdiff <= 0)
	return -1;
	@@ -4937,8 +5009,12 @@ static void task_fork_fair(struct task_struct *p)

	update_curr(cfs_rq);

	- if (curr)
	+ if (curr) {
	se->vruntime = curr->vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	+ }
	place_entity(cfs_rq, se, 1);

	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
	@@ -4947,10 +5023,17 @@ static void task_fork_fair(struct task_struct *p)
	* 'current' within the tree based on its new key value.
	*/
	swap(curr->vruntime, se->vruntime);
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(curr);
	+ update_effective_vruntime(se);
	+#endif
	resched_task(rq->curr);
	}

	se->vruntime -= cfs_rq->min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif

	raw_spin_unlock_irqrestore(&rq->lock, flags);
	}
	@@ -4998,6 +5081,9 @@ static void switched_from_fair(struct rq rq, struct task_struct p)
	*/
	place_entity(cfs_rq, se, 0);
	se->vruntime -= cfs_rq->min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(se);
	+#endif
	}
	}

	@@ -5083,6 +5169,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
	set_task_rq(p, task_cpu(p));
	if (!on_rq)
	p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
	+#ifdef CONFIG_CFS_BVT
	+ update_effective_vruntime(&p->se);
	+#endif
	}

	void free_fair_sched_group(struct task_group *tg)
	diff --git a/kernel/sched/features.h b/kernel/sched/features.h
	index de00a48..256b29c 100644
	--- a/kernel/sched/features.h
	+++ b/kernel/sched/features.h
	@@ -69,3 +69,5 @@ SCHED_FEAT(TTWU_QUEUE, true)
	SCHED_FEAT(FORCE_SD_OVERLAP, false)
	SCHED_FEAT(RT_RUNTIME_SHARE, true)
	SCHED_FEAT(LB_MIN, false)
	+
	+SCHED_FEAT(BVT_PLACEMENT, true)
	diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
	index 55844f2..e2173b3 100644
	--- a/kernel/sched/sched.h
	+++ b/kernel/sched/sched.h
	@@ -114,6 +114,10 @@ struct task_group {
	atomic_t load_weight;
	#endif

	+#ifdef CONFIG_CFS_BVT
	+ s64 bvt_warp_ns;
	+#endif
	+
	#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity **rt_se;
	struct rt_rq **rt_rq;
	diff --git a/kernel/sysctl.c b/kernel/sysctl.c
	index 4ab1187..fd86401 100644
	--- a/kernel/sysctl.c
	+++ b/kernel/sysctl.c
	@@ -373,6 +373,15 @@ static struct ctl_table kern_table[] = {
	.extra1 = &one,
	},
	#endif
	+#ifdef CONFIG_CFS_BVT
	+ {
	+ .procname = "sched_bvt_place_epsilon",
	+ .data = &sysctl_sched_bvt_place_epsilon,
	+ .maxlen = sizeof(unsigned int),
	+ .mode = 0644,
	+ .proc_handler = proc_dointvec,
	+ },
	+#endif
	#ifdef CONFIG_PROVE_LOCKING
	{
	.procname = "prove_locking",