Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Borrowed Virtual Time patch against Linux 3.5.0
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a1f493..ef258ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,10 @@ struct sched_entity {
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
+#ifdef CONFIG_CFS_BVT
+ u64 effective_vruntime;
+ unsigned int is_warped;
+#endif
u64 prev_sum_exec_runtime;
u64 nr_migrations;
@@ -2061,6 +2065,10 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
+#ifdef CONFIG_CFS_BVT
+extern unsigned int sysctl_sched_bvt_place_epsilon;
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/init/Kconfig b/init/Kconfig
index d07dcf9..b0ac6b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -787,6 +787,20 @@ config CFS_BANDWIDTH
restriction.
See tip/Documentation/scheduler/sched-bwc.txt for more information.
+config CFS_BVT
+ bool "Borrowed-Virtual Time support for CFS"
+ depends on EXPERIMENTAL
+ depends on FAIR_GROUP_SCHED
+ default n
+ help
+ This feature enables BVT extensions to CFS. This feature
+ allows you to bias the wakeup preemption decisions that the
+ CFS scheduler makes. For instance, you can guarantee that
+ latency-sensitive task groups are not preempted by waking
+ batch-oriented task groups, independent of how much CPU share
+ you allocate to each group.
+ FIXME: More documentation.
+
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on EXPERIMENTAL
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd4..497d946 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1711,6 +1711,10 @@ static void __sched_fork(struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_CFS_BVT
+ p->se.effective_vruntime = 0;
+ p->se.is_warped = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -7284,6 +7288,9 @@ void __init sched_init(void)
*/
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+#ifdef CONFIG_CFS_BVT
+ root_task_group.bvt_warp_ns = 0;
+#endif /* CONFIG_CFS_BVT */
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
@@ -8217,6 +8224,23 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
+
+#ifdef CONFIG_CFS_BVT
+static int cpu_bvt_warp_write_s64(struct cgroup *cgrp,
+ struct cftype *cftype, s64 warp_ns)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ tg->bvt_warp_ns = warp_ns;
+ return 0;
+}
+
+static s64 cpu_bvt_warp_read_s64(struct cgroup *cgrp,
+ struct cftype *cftype)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ return tg->bvt_warp_ns;
+}
+#endif /* CONFIG_CFS_BVT */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8267,6 +8291,13 @@ static struct cftype cpu_files[] = {
.read_map = cpu_stats_show,
},
#endif
+#ifdef CONFIG_CFS_BVT
+ {
+ .name = "bvt_warp_ns",
+ .read_s64 = cpu_bvt_warp_read_s64,
+ .write_s64 = cpu_bvt_warp_write_s64,
+ },
+#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
.name = "rt_runtime_us",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6..6bfe63e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -110,6 +110,22 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+#ifdef CONFIG_CFS_BVT
+/*
+ * If the BVT_PLACEMENT scheduler feature is enabled, waking BVT tasks
+ * are placed differently from CFS tasks when they wakeup. Rather
+ * than being placed some large factor (i.e. sched_latency >> 1)
+ * before min_vruntime (which gives waking tasks an unfair advantage
+ * in preempting currently runng tasks), they are placed
+ * sched_bvt_place_epsilon nanoseconds relative to min_vruntime. If
+ * you really want a BVT task to preempt currently running tasks, it
+ * should have a greater "warp" value than the current running task.
+ *
+ * Default: 1us in the future, units: nanoseconds
+ */
+unsigned int sysctl_sched_bvt_place_epsilon = 1000UL;
+#endif
+
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
@@ -416,6 +432,26 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CFS_BVT
+static inline void update_effective_vruntime(struct sched_entity *se)
+{
+ s64 warp;
+ struct task_group *tg;
+
+ if (entity_is_task(se)) {
+ se->effective_vruntime = se->vruntime;
+ return;
+ }
+
+ tg = se->my_q->tg;
+ warp = tg->bvt_warp_ns;
+
+ /* FIXME: Should we calc_delta_fair on warp_ns? */
+ se->effective_vruntime = se->vruntime - warp;
+ se->is_warped = warp ? 1 : 0;
+}
+#endif /* CONFIG_CFS_BVT */
+
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
@@ -444,7 +480,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
static inline int entity_before(struct sched_entity *a,
struct sched_entity *b)
{
+#ifdef CONFIG_CFS_BVT
+ return (s64)(a->effective_vruntime - b->effective_vruntime) < 0;
+#else
return (s64)(a->vruntime - b->vruntime) < 0;
+#endif
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
@@ -674,6 +714,9 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(curr);
+#endif
update_min_vruntime(cfs_rq);
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
@@ -1074,10 +1117,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
+#ifdef CONFIG_CFS_BVT
+ if (sched_feat(BVT_PLACEMENT) && !entity_is_task(se) && se->is_warped) {
+ vruntime = cfs_rq->min_vruntime + sysctl_sched_bvt_place_epsilon;
+ }
+#endif
+
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
se->vruntime = vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1089,8 +1141,13 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) {
se->vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
+ }
+
/*
* Update run-time statistics of the 'current'.
@@ -1199,8 +1256,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
- if (!(flags & DEQUEUE_SLEEP))
+ if (!(flags & DEQUEUE_SLEEP)) {
se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
+ }
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -1240,7 +1301,11 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
se = __pick_first_entity(cfs_rq);
+#ifdef CONFIG_CFS_BVT
+ delta = curr->effective_vruntime - se->effective_vruntime;
+#else
delta = curr->vruntime - se->vruntime;
+#endif
if (delta < 0)
return;
@@ -2351,6 +2416,9 @@ static void task_waking_fair(struct task_struct *p)
#endif
se->vruntime -= min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2849,7 +2917,11 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
+#ifdef CONFIG_CFS_BVT
+ s64 gran, vdiff = curr->effective_vruntime - se->effective_vruntime;
+#else
s64 gran, vdiff = curr->vruntime - se->vruntime;
+#endif
if (vdiff <= 0)
return -1;
@@ -4937,8 +5009,12 @@ static void task_fork_fair(struct task_struct *p)
update_curr(cfs_rq);
- if (curr)
+ if (curr) {
se->vruntime = curr->vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -4947,10 +5023,17 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(curr);
+ update_effective_vruntime(se);
+#endif
resched_task(rq->curr);
}
se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -4998,6 +5081,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(se);
+#endif
}
}
@@ -5083,6 +5169,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
set_task_rq(p, task_cpu(p));
if (!on_rq)
p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+#ifdef CONFIG_CFS_BVT
+ update_effective_vruntime(&p->se);
+#endif
}
void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a48..256b29c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,5 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+
+SCHED_FEAT(BVT_PLACEMENT, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f2..e2173b3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -114,6 +114,10 @@ struct task_group {
atomic_t load_weight;
#endif
+#ifdef CONFIG_CFS_BVT
+ s64 bvt_warp_ns;
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab1187..fd86401 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -373,6 +373,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#ifdef CONFIG_CFS_BVT
+ {
+ .procname = "sched_bvt_place_epsilon",
+ .data = &sysctl_sched_bvt_place_epsilon,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment