sched/tune: Fix improper accounting of tasks
cgroup_migrate_execute() calls can_attach() and css_set_move_task() separately without holding rq->lock. The schedtune implementation breaks here, since can_attach() accounts for the task move way before the group move is committed. If the task sleeps right after can_attach(), the sleep is accounted towards the previous group. This ends up in disparity of counts between group. Consider this race: TaskA is moved from root_grp to topapp_grp, root_grp's tasks = 1 and topapp tasks =0 right before the move and TaskB is moving it. On cpu X TaskA runs * cgroup_migrate_execute() schedtune_can_attach() root_grp.tasks--; topapp_grp.tasks++; (root_grp.tasks = 0 and topapp_grp.tasks = 1) *right at this moment context is switched and TaskA runs. *TaskA sleeps dequeue_task() schedtune_dequeue_task() schedtune_task_update root_grp.tasks--; //TaskA has not really "switched" group, so it decrements from the root_grp, however can_attach() has accounted the task move and this leaves us with root_grp.tasks = 0 (it is -ve value protected) topapp.grp.tasks = 1 Now even if cpuX is idle (TaskA is long gone sleeping), its topapp_grp.tasks continues to stay +ve and it is subject to topapp's boost unnecessarily. An easy way to fix this is to move the group change accounting in attach() callback which gets called _after_ css_set_move_task(). Also maintain the task's current idx in struct task_struct as it moves between groups. The task's enqueue/dequeue is accounted towards the cached idx value. In an event when the task dequeues just before group changes, it gets subtracted from the old group, which is correct because the task would have bumped up the old group's count. If the task changes group while its running, the attach() callback has to decrement from the old group and increment from the new group so that the next dequeue will subtract from the new group. IOW the attach() callback has to account only for running task but has to update the cached index for both running and sleeping task. The current uses task->on_rq != 0 check to determine whether a task is queued on the runqueue or not. This is an incorrect check. Because task->on_rq is set to TASK_ON_RQ_MIGRATING (value = 2) during migration. Fix this by using task_on_rq_queued() to check if a task is queued or not. Change-Id: If412da5a239c18d9122cfad2be59b355c14c068f Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org> Co-developed-by: Pavankumar Kondeti <pkondeti@codeaurora.org> Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
This commit is contained in:
parent
e63cbea4a5
commit
bca62a0ae5
3 changed files with 79 additions and 89 deletions
|
@ -885,6 +885,9 @@ struct task_struct {
|
|||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
struct task_group *sched_task_group;
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_TUNE
|
||||
int stune_idx;
|
||||
#endif
|
||||
struct sched_dl_entity dl;
|
||||
|
||||
|
|
|
@ -93,6 +93,9 @@ struct task_struct init_task
|
|||
#endif
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
.sched_task_group = &root_task_group,
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_TUNE
|
||||
.stune_idx = 0,
|
||||
#endif
|
||||
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
|
||||
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
|
||||
|
|
|
@ -420,7 +420,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
|
|||
{
|
||||
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
unsigned long irq_flags;
|
||||
struct schedtune *st;
|
||||
int idx;
|
||||
|
||||
if (unlikely(!schedtune_initialized))
|
||||
|
@ -432,90 +431,16 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
|
|||
* do_exit()::cgroup_exit() and task migration.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&bg->lock, irq_flags);
|
||||
rcu_read_lock();
|
||||
|
||||
st = task_schedtune(p);
|
||||
idx = st->idx;
|
||||
idx = p->stune_idx;
|
||||
|
||||
schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
|
||||
|
||||
rcu_read_unlock();
|
||||
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
|
||||
}
|
||||
|
||||
int schedtune_can_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *task;
|
||||
struct cgroup_subsys_state *css;
|
||||
struct boost_groups *bg;
|
||||
struct rq_flags rq_flags;
|
||||
unsigned int cpu;
|
||||
struct rq *rq;
|
||||
int src_bg; /* Source boost group index */
|
||||
int dst_bg; /* Destination boost group index */
|
||||
int tasks;
|
||||
u64 now;
|
||||
|
||||
if (unlikely(!schedtune_initialized))
|
||||
return 0;
|
||||
|
||||
|
||||
cgroup_taskset_for_each(task, css, tset) {
|
||||
|
||||
/*
|
||||
* Lock the CPU's RQ the task is enqueued to avoid race
|
||||
* conditions with migration code while the task is being
|
||||
* accounted
|
||||
*/
|
||||
rq = task_rq_lock(task, &rq_flags);
|
||||
|
||||
if (!task->on_rq) {
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Boost group accouting is protected by a per-cpu lock and requires
|
||||
* interrupt to be disabled to avoid race conditions on...
|
||||
*/
|
||||
cpu = cpu_of(rq);
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
raw_spin_lock(&bg->lock);
|
||||
|
||||
dst_bg = css_st(css)->idx;
|
||||
src_bg = task_schedtune(task)->idx;
|
||||
|
||||
/*
|
||||
* Current task is not changing boostgroup, which can
|
||||
* happen when the new hierarchy is in use.
|
||||
*/
|
||||
if (unlikely(dst_bg == src_bg)) {
|
||||
raw_spin_unlock(&bg->lock);
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the case of a RUNNABLE task which is switching its
|
||||
* current boost group.
|
||||
*/
|
||||
|
||||
/* Move task from src to dst boost group */
|
||||
tasks = bg->group[src_bg].tasks - 1;
|
||||
bg->group[src_bg].tasks = max(0, tasks);
|
||||
bg->group[dst_bg].tasks += 1;
|
||||
|
||||
/* Update boost hold start for this group */
|
||||
now = sched_clock_cpu(cpu);
|
||||
bg->group[dst_bg].ts = now;
|
||||
|
||||
/* Force boost group re-evaluation at next boost check */
|
||||
bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
|
||||
|
||||
raw_spin_unlock(&bg->lock);
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -580,7 +505,6 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu)
|
|||
{
|
||||
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
unsigned long irq_flags;
|
||||
struct schedtune *st;
|
||||
int idx;
|
||||
|
||||
if (unlikely(!schedtune_initialized))
|
||||
|
@ -591,14 +515,11 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu)
|
|||
* interrupt to be disabled to avoid race conditions on...
|
||||
*/
|
||||
raw_spin_lock_irqsave(&bg->lock, irq_flags);
|
||||
rcu_read_lock();
|
||||
|
||||
st = task_schedtune(p);
|
||||
idx = st->idx;
|
||||
idx = p->stune_idx;
|
||||
|
||||
schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
|
||||
|
||||
rcu_read_unlock();
|
||||
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
|
||||
}
|
||||
|
||||
|
@ -677,11 +598,19 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
|
|||
return st->boost;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
static void schedtune_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *task;
|
||||
struct cgroup_subsys_state *css;
|
||||
struct boost_groups *bg;
|
||||
struct rq_flags rq_flags;
|
||||
unsigned int cpu;
|
||||
struct rq *rq;
|
||||
int src_idx; /* Source boost group index */
|
||||
int dst_idx; /* Destination boost group index */
|
||||
int tasks;
|
||||
u64 now;
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
struct schedtune *st;
|
||||
bool colocate;
|
||||
|
||||
|
@ -692,13 +621,68 @@ static void schedtune_attach(struct cgroup_taskset *tset)
|
|||
|
||||
cgroup_taskset_for_each(task, css, tset)
|
||||
sync_cgroup_colocation(task, colocate);
|
||||
}
|
||||
#else
|
||||
static void schedtune_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
cgroup_taskset_for_each(task, css, tset) {
|
||||
/*
|
||||
* Lock the CPU's RQ the task is enqueued to avoid race
|
||||
* conditions with migration code while the task is being
|
||||
* accounted
|
||||
*/
|
||||
rq = task_rq_lock(task, &rq_flags);
|
||||
|
||||
/*
|
||||
* Boost group accouting is protected by a per-cpu lock and
|
||||
* requires interrupt to be disabled to avoid race conditions
|
||||
* on...
|
||||
*/
|
||||
cpu = cpu_of(rq);
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
raw_spin_lock(&bg->lock);
|
||||
|
||||
dst_idx = task_schedtune(task)->idx;
|
||||
src_idx = task->stune_idx;
|
||||
|
||||
/*
|
||||
* Current task is not changing boostgroup, which can
|
||||
* happen when the new hierarchy is in use.
|
||||
*/
|
||||
if (unlikely(dst_idx == src_idx)) {
|
||||
raw_spin_unlock(&bg->lock);
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
task->stune_idx = dst_idx;
|
||||
|
||||
if (!task_on_rq_queued(task)) {
|
||||
raw_spin_unlock(&bg->lock);
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the case of a RUNNABLE task which is switching its
|
||||
* current boost group.
|
||||
*/
|
||||
|
||||
/* Move task from src to dst boost group */
|
||||
tasks = bg->group[src_idx].tasks - 1;
|
||||
bg->group[src_idx].tasks = max(0, tasks);
|
||||
bg->group[dst_idx].tasks += 1;
|
||||
|
||||
/* Update boost hold start for this group */
|
||||
now = sched_clock_cpu(cpu);
|
||||
bg->group[dst_idx].ts = now;
|
||||
|
||||
/* Force boost group re-evaluation at next boost check */
|
||||
bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
|
||||
|
||||
raw_spin_unlock(&bg->lock);
|
||||
task_rq_unlock(rq, task, &rq_flags);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
s64 boost)
|
||||
|
@ -831,8 +815,8 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
|
|||
.css_alloc = schedtune_css_alloc,
|
||||
.css_free = schedtune_css_free,
|
||||
.attach = schedtune_attach,
|
||||
.can_attach = schedtune_can_attach,
|
||||
.cancel_attach = schedtune_cancel_attach,
|
||||
.can_attach = schedtune_can_attach,
|
||||
.cancel_attach = schedtune_cancel_attach,
|
||||
.legacy_cftypes = files,
|
||||
.early_init = 1,
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue