kernel-fxtec-pro1x/init/init_task.c
Abhijeet Dharmapurikar bca62a0ae5 sched/tune: Fix improper accounting of tasks
cgroup_migrate_execute() calls can_attach() and css_set_move_task()
separately without holding rq->lock.

The schedtune implementation breaks here, since can_attach() accounts
for the task move way before the group move is committed. If the task
sleeps right after can_attach(), the sleep is accounted towards the
previous group. This ends up in disparity of counts between group.

Consider this race:

TaskA is moved from root_grp to topapp_grp, root_grp's tasks = 1 and
topapp tasks =0 right before the move and TaskB is moving it.

On cpu X
TaskA runs
* cgroup_migrate_execute()
  schedtune_can_attach()
   root_grp.tasks--; topapp_grp.tasks++;
   (root_grp.tasks =  0 and topapp_grp.tasks = 1)

*right at this moment context is switched and TaskA runs.

*TaskA sleeps
 dequeue_task()
   schedtune_dequeue_task()
    schedtune_task_update
     root_grp.tasks--; //TaskA has not really "switched" group, so it
     decrements from the root_grp, however can_attach() has accounted
     the task move and this leaves us with
     root_grp.tasks = 0 (it is -ve value protected)
     topapp.grp.tasks = 1

Now even if cpuX is idle (TaskA is long gone sleeping), its
topapp_grp.tasks continues to stay +ve and it is subject to topapp's
boost unnecessarily.

An easy way to fix this is to move the group change accounting in
attach() callback which gets called _after_ css_set_move_task(). Also
maintain the task's current idx in struct task_struct as it moves
between groups. The task's enqueue/dequeue is accounted towards the
cached idx value. In an event when the task dequeues just
before group changes, it gets subtracted from the old group, which is
correct because the task would have bumped up the old group's count. If
the task changes group while its running, the attach() callback has to
decrement from the old group and increment from the new group so that
the next dequeue will subtract from the new group. IOW the attach()
callback has to account only for running task but has to update the
cached index for both running and sleeping task.

The current uses task->on_rq != 0 check to determine whether a task
is queued on the runqueue or not. This is an incorrect check. Because
task->on_rq is set to TASK_ON_RQ_MIGRATING (value = 2) during
migration. Fix this by using task_on_rq_queued() to check if a task
is queued or not.

Change-Id: If412da5a239c18d9122cfad2be59b355c14c068f
Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
Co-developed-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
2020-10-14 11:29:10 +05:30

203 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/init_task.h>
#include <linux/export.h>
#include <linux/mqueue.h>
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
#include <linux/scs.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
static struct signal_struct init_signals = {
.nr_threads = 1,
.thread_head = LIST_HEAD_INIT(init_task.thread_node),
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
.shared_pending = {
.list = LIST_HEAD_INIT(init_signals.shared_pending.list),
.signal = {{0}}
},
.multiprocess = HLIST_HEAD_INIT,
.rlim = INIT_RLIMITS,
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
#ifdef CONFIG_POSIX_TIMERS
.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
.cputimer = {
.cputime_atomic = INIT_CPUTIME_ATOMIC,
.running = false,
.checking_timer = false,
},
#endif
INIT_CPU_TIMERS(init_signals)
.pids = {
[PIDTYPE_PID] = &init_struct_pid,
[PIDTYPE_TGID] = &init_struct_pid,
[PIDTYPE_PGID] = &init_struct_pid,
[PIDTYPE_SID] = &init_struct_pid,
},
INIT_PREV_CPUTIME(init_signals)
};
static struct sighand_struct init_sighand = {
.count = ATOMIC_INIT(1),
.action = { { { .sa_handler = SIG_DFL, } }, },
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
};
/*
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
*/
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
__init_task_data
#endif
= {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = ATOMIC_INIT(1),
#endif
.state = 0,
.stack = init_stack,
.usage = ATOMIC_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_allowed = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.cpus_requested = CPU_MASK_ALL,
.mm = NULL,
.active_mm = &init_mm,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
#ifdef CONFIG_SMP
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
#endif
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
#ifdef CONFIG_SCHED_TUNE
.stune_idx = 0,
#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDITSYSCALL
.loginuid = INVALID_UID,
.sessionid = AUDIT_SID_UNSET,
#endif
#ifdef CONFIG_PERF_EVENTS
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
#endif
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
#endif
#ifdef CONFIG_TASKS_RCU
.rcu_tasks_holdout = false,
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq),
#endif
#ifdef CONFIG_RT_MUTEXES
.pi_waiters = RB_ROOT_CACHED,
.pi_top_task = NULL,
#endif
INIT_PREV_CPUTIME(init_task)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
.vtime.starttime = 0,
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
.numa_preferred_nid = -1,
.numa_group = NULL,
.numa_faults = NULL,
#endif
#ifdef CONFIG_KASAN
.kasan_depth = 1,
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
.softirqs_enabled = 1,
#endif
#ifdef CONFIG_LOCKDEP
.lockdep_recursion = 0,
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.ret_stack = NULL,
#endif
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT)
.trace_recursion = 0,
#endif
#ifdef CONFIG_LIVEPATCH
.patch_state = KLP_UNDEFINED,
#endif
#ifdef CONFIG_SECURITY
.security = NULL,
#endif
};
EXPORT_SYMBOL(init_task);
#ifdef CONFIG_SHADOW_CALL_STACK
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] __init_task_data
__aligned(SCS_SIZE) = {
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
};
#endif
/*
* Initial thread structure. Alignment of this is handled by a special
* linker map entry.
*/
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
#endif