bca62a0ae5
cgroup_migrate_execute() calls can_attach() and css_set_move_task() separately without holding rq->lock. The schedtune implementation breaks here, since can_attach() accounts for the task move way before the group move is committed. If the task sleeps right after can_attach(), the sleep is accounted towards the previous group. This ends up in disparity of counts between group. Consider this race: TaskA is moved from root_grp to topapp_grp, root_grp's tasks = 1 and topapp tasks =0 right before the move and TaskB is moving it. On cpu X TaskA runs * cgroup_migrate_execute() schedtune_can_attach() root_grp.tasks--; topapp_grp.tasks++; (root_grp.tasks = 0 and topapp_grp.tasks = 1) *right at this moment context is switched and TaskA runs. *TaskA sleeps dequeue_task() schedtune_dequeue_task() schedtune_task_update root_grp.tasks--; //TaskA has not really "switched" group, so it decrements from the root_grp, however can_attach() has accounted the task move and this leaves us with root_grp.tasks = 0 (it is -ve value protected) topapp.grp.tasks = 1 Now even if cpuX is idle (TaskA is long gone sleeping), its topapp_grp.tasks continues to stay +ve and it is subject to topapp's boost unnecessarily. An easy way to fix this is to move the group change accounting in attach() callback which gets called _after_ css_set_move_task(). Also maintain the task's current idx in struct task_struct as it moves between groups. The task's enqueue/dequeue is accounted towards the cached idx value. In an event when the task dequeues just before group changes, it gets subtracted from the old group, which is correct because the task would have bumped up the old group's count. If the task changes group while its running, the attach() callback has to decrement from the old group and increment from the new group so that the next dequeue will subtract from the new group. IOW the attach() callback has to account only for running task but has to update the cached index for both running and sleeping task. The current uses task->on_rq != 0 check to determine whether a task is queued on the runqueue or not. This is an incorrect check. Because task->on_rq is set to TASK_ON_RQ_MIGRATING (value = 2) during migration. Fix this by using task_on_rq_queued() to check if a task is queued or not. Change-Id: If412da5a239c18d9122cfad2be59b355c14c068f Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org> Co-developed-by: Pavankumar Kondeti <pkondeti@codeaurora.org> Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
203 lines
5.5 KiB
C
203 lines
5.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/init_task.h>
|
|
#include <linux/export.h>
|
|
#include <linux/mqueue.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/sysctl.h>
|
|
#include <linux/sched/rt.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/audit.h>
|
|
#include <linux/scs.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
static struct signal_struct init_signals = {
|
|
.nr_threads = 1,
|
|
.thread_head = LIST_HEAD_INIT(init_task.thread_node),
|
|
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
|
|
.shared_pending = {
|
|
.list = LIST_HEAD_INIT(init_signals.shared_pending.list),
|
|
.signal = {{0}}
|
|
},
|
|
.multiprocess = HLIST_HEAD_INIT,
|
|
.rlim = INIT_RLIMITS,
|
|
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
|
|
#ifdef CONFIG_POSIX_TIMERS
|
|
.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
|
|
.cputimer = {
|
|
.cputime_atomic = INIT_CPUTIME_ATOMIC,
|
|
.running = false,
|
|
.checking_timer = false,
|
|
},
|
|
#endif
|
|
INIT_CPU_TIMERS(init_signals)
|
|
.pids = {
|
|
[PIDTYPE_PID] = &init_struct_pid,
|
|
[PIDTYPE_TGID] = &init_struct_pid,
|
|
[PIDTYPE_PGID] = &init_struct_pid,
|
|
[PIDTYPE_SID] = &init_struct_pid,
|
|
},
|
|
INIT_PREV_CPUTIME(init_signals)
|
|
};
|
|
|
|
static struct sighand_struct init_sighand = {
|
|
.count = ATOMIC_INIT(1),
|
|
.action = { { { .sa_handler = SIG_DFL, } }, },
|
|
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
|
|
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
|
|
};
|
|
|
|
/*
|
|
* Set up the first task table, touch at your own risk!. Base=0,
|
|
* limit=0x1fffff (=2MB)
|
|
*/
|
|
struct task_struct init_task
|
|
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
|
|
__init_task_data
|
|
#endif
|
|
= {
|
|
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
|
.thread_info = INIT_THREAD_INFO(init_task),
|
|
.stack_refcount = ATOMIC_INIT(1),
|
|
#endif
|
|
.state = 0,
|
|
.stack = init_stack,
|
|
.usage = ATOMIC_INIT(2),
|
|
.flags = PF_KTHREAD,
|
|
.prio = MAX_PRIO - 20,
|
|
.static_prio = MAX_PRIO - 20,
|
|
.normal_prio = MAX_PRIO - 20,
|
|
.policy = SCHED_NORMAL,
|
|
.cpus_allowed = CPU_MASK_ALL,
|
|
.nr_cpus_allowed= NR_CPUS,
|
|
.cpus_requested = CPU_MASK_ALL,
|
|
.mm = NULL,
|
|
.active_mm = &init_mm,
|
|
.restart_block = {
|
|
.fn = do_no_restart_syscall,
|
|
},
|
|
.se = {
|
|
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
|
},
|
|
.rt = {
|
|
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
|
|
.time_slice = RR_TIMESLICE,
|
|
},
|
|
.tasks = LIST_HEAD_INIT(init_task.tasks),
|
|
#ifdef CONFIG_SMP
|
|
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
|
|
#endif
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
.sched_task_group = &root_task_group,
|
|
#endif
|
|
#ifdef CONFIG_SCHED_TUNE
|
|
.stune_idx = 0,
|
|
#endif
|
|
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
|
|
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
|
|
.real_parent = &init_task,
|
|
.parent = &init_task,
|
|
.children = LIST_HEAD_INIT(init_task.children),
|
|
.sibling = LIST_HEAD_INIT(init_task.sibling),
|
|
.group_leader = &init_task,
|
|
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
|
|
RCU_POINTER_INITIALIZER(cred, &init_cred),
|
|
.comm = INIT_TASK_COMM,
|
|
.thread = INIT_THREAD,
|
|
.fs = &init_fs,
|
|
.files = &init_files,
|
|
.signal = &init_signals,
|
|
.sighand = &init_sighand,
|
|
.nsproxy = &init_nsproxy,
|
|
.pending = {
|
|
.list = LIST_HEAD_INIT(init_task.pending.list),
|
|
.signal = {{0}}
|
|
},
|
|
.blocked = {{0}},
|
|
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
|
|
.journal_info = NULL,
|
|
INIT_CPU_TIMERS(init_task)
|
|
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
|
|
.timer_slack_ns = 50000, /* 50 usec default slack */
|
|
.thread_pid = &init_struct_pid,
|
|
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
|
|
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
|
|
#ifdef CONFIG_AUDITSYSCALL
|
|
.loginuid = INVALID_UID,
|
|
.sessionid = AUDIT_SID_UNSET,
|
|
#endif
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
|
|
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
|
|
#endif
|
|
#ifdef CONFIG_PREEMPT_RCU
|
|
.rcu_read_lock_nesting = 0,
|
|
.rcu_read_unlock_special.s = 0,
|
|
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
|
|
.rcu_blocked_node = NULL,
|
|
#endif
|
|
#ifdef CONFIG_TASKS_RCU
|
|
.rcu_tasks_holdout = false,
|
|
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
|
|
.rcu_tasks_idle_cpu = -1,
|
|
#endif
|
|
#ifdef CONFIG_CPUSETS
|
|
.mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq),
|
|
#endif
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
.pi_waiters = RB_ROOT_CACHED,
|
|
.pi_top_task = NULL,
|
|
#endif
|
|
INIT_PREV_CPUTIME(init_task)
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
|
|
.vtime.starttime = 0,
|
|
.vtime.state = VTIME_SYS,
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
.numa_preferred_nid = -1,
|
|
.numa_group = NULL,
|
|
.numa_faults = NULL,
|
|
#endif
|
|
#ifdef CONFIG_KASAN
|
|
.kasan_depth = 1,
|
|
#endif
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
.softirqs_enabled = 1,
|
|
#endif
|
|
#ifdef CONFIG_LOCKDEP
|
|
.lockdep_recursion = 0,
|
|
#endif
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
|
.ret_stack = NULL,
|
|
#endif
|
|
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT)
|
|
.trace_recursion = 0,
|
|
#endif
|
|
#ifdef CONFIG_LIVEPATCH
|
|
.patch_state = KLP_UNDEFINED,
|
|
#endif
|
|
#ifdef CONFIG_SECURITY
|
|
.security = NULL,
|
|
#endif
|
|
};
|
|
EXPORT_SYMBOL(init_task);
|
|
|
|
#ifdef CONFIG_SHADOW_CALL_STACK
|
|
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] __init_task_data
|
|
__aligned(SCS_SIZE) = {
|
|
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
|
|
};
|
|
#endif
|
|
|
|
/*
|
|
* Initial thread structure. Alignment of this is handled by a special
|
|
* linker map entry.
|
|
*/
|
|
#ifndef CONFIG_THREAD_INFO_IN_TASK
|
|
struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
|
|
#endif
|