kernel-fxtec-pro1x/kernel/sched/stop_task.c
Brendan Jackman 88d05f9257 FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide
This patch adds a parameter to select_task_rq, sibling_count_hint
allowing the caller, where it has this information, to inform the
sched_class the number of tasks that are being woken up as part of
the same event.

The wake_q mechanism is one case where this information is available.

select_task_rq_fair can then use the information to detect that it
needs to widen the search space for task placement in order to avoid
overloading the last-level cache domain's CPUs.

                               * * *

The reason I am investigating this change is the following use case
on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which
all repeatedly do X amount of work then
pthread_barrier_wait (i.e. sleep until the last task finishes its X
and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU
finish faster, and then those CPUs pull over the tasks that are still
running:

     v CPU v           ->time->

                    -------------
   0  (big)         11111  /333
                    -------------
   1  (big)         22222   /444|
                    -------------
   2  (LITTLE)      333333/
                    -------------
   3  (LITTLE)      444444/
                    -------------

Now when task 4 hits the barrier (at |) and wakes the others up,
there are 4 tasks with prev_cpu=<big> and 0 tasks with
prev_cpu=<little>. want_affine therefore means that we'll only look
in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled
on the bigs until the next load balance, something like this:

     v CPU v           ->time->

                    ------------------------
   0  (big)         11111  /333  31313\33333
                    ------------------------
   1  (big)         22222   /444|424\4444444
                    ------------------------
   2  (LITTLE)      333333/          \222222
                    ------------------------
   3  (LITTLE)      444444/            \1111
                    ------------------------
                                 ^^^
                           underutilization

So, I'm trying to get want_affine = 0 for these tasks.

I don't _think_ any incarnation of the wakee_flips mechanism can help
us here because which task is waker and which tasks are wakees
generally changes with each iteration.

However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the
nice property that we know exactly how many tasks are being woken, so
we can cheat.

It might be a disadvantage that we "widen" _every_ task that's woken in
an event, while select_idle_sibling would work fine for the first
sd_llc_size - 1 tasks.

IIUC, if wake_affine() behaves correctly this trick wouldn't be
necessary on SMP systems, so it might be best guarded by the presence
of SD_ASYM_CPUCAPACITY?

                               * * *

Final note..

In order to observe "perfect" behaviour for this use case, I also had
to disable the TTWU_QUEUE sched feature. Suppose during the wakeup
above we are working through the work queue and have placed tasks 3
and 2, and are about to place task 1:

     v CPU v           ->time->

                    --------------
   0  (big)         11111  /333  3
                    --------------
   1  (big)         22222   /444|4
                    --------------
   2  (LITTLE)      333333/      2
                    --------------
   3  (LITTLE)      444444/          <- Task 1 should go here
                    --------------

If TTWU_QUEUE is enabled, we will not yet have enqueued task
2 (having instead sent a reschedule IPI) or attached its load to CPU
2. So we are likely to also place task 1 on cpu 2. Disabling
TTWU_QUEUE means that we enqueue task 2 before placing task 1,
solving this issue. TTWU_QUEUE is there to minimise rq lock
contention, and I guess that this contention is less of an issue on
big.LITTLE systems since they have relatively few CPUs, which
suggests the trade-off makes sense here.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
( - Applied from https://patchwork.kernel.org/patch/9895261/
  - Fixed trivial conflict in kernel/sched/core.c
  - Fixed select_task_rq_idle, now in kernel/sched/idle.c
  - Fixed trivial conflict in select_task_rq_fair )
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Change-Id: I3cfc4bf48c3d7feef969db4d22449f4fbb4f795d
2018-10-26 12:15:52 +01:00

146 lines
3.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* stop-task scheduling class.
*
* The stop task is the highest priority task in the system, it preempts
* everything and will be preempted by nothing.
*
* See kernel/stop_machine.c
*/
#include "sched.h"
#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
int sibling_count_hint)
{
return task_cpu(p); /* stop tasks as never migrate */
}
#endif /* CONFIG_SMP */
static void
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
if (!stop || !task_on_rq_queued(stop))
return NULL;
put_prev_task(rq, prev);
stop->se.exec_start = rq_clock_task(rq);
return stop;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
{
BUG(); /* the stop task should never yield, its pointless. */
}
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
cgroup_account_cputime(curr, delta_exec);
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
stop->se.exec_start = rq_clock_task(rq);
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
static void
prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG(); /* how!?, what priority? */
}
static unsigned int
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_stop(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
.check_preempt_curr = check_preempt_curr_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
};