Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Thomas Gleixner:

 - Cleanup and improvement of NUMA balancing

 - Refactoring and improvements to the PELT (Per Entity Load Tracking)
   code

 - Watchdog simplification and related cleanups

 - The usual pile of small incremental fixes and improvements

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
  watchdog: Reduce message verbosity
  stop_machine: Reflow cpu_stop_queue_two_works()
  sched/numa: Move task_numa_placement() closer to numa_migrate_preferred()
  sched/numa: Use group_weights to identify if migration degrades locality
  sched/numa: Update the scan period without holding the numa_group lock
  sched/numa: Remove numa_has_capacity()
  sched/numa: Modify migrate_swap() to accept additional parameters
  sched/numa: Remove unused task_capacity from 'struct numa_stats'
  sched/numa: Skip nodes that are at 'hoplimit'
  sched/debug: Reverse the order of printing faults
  sched/numa: Use task faults only if numa_group is not yet set up
  sched/numa: Set preferred_node based on best_cpu
  sched/numa: Simplify load_too_imbalanced()
  sched/numa: Evaluate move once per node
  sched/numa: Remove redundant field
  sched/debug: Show the sum wait time of a task group
  sched/fair: Remove #ifdefs from scale_rt_capacity()
  sched/core: Remove get_cpu() from sched_fork()
  sched/cpufreq: Clarify sugov_get_util()
  sched/sysctl: Remove unused sched_time_avg_ms sysctl
  ...
This commit is contained in:
Linus Torvalds 2018-08-13 11:25:07 -07:00
commit f7951c33f0
38 changed files with 1015 additions and 876 deletions

View file

@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0;
if (swq_has_sleeper(&dvcpu->wq))
swake_up(&dvcpu->wq);
swake_up_one(&dvcpu->wq);
return 0;
}
@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
vcpu->arch.wait = 0;
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
}
/* low level hrtimer wake routine */

View file

@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
wqp = kvm_arch_vcpu_wq(vcpu);
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
swake_up_one(wqp);
++vcpu->stat.halt_wakeup;
}
@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
}
}
prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
if (kvmppc_vcore_check_block(vc)) {
finish_swait(&vc->wq, &wait);
@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) {
swake_up(&vc->wq);
swake_up_one(&vc->wq);
}
}

View file

@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
* yield-candidate.
*/
vcpu->preempted = true;
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
/*

View file

@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
for (;;) {
if (!n.halted)
prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
swake_up(&n->wq);
swake_up_one(&n->wq);
}
static void apf_task_wake_all(void)

View file

@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
* using swait_active() is safe.
*/
if (swait_active(q))
swake_up(q);
swake_up_one(q);
if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline;

View file

@ -164,6 +164,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,

View file

@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
#else
extern int lockup_detector_online_cpu(unsigned int cpu);
extern int lockup_detector_offline_cpu(unsigned int cpu);
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline void touch_softlockup_watchdog_sched(void) { }
static inline void touch_softlockup_watchdog(void) { }
static inline void touch_softlockup_watchdog_sync(void) { }
static inline void touch_all_softlockup_watchdogs(void) { }
#endif
#define lockup_detector_online_cpu NULL
#define lockup_detector_offline_cpu NULL
#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
#ifdef CONFIG_DETECT_HUNG_TASK
void reset_hung_task_detector(void);

View file

@ -1017,7 +1017,6 @@ struct task_struct {
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct list_head numa_entry;
struct numa_group *numa_group;
/*

View file

@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern __read_mostly unsigned int sysctl_sched_time_avg;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,

View file

@ -25,8 +25,6 @@ struct smpboot_thread_data;
* parked (cpu offline)
* @unpark: Optional unpark function, called when the thread is
* unparked (cpu online)
* @cpumask: Internal state. To update which threads are unparked,
* call smpboot_update_cpumask_percpu_thread().
* @selfparking: Thread is not parked by the park function.
* @thread_comm: The base name of the thread
*/
@ -40,23 +38,12 @@ struct smp_hotplug_thread {
void (*cleanup)(unsigned int cpu, bool online);
void (*park)(unsigned int cpu);
void (*unpark)(unsigned int cpu);
cpumask_var_t cpumask;
bool selfparking;
const char *thread_comm;
};
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
const struct cpumask *cpumask);
static inline int
smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
return smpboot_register_percpu_thread_cpumask(plug_thread,
cpu_possible_mask);
}
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *);
#endif

View file

@ -16,7 +16,7 @@
* wait-queues, but the semantics are actually completely different, and
* every single user we have ever had has been buggy (or pointless).
*
* A "swake_up()" only wakes up _one_ waiter, which is not at all what
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
* "wake_up()" does, and has led to problems. In other cases, it has
* been fine, because there's only ever one waiter (kvm), but in that
* case gthe whole "simple" wait-queue is just pointless to begin with,
@ -38,8 +38,8 @@
* all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
* sleeper state.
*
* - the exclusive mode; because this requires preserving the list order
* and this is hard.
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
* exclusive.
*
* - custom wake callback functions; because you cannot give any guarantees
* about random code. This also allows swait to be used in RT, such that
@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
* CPU0 - waker CPU1 - waiter
*
* for (;;) {
* @cond = true; prepare_to_swait(&wq_head, &wait, state);
* @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state);
* smp_mb(); // smp_mb() from set_current_state()
* if (swait_active(wq_head)) if (@cond)
* wake_up(wq_head); break;
@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
return swait_active(wq);
}
extern void swake_up(struct swait_queue_head *q);
extern void swake_up_one(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
extern void swake_up_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
#define ___swait_event(wq, condition, state, ret, cmd) \
({ \
__label__ __out; \
struct swait_queue __wait; \
long __ret = ret; \
\
@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
break; \
goto __out; \
} \
\
cmd; \
} \
finish_swait(&wq, &__wait); \
__ret; \
__out: __ret; \
})
#define __swait_event(wq, condition) \
(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
schedule())
#define swait_event(wq, condition) \
#define swait_event_exclusive(wq, condition) \
do { \
if (condition) \
break; \
@ -208,7 +208,7 @@ do { \
TASK_UNINTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_timeout(wq, condition, timeout) \
#define swait_event_timeout_exclusive(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
@ -220,7 +220,7 @@ do { \
___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
schedule())
#define swait_event_interruptible(wq, condition) \
#define swait_event_interruptible_exclusive(wq, condition) \
({ \
int __ret = 0; \
if (!(condition)) \
@ -233,7 +233,7 @@ do { \
TASK_INTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_interruptible_timeout(wq, condition, timeout) \
#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
@ -246,7 +246,7 @@ do { \
(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
/**
* swait_event_idle - wait without system load contribution
* swait_event_idle_exclusive - wait without system load contribution
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
@ -257,7 +257,7 @@ do { \
* condition and doesn't want to contribute to system load. Signals are
* ignored.
*/
#define swait_event_idle(wq, condition) \
#define swait_event_idle_exclusive(wq, condition) \
do { \
if (condition) \
break; \
@ -270,7 +270,7 @@ do { \
__ret = schedule_timeout(__ret))
/**
* swait_event_idle_timeout - wait up to timeout without load contribution
* swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout at which we'll give up in jiffies
@ -288,7 +288,7 @@ do { \
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
#define swait_event_idle_timeout(wq, condition, timeout) \
#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \

View file

@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_AP_WATCHDOG_ONLINE] = {
.name = "lockup_detector:online",
.startup.single = lockup_detector_online_cpu,
.teardown.single = lockup_detector_offline_cpu,
},
[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,

View file

@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
complete_all(&self->parked);
complete(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
return -EBUSY;
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);

View file

@ -92,7 +92,7 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
swait_event(s2idle_wait_head,
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
@ -160,7 +160,7 @@ void s2idle_wake(void)
raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
swake_up(&s2idle_wait_head);
swake_up_one(&s2idle_wait_head);
}
raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}

View file

@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(sp->srcu_gp_waiting))
swake_up(&sp->srcu_wq);
swake_up_one(&sp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
idx = sp->srcu_idx;
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
/* Invoke the callbacks we removed above. */

View file

@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
swake_up(&rsp->gp_wq);
swake_up_one(&rsp->gp_wq);
}
/*
@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
* Helper function for swait_event_idle() wakeup at force-quiescent-state
* Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
* time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout(rsp->gp_wq,
ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */

View file

@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
swake_up(&rsp->expedited_wq);
swake_up_one(&rsp->expedited_wq);
}
break;
}
@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
ret = swait_event_timeout(
ret = swait_event_timeout_exclusive(
rsp->expedited_wq,
sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);

View file

@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
del_timer(&rdp->nocb_timer);
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
swake_up_one(&rdp_leader->nocb_wq);
} else {
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
swait_event_interruptible(
swait_event_interruptible_exclusive(
rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
(d = rcu_seq_done(&rnp->gp_seq, c)));
if (likely(d))
@ -2188,7 +2188,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
swait_event_interruptible_exclusive(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = true;
@ -2253,7 +2253,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
swake_up_one(&rdp->nocb_wq);
}
}
@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
{
for (;;) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
swait_event_interruptible(rdp->nocb_wq,
swait_event_interruptible_exclusive(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */

View file

@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o

View file

@ -17,6 +17,8 @@
#include "../workqueue_internal.h"
#include "../smpboot.h"
#include "pelt.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@ -44,14 +46,6 @@ const_debug unsigned int sysctl_sched_features =
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#ifdef HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
update_irq_load_avg(rq, irq_delta + steal);
#endif
}
@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
* See __iter_div_u64_rem() for another example of this.
*/
asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
@ -1280,16 +1258,17 @@ static int migrate_swap_stop(void *data)
/*
* Cross migrate two tasks
*/
int migrate_swap(struct task_struct *cur, struct task_struct *p)
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = task_cpu(cur),
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = task_cpu(p),
.dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
if (dl_prio(p->prio)) {
put_cpu();
if (dl_prio(p->prio))
return -EAGAIN;
} else if (rt_prio(p->prio)) {
else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
} else {
else
p->sched_class = &fair_sched_class;
}
init_entity_runnable_average(&p->se);
@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
__set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
put_cpu();
return 0;
}
@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
}
}
static void set_cpu_rq_start_time(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
/*
* used to mark begin/end of suspend/resume:
*/
@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@ -6106,7 +6073,6 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
u64 ws = 0;
int i;
for_each_possible_cpu(i)
ws += schedstat_val(tg->se[i]->statistics.wait_sum);
seq_printf(sf, "wait_sum %llu\n", ws);
}
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */

View file

@ -53,9 +53,7 @@ struct sugov_cpu {
unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy: */
unsigned long util_cfs;
unsigned long util_dl;
unsigned long bw_dl;
unsigned long max;
/* The field below is for single-CPU policies only: */
@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
static void sugov_get_util(struct sugov_cpu *sg_cpu)
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the irq utilization.
*
* The DL bandwidth number otoh is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util, irq, max;
sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->util_cfs = cpu_util_cfs(rq);
sg_cpu->util_dl = cpu_util_dl(rq);
}
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
if (rt_rq_is_runnable(&rq->rt))
return sg_cpu->max;
return max;
/*
* Utilization required by DEADLINE must always be granted while, for
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
* gracefully reduce the frequency when no tasks show up for longer
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return max;
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = cpu_util_cfs(rq);
util += cpu_util_rt(rq);
/*
* We do not make cpu_util_dl() a permanent part of this sum because we
* want to use cpu_bw_dl() later on, but we need to check if the
* CFS+RT+DL sum is saturated (ie. no idle time) such that we select
* f_max when there is no idle time.
*
* NOTE: numerical errors or stop class might cause us to not quite hit
* saturation when we should -- something for later.
*/
if ((util + cpu_util_dl(rq)) >= max)
return max;
/*
* There is still idle time; further improve the number by using the
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* 1 - irq
* U' = irq + ------- * U
* max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
/*
* Bandwidth required by DEADLINE must always be granted while, for
* FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
* to gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
* Ideally we would like to set bw_dl as min/guaranteed freq and util +
* bw_dl as requested freq. However, cpufreq is not yet ready for such
* an interface. So, we only do the latter for now.
*/
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
return min(max, util + sg_cpu->bw_dl);
}
/**
@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
sg_policy->need_freq_update = true;
}
@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
sugov_get_util(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
sugov_iowait_apply(sg_cpu, time, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
if (j_util * max > j_max * util) {

View file

@ -16,6 +16,7 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
#include "sched.h"
#include "pelt.h"
struct dl_bandwidth def_dl_bandwidth;
@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (dl_entity_is_special(dl_se))
return;
@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
return p;
}
@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will

View file

@ -111,8 +111,10 @@ static int sched_feat_set(char *cmp)
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
if (i < 0)
return i;
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
@ -120,11 +122,8 @@ static int sched_feat_set(char *cmp)
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
}
return i;
return 0;
}
static ssize_t
@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
{
char buf[64];
char *cmp;
int i;
int ret;
struct inode *inode;
if (cnt > 63)
@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
ret = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
if (ret < 0)
return ret;
*ppos += cnt;
@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf)
{
SEQ_printf(m, "numa_faults node=%d ", node);
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
}
#endif

View file

@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return container_of(cfs_rq, struct rq, cfs);
}
#define entity_is_task(se) 1
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
#include "pelt.h"
#include "sched-pelt.h"
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
* util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
* where n denotes the nth task.
* where n denotes the nth task and cpu_scale the CPU capacity.
*
* For example, a simplest series from the beginning would be like:
* For example, for a CPU with 1024 of capacity, a simplest series from
* the beginning would be like:
*
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* of each group. Skip other nodes.
*/
if (sched_numa_topology_type == NUMA_BACKPLANE &&
dist > maxdist)
dist >= maxdist)
continue;
/* Add up the faults from nearby nodes. */
@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long nr_running;
unsigned long load;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
unsigned long task_capacity;
int has_free_capacity;
unsigned int nr_running;
};
/*
@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
* We'll either bail at !has_free_capacity, or we'll detect a huge
* imbalance and bail there.
* We'll detect a huge imbalance and bail there.
*/
if (!cpus)
return;
@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
capacity = cpus / smt; /* cores */
ns->task_capacity = min_t(unsigned, capacity,
capacity = min_t(unsigned, capacity,
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
if (dst_load < src_load)
swap(dst_load, src_load);
imb = abs(dst_load * src_capacity - src_load * dst_capacity);
/* Is the difference below the threshold? */
imb = dst_load * src_capacity * 100 -
src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
* Compare it with the old imbalance.
*/
orig_src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;
if (orig_dst_load < orig_src_load)
swap(orig_dst_load, orig_src_load);
old_imb = orig_dst_load * src_capacity * 100 -
orig_src_load * dst_capacity * env->imbalance_pct;
old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
/* Would this change make things worse? */
return (imb > old_imb);
@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
* be exchanged with the source task
*/
static void task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp)
long taskimp, long groupimp, bool maymove)
{
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
long src_load, dst_load;
@ -1605,15 +1581,21 @@ static void task_numa_compare(struct task_numa_env *env,
if (cur == env->p)
goto unlock;
if (!cur) {
if (maymove || imp > env->best_imp)
goto assign;
else
goto unlock;
}
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more rmeote accesses that would be expected to
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*/
if (cur) {
/* Skip this swap candidate if cannot move to the source CPU: */
/* Skip this swap candidate if cannot move to the source cpu */
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
@ -1629,73 +1611,43 @@ static void task_numa_compare(struct task_numa_env *env,
* tasks within a group over tiny differences.
*/
if (cur->numa_group)
imp -= imp/16;
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by
* itself (not part of a group), use the task weight
* instead.
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
if (cur->numa_group)
if (cur->numa_group && env->p->numa_group)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
imp += task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
}
}
if (imp <= env->best_imp && moveimp <= env->best_imp)
goto unlock;
if (!cur) {
/* Is there capacity at our destination? */
if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
}
/* Balance doesn't matter much if we're running a task per CPU: */
if (imp > env->best_imp && src_rq->nr_running == 1 &&
dst_rq->nr_running == 1)
goto assign;
/*
* In the overloaded case, try and keep the load balanced.
*/
balance:
load = task_h_load(env->p);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
if (moveimp > imp && moveimp > env->best_imp) {
/*
* If the improvement from just moving env->p direction is
* better than swapping tasks around, check if a move is
* possible. Store a slightly smaller score than moveimp,
* so an actually idle CPU will win.
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
cur = NULL;
goto assign;
}
}
if (imp <= env->best_imp)
goto unlock;
if (cur) {
load = task_h_load(cur);
dst_load -= load;
src_load += load;
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp - 1;
cur = NULL;
goto assign;
}
/*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
if (!load)
goto assign;
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
assign:
/*
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
@ -1711,7 +1663,6 @@ static void task_numa_compare(struct task_numa_env *env,
local_irq_enable();
}
assign:
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
@ -1720,43 +1671,30 @@ static void task_numa_compare(struct task_numa_env *env,
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
long src_load, dst_load, load;
bool maymove = false;
int cpu;
load = task_h_load(env->p);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
/*
* If the improvement from just moving env->p direction is better
* than swapping tasks around, check if a move is possible.
*/
maymove = !load_too_imbalanced(src_load, dst_load, env);
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
task_numa_compare(env, taskimp, groupimp);
task_numa_compare(env, taskimp, groupimp, maymove);
}
}
/* Only move tasks to a NUMA node less busy than the current node. */
static bool numa_has_capacity(struct task_numa_env *env)
{
struct numa_stats *src = &env->src_stats;
struct numa_stats *dst = &env->dst_stats;
if (src->has_free_capacity && !dst->has_free_capacity)
return false;
/*
* Only consider a task move if the source has a higher load
* than the destination, corrected for CPU capacity on each node.
*
* src->load dst->load
* --------------------- vs ---------------------
* src->compute_capacity dst->compute_capacity
*/
if (src->load * dst->compute_capacity * env->imbalance_pct >
dst->load * src->compute_capacity * 100)
return true;
return false;
}
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
p->numa_preferred_nid = task_node(p);
sched_setnuma(p, task_node(p));
return -EINVAL;
}
@ -1811,7 +1749,6 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
if (numa_has_capacity(&env))
task_numa_find_cpu(&env, taskimp, groupimp);
/*
@ -1842,7 +1779,6 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
if (numa_has_capacity(&env))
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
struct numa_group *ng = p->numa_group;
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
nid = cpu_to_node(env.best_cpu);
if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
if (nid != p->numa_preferred_nid)
sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */
@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
return ret;
}
ret = migrate_swap(p, env.best_task);
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
put_task_struct(env.best_task);
@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0;
int seq, nid, max_nid = -1;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
}
}
if (!p->numa_group) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
}
if (group_faults > max_group_faults) {
max_group_faults = group_faults;
max_group_nid = nid;
} else if (group_faults > max_faults) {
max_faults = group_faults;
max_nid = nid;
}
}
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
if (task_node(p) != p->numa_preferred_nid)
numa_migrate_preferred(p);
}
update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
/*
* Retry task to preferred node migration periodically, in case it
* case it previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry))
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
numa_migrate_preferred(p);
}
if (migrated)
p->numa_pages_migrated += pages;
@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
static inline long se_weight(struct sched_entity *se)
{
return scale_load_down(se->load.weight);
}
static inline long se_runnable(struct sched_entity *se)
{
return scale_load_down(se->runnable_weight);
}
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
#ifdef CONFIG_SMP
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
local_n = n;
/*
* As y^PERIOD = 1/2, we can combine
* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
* With a look-up table which covers y^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
val >>= local_n / LOAD_AVG_PERIOD;
local_n %= LOAD_AVG_PERIOD;
}
val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
return val;
}
static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
u32 c1, c2, c3 = d3; /* y^0 == 1 */
/*
* c1 = d1 y^p
*/
c1 = decay_load((u64)d1, periods);
/*
* p-1
* c2 = 1024 \Sum y^n
* n=1
*
* inf inf
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
* n=0 n=p
*/
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
return c1 + c2 + c3;
}
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
* the remainder of the (incomplete) current period.
*
* d1 d2 d3
* ^ ^ ^
* | | |
* |<->|<----------------->|<--->|
* ... |---x---|------| ... |------|-----x (now)
*
* p-1
* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
* n=1
*
* = u y^p + (Step 1)
*
* p-1
* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
* n=1
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
/*
* Step 1: decay old *_sum if we crossed period boundaries.
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
sa->runnable_load_sum =
decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
* Step 2
*/
delta %= 1024;
contrib = __accumulate_pelt_segments(periods,
1024 - sa->period_contrib, delta);
}
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
return periods;
}
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
*
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
* p0 p1 p2
* (now) (~1ms ago) (~2ms ago)
*
* Let u_i denote the fraction of p_i that the entity was runnable.
*
* We then designate the fractions u_i as our co-efficients, yielding the
* following representation of historical load:
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
*
* We choose y based on the with of a reasonably scheduling period, fixing:
* y^32 = 0.5
*
* This means that the contribution to load ~32ms ago (u_32) will be weighted
* approximately half as much as the contribution to load within the last ms
* (u_0).
*
* When a period "rolls over" and we have new u_0`, multiplying the previous
* sum again by y is sufficient to update:
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
sa->last_update_time = now;
return 0;
}
/*
* Use 1024ns as the unit of measurement since it's a reasonable
* approximation of 1us and fast to compute.
*/
delta >>= 10;
if (!delta)
return 0;
sa->last_update_time += delta << 10;
/*
* running is a subset of runnable (weight) so running can't be set if
* runnable is clear. But there are some corner cases where the current
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
* update_blocked_averages()
*/
if (!load)
runnable = running = 0;
/*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
return 1;
}
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
/*
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
sa->util_avg = sa->util_sum / divider;
}
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
* This flag is used to synchronize util_avg updates with util_est updates.
* We map this information into the LSB bit of the utilization saved at
* dequeue time (i.e. util_est.dequeued).
*/
#define UTIL_AVG_UNCHANGED 0x1
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already set */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
/*
* sched_entity:
*
* task:
* se_runnable() == se_weight()
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
*
* load_sum := runnable_sum
* load_avg = se_weight(se) * runnable_avg
*
* runnable_load_sum := runnable_sum
* runnable_load_avg = se_runnable(se) * runnable_avg
*
* XXX collapse load_sum and runnable_load_sum
*
* cfq_rs:
*
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
*
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
return 0;
}
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
return 1;
}
return 0;
}
static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
return 1;
}
return 0;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
#else /* CONFIG_SMP */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
/* updated child weight may affect parent so we have to do this bottom up */
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
sched_avg_update(this_rq);
}
/* Used instead of source_load when we know the type == 0 */
@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
return -1;
@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
return 0;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle != CPU_NOT_IDLE)
if (env->idle == CPU_IDLE)
return -1;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
src_faults = group_faults(p, src_nid);
dst_faults = group_faults(p, dst_nid);
src_weight = group_weight(p, src_nid, dist);
dst_weight = group_weight(p, dst_nid, dist);
} else {
src_faults = task_faults(p, src_nid);
dst_faults = task_faults(p, dst_nid);
src_weight = task_weight(p, src_nid, dist);
dst_weight = task_weight(p, dst_nid, dist);
}
return dst_faults < src_faults;
return dst_weight < src_weight;
}
#else
@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
return false;
}
static inline bool others_have_blocked(struct rq *rq)
{
if (READ_ONCE(rq->avg_rt.util_avg))
return true;
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
return false;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
done = false;
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
if (!cfs_rq_has_blocked(cfs_rq))
if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, used, age_stamp, avg;
s64 delta;
unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
unsigned long used, free;
unsigned long irq;
/*
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
age_stamp = READ_ONCE(rq->age_stamp);
avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
total = sched_avg_period() + delta;
used = div_u64(avg, total);
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return 1;
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
if (unlikely(used >= max))
return 1;
free = max - used;
return scale_irq_capacity(free, irq, max);
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
if (!capacity)
capacity = 1;

399
kernel/sched/pelt.c Normal file
View file

@ -0,0 +1,399 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Per Entity Load Tracking
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* Move PELT related code from fair.c into this pelt.c file
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
#include <linux/sched.h>
#include "sched.h"
#include "sched-pelt.h"
#include "pelt.h"
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
local_n = n;
/*
* As y^PERIOD = 1/2, we can combine
* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
* With a look-up table which covers y^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
val >>= local_n / LOAD_AVG_PERIOD;
local_n %= LOAD_AVG_PERIOD;
}
val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
return val;
}
static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
u32 c1, c2, c3 = d3; /* y^0 == 1 */
/*
* c1 = d1 y^p
*/
c1 = decay_load((u64)d1, periods);
/*
* p-1
* c2 = 1024 \Sum y^n
* n=1
*
* inf inf
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
* n=0 n=p
*/
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
return c1 + c2 + c3;
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
* the remainder of the (incomplete) current period.
*
* d1 d2 d3
* ^ ^ ^
* | | |
* |<->|<----------------->|<--->|
* ... |---x---|------| ... |------|-----x (now)
*
* p-1
* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
* n=1
*
* = u y^p + (Step 1)
*
* p-1
* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
* n=1
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
/*
* Step 1: decay old *_sum if we crossed period boundaries.
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
sa->runnable_load_sum =
decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
* Step 2
*/
delta %= 1024;
contrib = __accumulate_pelt_segments(periods,
1024 - sa->period_contrib, delta);
}
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
return periods;
}
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
*
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
* p0 p1 p2
* (now) (~1ms ago) (~2ms ago)
*
* Let u_i denote the fraction of p_i that the entity was runnable.
*
* We then designate the fractions u_i as our co-efficients, yielding the
* following representation of historical load:
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
*
* We choose y based on the with of a reasonably scheduling period, fixing:
* y^32 = 0.5
*
* This means that the contribution to load ~32ms ago (u_32) will be weighted
* approximately half as much as the contribution to load within the last ms
* (u_0).
*
* When a period "rolls over" and we have new u_0`, multiplying the previous
* sum again by y is sufficient to update:
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
sa->last_update_time = now;
return 0;
}
/*
* Use 1024ns as the unit of measurement since it's a reasonable
* approximation of 1us and fast to compute.
*/
delta >>= 10;
if (!delta)
return 0;
sa->last_update_time += delta << 10;
/*
* running is a subset of runnable (weight) so running can't be set if
* runnable is clear. But there are some corner cases where the current
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
* update_blocked_averages()
*/
if (!load)
runnable = running = 0;
/*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
return 1;
}
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
/*
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
/*
* sched_entity:
*
* task:
* se_runnable() == se_weight()
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
*
* load_sum := runnable_sum
* load_avg = se_weight(se) * runnable_avg
*
* runnable_load_sum := runnable_sum
* runnable_load_avg = se_runnable(se) * runnable_avg
*
* XXX collapse load_sum and runnable_load_sum
*
* cfq_rq:
*
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
*
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
return 0;
}
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
return 1;
}
return 0;
}
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
return 1;
}
return 0;
}
/*
* rt_rq:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
* runnable_load_sum = load_sum
*
* load_avg and runnable_load_avg are not supported and meaningless.
*
*/
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
running,
running,
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
return 1;
}
return 0;
}
/*
* dl_rq:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
* runnable_load_sum = load_sum
*
*/
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
running,
running,
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
return 1;
}
return 0;
}
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
/*
* irq:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
* runnable_load_sum = load_sum
*
*/
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
* of rq clock during which the metric si updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
0,
0,
0);
ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
1,
1,
1);
if (ret)
___update_load_avg(&rq->avg_irq, 1, 1);
return ret;
}
#endif

72
kernel/sched/pelt.h Normal file
View file

@ -0,0 +1,72 @@
#ifdef CONFIG_SMP
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
* This flag is used to synchronize util_avg updates with util_est updates.
* We map this information into the LSB bit of the utilization saved at
* dequeue time (i.e. util_est.dequeued).
*/
#define UTIL_AVG_UNCHANGED 0x1
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already set */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
#else
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int
update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif

View file

@ -5,6 +5,8 @@
*/
#include "sched.h"
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rt_queue_push_tasks(rq);
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
return p;
}
@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
watchdog(rq, p);

View file

@ -594,6 +594,7 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
int rt_queued;
@ -673,7 +674,26 @@ struct dl_rq {
u64 bw_ratio;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
#else
#define entity_is_task(se) 1
#endif
#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
static inline long se_weight(struct sched_entity *se)
{
return scale_load_down(se->load.weight);
}
static inline long se_runnable(struct sched_entity *se)
{
return scale_load_down(se->runnable_weight);
}
static inline bool sched_asym_prefer(int a, int b)
{
@ -833,8 +853,12 @@ struct rq {
struct list_head cfs_tasks;
u64 rt_avg;
u64 age_stamp;
struct sched_avg avg_rt;
struct sched_avg avg_dl;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#define HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
u64 idle_stamp;
u64 avg_idle;
@ -1075,7 +1099,8 @@ enum numa_faults_stats {
};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_time_avg;
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
static inline u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
#ifdef CONFIG_SCHED_HRTICK
/*
@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq);
}
#else
#ifndef arch_scale_cpu_capacity
static __always_inline
@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static inline unsigned long cpu_util_dl(struct rq *rq)
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
}
static inline unsigned long cpu_util_dl(struct rq *rq)
{
return READ_ONCE(rq->avg_dl.util_avg);
}
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
return util;
}
static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
#endif
#ifdef HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return rq->avg_irq.util_avg;
}
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
{
util *= (max - irq);
util /= max;
return util;
}
#else
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return 0;
}
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
{
return util;
}
#endif

View file

@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
void swake_up(struct swait_queue_head *q)
void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up);
EXPORT_SYMBOL(swake_up_one);
/*
* Does not allow usage from IRQ disabled, since we must be able to
@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_all);
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
list_add(&wait->task_list, &q->task_list);
list_add_tail(&wait->task_list, &q->task_list);
}
void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
unsigned long flags;
@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
set_current_state(state);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_swait);
EXPORT_SYMBOL(prepare_to_swait_exclusive);
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
if (signal_pending_state(state, current))
return -ERESTARTSYS;
unsigned long flags;
long ret = 0;
prepare_to_swait(q, wait, state);
raw_spin_lock_irqsave(&q->lock, flags);
if (unlikely(signal_pending_state(state, current))) {
/*
* See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
* must not see us.
*/
list_del_init(&wait->task_list);
ret = -ERESTARTSYS;
} else {
__prepare_to_swait(q, wait);
set_current_state(state);
}
raw_spin_unlock_irqrestore(&q->lock, flags);
return 0;
return ret;
}
EXPORT_SYMBOL(prepare_to_swait_event);

View file

@ -238,7 +238,6 @@ int smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
if (cpumask_test_cpu(cpu, cur->cpumask))
smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
return 0;
@ -280,33 +279,25 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
* smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
* smpboot_register_percpu_thread - Register a per_cpu thread related
* to hotplug
* @plug_thread: Hotplug thread descriptor
* @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
const struct cpumask *cpumask)
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
free_cpumask_var(plug_thread->cpumask);
goto out;
}
if (cpumask_test_cpu(cpu, cpumask))
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
@ -315,7 +306,7 @@ int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_threa
put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
/**
* smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
* @plug_thread: Hotplug thread descriptor
* @new: Revised mask to use
*
* The cpumask field in the smp_hotplug_thread must not be updated directly
* by the client, but only by calling this function.
* This function can only be called on a registered smp_hotplug_thread.
*/
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *new)
{
struct cpumask *old = plug_thread->cpumask;
static struct cpumask tmp;
unsigned int cpu;
lockdep_assert_cpus_held();
mutex_lock(&smpboot_threads_lock);
/* Park threads that were exclusively enabled on the old mask. */
cpumask_andnot(&tmp, old, new);
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_park_thread(plug_thread, cpu);
/* Unpark threads that are exclusively enabled on the new mask. */
cpumask_andnot(&tmp, new, old);
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_unpark_thread(plug_thread, cpu);
cpumask_copy(old, new);
mutex_unlock(&smpboot_threads_lock);
}
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
/*

View file

@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
retry:
/*
* The waking up of stopper threads has to happen in the same
* scheduling context as the queueing. Otherwise, there is a
* possibility of one of the above stoppers being woken up by another
* CPU, and preempting us. This will cause us to not wake up the other
* stopper forever.
*/
preempt_disable();
raw_spin_lock_irq(&stopper1->lock);
raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
if (!stopper1->enabled || !stopper2->enabled) {
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
goto unlock;
}
/*
* Ensure that if we race with __stop_cpus() the stoppers won't get
* queued up in reverse order leading to system deadlock.
@ -255,36 +266,30 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
* It can be falsely true but it is safe to spin until it is cleared,
* queue_stop_cpus_work() does everything under preempt_disable().
*/
if (unlikely(stop_cpus_in_progress)) {
err = -EDEADLK;
if (unlikely(stop_cpus_in_progress))
goto unlock;
}
err = 0;
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
/*
* The waking up of stopper threads has to happen
* in the same scheduling context as the queueing.
* Otherwise, there is a possibility of one of the
* above stoppers being woken up by another CPU,
* and preempting us. This will cause us to n ot
* wake up the other stopper forever.
*/
preempt_disable();
unlock:
raw_spin_unlock(&stopper2->lock);
raw_spin_unlock_irq(&stopper1->lock);
if (unlikely(err == -EDEADLK)) {
preempt_enable();
while (stop_cpus_in_progress)
cpu_relax();
goto retry;
}
if (!err) {
wake_up_q(&wakeq);
preempt_enable();
}
return err;
}

View file

@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_time_avg_ms",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",

View file

@ -18,18 +18,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/kthread.h>
static DEFINE_MUTEX(watchdog_mutex);
@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
static bool softlockup_threads_initialized __read_mostly;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/*
* The watchdog thread function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
*/
static int softlockup_fn(void *data)
{
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
}
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
watchdog_interrupt_count();
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
reinit_completion(this_cpu_ptr(&softlockup_completion));
stop_one_cpu_nowait(smp_processor_id(),
softlockup_fn, NULL,
this_cpu_ptr(&softlockup_stop_work));
}
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
struct sched_param param = { .sched_priority = prio };
sched_setscheduler(current, policy, &param);
}
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
struct completion *done = this_cpu_ptr(&softlockup_completion);
WARN_ON_ONCE(cpu != smp_processor_id());
init_completion(done);
complete(done);
/*
* Start the timer first to prevent the NMI watchdog triggering
@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
/* Enable the perf event */
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
watchdog_nmi_enable(cpu);
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
WARN_ON_ONCE(cpu != smp_processor_id());
/*
* Disable the perf event first. That prevents that a large delay
* between disabling the timer and disabling the perf event causes
@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
*/
watchdog_nmi_disable(cpu);
hrtimer_cancel(hrtimer);
wait_for_completion(this_cpu_ptr(&softlockup_completion));
}
static void watchdog_cleanup(unsigned int cpu, bool online)
static int softlockup_stop_fn(void *data)
{
watchdog_disable(cpu);
watchdog_disable(smp_processor_id());
return 0;
}
static int watchdog_should_run(unsigned int cpu)
static void softlockup_stop_all(void)
{
return __this_cpu_read(hrtimer_interrupts) !=
__this_cpu_read(soft_lockup_hrtimer_cnt);
}
int cpu;
/*
* The watchdog thread function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
*/
static void watchdog(unsigned int cpu)
{
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
}
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
.thread_fn = watchdog,
.thread_comm = "watchdog/%u",
.setup = watchdog_enable,
.cleanup = watchdog_cleanup,
.park = watchdog_disable,
.unpark = watchdog_enable,
};
static void softlockup_update_smpboot_threads(void)
{
lockdep_assert_held(&watchdog_mutex);
if (!softlockup_threads_initialized)
if (!softlockup_initialized)
return;
smpboot_update_cpumask_percpu_thread(&watchdog_threads,
&watchdog_allowed_mask);
}
for_each_cpu(cpu, &watchdog_allowed_mask)
smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
/* Temporarily park all watchdog threads */
static void softlockup_park_all_threads(void)
{
cpumask_clear(&watchdog_allowed_mask);
softlockup_update_smpboot_threads();
}
/* Unpark enabled threads */
static void softlockup_unpark_threads(void)
static int softlockup_start_fn(void *data)
{
watchdog_enable(smp_processor_id());
return 0;
}
static void softlockup_start_all(void)
{
int cpu;
cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
softlockup_update_smpboot_threads();
for_each_cpu(cpu, &watchdog_allowed_mask)
smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}
int lockup_detector_online_cpu(unsigned int cpu)
{
watchdog_enable(cpu);
return 0;
}
int lockup_detector_offline_cpu(unsigned int cpu)
{
watchdog_disable(cpu);
return 0;
}
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
watchdog_nmi_stop();
softlockup_park_all_threads();
softlockup_stop_all();
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
softlockup_unpark_threads();
softlockup_start_all();
watchdog_nmi_start();
cpus_read_unlock();
/*
@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
*/
static __init void lockup_detector_setup(void)
{
int ret;
/*
* If sysctl is off and watchdog got disabled on the command line,
* nothing to do here.
@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
!(watchdog_enabled && watchdog_thresh))
return;
ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
&watchdog_allowed_mask);
if (ret) {
pr_err("Failed to initialize soft lockup detector threads\n");
return;
}
mutex_lock(&watchdog_mutex);
softlockup_threads_initialized = true;
lockup_detector_reconfigure();
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline int watchdog_park_threads(void) { return 0; }
static inline void watchdog_unpark_threads(void) { }
static inline int watchdog_enable_all_cpus(void) { return 0; }
static inline void watchdog_disable_all_cpus(void) { }
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();

View file

@ -175,7 +175,7 @@ static int hardlockup_detector_event_create(void)
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
PTR_ERR(evt));
return PTR_ERR(evt);
}

View file

@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.pause = false;
swake_up(kvm_arch_vcpu_wq(vcpu));
swake_up_one(kvm_arch_vcpu_wq(vcpu));
}
}
@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
{
struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
(!vcpu->arch.pause)));
if (vcpu->arch.power_off || vcpu->arch.pause) {

View file

@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
smp_mb(); /* Make sure the above is visible */
wq = kvm_arch_vcpu_wq(vcpu);
swake_up(wq);
swake_up_one(wq);
return PSCI_RET_SUCCESS;
}

View file

@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
mmput(mm);
kvm_put_kvm(vcpu->kvm);

View file

@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_blocking(vcpu);
for (;;) {
prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_vcpu_check_block(vcpu) < 0)
break;
@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
wqp = kvm_arch_vcpu_wq(vcpu);
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
swake_up_one(wqp);
++vcpu->stat.halt_wakeup;
return true;
}