Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: - Cleanup and improvement of NUMA balancing - Refactoring and improvements to the PELT (Per Entity Load Tracking) code - Watchdog simplification and related cleanups - The usual pile of small incremental fixes and improvements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) watchdog: Reduce message verbosity stop_machine: Reflow cpu_stop_queue_two_works() sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() sched/numa: Use group_weights to identify if migration degrades locality sched/numa: Update the scan period without holding the numa_group lock sched/numa: Remove numa_has_capacity() sched/numa: Modify migrate_swap() to accept additional parameters sched/numa: Remove unused task_capacity from 'struct numa_stats' sched/numa: Skip nodes that are at 'hoplimit' sched/debug: Reverse the order of printing faults sched/numa: Use task faults only if numa_group is not yet set up sched/numa: Set preferred_node based on best_cpu sched/numa: Simplify load_too_imbalanced() sched/numa: Evaluate move once per node sched/numa: Remove redundant field sched/debug: Show the sum wait time of a task group sched/fair: Remove #ifdefs from scale_rt_capacity() sched/core: Remove get_cpu() from sched_fork() sched/cpufreq: Clarify sugov_get_util() sched/sysctl: Remove unused sched_time_avg_ms sysctl ...
This commit is contained in:
commit
f7951c33f0
38 changed files with 1015 additions and 876 deletions
|
@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
|
|||
dvcpu->arch.wait = 0;
|
||||
|
||||
if (swq_has_sleeper(&dvcpu->wq))
|
||||
swake_up(&dvcpu->wq);
|
||||
swake_up_one(&dvcpu->wq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
|
|||
|
||||
vcpu->arch.wait = 0;
|
||||
if (swq_has_sleeper(&vcpu->wq))
|
||||
swake_up(&vcpu->wq);
|
||||
swake_up_one(&vcpu->wq);
|
||||
}
|
||||
|
||||
/* low level hrtimer wake routine */
|
||||
|
|
|
@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
|
|||
|
||||
wqp = kvm_arch_vcpu_wq(vcpu);
|
||||
if (swq_has_sleeper(wqp)) {
|
||||
swake_up(wqp);
|
||||
swake_up_one(wqp);
|
||||
++vcpu->stat.halt_wakeup;
|
||||
}
|
||||
|
||||
|
@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
|
|||
}
|
||||
}
|
||||
|
||||
prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
|
||||
prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
if (kvmppc_vcore_check_block(vc)) {
|
||||
finish_swait(&vc->wq, &wait);
|
||||
|
@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
|
|||
kvmppc_start_thread(vcpu, vc);
|
||||
trace_kvm_guest_enter(vcpu);
|
||||
} else if (vc->vcore_state == VCORE_SLEEPING) {
|
||||
swake_up(&vc->wq);
|
||||
swake_up_one(&vc->wq);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
|
|||
* yield-candidate.
|
||||
*/
|
||||
vcpu->preempted = true;
|
||||
swake_up(&vcpu->wq);
|
||||
swake_up_one(&vcpu->wq);
|
||||
vcpu->stat.halt_wakeup++;
|
||||
}
|
||||
/*
|
||||
|
|
|
@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
|
|||
|
||||
for (;;) {
|
||||
if (!n.halted)
|
||||
prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
|
||||
prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
|
||||
if (hlist_unhashed(&n.link))
|
||||
break;
|
||||
|
||||
|
@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
|
|||
if (n->halted)
|
||||
smp_send_reschedule(n->cpu);
|
||||
else if (swq_has_sleeper(&n->wq))
|
||||
swake_up(&n->wq);
|
||||
swake_up_one(&n->wq);
|
||||
}
|
||||
|
||||
static void apf_task_wake_all(void)
|
||||
|
|
|
@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
|
|||
* using swait_active() is safe.
|
||||
*/
|
||||
if (swait_active(q))
|
||||
swake_up(q);
|
||||
swake_up_one(q);
|
||||
|
||||
if (apic_lvtt_tscdeadline(apic))
|
||||
ktimer->expired_tscdeadline = ktimer->tscdeadline;
|
||||
|
|
|
@ -164,6 +164,7 @@ enum cpuhp_state {
|
|||
CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
|
||||
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
|
||||
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
|
||||
CPUHP_AP_WATCHDOG_ONLINE,
|
||||
CPUHP_AP_WORKQUEUE_ONLINE,
|
||||
CPUHP_AP_RCUTREE_ONLINE,
|
||||
CPUHP_AP_ONLINE_DYN,
|
||||
|
|
|
@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
|
|||
extern void touch_softlockup_watchdog_sync(void);
|
||||
extern void touch_all_softlockup_watchdogs(void);
|
||||
extern unsigned int softlockup_panic;
|
||||
#else
|
||||
|
||||
extern int lockup_detector_online_cpu(unsigned int cpu);
|
||||
extern int lockup_detector_offline_cpu(unsigned int cpu);
|
||||
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
static inline void touch_softlockup_watchdog_sched(void) { }
|
||||
static inline void touch_softlockup_watchdog(void) { }
|
||||
static inline void touch_softlockup_watchdog_sync(void) { }
|
||||
static inline void touch_all_softlockup_watchdogs(void) { }
|
||||
#endif
|
||||
|
||||
#define lockup_detector_online_cpu NULL
|
||||
#define lockup_detector_offline_cpu NULL
|
||||
#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
|
||||
#ifdef CONFIG_DETECT_HUNG_TASK
|
||||
void reset_hung_task_detector(void);
|
||||
|
|
|
@ -1017,7 +1017,6 @@ struct task_struct {
|
|||
u64 last_sum_exec_runtime;
|
||||
struct callback_head numa_work;
|
||||
|
||||
struct list_head numa_entry;
|
||||
struct numa_group *numa_group;
|
||||
|
||||
/*
|
||||
|
|
|
@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
|||
#ifdef CONFIG_SCHED_DEBUG
|
||||
extern __read_mostly unsigned int sysctl_sched_migration_cost;
|
||||
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
|
||||
extern __read_mostly unsigned int sysctl_sched_time_avg;
|
||||
|
||||
int sched_proc_update_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length,
|
||||
|
|
|
@ -25,8 +25,6 @@ struct smpboot_thread_data;
|
|||
* parked (cpu offline)
|
||||
* @unpark: Optional unpark function, called when the thread is
|
||||
* unparked (cpu online)
|
||||
* @cpumask: Internal state. To update which threads are unparked,
|
||||
* call smpboot_update_cpumask_percpu_thread().
|
||||
* @selfparking: Thread is not parked by the park function.
|
||||
* @thread_comm: The base name of the thread
|
||||
*/
|
||||
|
@ -40,23 +38,12 @@ struct smp_hotplug_thread {
|
|||
void (*cleanup)(unsigned int cpu, bool online);
|
||||
void (*park)(unsigned int cpu);
|
||||
void (*unpark)(unsigned int cpu);
|
||||
cpumask_var_t cpumask;
|
||||
bool selfparking;
|
||||
const char *thread_comm;
|
||||
};
|
||||
|
||||
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
|
||||
const struct cpumask *cpumask);
|
||||
|
||||
static inline int
|
||||
smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
|
||||
{
|
||||
return smpboot_register_percpu_thread_cpumask(plug_thread,
|
||||
cpu_possible_mask);
|
||||
}
|
||||
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
|
||||
|
||||
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
|
||||
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
|
||||
const struct cpumask *);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
* wait-queues, but the semantics are actually completely different, and
|
||||
* every single user we have ever had has been buggy (or pointless).
|
||||
*
|
||||
* A "swake_up()" only wakes up _one_ waiter, which is not at all what
|
||||
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
|
||||
* "wake_up()" does, and has led to problems. In other cases, it has
|
||||
* been fine, because there's only ever one waiter (kvm), but in that
|
||||
* case gthe whole "simple" wait-queue is just pointless to begin with,
|
||||
|
@ -38,8 +38,8 @@
|
|||
* all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
|
||||
* sleeper state.
|
||||
*
|
||||
* - the exclusive mode; because this requires preserving the list order
|
||||
* and this is hard.
|
||||
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
|
||||
* exclusive.
|
||||
*
|
||||
* - custom wake callback functions; because you cannot give any guarantees
|
||||
* about random code. This also allows swait to be used in RT, such that
|
||||
|
@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
|
|||
* CPU0 - waker CPU1 - waiter
|
||||
*
|
||||
* for (;;) {
|
||||
* @cond = true; prepare_to_swait(&wq_head, &wait, state);
|
||||
* @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state);
|
||||
* smp_mb(); // smp_mb() from set_current_state()
|
||||
* if (swait_active(wq_head)) if (@cond)
|
||||
* wake_up(wq_head); break;
|
||||
|
@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
|
|||
return swait_active(wq);
|
||||
}
|
||||
|
||||
extern void swake_up(struct swait_queue_head *q);
|
||||
extern void swake_up_one(struct swait_queue_head *q);
|
||||
extern void swake_up_all(struct swait_queue_head *q);
|
||||
extern void swake_up_locked(struct swait_queue_head *q);
|
||||
|
||||
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
||||
extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
|
||||
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
|
||||
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
|
||||
|
||||
extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
||||
extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
||||
|
||||
/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
|
||||
/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
|
||||
#define ___swait_event(wq, condition, state, ret, cmd) \
|
||||
({ \
|
||||
__label__ __out; \
|
||||
struct swait_queue __wait; \
|
||||
long __ret = ret; \
|
||||
\
|
||||
|
@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
|||
\
|
||||
if (___wait_is_interruptible(state) && __int) { \
|
||||
__ret = __int; \
|
||||
break; \
|
||||
goto __out; \
|
||||
} \
|
||||
\
|
||||
cmd; \
|
||||
} \
|
||||
finish_swait(&wq, &__wait); \
|
||||
__ret; \
|
||||
__out: __ret; \
|
||||
})
|
||||
|
||||
#define __swait_event(wq, condition) \
|
||||
(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
|
||||
schedule())
|
||||
|
||||
#define swait_event(wq, condition) \
|
||||
#define swait_event_exclusive(wq, condition) \
|
||||
do { \
|
||||
if (condition) \
|
||||
break; \
|
||||
|
@ -208,7 +208,7 @@ do { \
|
|||
TASK_UNINTERRUPTIBLE, timeout, \
|
||||
__ret = schedule_timeout(__ret))
|
||||
|
||||
#define swait_event_timeout(wq, condition, timeout) \
|
||||
#define swait_event_timeout_exclusive(wq, condition, timeout) \
|
||||
({ \
|
||||
long __ret = timeout; \
|
||||
if (!___wait_cond_timeout(condition)) \
|
||||
|
@ -220,7 +220,7 @@ do { \
|
|||
___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
|
||||
schedule())
|
||||
|
||||
#define swait_event_interruptible(wq, condition) \
|
||||
#define swait_event_interruptible_exclusive(wq, condition) \
|
||||
({ \
|
||||
int __ret = 0; \
|
||||
if (!(condition)) \
|
||||
|
@ -233,7 +233,7 @@ do { \
|
|||
TASK_INTERRUPTIBLE, timeout, \
|
||||
__ret = schedule_timeout(__ret))
|
||||
|
||||
#define swait_event_interruptible_timeout(wq, condition, timeout) \
|
||||
#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
|
||||
({ \
|
||||
long __ret = timeout; \
|
||||
if (!___wait_cond_timeout(condition)) \
|
||||
|
@ -246,7 +246,7 @@ do { \
|
|||
(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
|
||||
|
||||
/**
|
||||
* swait_event_idle - wait without system load contribution
|
||||
* swait_event_idle_exclusive - wait without system load contribution
|
||||
* @wq: the waitqueue to wait on
|
||||
* @condition: a C expression for the event to wait for
|
||||
*
|
||||
|
@ -257,7 +257,7 @@ do { \
|
|||
* condition and doesn't want to contribute to system load. Signals are
|
||||
* ignored.
|
||||
*/
|
||||
#define swait_event_idle(wq, condition) \
|
||||
#define swait_event_idle_exclusive(wq, condition) \
|
||||
do { \
|
||||
if (condition) \
|
||||
break; \
|
||||
|
@ -270,7 +270,7 @@ do { \
|
|||
__ret = schedule_timeout(__ret))
|
||||
|
||||
/**
|
||||
* swait_event_idle_timeout - wait up to timeout without load contribution
|
||||
* swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
|
||||
* @wq: the waitqueue to wait on
|
||||
* @condition: a C expression for the event to wait for
|
||||
* @timeout: timeout at which we'll give up in jiffies
|
||||
|
@ -288,7 +288,7 @@ do { \
|
|||
* or the remaining jiffies (at least 1) if the @condition evaluated
|
||||
* to %true before the @timeout elapsed.
|
||||
*/
|
||||
#define swait_event_idle_timeout(wq, condition, timeout) \
|
||||
#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \
|
||||
({ \
|
||||
long __ret = timeout; \
|
||||
if (!___wait_cond_timeout(condition)) \
|
||||
|
|
|
@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
|||
.startup.single = perf_event_init_cpu,
|
||||
.teardown.single = perf_event_exit_cpu,
|
||||
},
|
||||
[CPUHP_AP_WATCHDOG_ONLINE] = {
|
||||
.name = "lockup_detector:online",
|
||||
.startup.single = lockup_detector_online_cpu,
|
||||
.teardown.single = lockup_detector_offline_cpu,
|
||||
},
|
||||
[CPUHP_AP_WORKQUEUE_ONLINE] = {
|
||||
.name = "workqueue:online",
|
||||
.startup.single = workqueue_online_cpu,
|
||||
|
|
|
@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
|
|||
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
|
||||
break;
|
||||
|
||||
complete_all(&self->parked);
|
||||
complete(&self->parked);
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
|
|||
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
|
||||
__kthread_bind(k, kthread->cpu, TASK_PARKED);
|
||||
|
||||
reinit_completion(&kthread->parked);
|
||||
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
|
||||
/*
|
||||
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
|
||||
|
@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
|
|||
if (WARN_ON(k->flags & PF_EXITING))
|
||||
return -ENOSYS;
|
||||
|
||||
if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
|
||||
return -EBUSY;
|
||||
|
||||
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
|
||||
if (k != current) {
|
||||
wake_up_process(k);
|
||||
|
|
|
@ -92,7 +92,7 @@ static void s2idle_enter(void)
|
|||
/* Push all the CPUs into the idle loop. */
|
||||
wake_up_all_idle_cpus();
|
||||
/* Make the current CPU wait so it can enter the idle loop too. */
|
||||
swait_event(s2idle_wait_head,
|
||||
swait_event_exclusive(s2idle_wait_head,
|
||||
s2idle_state == S2IDLE_STATE_WAKE);
|
||||
|
||||
cpuidle_pause();
|
||||
|
@ -160,7 +160,7 @@ void s2idle_wake(void)
|
|||
raw_spin_lock_irqsave(&s2idle_lock, flags);
|
||||
if (s2idle_state > S2IDLE_STATE_NONE) {
|
||||
s2idle_state = S2IDLE_STATE_WAKE;
|
||||
swake_up(&s2idle_wait_head);
|
||||
swake_up_one(&s2idle_wait_head);
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&s2idle_lock, flags);
|
||||
}
|
||||
|
|
|
@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
|
|||
|
||||
WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
|
||||
if (!newval && READ_ONCE(sp->srcu_gp_waiting))
|
||||
swake_up(&sp->srcu_wq);
|
||||
swake_up_one(&sp->srcu_wq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
|
@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
|
|||
idx = sp->srcu_idx;
|
||||
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
|
||||
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
|
||||
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
|
||||
swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
|
||||
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
|
||||
|
||||
/* Invoke the callbacks we removed above. */
|
||||
|
|
|
@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
|
|||
!READ_ONCE(rsp->gp_flags) ||
|
||||
!rsp->gp_kthread)
|
||||
return;
|
||||
swake_up(&rsp->gp_wq);
|
||||
swake_up_one(&rsp->gp_wq);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
|
|||
}
|
||||
|
||||
/*
|
||||
* Helper function for swait_event_idle() wakeup at force-quiescent-state
|
||||
* Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
|
||||
* time.
|
||||
*/
|
||||
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
|
||||
|
@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
|
|||
READ_ONCE(rsp->gp_seq),
|
||||
TPS("reqwait"));
|
||||
rsp->gp_state = RCU_GP_WAIT_GPS;
|
||||
swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
|
||||
swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
|
||||
RCU_GP_FLAG_INIT);
|
||||
rsp->gp_state = RCU_GP_DONE_GPS;
|
||||
/* Locking provides needed memory barrier. */
|
||||
|
@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
|
|||
READ_ONCE(rsp->gp_seq),
|
||||
TPS("fqswait"));
|
||||
rsp->gp_state = RCU_GP_WAIT_FQS;
|
||||
ret = swait_event_idle_timeout(rsp->gp_wq,
|
||||
ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
|
||||
rcu_gp_fqs_check_wake(rsp, &gf), j);
|
||||
rsp->gp_state = RCU_GP_DOING_FQS;
|
||||
/* Locking provides needed memory barriers. */
|
||||
|
|
|
@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
|
|||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
if (wake) {
|
||||
smp_mb(); /* EGP done before wake_up(). */
|
||||
swake_up(&rsp->expedited_wq);
|
||||
swake_up_one(&rsp->expedited_wq);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
|
|||
jiffies_start = jiffies;
|
||||
|
||||
for (;;) {
|
||||
ret = swait_event_timeout(
|
||||
ret = swait_event_timeout_exclusive(
|
||||
rsp->expedited_wq,
|
||||
sync_rcu_preempt_exp_done_unlocked(rnp_root),
|
||||
jiffies_stall);
|
||||
|
|
|
@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
|
|||
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
|
||||
del_timer(&rdp->nocb_timer);
|
||||
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
|
||||
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
|
||||
swake_up(&rdp_leader->nocb_wq);
|
||||
smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
|
||||
swake_up_one(&rdp_leader->nocb_wq);
|
||||
} else {
|
||||
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
|
||||
}
|
||||
|
@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
|
|||
*/
|
||||
trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
|
||||
for (;;) {
|
||||
swait_event_interruptible(
|
||||
swait_event_interruptible_exclusive(
|
||||
rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
|
||||
(d = rcu_seq_done(&rnp->gp_seq, c)));
|
||||
if (likely(d))
|
||||
|
@ -2188,7 +2188,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
|
|||
/* Wait for callbacks to appear. */
|
||||
if (!rcu_nocb_poll) {
|
||||
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
|
||||
swait_event_interruptible(my_rdp->nocb_wq,
|
||||
swait_event_interruptible_exclusive(my_rdp->nocb_wq,
|
||||
!READ_ONCE(my_rdp->nocb_leader_sleep));
|
||||
raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
|
||||
my_rdp->nocb_leader_sleep = true;
|
||||
|
@ -2253,7 +2253,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
|
|||
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
|
||||
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
|
||||
/* List was empty, so wake up the follower. */
|
||||
swake_up(&rdp->nocb_wq);
|
||||
swake_up_one(&rdp->nocb_wq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
|
|||
{
|
||||
for (;;) {
|
||||
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
|
||||
swait_event_interruptible(rdp->nocb_wq,
|
||||
swait_event_interruptible_exclusive(rdp->nocb_wq,
|
||||
READ_ONCE(rdp->nocb_follower_head));
|
||||
if (smp_load_acquire(&rdp->nocb_follower_head)) {
|
||||
/* ^^^ Ensure CB invocation follows _head test. */
|
||||
|
|
|
@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
|
|||
obj-y += idle.o fair.o rt.o deadline.o
|
||||
obj-y += wait.o wait_bit.o swait.o completion.o
|
||||
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
|
||||
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
|
||||
obj-$(CONFIG_SCHEDSTATS) += stats.o
|
||||
obj-$(CONFIG_SCHED_DEBUG) += debug.o
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#include "../workqueue_internal.h"
|
||||
#include "../smpboot.h"
|
||||
|
||||
#include "pelt.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
|
||||
|
@ -44,14 +46,6 @@ const_debug unsigned int sysctl_sched_features =
|
|||
*/
|
||||
const_debug unsigned int sysctl_sched_nr_migrate = 32;
|
||||
|
||||
/*
|
||||
* period over which we average the RT time consumption, measured
|
||||
* in ms.
|
||||
*
|
||||
* default: 1s
|
||||
*/
|
||||
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
|
||||
|
||||
/*
|
||||
* period over which we measure -rt task CPU usage in us.
|
||||
* default: 1s
|
||||
|
@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|||
|
||||
rq->clock_task += delta;
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#ifdef HAVE_SCHED_AVG_IRQ
|
||||
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
|
||||
sched_rt_avg_update(rq, irq_delta + steal);
|
||||
update_irq_load_avg(rq, irq_delta + steal);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
|
|||
return true;
|
||||
}
|
||||
#endif /* CONFIG_NO_HZ_FULL */
|
||||
|
||||
void sched_avg_update(struct rq *rq)
|
||||
{
|
||||
s64 period = sched_avg_period();
|
||||
|
||||
while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
|
||||
/*
|
||||
* Inline assembly required to prevent the compiler
|
||||
* optimising this loop into a divmod call.
|
||||
* See __iter_div_u64_rem() for another example of this.
|
||||
*/
|
||||
asm("" : "+rm" (rq->age_stamp));
|
||||
rq->age_stamp += period;
|
||||
rq->rt_avg /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
||||
|
@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
|||
__set_task_cpu(p, new_cpu);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static void __migrate_swap_task(struct task_struct *p, int cpu)
|
||||
{
|
||||
if (task_on_rq_queued(p)) {
|
||||
|
@ -1280,16 +1258,17 @@ static int migrate_swap_stop(void *data)
|
|||
/*
|
||||
* Cross migrate two tasks
|
||||
*/
|
||||
int migrate_swap(struct task_struct *cur, struct task_struct *p)
|
||||
int migrate_swap(struct task_struct *cur, struct task_struct *p,
|
||||
int target_cpu, int curr_cpu)
|
||||
{
|
||||
struct migration_swap_arg arg;
|
||||
int ret = -EINVAL;
|
||||
|
||||
arg = (struct migration_swap_arg){
|
||||
.src_task = cur,
|
||||
.src_cpu = task_cpu(cur),
|
||||
.src_cpu = curr_cpu,
|
||||
.dst_task = p,
|
||||
.dst_cpu = task_cpu(p),
|
||||
.dst_cpu = target_cpu,
|
||||
};
|
||||
|
||||
if (arg.src_cpu == arg.dst_cpu)
|
||||
|
@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
|
|||
out:
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/*
|
||||
* wait_task_inactive - wait for a thread to unschedule.
|
||||
|
@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
|
|||
int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
{
|
||||
unsigned long flags;
|
||||
int cpu = get_cpu();
|
||||
|
||||
__sched_fork(clone_flags, p);
|
||||
/*
|
||||
|
@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
p->sched_reset_on_fork = 0;
|
||||
}
|
||||
|
||||
if (dl_prio(p->prio)) {
|
||||
put_cpu();
|
||||
if (dl_prio(p->prio))
|
||||
return -EAGAIN;
|
||||
} else if (rt_prio(p->prio)) {
|
||||
else if (rt_prio(p->prio))
|
||||
p->sched_class = &rt_sched_class;
|
||||
} else {
|
||||
else
|
||||
p->sched_class = &fair_sched_class;
|
||||
}
|
||||
|
||||
init_entity_runnable_average(&p->se);
|
||||
|
||||
|
@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
* We're setting the CPU for the first time, we don't migrate,
|
||||
* so use __set_task_cpu().
|
||||
*/
|
||||
__set_task_cpu(p, cpu);
|
||||
__set_task_cpu(p, smp_processor_id());
|
||||
if (p->sched_class->task_fork)
|
||||
p->sched_class->task_fork(p);
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
|
@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
plist_node_init(&p->pushable_tasks, MAX_PRIO);
|
||||
RB_CLEAR_NODE(&p->pushable_dl_tasks);
|
||||
#endif
|
||||
|
||||
put_cpu();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
|
|||
}
|
||||
}
|
||||
|
||||
static void set_cpu_rq_start_time(unsigned int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
rq->age_stamp = sched_clock_cpu(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* used to mark begin/end of suspend/resume:
|
||||
*/
|
||||
|
@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
|
|||
|
||||
int sched_cpu_starting(unsigned int cpu)
|
||||
{
|
||||
set_cpu_rq_start_time(cpu);
|
||||
sched_rq_cpu_starting(cpu);
|
||||
sched_tick_start(cpu);
|
||||
return 0;
|
||||
|
@ -6106,7 +6073,6 @@ void __init sched_init(void)
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
idle_thread_set_boot_cpu();
|
||||
set_cpu_rq_start_time(smp_processor_id());
|
||||
#endif
|
||||
init_sched_fair_class();
|
||||
|
||||
|
@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
|
|||
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
|
||||
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
|
||||
|
||||
if (schedstat_enabled() && tg != &root_task_group) {
|
||||
u64 ws = 0;
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i)
|
||||
ws += schedstat_val(tg->se[i]->statistics.wait_sum);
|
||||
|
||||
seq_printf(sf, "wait_sum %llu\n", ws);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
|
|
|
@ -53,9 +53,7 @@ struct sugov_cpu {
|
|||
unsigned int iowait_boost_max;
|
||||
u64 last_update;
|
||||
|
||||
/* The fields below are only needed when sharing a policy: */
|
||||
unsigned long util_cfs;
|
||||
unsigned long util_dl;
|
||||
unsigned long bw_dl;
|
||||
unsigned long max;
|
||||
|
||||
/* The field below is for single-CPU policies only: */
|
||||
|
@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
|
|||
return cpufreq_driver_resolve_freq(policy, freq);
|
||||
}
|
||||
|
||||
static void sugov_get_util(struct sugov_cpu *sg_cpu)
|
||||
/*
|
||||
* This function computes an effective utilization for the given CPU, to be
|
||||
* used for frequency selection given the linear relation: f = u * f_max.
|
||||
*
|
||||
* The scheduler tracks the following metrics:
|
||||
*
|
||||
* cpu_util_{cfs,rt,dl,irq}()
|
||||
* cpu_bw_dl()
|
||||
*
|
||||
* Where the cfs,rt and dl util numbers are tracked with the same metric and
|
||||
* synchronized windows and are thus directly comparable.
|
||||
*
|
||||
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
|
||||
* which excludes things like IRQ and steal-time. These latter are then accrued
|
||||
* in the irq utilization.
|
||||
*
|
||||
* The DL bandwidth number otoh is not a measured metric but a value computed
|
||||
* based on the task model parameters and gives the minimal utilization
|
||||
* required to meet deadlines.
|
||||
*/
|
||||
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(sg_cpu->cpu);
|
||||
unsigned long util, irq, max;
|
||||
|
||||
sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
|
||||
sg_cpu->util_cfs = cpu_util_cfs(rq);
|
||||
sg_cpu->util_dl = cpu_util_dl(rq);
|
||||
}
|
||||
|
||||
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(sg_cpu->cpu);
|
||||
sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
|
||||
sg_cpu->bw_dl = cpu_bw_dl(rq);
|
||||
|
||||
if (rt_rq_is_runnable(&rq->rt))
|
||||
return sg_cpu->max;
|
||||
return max;
|
||||
|
||||
/*
|
||||
* Utilization required by DEADLINE must always be granted while, for
|
||||
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
|
||||
* gracefully reduce the frequency when no tasks show up for longer
|
||||
* Early check to see if IRQ/steal time saturates the CPU, can be
|
||||
* because of inaccuracies in how we track these -- see
|
||||
* update_irq_load_avg().
|
||||
*/
|
||||
irq = cpu_util_irq(rq);
|
||||
if (unlikely(irq >= max))
|
||||
return max;
|
||||
|
||||
/*
|
||||
* Because the time spend on RT/DL tasks is visible as 'lost' time to
|
||||
* CFS tasks and we use the same metric to track the effective
|
||||
* utilization (PELT windows are synchronized) we can directly add them
|
||||
* to obtain the CPU's actual utilization.
|
||||
*/
|
||||
util = cpu_util_cfs(rq);
|
||||
util += cpu_util_rt(rq);
|
||||
|
||||
/*
|
||||
* We do not make cpu_util_dl() a permanent part of this sum because we
|
||||
* want to use cpu_bw_dl() later on, but we need to check if the
|
||||
* CFS+RT+DL sum is saturated (ie. no idle time) such that we select
|
||||
* f_max when there is no idle time.
|
||||
*
|
||||
* NOTE: numerical errors or stop class might cause us to not quite hit
|
||||
* saturation when we should -- something for later.
|
||||
*/
|
||||
if ((util + cpu_util_dl(rq)) >= max)
|
||||
return max;
|
||||
|
||||
/*
|
||||
* There is still idle time; further improve the number by using the
|
||||
* irq metric. Because IRQ/steal time is hidden from the task clock we
|
||||
* need to scale the task numbers:
|
||||
*
|
||||
* 1 - irq
|
||||
* U' = irq + ------- * U
|
||||
* max
|
||||
*/
|
||||
util = scale_irq_capacity(util, irq, max);
|
||||
util += irq;
|
||||
|
||||
/*
|
||||
* Bandwidth required by DEADLINE must always be granted while, for
|
||||
* FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
|
||||
* to gracefully reduce the frequency when no tasks show up for longer
|
||||
* periods of time.
|
||||
*
|
||||
* Ideally we would like to set util_dl as min/guaranteed freq and
|
||||
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
|
||||
* ready for such an interface. So, we only do the latter for now.
|
||||
* Ideally we would like to set bw_dl as min/guaranteed freq and util +
|
||||
* bw_dl as requested freq. However, cpufreq is not yet ready for such
|
||||
* an interface. So, we only do the latter for now.
|
||||
*/
|
||||
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
|
||||
return min(max, util + sg_cpu->bw_dl);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
|
|||
*/
|
||||
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
|
||||
{
|
||||
if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
|
||||
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
|
||||
sg_policy->need_freq_update = true;
|
||||
}
|
||||
|
||||
|
@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
|
|||
|
||||
busy = sugov_cpu_is_busy(sg_cpu);
|
||||
|
||||
sugov_get_util(sg_cpu);
|
||||
util = sugov_get_util(sg_cpu);
|
||||
max = sg_cpu->max;
|
||||
util = sugov_aggregate_util(sg_cpu);
|
||||
sugov_iowait_apply(sg_cpu, time, &util, &max);
|
||||
next_f = get_next_freq(sg_policy, util, max);
|
||||
/*
|
||||
|
@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
|
|||
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
|
||||
unsigned long j_util, j_max;
|
||||
|
||||
sugov_get_util(j_sg_cpu);
|
||||
j_util = sugov_get_util(j_sg_cpu);
|
||||
j_max = j_sg_cpu->max;
|
||||
j_util = sugov_aggregate_util(j_sg_cpu);
|
||||
sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
|
||||
|
||||
if (j_util * max > j_max * util) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
* Fabio Checconi <fchecconi@gmail.com>
|
||||
*/
|
||||
#include "sched.h"
|
||||
#include "pelt.h"
|
||||
|
||||
struct dl_bandwidth def_dl_bandwidth;
|
||||
|
||||
|
@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
|
|||
curr->se.exec_start = now;
|
||||
cgroup_account_cputime(curr, delta_exec);
|
||||
|
||||
sched_rt_avg_update(rq, delta_exec);
|
||||
|
||||
if (dl_entity_is_special(dl_se))
|
||||
return;
|
||||
|
||||
|
@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|||
|
||||
deadline_queue_push_tasks(rq);
|
||||
|
||||
if (rq->curr->sched_class != &dl_sched_class)
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
|
|||
{
|
||||
update_curr_dl(rq);
|
||||
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
|
||||
enqueue_pushable_dl_task(rq, p);
|
||||
}
|
||||
|
@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
|
|||
{
|
||||
update_curr_dl(rq);
|
||||
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
/*
|
||||
* Even when we have runtime, update_curr_dl() might have resulted in us
|
||||
* not being the leftmost task anymore. In that case NEED_RESCHED will
|
||||
|
|
|
@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp)
|
|||
cmp += 3;
|
||||
}
|
||||
|
||||
for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
||||
if (strcmp(cmp, sched_feat_names[i]) == 0) {
|
||||
if (neg) {
|
||||
sysctl_sched_features &= ~(1UL << i);
|
||||
sched_feat_disable(i);
|
||||
} else {
|
||||
sysctl_sched_features |= (1UL << i);
|
||||
sched_feat_enable(i);
|
||||
}
|
||||
break;
|
||||
}
|
||||
i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
|
||||
if (i < 0)
|
||||
return i;
|
||||
|
||||
if (neg) {
|
||||
sysctl_sched_features &= ~(1UL << i);
|
||||
sched_feat_disable(i);
|
||||
} else {
|
||||
sysctl_sched_features |= (1UL << i);
|
||||
sched_feat_enable(i);
|
||||
}
|
||||
|
||||
return i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
|
@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|||
{
|
||||
char buf[64];
|
||||
char *cmp;
|
||||
int i;
|
||||
int ret;
|
||||
struct inode *inode;
|
||||
|
||||
if (cnt > 63)
|
||||
|
@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|||
/* Ensure the static_key remains in a consistent state */
|
||||
inode = file_inode(filp);
|
||||
inode_lock(inode);
|
||||
i = sched_feat_set(cmp);
|
||||
ret = sched_feat_set(cmp);
|
||||
inode_unlock(inode);
|
||||
if (i == __SCHED_FEAT_NR)
|
||||
return -EINVAL;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
*ppos += cnt;
|
||||
|
||||
|
@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
|
|||
unsigned long tpf, unsigned long gsf, unsigned long gpf)
|
||||
{
|
||||
SEQ_printf(m, "numa_faults node=%d ", node);
|
||||
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
|
||||
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
|
||||
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
|
||||
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
|||
return cfs_rq->rq;
|
||||
}
|
||||
|
||||
/* An entity is a task if it doesn't "own" a runqueue */
|
||||
#define entity_is_task(se) (!se->my_q)
|
||||
|
||||
static inline struct task_struct *task_of(struct sched_entity *se)
|
||||
{
|
||||
SCHED_WARN_ON(!entity_is_task(se));
|
||||
|
@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
|||
return container_of(cfs_rq, struct rq, cfs);
|
||||
}
|
||||
|
||||
#define entity_is_task(se) 1
|
||||
|
||||
#define for_each_sched_entity(se) \
|
||||
for (; se; se = NULL)
|
||||
|
@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#include "pelt.h"
|
||||
#include "sched-pelt.h"
|
||||
|
||||
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
||||
|
@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
|
|||
* To solve this problem, we also cap the util_avg of successive tasks to
|
||||
* only 1/2 of the left utilization budget:
|
||||
*
|
||||
* util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
|
||||
* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
|
||||
*
|
||||
* where n denotes the nth task.
|
||||
* where n denotes the nth task and cpu_scale the CPU capacity.
|
||||
*
|
||||
* For example, a simplest series from the beginning would be like:
|
||||
* For example, for a CPU with 1024 of capacity, a simplest series from
|
||||
* the beginning would be like:
|
||||
*
|
||||
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
|
||||
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
|
||||
|
@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|||
{
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
struct sched_avg *sa = &se->avg;
|
||||
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
|
||||
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
|
||||
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
|
||||
|
||||
if (cap > 0) {
|
||||
if (cfs_rq->avg.util_avg != 0) {
|
||||
|
@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
|
|||
* of each group. Skip other nodes.
|
||||
*/
|
||||
if (sched_numa_topology_type == NUMA_BACKPLANE &&
|
||||
dist > maxdist)
|
||||
dist >= maxdist)
|
||||
continue;
|
||||
|
||||
/* Add up the faults from nearby nodes. */
|
||||
|
@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
|
|||
|
||||
/* Cached statistics for all CPUs within a node */
|
||||
struct numa_stats {
|
||||
unsigned long nr_running;
|
||||
unsigned long load;
|
||||
|
||||
/* Total compute capacity of CPUs on a node */
|
||||
unsigned long compute_capacity;
|
||||
|
||||
/* Approximate capacity in terms of runnable tasks on a node */
|
||||
unsigned long task_capacity;
|
||||
int has_free_capacity;
|
||||
unsigned int nr_running;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
|
|||
* the @ns structure is NULL'ed and task_numa_compare() will
|
||||
* not find this node attractive.
|
||||
*
|
||||
* We'll either bail at !has_free_capacity, or we'll detect a huge
|
||||
* imbalance and bail there.
|
||||
* We'll detect a huge imbalance and bail there.
|
||||
*/
|
||||
if (!cpus)
|
||||
return;
|
||||
|
@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
|
|||
smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
|
||||
capacity = cpus / smt; /* cores */
|
||||
|
||||
ns->task_capacity = min_t(unsigned, capacity,
|
||||
capacity = min_t(unsigned, capacity,
|
||||
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
|
||||
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
|
||||
}
|
||||
|
||||
struct task_numa_env {
|
||||
|
@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
|||
src_capacity = env->src_stats.compute_capacity;
|
||||
dst_capacity = env->dst_stats.compute_capacity;
|
||||
|
||||
/* We care about the slope of the imbalance, not the direction. */
|
||||
if (dst_load < src_load)
|
||||
swap(dst_load, src_load);
|
||||
imb = abs(dst_load * src_capacity - src_load * dst_capacity);
|
||||
|
||||
/* Is the difference below the threshold? */
|
||||
imb = dst_load * src_capacity * 100 -
|
||||
src_load * dst_capacity * env->imbalance_pct;
|
||||
if (imb <= 0)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The imbalance is above the allowed threshold.
|
||||
* Compare it with the old imbalance.
|
||||
*/
|
||||
orig_src_load = env->src_stats.load;
|
||||
orig_dst_load = env->dst_stats.load;
|
||||
|
||||
if (orig_dst_load < orig_src_load)
|
||||
swap(orig_dst_load, orig_src_load);
|
||||
|
||||
old_imb = orig_dst_load * src_capacity * 100 -
|
||||
orig_src_load * dst_capacity * env->imbalance_pct;
|
||||
old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
|
||||
|
||||
/* Would this change make things worse? */
|
||||
return (imb > old_imb);
|
||||
|
@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
|||
* be exchanged with the source task
|
||||
*/
|
||||
static void task_numa_compare(struct task_numa_env *env,
|
||||
long taskimp, long groupimp)
|
||||
long taskimp, long groupimp, bool maymove)
|
||||
{
|
||||
struct rq *src_rq = cpu_rq(env->src_cpu);
|
||||
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
||||
struct task_struct *cur;
|
||||
long src_load, dst_load;
|
||||
|
@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
|
|||
if (cur == env->p)
|
||||
goto unlock;
|
||||
|
||||
if (!cur) {
|
||||
if (maymove || imp > env->best_imp)
|
||||
goto assign;
|
||||
else
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* "imp" is the fault differential for the source task between the
|
||||
* source and destination node. Calculate the total differential for
|
||||
* the source task and potential destination task. The more negative
|
||||
* the value is, the more rmeote accesses that would be expected to
|
||||
* the value is, the more remote accesses that would be expected to
|
||||
* be incurred if the tasks were swapped.
|
||||
*/
|
||||
if (cur) {
|
||||
/* Skip this swap candidate if cannot move to the source CPU: */
|
||||
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If dst and source tasks are in the same NUMA group, or not
|
||||
* in any group then look only at task weights.
|
||||
*/
|
||||
if (cur->numa_group == env->p->numa_group) {
|
||||
imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
/*
|
||||
* Add some hysteresis to prevent swapping the
|
||||
* tasks within a group over tiny differences.
|
||||
*/
|
||||
if (cur->numa_group)
|
||||
imp -= imp/16;
|
||||
} else {
|
||||
/*
|
||||
* Compare the group weights. If a task is all by
|
||||
* itself (not part of a group), use the task weight
|
||||
* instead.
|
||||
*/
|
||||
if (cur->numa_group)
|
||||
imp += group_weight(cur, env->src_nid, dist) -
|
||||
group_weight(cur, env->dst_nid, dist);
|
||||
else
|
||||
imp += task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
}
|
||||
}
|
||||
|
||||
if (imp <= env->best_imp && moveimp <= env->best_imp)
|
||||
/* Skip this swap candidate if cannot move to the source cpu */
|
||||
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
|
||||
goto unlock;
|
||||
|
||||
if (!cur) {
|
||||
/* Is there capacity at our destination? */
|
||||
if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
|
||||
!env->dst_stats.has_free_capacity)
|
||||
goto unlock;
|
||||
|
||||
goto balance;
|
||||
}
|
||||
|
||||
/* Balance doesn't matter much if we're running a task per CPU: */
|
||||
if (imp > env->best_imp && src_rq->nr_running == 1 &&
|
||||
dst_rq->nr_running == 1)
|
||||
goto assign;
|
||||
|
||||
/*
|
||||
* In the overloaded case, try and keep the load balanced.
|
||||
* If dst and source tasks are in the same NUMA group, or not
|
||||
* in any group then look only at task weights.
|
||||
*/
|
||||
balance:
|
||||
load = task_h_load(env->p);
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
if (moveimp > imp && moveimp > env->best_imp) {
|
||||
if (cur->numa_group == env->p->numa_group) {
|
||||
imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
/*
|
||||
* If the improvement from just moving env->p direction is
|
||||
* better than swapping tasks around, check if a move is
|
||||
* possible. Store a slightly smaller score than moveimp,
|
||||
* so an actually idle CPU will win.
|
||||
* Add some hysteresis to prevent swapping the
|
||||
* tasks within a group over tiny differences.
|
||||
*/
|
||||
if (!load_too_imbalanced(src_load, dst_load, env)) {
|
||||
imp = moveimp - 1;
|
||||
cur = NULL;
|
||||
goto assign;
|
||||
}
|
||||
if (cur->numa_group)
|
||||
imp -= imp / 16;
|
||||
} else {
|
||||
/*
|
||||
* Compare the group weights. If a task is all by itself
|
||||
* (not part of a group), use the task weight instead.
|
||||
*/
|
||||
if (cur->numa_group && env->p->numa_group)
|
||||
imp += group_weight(cur, env->src_nid, dist) -
|
||||
group_weight(cur, env->dst_nid, dist);
|
||||
else
|
||||
imp += task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
}
|
||||
|
||||
if (imp <= env->best_imp)
|
||||
goto unlock;
|
||||
|
||||
if (cur) {
|
||||
load = task_h_load(cur);
|
||||
dst_load -= load;
|
||||
src_load += load;
|
||||
if (maymove && moveimp > imp && moveimp > env->best_imp) {
|
||||
imp = moveimp - 1;
|
||||
cur = NULL;
|
||||
goto assign;
|
||||
}
|
||||
|
||||
/*
|
||||
* In the overloaded case, try and keep the load balanced.
|
||||
*/
|
||||
load = task_h_load(env->p) - task_h_load(cur);
|
||||
if (!load)
|
||||
goto assign;
|
||||
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
if (load_too_imbalanced(src_load, dst_load, env))
|
||||
goto unlock;
|
||||
|
||||
assign:
|
||||
/*
|
||||
* One idle CPU per node is evaluated for a task numa move.
|
||||
* Call select_idle_sibling to maybe find a better one.
|
||||
|
@ -1711,7 +1663,6 @@ static void task_numa_compare(struct task_numa_env *env,
|
|||
local_irq_enable();
|
||||
}
|
||||
|
||||
assign:
|
||||
task_numa_assign(env, cur, imp);
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
@ -1720,43 +1671,30 @@ static void task_numa_compare(struct task_numa_env *env,
|
|||
static void task_numa_find_cpu(struct task_numa_env *env,
|
||||
long taskimp, long groupimp)
|
||||
{
|
||||
long src_load, dst_load, load;
|
||||
bool maymove = false;
|
||||
int cpu;
|
||||
|
||||
load = task_h_load(env->p);
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
/*
|
||||
* If the improvement from just moving env->p direction is better
|
||||
* than swapping tasks around, check if a move is possible.
|
||||
*/
|
||||
maymove = !load_too_imbalanced(src_load, dst_load, env);
|
||||
|
||||
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
|
||||
/* Skip this CPU if the source task cannot migrate */
|
||||
if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
|
||||
continue;
|
||||
|
||||
env->dst_cpu = cpu;
|
||||
task_numa_compare(env, taskimp, groupimp);
|
||||
task_numa_compare(env, taskimp, groupimp, maymove);
|
||||
}
|
||||
}
|
||||
|
||||
/* Only move tasks to a NUMA node less busy than the current node. */
|
||||
static bool numa_has_capacity(struct task_numa_env *env)
|
||||
{
|
||||
struct numa_stats *src = &env->src_stats;
|
||||
struct numa_stats *dst = &env->dst_stats;
|
||||
|
||||
if (src->has_free_capacity && !dst->has_free_capacity)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Only consider a task move if the source has a higher load
|
||||
* than the destination, corrected for CPU capacity on each node.
|
||||
*
|
||||
* src->load dst->load
|
||||
* --------------------- vs ---------------------
|
||||
* src->compute_capacity dst->compute_capacity
|
||||
*/
|
||||
if (src->load * dst->compute_capacity * env->imbalance_pct >
|
||||
|
||||
dst->load * src->compute_capacity * 100)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int task_numa_migrate(struct task_struct *p)
|
||||
{
|
||||
struct task_numa_env env = {
|
||||
|
@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|||
* elsewhere, so there is no point in (re)trying.
|
||||
*/
|
||||
if (unlikely(!sd)) {
|
||||
p->numa_preferred_nid = task_node(p);
|
||||
sched_setnuma(p, task_node(p));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|||
update_numa_stats(&env.dst_stats, env.dst_nid);
|
||||
|
||||
/* Try to find a spot on the preferred nid. */
|
||||
if (numa_has_capacity(&env))
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
|
||||
/*
|
||||
* Look at other nodes in these cases:
|
||||
|
@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|||
env.dist = dist;
|
||||
env.dst_nid = nid;
|
||||
update_numa_stats(&env.dst_stats, env.dst_nid);
|
||||
if (numa_has_capacity(&env))
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
|
|||
* trying for a better one later. Do not set the preferred node here.
|
||||
*/
|
||||
if (p->numa_group) {
|
||||
struct numa_group *ng = p->numa_group;
|
||||
|
||||
if (env.best_cpu == -1)
|
||||
nid = env.src_nid;
|
||||
else
|
||||
nid = env.dst_nid;
|
||||
nid = cpu_to_node(env.best_cpu);
|
||||
|
||||
if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
|
||||
sched_setnuma(p, env.dst_nid);
|
||||
if (nid != p->numa_preferred_nid)
|
||||
sched_setnuma(p, nid);
|
||||
}
|
||||
|
||||
/* No better CPU than the current one was found. */
|
||||
|
@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
|
|||
return ret;
|
||||
}
|
||||
|
||||
ret = migrate_swap(p, env.best_task);
|
||||
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
|
||||
|
||||
if (ret != 0)
|
||||
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
|
||||
put_task_struct(env.best_task);
|
||||
|
@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
|
|||
|
||||
static void task_numa_placement(struct task_struct *p)
|
||||
{
|
||||
int seq, nid, max_nid = -1, max_group_nid = -1;
|
||||
unsigned long max_faults = 0, max_group_faults = 0;
|
||||
int seq, nid, max_nid = -1;
|
||||
unsigned long max_faults = 0;
|
||||
unsigned long fault_types[2] = { 0, 0 };
|
||||
unsigned long total_faults;
|
||||
u64 runtime, period;
|
||||
|
@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
|
|||
}
|
||||
}
|
||||
|
||||
if (faults > max_faults) {
|
||||
max_faults = faults;
|
||||
if (!p->numa_group) {
|
||||
if (faults > max_faults) {
|
||||
max_faults = faults;
|
||||
max_nid = nid;
|
||||
}
|
||||
} else if (group_faults > max_faults) {
|
||||
max_faults = group_faults;
|
||||
max_nid = nid;
|
||||
}
|
||||
|
||||
if (group_faults > max_group_faults) {
|
||||
max_group_faults = group_faults;
|
||||
max_group_nid = nid;
|
||||
}
|
||||
}
|
||||
|
||||
update_task_scan_period(p, fault_types[0], fault_types[1]);
|
||||
|
||||
if (p->numa_group) {
|
||||
numa_group_count_active_nodes(p->numa_group);
|
||||
spin_unlock_irq(group_lock);
|
||||
max_nid = preferred_group_nid(p, max_group_nid);
|
||||
max_nid = preferred_group_nid(p, max_nid);
|
||||
}
|
||||
|
||||
if (max_faults) {
|
||||
/* Set the new preferred node */
|
||||
if (max_nid != p->numa_preferred_nid)
|
||||
sched_setnuma(p, max_nid);
|
||||
|
||||
if (task_node(p) != p->numa_preferred_nid)
|
||||
numa_migrate_preferred(p);
|
||||
}
|
||||
|
||||
update_task_scan_period(p, fault_types[0], fault_types[1]);
|
||||
}
|
||||
|
||||
static inline int get_numa_group(struct numa_group *grp)
|
||||
|
@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|||
numa_is_active_node(mem_node, ng))
|
||||
local = 1;
|
||||
|
||||
task_numa_placement(p);
|
||||
|
||||
/*
|
||||
* Retry task to preferred node migration periodically, in case it
|
||||
* case it previously failed, or the scheduler moved us.
|
||||
*/
|
||||
if (time_after(jiffies, p->numa_migrate_retry))
|
||||
if (time_after(jiffies, p->numa_migrate_retry)) {
|
||||
task_numa_placement(p);
|
||||
numa_migrate_preferred(p);
|
||||
}
|
||||
|
||||
if (migrated)
|
||||
p->numa_pages_migrated += pages;
|
||||
|
@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
} while (0)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* XXX we want to get rid of these helpers and use the full load resolution.
|
||||
*/
|
||||
static inline long se_weight(struct sched_entity *se)
|
||||
{
|
||||
return scale_load_down(se->load.weight);
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
{
|
||||
return scale_load_down(se->runnable_weight);
|
||||
}
|
||||
|
||||
static inline void
|
||||
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
|
@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* Approximate:
|
||||
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
|
||||
*/
|
||||
static u64 decay_load(u64 val, u64 n)
|
||||
{
|
||||
unsigned int local_n;
|
||||
|
||||
if (unlikely(n > LOAD_AVG_PERIOD * 63))
|
||||
return 0;
|
||||
|
||||
/* after bounds checking we can collapse to 32-bit */
|
||||
local_n = n;
|
||||
|
||||
/*
|
||||
* As y^PERIOD = 1/2, we can combine
|
||||
* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
|
||||
* With a look-up table which covers y^n (n<PERIOD)
|
||||
*
|
||||
* To achieve constant time decay_load.
|
||||
*/
|
||||
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
|
||||
val >>= local_n / LOAD_AVG_PERIOD;
|
||||
local_n %= LOAD_AVG_PERIOD;
|
||||
}
|
||||
|
||||
val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
|
||||
{
|
||||
u32 c1, c2, c3 = d3; /* y^0 == 1 */
|
||||
|
||||
/*
|
||||
* c1 = d1 y^p
|
||||
*/
|
||||
c1 = decay_load((u64)d1, periods);
|
||||
|
||||
/*
|
||||
* p-1
|
||||
* c2 = 1024 \Sum y^n
|
||||
* n=1
|
||||
*
|
||||
* inf inf
|
||||
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
|
||||
* n=0 n=p
|
||||
*/
|
||||
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
|
||||
|
||||
return c1 + c2 + c3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accumulate the three separate parts of the sum; d1 the remainder
|
||||
* of the last (incomplete) period, d2 the span of full periods and d3
|
||||
* the remainder of the (incomplete) current period.
|
||||
*
|
||||
* d1 d2 d3
|
||||
* ^ ^ ^
|
||||
* | | |
|
||||
* |<->|<----------------->|<--->|
|
||||
* ... |---x---|------| ... |------|-----x (now)
|
||||
*
|
||||
* p-1
|
||||
* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
|
||||
* n=1
|
||||
*
|
||||
* = u y^p + (Step 1)
|
||||
*
|
||||
* p-1
|
||||
* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
|
||||
* n=1
|
||||
*/
|
||||
static __always_inline u32
|
||||
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
unsigned long scale_freq, scale_cpu;
|
||||
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
|
||||
u64 periods;
|
||||
|
||||
scale_freq = arch_scale_freq_capacity(cpu);
|
||||
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
|
||||
|
||||
delta += sa->period_contrib;
|
||||
periods = delta / 1024; /* A period is 1024us (~1ms) */
|
||||
|
||||
/*
|
||||
* Step 1: decay old *_sum if we crossed period boundaries.
|
||||
*/
|
||||
if (periods) {
|
||||
sa->load_sum = decay_load(sa->load_sum, periods);
|
||||
sa->runnable_load_sum =
|
||||
decay_load(sa->runnable_load_sum, periods);
|
||||
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
|
||||
|
||||
/*
|
||||
* Step 2
|
||||
*/
|
||||
delta %= 1024;
|
||||
contrib = __accumulate_pelt_segments(periods,
|
||||
1024 - sa->period_contrib, delta);
|
||||
}
|
||||
sa->period_contrib = delta;
|
||||
|
||||
contrib = cap_scale(contrib, scale_freq);
|
||||
if (load)
|
||||
sa->load_sum += load * contrib;
|
||||
if (runnable)
|
||||
sa->runnable_load_sum += runnable * contrib;
|
||||
if (running)
|
||||
sa->util_sum += contrib * scale_cpu;
|
||||
|
||||
return periods;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can represent the historical contribution to runnable average as the
|
||||
* coefficients of a geometric series. To do this we sub-divide our runnable
|
||||
* history into segments of approximately 1ms (1024us); label the segment that
|
||||
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
|
||||
*
|
||||
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
|
||||
* p0 p1 p2
|
||||
* (now) (~1ms ago) (~2ms ago)
|
||||
*
|
||||
* Let u_i denote the fraction of p_i that the entity was runnable.
|
||||
*
|
||||
* We then designate the fractions u_i as our co-efficients, yielding the
|
||||
* following representation of historical load:
|
||||
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
|
||||
*
|
||||
* We choose y based on the with of a reasonably scheduling period, fixing:
|
||||
* y^32 = 0.5
|
||||
*
|
||||
* This means that the contribution to load ~32ms ago (u_32) will be weighted
|
||||
* approximately half as much as the contribution to load within the last ms
|
||||
* (u_0).
|
||||
*
|
||||
* When a period "rolls over" and we have new u_0`, multiplying the previous
|
||||
* sum again by y is sufficient to update:
|
||||
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
|
||||
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
||||
*/
|
||||
static __always_inline int
|
||||
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
u64 delta;
|
||||
|
||||
delta = now - sa->last_update_time;
|
||||
/*
|
||||
* This should only happen when time goes backwards, which it
|
||||
* unfortunately does during sched clock init when we swap over to TSC.
|
||||
*/
|
||||
if ((s64)delta < 0) {
|
||||
sa->last_update_time = now;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use 1024ns as the unit of measurement since it's a reasonable
|
||||
* approximation of 1us and fast to compute.
|
||||
*/
|
||||
delta >>= 10;
|
||||
if (!delta)
|
||||
return 0;
|
||||
|
||||
sa->last_update_time += delta << 10;
|
||||
|
||||
/*
|
||||
* running is a subset of runnable (weight) so running can't be set if
|
||||
* runnable is clear. But there are some corner cases where the current
|
||||
* se has been already dequeued but cfs_rq->curr still points to it.
|
||||
* This means that weight will be 0 but not running for a sched_entity
|
||||
* but also for a cfs_rq if the latter becomes idle. As an example,
|
||||
* this happens during idle_balance() which calls
|
||||
* update_blocked_averages()
|
||||
*/
|
||||
if (!load)
|
||||
runnable = running = 0;
|
||||
|
||||
/*
|
||||
* Now we know we crossed measurement unit boundaries. The *_avg
|
||||
* accrues by two steps:
|
||||
*
|
||||
* Step 1: accumulate *_sum since last_update_time. If we haven't
|
||||
* crossed period boundaries, finish.
|
||||
*/
|
||||
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
|
||||
{
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
|
||||
|
||||
/*
|
||||
* Step 2: update *_avg.
|
||||
*/
|
||||
sa->load_avg = div_u64(load * sa->load_sum, divider);
|
||||
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
|
||||
sa->util_avg = sa->util_sum / divider;
|
||||
}
|
||||
|
||||
/*
|
||||
* When a task is dequeued, its estimated utilization should not be update if
|
||||
* its util_avg has not been updated at least once.
|
||||
* This flag is used to synchronize util_avg updates with util_est updates.
|
||||
* We map this information into the LSB bit of the utilization saved at
|
||||
* dequeue time (i.e. util_est.dequeued).
|
||||
*/
|
||||
#define UTIL_AVG_UNCHANGED 0x1
|
||||
|
||||
static inline void cfs_se_util_change(struct sched_avg *avg)
|
||||
{
|
||||
unsigned int enqueued;
|
||||
|
||||
if (!sched_feat(UTIL_EST))
|
||||
return;
|
||||
|
||||
/* Avoid store if the flag has been already set */
|
||||
enqueued = avg->util_est.enqueued;
|
||||
if (!(enqueued & UTIL_AVG_UNCHANGED))
|
||||
return;
|
||||
|
||||
/* Reset flag to report util_avg has been updated */
|
||||
enqueued &= ~UTIL_AVG_UNCHANGED;
|
||||
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_entity:
|
||||
*
|
||||
* task:
|
||||
* se_runnable() == se_weight()
|
||||
*
|
||||
* group: [ see update_cfs_group() ]
|
||||
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
||||
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
|
||||
*
|
||||
* load_sum := runnable_sum
|
||||
* load_avg = se_weight(se) * runnable_avg
|
||||
*
|
||||
* runnable_load_sum := runnable_sum
|
||||
* runnable_load_avg = se_runnable(se) * runnable_avg
|
||||
*
|
||||
* XXX collapse load_sum and runnable_load_sum
|
||||
*
|
||||
* cfq_rs:
|
||||
*
|
||||
* load_sum = \Sum se_weight(se) * se->avg.load_sum
|
||||
* load_avg = \Sum se->avg.load_avg
|
||||
*
|
||||
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
|
||||
* runnable_load_avg = \Sum se->avg.runable_load_avg
|
||||
*/
|
||||
|
||||
static int
|
||||
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
cfs_rq->curr == se)) {
|
||||
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
cfs_se_util_change(&se->avg);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, cpu, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
scale_load_down(cfs_rq->runnable_weight),
|
||||
cfs_rq->curr != NULL)) {
|
||||
|
||||
___update_load_avg(&cfs_rq->avg, 1, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/**
|
||||
* update_tg_load_avg - update the tg's load avg
|
||||
|
@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
|||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
static inline int
|
||||
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define UPDATE_TG 0x0
|
||||
#define SKIP_AGE_LOAD 0x0
|
||||
#define DO_ATTACH 0x0
|
||||
|
@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
|
|||
throttled_hierarchy(dest_cfs_rq);
|
||||
}
|
||||
|
||||
/* updated child weight may affect parent so we have to do this bottom up */
|
||||
static int tg_unthrottle_up(struct task_group *tg, void *data)
|
||||
{
|
||||
struct rq *rq = data;
|
||||
|
@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
|
|||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
|
||||
sched_avg_update(this_rq);
|
||||
}
|
||||
|
||||
/* Used instead of source_load when we know the type == 0 */
|
||||
|
@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
|||
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
unsigned long src_faults, dst_faults;
|
||||
int src_nid, dst_nid;
|
||||
unsigned long src_weight, dst_weight;
|
||||
int src_nid, dst_nid, dist;
|
||||
|
||||
if (!static_branch_likely(&sched_numa_balancing))
|
||||
return -1;
|
||||
|
@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|||
return 0;
|
||||
|
||||
/* Leaving a core idle is often worse than degrading locality. */
|
||||
if (env->idle != CPU_NOT_IDLE)
|
||||
if (env->idle == CPU_IDLE)
|
||||
return -1;
|
||||
|
||||
dist = node_distance(src_nid, dst_nid);
|
||||
if (numa_group) {
|
||||
src_faults = group_faults(p, src_nid);
|
||||
dst_faults = group_faults(p, dst_nid);
|
||||
src_weight = group_weight(p, src_nid, dist);
|
||||
dst_weight = group_weight(p, dst_nid, dist);
|
||||
} else {
|
||||
src_faults = task_faults(p, src_nid);
|
||||
dst_faults = task_faults(p, dst_nid);
|
||||
src_weight = task_weight(p, src_nid, dist);
|
||||
dst_weight = task_weight(p, dst_nid, dist);
|
||||
}
|
||||
|
||||
return dst_faults < src_faults;
|
||||
return dst_weight < src_weight;
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline bool others_have_blocked(struct rq *rq)
|
||||
{
|
||||
if (READ_ONCE(rq->avg_rt.util_avg))
|
||||
return true;
|
||||
|
||||
if (READ_ONCE(rq->avg_dl.util_avg))
|
||||
return true;
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
if (READ_ONCE(rq->avg_irq.util_avg))
|
||||
return true;
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
||||
|
@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu)
|
|||
if (cfs_rq_has_blocked(cfs_rq))
|
||||
done = false;
|
||||
}
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_irq_load_avg(rq, 0);
|
||||
/* Don't need periodic decay once load/util_avg are null */
|
||||
if (others_have_blocked(rq))
|
||||
done = false;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
|
@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu)
|
|||
rq_lock_irqsave(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_irq_load_avg(rq, 0);
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
if (!cfs_rq_has_blocked(cfs_rq))
|
||||
if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
|
||||
rq->has_blocked_load = 0;
|
||||
#endif
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
|
@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
|
|||
static unsigned long scale_rt_capacity(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
u64 total, used, age_stamp, avg;
|
||||
s64 delta;
|
||||
unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
|
||||
unsigned long used, free;
|
||||
unsigned long irq;
|
||||
|
||||
/*
|
||||
* Since we're reading these variables without serialization make sure
|
||||
* we read them once before doing sanity checks on them.
|
||||
*/
|
||||
age_stamp = READ_ONCE(rq->age_stamp);
|
||||
avg = READ_ONCE(rq->rt_avg);
|
||||
delta = __rq_clock_broken(rq) - age_stamp;
|
||||
irq = cpu_util_irq(rq);
|
||||
|
||||
if (unlikely(delta < 0))
|
||||
delta = 0;
|
||||
if (unlikely(irq >= max))
|
||||
return 1;
|
||||
|
||||
total = sched_avg_period() + delta;
|
||||
used = READ_ONCE(rq->avg_rt.util_avg);
|
||||
used += READ_ONCE(rq->avg_dl.util_avg);
|
||||
|
||||
used = div_u64(avg, total);
|
||||
if (unlikely(used >= max))
|
||||
return 1;
|
||||
|
||||
if (likely(used < SCHED_CAPACITY_SCALE))
|
||||
return SCHED_CAPACITY_SCALE - used;
|
||||
free = max - used;
|
||||
|
||||
return 1;
|
||||
return scale_irq_capacity(free, irq, max);
|
||||
}
|
||||
|
||||
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
|
||||
unsigned long capacity = scale_rt_capacity(cpu);
|
||||
struct sched_group *sdg = sd->groups;
|
||||
|
||||
cpu_rq(cpu)->cpu_capacity_orig = capacity;
|
||||
|
||||
capacity *= scale_rt_capacity(cpu);
|
||||
capacity >>= SCHED_CAPACITY_SHIFT;
|
||||
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
|
||||
|
||||
if (!capacity)
|
||||
capacity = 1;
|
||||
|
|
399
kernel/sched/pelt.c
Normal file
399
kernel/sched/pelt.c
Normal file
|
@ -0,0 +1,399 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Per Entity Load Tracking
|
||||
*
|
||||
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
|
||||
*
|
||||
* Interactivity improvements by Mike Galbraith
|
||||
* (C) 2007 Mike Galbraith <efault@gmx.de>
|
||||
*
|
||||
* Various enhancements by Dmitry Adamushko.
|
||||
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
|
||||
*
|
||||
* Group scheduling enhancements by Srivatsa Vaddagiri
|
||||
* Copyright IBM Corporation, 2007
|
||||
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
|
||||
*
|
||||
* Scaled math optimizations by Thomas Gleixner
|
||||
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
|
||||
*
|
||||
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
|
||||
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
|
||||
*
|
||||
* Move PELT related code from fair.c into this pelt.c file
|
||||
* Author: Vincent Guittot <vincent.guittot@linaro.org>
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include "sched.h"
|
||||
#include "sched-pelt.h"
|
||||
#include "pelt.h"
|
||||
|
||||
/*
|
||||
* Approximate:
|
||||
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
|
||||
*/
|
||||
static u64 decay_load(u64 val, u64 n)
|
||||
{
|
||||
unsigned int local_n;
|
||||
|
||||
if (unlikely(n > LOAD_AVG_PERIOD * 63))
|
||||
return 0;
|
||||
|
||||
/* after bounds checking we can collapse to 32-bit */
|
||||
local_n = n;
|
||||
|
||||
/*
|
||||
* As y^PERIOD = 1/2, we can combine
|
||||
* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
|
||||
* With a look-up table which covers y^n (n<PERIOD)
|
||||
*
|
||||
* To achieve constant time decay_load.
|
||||
*/
|
||||
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
|
||||
val >>= local_n / LOAD_AVG_PERIOD;
|
||||
local_n %= LOAD_AVG_PERIOD;
|
||||
}
|
||||
|
||||
val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
|
||||
{
|
||||
u32 c1, c2, c3 = d3; /* y^0 == 1 */
|
||||
|
||||
/*
|
||||
* c1 = d1 y^p
|
||||
*/
|
||||
c1 = decay_load((u64)d1, periods);
|
||||
|
||||
/*
|
||||
* p-1
|
||||
* c2 = 1024 \Sum y^n
|
||||
* n=1
|
||||
*
|
||||
* inf inf
|
||||
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
|
||||
* n=0 n=p
|
||||
*/
|
||||
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
|
||||
|
||||
return c1 + c2 + c3;
|
||||
}
|
||||
|
||||
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
|
||||
|
||||
/*
|
||||
* Accumulate the three separate parts of the sum; d1 the remainder
|
||||
* of the last (incomplete) period, d2 the span of full periods and d3
|
||||
* the remainder of the (incomplete) current period.
|
||||
*
|
||||
* d1 d2 d3
|
||||
* ^ ^ ^
|
||||
* | | |
|
||||
* |<->|<----------------->|<--->|
|
||||
* ... |---x---|------| ... |------|-----x (now)
|
||||
*
|
||||
* p-1
|
||||
* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
|
||||
* n=1
|
||||
*
|
||||
* = u y^p + (Step 1)
|
||||
*
|
||||
* p-1
|
||||
* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
|
||||
* n=1
|
||||
*/
|
||||
static __always_inline u32
|
||||
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
unsigned long scale_freq, scale_cpu;
|
||||
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
|
||||
u64 periods;
|
||||
|
||||
scale_freq = arch_scale_freq_capacity(cpu);
|
||||
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
|
||||
|
||||
delta += sa->period_contrib;
|
||||
periods = delta / 1024; /* A period is 1024us (~1ms) */
|
||||
|
||||
/*
|
||||
* Step 1: decay old *_sum if we crossed period boundaries.
|
||||
*/
|
||||
if (periods) {
|
||||
sa->load_sum = decay_load(sa->load_sum, periods);
|
||||
sa->runnable_load_sum =
|
||||
decay_load(sa->runnable_load_sum, periods);
|
||||
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
|
||||
|
||||
/*
|
||||
* Step 2
|
||||
*/
|
||||
delta %= 1024;
|
||||
contrib = __accumulate_pelt_segments(periods,
|
||||
1024 - sa->period_contrib, delta);
|
||||
}
|
||||
sa->period_contrib = delta;
|
||||
|
||||
contrib = cap_scale(contrib, scale_freq);
|
||||
if (load)
|
||||
sa->load_sum += load * contrib;
|
||||
if (runnable)
|
||||
sa->runnable_load_sum += runnable * contrib;
|
||||
if (running)
|
||||
sa->util_sum += contrib * scale_cpu;
|
||||
|
||||
return periods;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can represent the historical contribution to runnable average as the
|
||||
* coefficients of a geometric series. To do this we sub-divide our runnable
|
||||
* history into segments of approximately 1ms (1024us); label the segment that
|
||||
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
|
||||
*
|
||||
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
|
||||
* p0 p1 p2
|
||||
* (now) (~1ms ago) (~2ms ago)
|
||||
*
|
||||
* Let u_i denote the fraction of p_i that the entity was runnable.
|
||||
*
|
||||
* We then designate the fractions u_i as our co-efficients, yielding the
|
||||
* following representation of historical load:
|
||||
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
|
||||
*
|
||||
* We choose y based on the with of a reasonably scheduling period, fixing:
|
||||
* y^32 = 0.5
|
||||
*
|
||||
* This means that the contribution to load ~32ms ago (u_32) will be weighted
|
||||
* approximately half as much as the contribution to load within the last ms
|
||||
* (u_0).
|
||||
*
|
||||
* When a period "rolls over" and we have new u_0`, multiplying the previous
|
||||
* sum again by y is sufficient to update:
|
||||
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
|
||||
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
||||
*/
|
||||
static __always_inline int
|
||||
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
u64 delta;
|
||||
|
||||
delta = now - sa->last_update_time;
|
||||
/*
|
||||
* This should only happen when time goes backwards, which it
|
||||
* unfortunately does during sched clock init when we swap over to TSC.
|
||||
*/
|
||||
if ((s64)delta < 0) {
|
||||
sa->last_update_time = now;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use 1024ns as the unit of measurement since it's a reasonable
|
||||
* approximation of 1us and fast to compute.
|
||||
*/
|
||||
delta >>= 10;
|
||||
if (!delta)
|
||||
return 0;
|
||||
|
||||
sa->last_update_time += delta << 10;
|
||||
|
||||
/*
|
||||
* running is a subset of runnable (weight) so running can't be set if
|
||||
* runnable is clear. But there are some corner cases where the current
|
||||
* se has been already dequeued but cfs_rq->curr still points to it.
|
||||
* This means that weight will be 0 but not running for a sched_entity
|
||||
* but also for a cfs_rq if the latter becomes idle. As an example,
|
||||
* this happens during idle_balance() which calls
|
||||
* update_blocked_averages()
|
||||
*/
|
||||
if (!load)
|
||||
runnable = running = 0;
|
||||
|
||||
/*
|
||||
* Now we know we crossed measurement unit boundaries. The *_avg
|
||||
* accrues by two steps:
|
||||
*
|
||||
* Step 1: accumulate *_sum since last_update_time. If we haven't
|
||||
* crossed period boundaries, finish.
|
||||
*/
|
||||
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
|
||||
{
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
|
||||
|
||||
/*
|
||||
* Step 2: update *_avg.
|
||||
*/
|
||||
sa->load_avg = div_u64(load * sa->load_sum, divider);
|
||||
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
|
||||
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_entity:
|
||||
*
|
||||
* task:
|
||||
* se_runnable() == se_weight()
|
||||
*
|
||||
* group: [ see update_cfs_group() ]
|
||||
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
||||
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
|
||||
*
|
||||
* load_sum := runnable_sum
|
||||
* load_avg = se_weight(se) * runnable_avg
|
||||
*
|
||||
* runnable_load_sum := runnable_sum
|
||||
* runnable_load_avg = se_runnable(se) * runnable_avg
|
||||
*
|
||||
* XXX collapse load_sum and runnable_load_sum
|
||||
*
|
||||
* cfq_rq:
|
||||
*
|
||||
* load_sum = \Sum se_weight(se) * se->avg.load_sum
|
||||
* load_avg = \Sum se->avg.load_avg
|
||||
*
|
||||
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
|
||||
* runnable_load_avg = \Sum se->avg.runable_load_avg
|
||||
*/
|
||||
|
||||
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
cfs_rq->curr == se)) {
|
||||
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
cfs_se_util_change(&se->avg);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, cpu, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
scale_load_down(cfs_rq->runnable_weight),
|
||||
cfs_rq->curr != NULL)) {
|
||||
|
||||
___update_load_avg(&cfs_rq->avg, 1, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* rt_rq:
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
*
|
||||
* load_avg and runnable_load_avg are not supported and meaningless.
|
||||
*
|
||||
*/
|
||||
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
|
||||
running,
|
||||
running,
|
||||
running)) {
|
||||
|
||||
___update_load_avg(&rq->avg_rt, 1, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* dl_rq:
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
*
|
||||
*/
|
||||
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
|
||||
running,
|
||||
running,
|
||||
running)) {
|
||||
|
||||
___update_load_avg(&rq->avg_dl, 1, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
/*
|
||||
* irq:
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
*
|
||||
*/
|
||||
|
||||
int update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
int ret = 0;
|
||||
/*
|
||||
* We know the time that has been used by interrupt since last update
|
||||
* but we don't when. Let be pessimistic and assume that interrupt has
|
||||
* happened just before the update. This is not so far from reality
|
||||
* because interrupt will most probably wake up task and trig an update
|
||||
* of rq clock during which the metric si updated.
|
||||
* We start to decay with normal context time and then we add the
|
||||
* interrupt context time.
|
||||
* We can safely remove running from rq->clock because
|
||||
* rq->clock += delta with delta >= running
|
||||
*/
|
||||
ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
|
||||
0,
|
||||
0,
|
||||
0);
|
||||
ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
|
||||
1,
|
||||
1,
|
||||
1);
|
||||
|
||||
if (ret)
|
||||
___update_load_avg(&rq->avg_irq, 1, 1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
72
kernel/sched/pelt.h
Normal file
72
kernel/sched/pelt.h
Normal file
|
@ -0,0 +1,72 @@
|
|||
#ifdef CONFIG_SMP
|
||||
|
||||
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
|
||||
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
|
||||
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
int update_irq_load_avg(struct rq *rq, u64 running);
|
||||
#else
|
||||
static inline int
|
||||
update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When a task is dequeued, its estimated utilization should not be update if
|
||||
* its util_avg has not been updated at least once.
|
||||
* This flag is used to synchronize util_avg updates with util_est updates.
|
||||
* We map this information into the LSB bit of the utilization saved at
|
||||
* dequeue time (i.e. util_est.dequeued).
|
||||
*/
|
||||
#define UTIL_AVG_UNCHANGED 0x1
|
||||
|
||||
static inline void cfs_se_util_change(struct sched_avg *avg)
|
||||
{
|
||||
unsigned int enqueued;
|
||||
|
||||
if (!sched_feat(UTIL_EST))
|
||||
return;
|
||||
|
||||
/* Avoid store if the flag has been already set */
|
||||
enqueued = avg->util_est.enqueued;
|
||||
if (!(enqueued & UTIL_AVG_UNCHANGED))
|
||||
return;
|
||||
|
||||
/* Reset flag to report util_avg has been updated */
|
||||
enqueued &= ~UTIL_AVG_UNCHANGED;
|
||||
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline int
|
||||
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
@ -5,6 +5,8 @@
|
|||
*/
|
||||
#include "sched.h"
|
||||
|
||||
#include "pelt.h"
|
||||
|
||||
int sched_rr_timeslice = RR_TIMESLICE;
|
||||
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
|
||||
|
||||
|
@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
|
|||
curr->se.exec_start = now;
|
||||
cgroup_account_cputime(curr, delta_exec);
|
||||
|
||||
sched_rt_avg_update(rq, delta_exec);
|
||||
|
||||
if (!rt_bandwidth_enabled())
|
||||
return;
|
||||
|
||||
|
@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|||
|
||||
rt_queue_push_tasks(rq);
|
||||
|
||||
/*
|
||||
* If prev task was rt, put_prev_task() has already updated the
|
||||
* utilization. We only care of the case where we start to schedule a
|
||||
* rt task
|
||||
*/
|
||||
if (rq->curr->sched_class != &rt_sched_class)
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
|||
{
|
||||
update_curr_rt(rq);
|
||||
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
|
||||
/*
|
||||
* The previous task needs to be made eligible for pushing
|
||||
* if it is still active
|
||||
|
@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
|
|||
struct sched_rt_entity *rt_se = &p->rt;
|
||||
|
||||
update_curr_rt(rq);
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
|
||||
watchdog(rq, p);
|
||||
|
||||
|
|
|
@ -594,6 +594,7 @@ struct rt_rq {
|
|||
unsigned long rt_nr_total;
|
||||
int overloaded;
|
||||
struct plist_head pushable_tasks;
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
int rt_queued;
|
||||
|
||||
|
@ -673,7 +674,26 @@ struct dl_rq {
|
|||
u64 bw_ratio;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/* An entity is a task if it doesn't "own" a runqueue */
|
||||
#define entity_is_task(se) (!se->my_q)
|
||||
#else
|
||||
#define entity_is_task(se) 1
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* XXX we want to get rid of these helpers and use the full load resolution.
|
||||
*/
|
||||
static inline long se_weight(struct sched_entity *se)
|
||||
{
|
||||
return scale_load_down(se->load.weight);
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
{
|
||||
return scale_load_down(se->runnable_weight);
|
||||
}
|
||||
|
||||
static inline bool sched_asym_prefer(int a, int b)
|
||||
{
|
||||
|
@ -833,8 +853,12 @@ struct rq {
|
|||
|
||||
struct list_head cfs_tasks;
|
||||
|
||||
u64 rt_avg;
|
||||
u64 age_stamp;
|
||||
struct sched_avg avg_rt;
|
||||
struct sched_avg avg_dl;
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#define HAVE_SCHED_AVG_IRQ
|
||||
struct sched_avg avg_irq;
|
||||
#endif
|
||||
u64 idle_stamp;
|
||||
u64 avg_idle;
|
||||
|
||||
|
@ -1075,7 +1099,8 @@ enum numa_faults_stats {
|
|||
};
|
||||
extern void sched_setnuma(struct task_struct *p, int node);
|
||||
extern int migrate_task_to(struct task_struct *p, int cpu);
|
||||
extern int migrate_swap(struct task_struct *, struct task_struct *);
|
||||
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
|
||||
int cpu, int scpu);
|
||||
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
|
||||
#else
|
||||
static inline void
|
||||
|
@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
|
|||
|
||||
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern const_debug unsigned int sysctl_sched_time_avg;
|
||||
extern const_debug unsigned int sysctl_sched_nr_migrate;
|
||||
extern const_debug unsigned int sysctl_sched_migration_cost;
|
||||
|
||||
static inline u64 sched_avg_period(void)
|
||||
{
|
||||
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
|
||||
/*
|
||||
|
@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void sched_avg_update(struct rq *rq);
|
||||
|
||||
#ifndef arch_scale_cpu_capacity
|
||||
static __always_inline
|
||||
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
|
@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|||
return SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
||||
{
|
||||
rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
|
||||
sched_avg_update(rq);
|
||||
}
|
||||
#else
|
||||
#ifndef arch_scale_cpu_capacity
|
||||
static __always_inline
|
||||
|
@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
|
|||
return SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
#endif
|
||||
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
|
||||
static inline void sched_avg_update(struct rq *rq) { }
|
||||
#endif
|
||||
|
||||
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
|
||||
|
@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
|
||||
static inline unsigned long cpu_util_dl(struct rq *rq)
|
||||
static inline unsigned long cpu_bw_dl(struct rq *rq)
|
||||
{
|
||||
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util_dl(struct rq *rq)
|
||||
{
|
||||
return READ_ONCE(rq->avg_dl.util_avg);
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util_cfs(struct rq *rq)
|
||||
{
|
||||
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
|
||||
|
@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
|
|||
|
||||
return util;
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util_rt(struct rq *rq)
|
||||
{
|
||||
return READ_ONCE(rq->avg_rt.util_avg);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SCHED_AVG_IRQ
|
||||
static inline unsigned long cpu_util_irq(struct rq *rq)
|
||||
{
|
||||
return rq->avg_irq.util_avg;
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
|
||||
{
|
||||
util *= (max - irq);
|
||||
util /= max;
|
||||
|
||||
return util;
|
||||
|
||||
}
|
||||
#else
|
||||
static inline unsigned long cpu_util_irq(struct rq *rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
|
||||
{
|
||||
return util;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
|
|||
}
|
||||
EXPORT_SYMBOL(swake_up_locked);
|
||||
|
||||
void swake_up(struct swait_queue_head *q)
|
||||
void swake_up_one(struct swait_queue_head *q)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
|
@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
|
|||
swake_up_locked(q);
|
||||
raw_spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(swake_up);
|
||||
EXPORT_SYMBOL(swake_up_one);
|
||||
|
||||
/*
|
||||
* Does not allow usage from IRQ disabled, since we must be able to
|
||||
|
@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
|
|||
}
|
||||
EXPORT_SYMBOL(swake_up_all);
|
||||
|
||||
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
|
||||
static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
|
||||
{
|
||||
wait->task = current;
|
||||
if (list_empty(&wait->task_list))
|
||||
list_add(&wait->task_list, &q->task_list);
|
||||
list_add_tail(&wait->task_list, &q->task_list);
|
||||
}
|
||||
|
||||
void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
|
||||
void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
|
@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
|
|||
set_current_state(state);
|
||||
raw_spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(prepare_to_swait);
|
||||
EXPORT_SYMBOL(prepare_to_swait_exclusive);
|
||||
|
||||
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
|
||||
{
|
||||
if (signal_pending_state(state, current))
|
||||
return -ERESTARTSYS;
|
||||
unsigned long flags;
|
||||
long ret = 0;
|
||||
|
||||
prepare_to_swait(q, wait, state);
|
||||
raw_spin_lock_irqsave(&q->lock, flags);
|
||||
if (unlikely(signal_pending_state(state, current))) {
|
||||
/*
|
||||
* See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
|
||||
* must not see us.
|
||||
*/
|
||||
list_del_init(&wait->task_list);
|
||||
ret = -ERESTARTSYS;
|
||||
} else {
|
||||
__prepare_to_swait(q, wait);
|
||||
set_current_state(state);
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&q->lock, flags);
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(prepare_to_swait_event);
|
||||
|
||||
|
|
|
@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
|
|||
|
||||
mutex_lock(&smpboot_threads_lock);
|
||||
list_for_each_entry(cur, &hotplug_threads, list)
|
||||
if (cpumask_test_cpu(cpu, cur->cpumask))
|
||||
smpboot_unpark_thread(cur, cpu);
|
||||
smpboot_unpark_thread(cur, cpu);
|
||||
mutex_unlock(&smpboot_threads_lock);
|
||||
return 0;
|
||||
}
|
||||
|
@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
|
|||
}
|
||||
|
||||
/**
|
||||
* smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
|
||||
* smpboot_register_percpu_thread - Register a per_cpu thread related
|
||||
* to hotplug
|
||||
* @plug_thread: Hotplug thread descriptor
|
||||
* @cpumask: The cpumask where threads run
|
||||
*
|
||||
* Creates and starts the threads on all online cpus.
|
||||
*/
|
||||
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
|
||||
const struct cpumask *cpumask)
|
||||
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
|
||||
{
|
||||
unsigned int cpu;
|
||||
int ret = 0;
|
||||
|
||||
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
cpumask_copy(plug_thread->cpumask, cpumask);
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&smpboot_threads_lock);
|
||||
for_each_online_cpu(cpu) {
|
||||
ret = __smpboot_create_thread(plug_thread, cpu);
|
||||
if (ret) {
|
||||
smpboot_destroy_threads(plug_thread);
|
||||
free_cpumask_var(plug_thread->cpumask);
|
||||
goto out;
|
||||
}
|
||||
if (cpumask_test_cpu(cpu, cpumask))
|
||||
smpboot_unpark_thread(plug_thread, cpu);
|
||||
smpboot_unpark_thread(plug_thread, cpu);
|
||||
}
|
||||
list_add(&plug_thread->list, &hotplug_threads);
|
||||
out:
|
||||
|
@ -315,7 +306,7 @@ int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_threa
|
|||
put_online_cpus();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
|
||||
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
|
||||
|
||||
/**
|
||||
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
|
||||
|
@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
|
|||
smpboot_destroy_threads(plug_thread);
|
||||
mutex_unlock(&smpboot_threads_lock);
|
||||
put_online_cpus();
|
||||
free_cpumask_var(plug_thread->cpumask);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
|
||||
|
||||
/**
|
||||
* smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
|
||||
* @plug_thread: Hotplug thread descriptor
|
||||
* @new: Revised mask to use
|
||||
*
|
||||
* The cpumask field in the smp_hotplug_thread must not be updated directly
|
||||
* by the client, but only by calling this function.
|
||||
* This function can only be called on a registered smp_hotplug_thread.
|
||||
*/
|
||||
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
|
||||
const struct cpumask *new)
|
||||
{
|
||||
struct cpumask *old = plug_thread->cpumask;
|
||||
static struct cpumask tmp;
|
||||
unsigned int cpu;
|
||||
|
||||
lockdep_assert_cpus_held();
|
||||
mutex_lock(&smpboot_threads_lock);
|
||||
|
||||
/* Park threads that were exclusively enabled on the old mask. */
|
||||
cpumask_andnot(&tmp, old, new);
|
||||
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
|
||||
smpboot_park_thread(plug_thread, cpu);
|
||||
|
||||
/* Unpark threads that are exclusively enabled on the new mask. */
|
||||
cpumask_andnot(&tmp, new, old);
|
||||
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
|
||||
smpboot_unpark_thread(plug_thread, cpu);
|
||||
|
||||
cpumask_copy(old, new);
|
||||
|
||||
mutex_unlock(&smpboot_threads_lock);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
|
||||
|
||||
/*
|
||||
|
|
|
@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
|
|||
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
|
||||
DEFINE_WAKE_Q(wakeq);
|
||||
int err;
|
||||
|
||||
retry:
|
||||
/*
|
||||
* The waking up of stopper threads has to happen in the same
|
||||
* scheduling context as the queueing. Otherwise, there is a
|
||||
* possibility of one of the above stoppers being woken up by another
|
||||
* CPU, and preempting us. This will cause us to not wake up the other
|
||||
* stopper forever.
|
||||
*/
|
||||
preempt_disable();
|
||||
raw_spin_lock_irq(&stopper1->lock);
|
||||
raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
|
||||
|
||||
err = -ENOENT;
|
||||
if (!stopper1->enabled || !stopper2->enabled)
|
||||
if (!stopper1->enabled || !stopper2->enabled) {
|
||||
err = -ENOENT;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that if we race with __stop_cpus() the stoppers won't get
|
||||
* queued up in reverse order leading to system deadlock.
|
||||
|
@ -255,36 +266,30 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
|
|||
* It can be falsely true but it is safe to spin until it is cleared,
|
||||
* queue_stop_cpus_work() does everything under preempt_disable().
|
||||
*/
|
||||
err = -EDEADLK;
|
||||
if (unlikely(stop_cpus_in_progress))
|
||||
goto unlock;
|
||||
if (unlikely(stop_cpus_in_progress)) {
|
||||
err = -EDEADLK;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
__cpu_stop_queue_work(stopper1, work1, &wakeq);
|
||||
__cpu_stop_queue_work(stopper2, work2, &wakeq);
|
||||
/*
|
||||
* The waking up of stopper threads has to happen
|
||||
* in the same scheduling context as the queueing.
|
||||
* Otherwise, there is a possibility of one of the
|
||||
* above stoppers being woken up by another CPU,
|
||||
* and preempting us. This will cause us to n ot
|
||||
* wake up the other stopper forever.
|
||||
*/
|
||||
preempt_disable();
|
||||
|
||||
unlock:
|
||||
raw_spin_unlock(&stopper2->lock);
|
||||
raw_spin_unlock_irq(&stopper1->lock);
|
||||
|
||||
if (unlikely(err == -EDEADLK)) {
|
||||
preempt_enable();
|
||||
|
||||
while (stop_cpus_in_progress)
|
||||
cpu_relax();
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (!err) {
|
||||
wake_up_q(&wakeq);
|
||||
preempt_enable();
|
||||
}
|
||||
wake_up_q(&wakeq);
|
||||
preempt_enable();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
|
|||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_time_avg_ms",
|
||||
.data = &sysctl_sched_time_avg,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one,
|
||||
},
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
{
|
||||
.procname = "sched_schedstats",
|
||||
|
|
|
@ -18,18 +18,14 @@
|
|||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/smpboot.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <uapi/linux/sched/types.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
#include <asm/irq_regs.h>
|
||||
#include <linux/kvm_para.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
static DEFINE_MUTEX(watchdog_mutex);
|
||||
|
||||
|
@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
|
|||
unsigned int __read_mostly softlockup_panic =
|
||||
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
|
||||
|
||||
static bool softlockup_threads_initialized __read_mostly;
|
||||
static bool softlockup_initialized __read_mostly;
|
||||
static u64 __read_mostly sample_period;
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
|
||||
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
|
||||
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
|
||||
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
|
||||
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
|
||||
|
@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
|
|||
__this_cpu_inc(hrtimer_interrupts);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct completion, softlockup_completion);
|
||||
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
|
||||
|
||||
/*
|
||||
* The watchdog thread function - touches the timestamp.
|
||||
*
|
||||
* It only runs once every sample_period seconds (4 seconds by
|
||||
* default) to reset the softlockup timestamp. If this gets delayed
|
||||
* for more than 2*watchdog_thresh seconds then the debug-printout
|
||||
* triggers in watchdog_timer_fn().
|
||||
*/
|
||||
static int softlockup_fn(void *data)
|
||||
{
|
||||
__this_cpu_write(soft_lockup_hrtimer_cnt,
|
||||
__this_cpu_read(hrtimer_interrupts));
|
||||
__touch_watchdog();
|
||||
complete(this_cpu_ptr(&softlockup_completion));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* watchdog kicker functions */
|
||||
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
||||
{
|
||||
|
@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
|||
watchdog_interrupt_count();
|
||||
|
||||
/* kick the softlockup detector */
|
||||
wake_up_process(__this_cpu_read(softlockup_watchdog));
|
||||
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
|
||||
reinit_completion(this_cpu_ptr(&softlockup_completion));
|
||||
stop_one_cpu_nowait(smp_processor_id(),
|
||||
softlockup_fn, NULL,
|
||||
this_cpu_ptr(&softlockup_stop_work));
|
||||
}
|
||||
|
||||
/* .. and repeat */
|
||||
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
|
||||
|
@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
|||
return HRTIMER_RESTART;
|
||||
}
|
||||
|
||||
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
|
||||
{
|
||||
struct sched_param param = { .sched_priority = prio };
|
||||
|
||||
sched_setscheduler(current, policy, ¶m);
|
||||
}
|
||||
|
||||
static void watchdog_enable(unsigned int cpu)
|
||||
{
|
||||
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
|
||||
struct completion *done = this_cpu_ptr(&softlockup_completion);
|
||||
|
||||
WARN_ON_ONCE(cpu != smp_processor_id());
|
||||
|
||||
init_completion(done);
|
||||
complete(done);
|
||||
|
||||
/*
|
||||
* Start the timer first to prevent the NMI watchdog triggering
|
||||
|
@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
|
|||
/* Enable the perf event */
|
||||
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
|
||||
watchdog_nmi_enable(cpu);
|
||||
|
||||
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
|
||||
}
|
||||
|
||||
static void watchdog_disable(unsigned int cpu)
|
||||
{
|
||||
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
|
||||
|
||||
watchdog_set_prio(SCHED_NORMAL, 0);
|
||||
WARN_ON_ONCE(cpu != smp_processor_id());
|
||||
|
||||
/*
|
||||
* Disable the perf event first. That prevents that a large delay
|
||||
* between disabling the timer and disabling the perf event causes
|
||||
|
@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
|
|||
*/
|
||||
watchdog_nmi_disable(cpu);
|
||||
hrtimer_cancel(hrtimer);
|
||||
wait_for_completion(this_cpu_ptr(&softlockup_completion));
|
||||
}
|
||||
|
||||
static void watchdog_cleanup(unsigned int cpu, bool online)
|
||||
static int softlockup_stop_fn(void *data)
|
||||
{
|
||||
watchdog_disable(cpu);
|
||||
watchdog_disable(smp_processor_id());
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int watchdog_should_run(unsigned int cpu)
|
||||
static void softlockup_stop_all(void)
|
||||
{
|
||||
return __this_cpu_read(hrtimer_interrupts) !=
|
||||
__this_cpu_read(soft_lockup_hrtimer_cnt);
|
||||
}
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* The watchdog thread function - touches the timestamp.
|
||||
*
|
||||
* It only runs once every sample_period seconds (4 seconds by
|
||||
* default) to reset the softlockup timestamp. If this gets delayed
|
||||
* for more than 2*watchdog_thresh seconds then the debug-printout
|
||||
* triggers in watchdog_timer_fn().
|
||||
*/
|
||||
static void watchdog(unsigned int cpu)
|
||||
{
|
||||
__this_cpu_write(soft_lockup_hrtimer_cnt,
|
||||
__this_cpu_read(hrtimer_interrupts));
|
||||
__touch_watchdog();
|
||||
}
|
||||
|
||||
static struct smp_hotplug_thread watchdog_threads = {
|
||||
.store = &softlockup_watchdog,
|
||||
.thread_should_run = watchdog_should_run,
|
||||
.thread_fn = watchdog,
|
||||
.thread_comm = "watchdog/%u",
|
||||
.setup = watchdog_enable,
|
||||
.cleanup = watchdog_cleanup,
|
||||
.park = watchdog_disable,
|
||||
.unpark = watchdog_enable,
|
||||
};
|
||||
|
||||
static void softlockup_update_smpboot_threads(void)
|
||||
{
|
||||
lockdep_assert_held(&watchdog_mutex);
|
||||
|
||||
if (!softlockup_threads_initialized)
|
||||
if (!softlockup_initialized)
|
||||
return;
|
||||
|
||||
smpboot_update_cpumask_percpu_thread(&watchdog_threads,
|
||||
&watchdog_allowed_mask);
|
||||
}
|
||||
for_each_cpu(cpu, &watchdog_allowed_mask)
|
||||
smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
|
||||
|
||||
/* Temporarily park all watchdog threads */
|
||||
static void softlockup_park_all_threads(void)
|
||||
{
|
||||
cpumask_clear(&watchdog_allowed_mask);
|
||||
softlockup_update_smpboot_threads();
|
||||
}
|
||||
|
||||
/* Unpark enabled threads */
|
||||
static void softlockup_unpark_threads(void)
|
||||
static int softlockup_start_fn(void *data)
|
||||
{
|
||||
watchdog_enable(smp_processor_id());
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void softlockup_start_all(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
|
||||
softlockup_update_smpboot_threads();
|
||||
for_each_cpu(cpu, &watchdog_allowed_mask)
|
||||
smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
|
||||
}
|
||||
|
||||
int lockup_detector_online_cpu(unsigned int cpu)
|
||||
{
|
||||
watchdog_enable(cpu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int lockup_detector_offline_cpu(unsigned int cpu)
|
||||
{
|
||||
watchdog_disable(cpu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void lockup_detector_reconfigure(void)
|
||||
{
|
||||
cpus_read_lock();
|
||||
watchdog_nmi_stop();
|
||||
softlockup_park_all_threads();
|
||||
|
||||
softlockup_stop_all();
|
||||
set_sample_period();
|
||||
lockup_detector_update_enable();
|
||||
if (watchdog_enabled && watchdog_thresh)
|
||||
softlockup_unpark_threads();
|
||||
softlockup_start_all();
|
||||
|
||||
watchdog_nmi_start();
|
||||
cpus_read_unlock();
|
||||
/*
|
||||
|
@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
|
|||
*/
|
||||
static __init void lockup_detector_setup(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If sysctl is off and watchdog got disabled on the command line,
|
||||
* nothing to do here.
|
||||
|
@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
|
|||
!(watchdog_enabled && watchdog_thresh))
|
||||
return;
|
||||
|
||||
ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
|
||||
&watchdog_allowed_mask);
|
||||
if (ret) {
|
||||
pr_err("Failed to initialize soft lockup detector threads\n");
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_lock(&watchdog_mutex);
|
||||
softlockup_threads_initialized = true;
|
||||
lockup_detector_reconfigure();
|
||||
softlockup_initialized = true;
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
static inline int watchdog_park_threads(void) { return 0; }
|
||||
static inline void watchdog_unpark_threads(void) { }
|
||||
static inline int watchdog_enable_all_cpus(void) { return 0; }
|
||||
static inline void watchdog_disable_all_cpus(void) { }
|
||||
static void lockup_detector_reconfigure(void)
|
||||
{
|
||||
cpus_read_lock();
|
||||
|
|
|
@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void)
|
|||
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
|
||||
watchdog_overflow_callback, NULL);
|
||||
if (IS_ERR(evt)) {
|
||||
pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
|
||||
PTR_ERR(evt));
|
||||
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
|
||||
PTR_ERR(evt));
|
||||
return PTR_ERR(evt);
|
||||
}
|
||||
this_cpu_write(watchdog_ev, evt);
|
||||
|
|
|
@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
|
|||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
vcpu->arch.pause = false;
|
||||
swake_up(kvm_arch_vcpu_wq(vcpu));
|
||||
swake_up_one(kvm_arch_vcpu_wq(vcpu));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
|
||||
|
||||
swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
|
||||
swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
|
||||
(!vcpu->arch.pause)));
|
||||
|
||||
if (vcpu->arch.power_off || vcpu->arch.pause) {
|
||||
|
|
|
@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
|
|||
smp_mb(); /* Make sure the above is visible */
|
||||
|
||||
wq = kvm_arch_vcpu_wq(vcpu);
|
||||
swake_up(wq);
|
||||
swake_up_one(wq);
|
||||
|
||||
return PSCI_RET_SUCCESS;
|
||||
}
|
||||
|
|
|
@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
|
|||
trace_kvm_async_pf_completed(addr, gva);
|
||||
|
||||
if (swq_has_sleeper(&vcpu->wq))
|
||||
swake_up(&vcpu->wq);
|
||||
swake_up_one(&vcpu->wq);
|
||||
|
||||
mmput(mm);
|
||||
kvm_put_kvm(vcpu->kvm);
|
||||
|
|
|
@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
|
|||
kvm_arch_vcpu_blocking(vcpu);
|
||||
|
||||
for (;;) {
|
||||
prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
|
||||
prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
if (kvm_vcpu_check_block(vcpu) < 0)
|
||||
break;
|
||||
|
@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
|
|||
|
||||
wqp = kvm_arch_vcpu_wq(vcpu);
|
||||
if (swq_has_sleeper(wqp)) {
|
||||
swake_up(wqp);
|
||||
swake_up_one(wqp);
|
||||
++vcpu->stat.halt_wakeup;
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue