Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle are:

   - Make schedstats a runtime tunable (disabled by default) and
     optimize it via static keys.

     As most distributions enable CONFIG_SCHEDSTATS=y due to its
     instrumentation value, this is a nice performance enhancement.
     (Mel Gorman)

   - Implement 'simple waitqueues' (swait): these are just pure
     waitqueues without any of the more complex features of full-blown
     waitqueues (callbacks, wake flags, wake keys, etc.).  Simple
     waitqueues have less memory overhead and are faster.

     Use simple waitqueues in the RCU code (in 4 different places) and
     for handling KVM vCPU wakeups.

     (Peter Zijlstra, Daniel Wagner, Thomas Gleixner, Paul Gortmaker,
     Marcelo Tosatti)

   - sched/numa enhancements (Rik van Riel)

   - NOHZ performance enhancements (Rik van Riel)

   - Various sched/deadline enhancements (Steven Rostedt)

   - Various fixes (Peter Zijlstra)

   - ... and a number of other fixes, cleanups and smaller enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits)
  sched/cputime: Fix steal_account_process_tick() to always return jiffies
  sched/deadline: Remove dl_new from struct sched_dl_entity
  Revert "kbuild: Add option to turn incompatible pointer check into error"
  sched/deadline: Remove superfluous call to switched_to_dl()
  sched/debug: Fix preempt_disable_ip recording for preempt_disable()
  sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity
  time, acct: Drop irq save & restore from __acct_update_integrals()
  acct, time: Change indentation in __acct_update_integrals()
  sched, time: Remove non-power-of-two divides from __acct_update_integrals()
  sched/rt: Kick RT bandwidth timer immediately on start up
  sched/debug: Add deadline scheduler bandwidth ratio to /proc/sched_debug
  sched/debug: Move sched_domain_sysctl to debug.c
  sched/debug: Move the /sys/kernel/debug/sched_features file setup into debug.c
  sched/rt: Fix PI handling vs. sched_setscheduler()
  sched/core: Remove duplicated sched_group_set_shares() prototype
  sched/fair: Consolidate nohz CPU load update code
  sched/fair: Avoid using decay_load_missed() with a negative value
  sched/deadline: Always calculate end of period on sched_yield()
  sched/cgroup: Fix cgroup entity load tracking tear-down
  rcu: Use simple wait queues where possible in rcutree
  ...
This commit is contained in:
Linus Torvalds 2016-03-14 19:14:06 -07:00
commit d4e796152a
37 changed files with 1298 additions and 713 deletions

View file

@ -3532,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
sched_debug [KNL] Enables verbose scheduler debug messages. sched_debug [KNL] Enables verbose scheduler debug messages.
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
incurs a small amount of overhead in the scheduler
but is useful for debugging and performance tuning.
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
xtime_lock contention on larger systems, and/or RCU lock xtime_lock contention on larger systems, and/or RCU lock
contention on all systems with CONFIG_MAXSMP set. contention on all systems with CONFIG_MAXSMP set.

View file

@ -773,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued.
============================================================== ==============================================================
sched_schedstats:
Enables/disables scheduler statistics. Enabling this feature
incurs a small amount of overhead in the scheduler but is
useful for debugging and performance tuning.
==============================================================
sg-big-buff: sg-big-buff:
This file shows the size of the generic SCSI (sg) buffer. This file shows the size of the generic SCSI (sg) buffer.

View file

@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
vcpu->arch.pause = false; vcpu->arch.pause = false;
wake_up_interruptible(wq); swake_up(wq);
} }
} }
static void vcpu_sleep(struct kvm_vcpu *vcpu) static void vcpu_sleep(struct kvm_vcpu *vcpu)
{ {
wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
(!vcpu->arch.pause))); (!vcpu->arch.pause)));
} }

View file

@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{ {
struct kvm *kvm = source_vcpu->kvm; struct kvm *kvm = source_vcpu->kvm;
struct kvm_vcpu *vcpu = NULL; struct kvm_vcpu *vcpu = NULL;
wait_queue_head_t *wq; struct swait_queue_head *wq;
unsigned long cpu_id; unsigned long cpu_id;
unsigned long context_id; unsigned long context_id;
phys_addr_t target_pc; phys_addr_t target_pc;
@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
smp_mb(); /* Make sure the above is visible */ smp_mb(); /* Make sure the above is visible */
wq = kvm_arch_vcpu_wq(vcpu); wq = kvm_arch_vcpu_wq(vcpu);
wake_up_interruptible(wq); swake_up(wq);
return PSCI_RET_SUCCESS; return PSCI_RET_SUCCESS;
} }

View file

@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0; dvcpu->arch.wait = 0;
if (waitqueue_active(&dvcpu->wq)) if (swait_active(&dvcpu->wq))
wake_up_interruptible(&dvcpu->wq); swake_up(&dvcpu->wq);
return 0; return 0;
} }
@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
kvm_mips_callbacks->queue_timer_int(vcpu); kvm_mips_callbacks->queue_timer_int(vcpu);
vcpu->arch.wait = 0; vcpu->arch.wait = 0;
if (waitqueue_active(&vcpu->wq)) if (swait_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq); swake_up(&vcpu->wq);
} }
/* low level hrtimer wake routine */ /* low level hrtimer wake routine */

View file

@ -289,7 +289,7 @@ struct kvmppc_vcore {
struct list_head runnable_threads; struct list_head runnable_threads;
struct list_head preempt_list; struct list_head preempt_list;
spinlock_t lock; spinlock_t lock;
wait_queue_head_t wq; struct swait_queue_head wq;
spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
u64 stolen_tb; u64 stolen_tb;
u64 preempt_tb; u64 preempt_tb;
@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
u8 prodded; u8 prodded;
u32 last_inst; u32 last_inst;
wait_queue_head_t *wqp; struct swait_queue_head *wqp;
struct kvmppc_vcore *vcore; struct kvmppc_vcore *vcore;
int ret; int ret;
int trap; int trap;

View file

@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
{ {
int cpu; int cpu;
wait_queue_head_t *wqp; struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu); wqp = kvm_arch_vcpu_wq(vcpu);
if (waitqueue_active(wqp)) { if (swait_active(wqp)) {
wake_up_interruptible(wqp); swake_up(wqp);
++vcpu->stat.halt_wakeup; ++vcpu->stat.halt_wakeup;
} }
@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
tvcpu->arch.prodded = 1; tvcpu->arch.prodded = 1;
smp_mb(); smp_mb();
if (vcpu->arch.ceded) { if (vcpu->arch.ceded) {
if (waitqueue_active(&vcpu->wq)) { if (swait_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq); swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++; vcpu->stat.halt_wakeup++;
} }
} }
@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
INIT_LIST_HEAD(&vcore->runnable_threads); INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock); spin_lock_init(&vcore->lock);
spin_lock_init(&vcore->stoltb_lock); spin_lock_init(&vcore->stoltb_lock);
init_waitqueue_head(&vcore->wq); init_swait_queue_head(&vcore->wq);
vcore->preempt_tb = TB_NIL; vcore->preempt_tb = TB_NIL;
vcore->lpcr = kvm->arch.lpcr; vcore->lpcr = kvm->arch.lpcr;
vcore->first_vcpuid = core * threads_per_subcore; vcore->first_vcpuid = core * threads_per_subcore;
@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{ {
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
int do_sleep = 1; int do_sleep = 1;
DECLARE_SWAITQUEUE(wait);
DEFINE_WAIT(wait); prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
/* /*
* Check one last time for pending exceptions and ceded state after * Check one last time for pending exceptions and ceded state after
@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
} }
if (!do_sleep) { if (!do_sleep) {
finish_wait(&vc->wq, &wait); finish_swait(&vc->wq, &wait);
return; return;
} }
@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
trace_kvmppc_vcore_blocked(vc, 0); trace_kvmppc_vcore_blocked(vc, 0);
spin_unlock(&vc->lock); spin_unlock(&vc->lock);
schedule(); schedule();
finish_wait(&vc->wq, &wait); finish_swait(&vc->wq, &wait);
spin_lock(&vc->lock); spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE; vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_vcore_blocked(vc, 1); trace_kvmppc_vcore_blocked(vc, 1);
@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_start_thread(vcpu, vc); kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu); trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) { } else if (vc->vcore_state == VCORE_SLEEPING) {
wake_up(&vc->wq); swake_up(&vc->wq);
} }
} }

View file

@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
struct kvm_s390_local_interrupt { struct kvm_s390_local_interrupt {
spinlock_t lock; spinlock_t lock;
struct kvm_s390_float_interrupt *float_int; struct kvm_s390_float_interrupt *float_int;
wait_queue_head_t *wq; struct swait_queue_head *wq;
atomic_t *cpuflags; atomic_t *cpuflags;
DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
struct kvm_s390_irq_payload irq; struct kvm_s390_irq_payload irq;

View file

@ -966,13 +966,13 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
{ {
if (waitqueue_active(&vcpu->wq)) { if (swait_active(&vcpu->wq)) {
/* /*
* The vcpu gave up the cpu voluntarily, mark it as a good * The vcpu gave up the cpu voluntarily, mark it as a good
* yield-candidate. * yield-candidate.
*/ */
vcpu->preempted = true; vcpu->preempted = true;
wake_up_interruptible(&vcpu->wq); swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++; vcpu->stat.halt_wakeup++;
} }
} }

View file

@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
static void apic_timer_expired(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic)
{ {
struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq; struct swait_queue_head *q = &vcpu->wq;
struct kvm_timer *ktimer = &apic->lapic_timer; struct kvm_timer *ktimer = &apic->lapic_timer;
if (atomic_read(&apic->lapic_timer.pending)) if (atomic_read(&apic->lapic_timer.pending))
@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending); atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu); kvm_set_pending_timer(vcpu);
if (waitqueue_active(q)) if (swait_active(q))
wake_up_interruptible(q); swake_up(q);
if (apic_lvtt_tscdeadline(apic)) if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline; ktimer->expired_tscdeadline = ktimer->tscdeadline;

View file

@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
static inline unsigned long get_lock_parent_ip(void)
{
unsigned long addr = CALLER_ADDR0;
if (!in_lock_functions(addr))
return addr;
addr = CALLER_ADDR1;
if (!in_lock_functions(addr))
return addr;
return CALLER_ADDR2;
}
#ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_IRQSOFF_TRACER
extern void time_hardirqs_on(unsigned long a0, unsigned long a1); extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
extern void time_hardirqs_off(unsigned long a0, unsigned long a1); extern void time_hardirqs_off(unsigned long a0, unsigned long a1);

View file

@ -25,6 +25,7 @@
#include <linux/irqflags.h> #include <linux/irqflags.h>
#include <linux/context_tracking.h> #include <linux/context_tracking.h>
#include <linux/irqbypass.h> #include <linux/irqbypass.h>
#include <linux/swait.h>
#include <asm/signal.h> #include <asm/signal.h>
#include <linux/kvm.h> #include <linux/kvm.h>
@ -218,7 +219,7 @@ struct kvm_vcpu {
int fpu_active; int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded; int guest_fpu_loaded, guest_xcr0_loaded;
unsigned char fpu_counter; unsigned char fpu_counter;
wait_queue_head_t wq; struct swait_queue_head wq;
struct pid *pid; struct pid *pid;
int sigset_active; int sigset_active;
sigset_t sigset; sigset_t sigset;
@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
} }
#endif #endif
static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
{ {
#ifdef __KVM_HAVE_ARCH_WQP #ifdef __KVM_HAVE_ARCH_WQP
return vcpu->arch.wqp; return vcpu->arch.wqp;

View file

@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
void clear_all_latency_tracing(struct task_struct *p); void clear_all_latency_tracing(struct task_struct *p);
extern int sysctl_latencytop(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#else #else
static inline void static inline void

View file

@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
static inline void update_cpu_load_nohz(int active) { } static inline void update_cpu_load_nohz(int active) { }
#endif #endif
extern unsigned long get_parent_ip(unsigned long addr);
extern void dump_cpu_task(int cpu); extern void dump_cpu_task(int cpu);
struct seq_file; struct seq_file;
@ -920,6 +918,10 @@ static inline int sched_info_on(void)
#endif #endif
} }
#ifdef CONFIG_SCHEDSTATS
void force_schedstat_enabled(void);
#endif
enum cpu_idle_type { enum cpu_idle_type {
CPU_IDLE, CPU_IDLE,
CPU_NOT_IDLE, CPU_NOT_IDLE,
@ -1289,6 +1291,8 @@ struct sched_rt_entity {
unsigned long timeout; unsigned long timeout;
unsigned long watchdog_stamp; unsigned long watchdog_stamp;
unsigned int time_slice; unsigned int time_slice;
unsigned short on_rq;
unsigned short on_list;
struct sched_rt_entity *back; struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
@ -1329,10 +1333,6 @@ struct sched_dl_entity {
* task has to wait for a replenishment to be performed at the * task has to wait for a replenishment to be performed at the
* next firing of dl_timer. * next firing of dl_timer.
* *
* @dl_new tells if a new instance arrived. If so we must
* start executing it with full runtime and reset its absolute
* deadline;
*
* @dl_boosted tells if we are boosted due to DI. If so we are * @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we * outside bandwidth enforcement mechanism (but only until we
* exit the critical section); * exit the critical section);
@ -1340,7 +1340,7 @@ struct sched_dl_entity {
* @dl_yielded tells if task gave up the cpu before consuming * @dl_yielded tells if task gave up the cpu before consuming
* all its available runtime during the last job. * all its available runtime during the last job.
*/ */
int dl_throttled, dl_new, dl_boosted, dl_yielded; int dl_throttled, dl_boosted, dl_yielded;
/* /*
* Bandwidth enforcement timer. Each -deadline task has its * Bandwidth enforcement timer. Each -deadline task has its

View file

@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, void __user *buffer, size_t *lenp,
loff_t *ppos); loff_t *ppos);
extern int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
#endif /* _SCHED_SYSCTL_H */ #endif /* _SCHED_SYSCTL_H */

172
include/linux/swait.h Normal file
View file

@ -0,0 +1,172 @@
#ifndef _LINUX_SWAIT_H
#define _LINUX_SWAIT_H
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <asm/current.h>
/*
* Simple wait queues
*
* While these are very similar to the other/complex wait queues (wait.h) the
* most important difference is that the simple waitqueue allows for
* deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
* times.
*
* In order to make this so, we had to drop a fair number of features of the
* other waitqueue code; notably:
*
* - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
* all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
* sleeper state.
*
* - the exclusive mode; because this requires preserving the list order
* and this is hard.
*
* - custom wake functions; because you cannot give any guarantees about
* random code.
*
* As a side effect of this; the data structures are slimmer.
*
* One would recommend using this wait queue where possible.
*/
struct task_struct;
struct swait_queue_head {
raw_spinlock_t lock;
struct list_head task_list;
};
struct swait_queue {
struct task_struct *task;
struct list_head task_list;
};
#define __SWAITQUEUE_INITIALIZER(name) { \
.task = current, \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
#define DECLARE_SWAITQUEUE(name) \
struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
#define DECLARE_SWAIT_QUEUE_HEAD(name) \
struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key);
#define init_swait_queue_head(q) \
do { \
static struct lock_class_key __key; \
__init_swait_queue_head((q), #q, &__key); \
} while (0)
#ifdef CONFIG_LOCKDEP
# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
({ init_swait_queue_head(&name); name; })
# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
DECLARE_SWAIT_QUEUE_HEAD(name)
#endif
static inline int swait_active(struct swait_queue_head *q)
{
return !list_empty(&q->task_list);
}
extern void swake_up(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
extern void swake_up_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
#define ___swait_event(wq, condition, state, ret, cmd) \
({ \
struct swait_queue __wait; \
long __ret = ret; \
\
INIT_LIST_HEAD(&__wait.task_list); \
for (;;) { \
long __int = prepare_to_swait_event(&wq, &__wait, state);\
\
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
break; \
} \
\
cmd; \
} \
finish_swait(&wq, &__wait); \
__ret; \
})
#define __swait_event(wq, condition) \
(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
schedule())
#define swait_event(wq, condition) \
do { \
if (condition) \
break; \
__swait_event(wq, condition); \
} while (0)
#define __swait_event_timeout(wq, condition, timeout) \
___swait_event(wq, ___wait_cond_timeout(condition), \
TASK_UNINTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
__ret = __swait_event_timeout(wq, condition, timeout); \
__ret; \
})
#define __swait_event_interruptible(wq, condition) \
___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
schedule())
#define swait_event_interruptible(wq, condition) \
({ \
int __ret = 0; \
if (!(condition)) \
__ret = __swait_event_interruptible(wq, condition); \
__ret; \
})
#define __swait_event_interruptible_timeout(wq, condition, timeout) \
___swait_event(wq, ___wait_cond_timeout(condition), \
TASK_INTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_interruptible_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
__ret = __swait_event_interruptible_timeout(wq, \
condition, timeout); \
__ret; \
})
#endif /* _LINUX_SWAIT_H */

View file

@ -338,7 +338,7 @@ do { \
schedule(); try_to_freeze()) schedule(); try_to_freeze())
/** /**
* wait_event - sleep (or freeze) until a condition gets true * wait_event_freezable - sleep (or freeze) until a condition gets true
* @wq: the waitqueue to wait on * @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for * @condition: a C expression for the event to wait for
* *

View file

@ -47,12 +47,12 @@
* of times) * of times)
*/ */
#include <linux/latencytop.h>
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/latencytop.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/list.h> #include <linux/list.h>
@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops); proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0; return 0;
} }
int sysctl_latencytop(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int err;
err = proc_dointvec(table, write, buffer, lenp, ppos);
if (latencytop_enabled)
force_schedstat_enabled();
return err;
}
device_initcall(init_lstats_procfs); device_initcall(init_lstats_procfs);

View file

@ -59,6 +59,7 @@ int profile_setup(char *str)
if (!strncmp(str, sleepstr, strlen(sleepstr))) { if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING; prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',') if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1; str += strlen(sleepstr) + 1;

View file

@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
int needmore; int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0; rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1]; needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c, trace_rcu_future_gp(rnp, rdp, c,
@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) || !READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread) !rsp->gp_kthread)
return; return;
wake_up(&rsp->gp_wq); swake_up(&rsp->gp_wq);
} }
/* /*
@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
int nocb = 0; int nocb = 0;
struct rcu_data *rdp; struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp); struct rcu_node *rnp = rcu_get_root(rsp);
struct swait_queue_head *sq;
WRITE_ONCE(rsp->gp_activity, jiffies); WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp); raw_spin_lock_irq_rcu_node(rnp);
@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */ /* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp); nocb += rcu_future_gp_cleanup(rsp, rnp);
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq(&rnp->lock); raw_spin_unlock_irq(&rnp->lock);
rcu_nocb_gp_cleanup(sq);
cond_resched_rcu_qs(); cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies); WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay); rcu_gp_slow(rsp, gp_cleanup_delay);
@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum), READ_ONCE(rsp->gpnum),
TPS("reqwait")); TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS; rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq, swait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) & READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT); RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS; rsp->gp_state = RCU_GP_DONE_GPS;
@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum), READ_ONCE(rsp->gpnum),
TPS("fqswait")); TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS; rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq, ret = swait_event_interruptible_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j); rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS; rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */ /* Locking provides needed memory barriers. */
@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
rcu_gp_kthread_wake(rsp); swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
} }
/* /*
@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
} }
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags); raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
rcu_gp_kthread_wake(rsp); swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
} }
/* /*
@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (wake) { if (wake) {
smp_mb(); /* EGP done before wake_up(). */ smp_mb(); /* EGP done before wake_up(). */
wake_up(&rsp->expedited_wq); swake_up(&rsp->expedited_wq);
} }
break; break;
} }
@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies; jiffies_start = jiffies;
for (;;) { for (;;) {
ret = wait_event_interruptible_timeout( ret = swait_event_timeout(
rsp->expedited_wq, rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root), sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall); jiffies_stall);
@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
return; return;
if (ret < 0) { if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */ /* Hit a signal, disable CPU stall warnings. */
wait_event(rsp->expedited_wq, swait_event(rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root)); sync_rcu_preempt_exp_done(rnp_root));
return; return;
} }
@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
} }
} }
init_waitqueue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->gp_wq);
init_waitqueue_head(&rsp->expedited_wq); init_swait_queue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1]; rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
while (i > rnp->grphi) while (i > rnp->grphi)

View file

@ -27,6 +27,7 @@
#include <linux/threads.h> #include <linux/threads.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/seqlock.h> #include <linux/seqlock.h>
#include <linux/swait.h>
#include <linux/stop_machine.h> #include <linux/stop_machine.h>
/* /*
@ -243,7 +244,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */ /* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */ /* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU #ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2]; struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */ /* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2]; int need_future_gp[2];
@ -399,7 +400,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail; struct rcu_head **nocb_follower_tail;
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread; struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@ -478,7 +479,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */ unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */ unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */ struct task_struct *gp_kthread; /* Task for grace periods. */
wait_queue_head_t gp_wq; /* Where GP task waits. */ struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */ short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */ short gp_state; /* GP kthread sleep state. */
@ -506,7 +507,7 @@ struct rcu_state {
unsigned long expedited_sequence; /* Take a ticket. */ unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */
wait_queue_head_t expedited_wq; /* Wait for check-ins. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */ int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */ unsigned long jiffies_force_qs; /* Time at which to invoke */
@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void); static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags); bool lazy, unsigned long flags);

View file

@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period. * grace period.
*/ */
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{ {
wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); swake_up_all(sq);
} }
/* /*
@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
} }
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return &rnp->nocb_gp_wq[rnp->completed & 0x1];
}
static void rcu_init_one_nocb(struct rcu_node *rnp) static void rcu_init_one_nocb(struct rcu_node *rnp)
{ {
init_waitqueue_head(&rnp->nocb_gp_wq[0]); init_swait_queue_head(&rnp->nocb_gp_wq[0]);
init_waitqueue_head(&rnp->nocb_gp_wq[1]); init_swait_queue_head(&rnp->nocb_gp_wq[1]);
} }
#ifndef CONFIG_RCU_NOCB_CPU_ALL #ifndef CONFIG_RCU_NOCB_CPU_ALL
@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */ /* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq); swake_up(&rdp_leader->nocb_wq);
} }
} }
@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/ */
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) { for (;;) {
wait_event_interruptible( swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1], rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d)) if (likely(d))
@ -2097,7 +2102,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
/* Wait for callbacks to appear. */ /* Wait for callbacks to appear. */
if (!rcu_nocb_poll) { if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq, swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep)); !READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */ /* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) { } else if (firsttime) {
@ -2172,7 +2177,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
* List was empty, wake up the follower. * List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add(). * Memory barriers supplied by atomic_long_add().
*/ */
wake_up(&rdp->nocb_wq); swake_up(&rdp->nocb_wq);
} }
} }
@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll) { if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep"); "FollowerSleep");
wait_event_interruptible(rdp->nocb_wq, swait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head)); READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) { } else if (firsttime) {
/* Don't drown trace log with "Poll"! */ /* Don't drown trace log with "Poll"! */
@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{ {
rdp->nocb_tail = &rdp->nocb_head; rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq); init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head; rdp->nocb_follower_tail = &rdp->nocb_follower_head;
} }
@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
return false; return false;
} }
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{ {
} }
@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{ {
} }
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return NULL;
}
static void rcu_init_one_nocb(struct rcu_node *rnp) static void rcu_init_one_nocb(struct rcu_node *rnp)
{ {
} }

View file

@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o obj-y += wait.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHEDSTATS) += stats.o

View file

@ -67,12 +67,10 @@
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/tick.h> #include <linux/tick.h>
#include <linux/debugfs.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/ftrace.h> #include <linux/ftrace.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/init_task.h> #include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h> #include <linux/context_tracking.h>
#include <linux/compiler.h> #include <linux/compiler.h>
@ -125,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
#undef SCHED_FEAT #undef SCHED_FEAT
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled) \
#name ,
static const char * const sched_feat_names[] = {
#include "features.h"
};
#undef SCHED_FEAT
static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
return 0;
}
#ifdef HAVE_JUMP_LABEL
#define jump_label_key__true STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
#undef SCHED_FEAT
static void sched_feat_disable(int i)
{
static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
int i;
int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
}
return i;
}
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp;
int i;
struct inode *inode;
if (cnt > 63)
cnt = 63;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
cmp = strstrip(buf);
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static const struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
return 0;
}
late_initcall(sched_init_debug);
#endif /* CONFIG_SCHED_DEBUG */
/* /*
* Number of tasks to iterate in a single balance run. * Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled. * Limited because this is done with IRQs disabled.
@ -2094,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu); ttwu_queue(p, cpu);
stat: stat:
ttwu_stat(p, cpu, wake_flags); if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
out: out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@ -2142,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0); ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0); if (schedstat_enabled())
ttwu_stat(p, smp_processor_id(), 0);
out: out:
raw_spin_unlock(&p->pi_lock); raw_spin_unlock(&p->pi_lock);
} }
@ -2184,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0; dl_se->dl_bw = 0;
dl_se->dl_throttled = 0; dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
dl_se->dl_yielded = 0; dl_se->dl_yielded = 0;
} }
@ -2211,6 +2078,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif #endif
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics)); memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif #endif
@ -2219,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p); __dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list); INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers); INIT_HLIST_HEAD(&p->preempt_notifiers);
@ -2282,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif #endif
#endif #endif
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
#ifdef CONFIG_SCHEDSTATS
static void set_schedstats(bool enabled)
{
if (enabled)
static_branch_enable(&sched_schedstats);
else
static_branch_disable(&sched_schedstats);
}
void force_schedstat_enabled(void)
{
if (!schedstat_enabled()) {
pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
static_branch_enable(&sched_schedstats);
}
}
static int __init setup_schedstats(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
set_schedstats(false);
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse schedstats=\n");
return ret;
}
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = static_branch_likely(&sched_schedstats);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write)
set_schedstats(state);
return err;
}
#endif
#endif
/* /*
* fork()/clone()-time setup: * fork()/clone()-time setup:
*/ */
@ -3011,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
} }
#endif #endif
notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
if (in_lock_functions(addr))
addr = CALLER_ADDR3;
}
return addr;
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER)) defined(CONFIG_PREEMPT_TRACER))
@ -3042,7 +2967,7 @@ void preempt_count_add(int val)
PREEMPT_MASK - 10); PREEMPT_MASK - 10);
#endif #endif
if (preempt_count() == val) { if (preempt_count() == val) {
unsigned long ip = get_parent_ip(CALLER_ADDR1); unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip; current->preempt_disable_ip = ip;
#endif #endif
@ -3069,7 +2994,7 @@ void preempt_count_sub(int val)
#endif #endif
if (preempt_count() == val) if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
__preempt_count_sub(val); __preempt_count_sub(val);
} }
EXPORT_SYMBOL(preempt_count_sub); EXPORT_SYMBOL(preempt_count_sub);
@ -3281,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next); trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */ rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else { } else {
lockdep_unpin_lock(&rq->lock); lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock); raw_spin_unlock_irq(&rq->lock);
@ -3467,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function);
*/ */
void rt_mutex_setprio(struct task_struct *p, int prio) void rt_mutex_setprio(struct task_struct *p, int prio)
{ {
int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq; struct rq *rq;
const struct sched_class *prev_class; const struct sched_class *prev_class;
@ -3495,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio); trace_sched_pi_setprio(p, prio);
oldprio = p->prio; oldprio = p->prio;
if (oldprio == prio)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class; prev_class = p->sched_class;
queued = task_on_rq_queued(p); queued = task_on_rq_queued(p);
running = task_current(rq, p); running = task_current(rq, p);
if (queued) if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, queue_flag);
if (running) if (running)
put_prev_task(rq, p); put_prev_task(rq, p);
@ -3517,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) || if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1; p->dl.dl_boosted = 1;
enqueue_flag |= ENQUEUE_REPLENISH; queue_flag |= ENQUEUE_REPLENISH;
} else } else
p->dl.dl_boosted = 0; p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class; p->sched_class = &dl_sched_class;
@ -3525,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio)) if (dl_prio(oldprio))
p->dl.dl_boosted = 0; p->dl.dl_boosted = 0;
if (oldprio < prio) if (oldprio < prio)
enqueue_flag |= ENQUEUE_HEAD; queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class; p->sched_class = &rt_sched_class;
} else { } else {
if (dl_prio(oldprio)) if (dl_prio(oldprio))
@ -3540,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running) if (running)
p->sched_class->set_curr_task(rq); p->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, p, enqueue_flag); enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
out_unlock: out_unlock:
@ -3896,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class; const struct sched_class *prev_class;
struct rq *rq; struct rq *rq;
int reset_on_fork; int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* may grab non-irq protected spin_locks */ /* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt()); BUG_ON(in_interrupt());
@ -4078,17 +4007,14 @@ static int __sched_setscheduler(struct task_struct *p,
* itself. * itself.
*/ */
new_effective_prio = rt_mutex_get_effective_prio(p, newprio); new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
if (new_effective_prio == oldprio) { if (new_effective_prio == oldprio)
__setscheduler_params(p, attr); queue_flags &= ~DEQUEUE_MOVE;
task_rq_unlock(rq, p, &flags);
return 0;
}
} }
queued = task_on_rq_queued(p); queued = task_on_rq_queued(p);
running = task_current(rq, p); running = task_current(rq, p);
if (queued) if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, queue_flags);
if (running) if (running)
put_prev_task(rq, p); put_prev_task(rq, p);
@ -4098,15 +4024,14 @@ static int __sched_setscheduler(struct task_struct *p,
if (running) if (running)
p->sched_class->set_curr_task(rq); p->sched_class->set_curr_task(rq);
if (queued) { if (queued) {
int enqueue_flags = ENQUEUE_RESTORE;
/* /*
* We enqueue to tail when the priority of a task is * We enqueue to tail when the priority of a task is
* increased (user space view). * increased (user space view).
*/ */
if (oldprio <= p->prio) if (oldprio < p->prio)
enqueue_flags |= ENQUEUE_HEAD; queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, enqueue_flags); enqueue_task(rq, p, queue_flags);
} }
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
@ -5408,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq)
} }
#endif /* CONFIG_HOTPLUG_CPU */ #endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
static struct ctl_table sd_ctl_dir[] = {
{
.procname = "sched_domain",
.mode = 0555,
},
{}
};
static struct ctl_table sd_ctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
.child = sd_ctl_dir,
},
{}
};
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
return entry;
}
static void sd_free_ctl_entry(struct ctl_table **tablep)
{
struct ctl_table *entry;
/*
* In the intermediate directories, both the child directory and
* procname are dynamically allocated and could fail but the mode
* will always be set. In the lowest directory the names are
* static strings and all have proc handlers.
*/
for (entry = *tablep; entry->mode; entry++) {
if (entry->child)
sd_free_ctl_entry(&entry->child);
if (entry->proc_handler == NULL)
kfree(entry->procname);
}
kfree(*tablep);
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
}
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
int domain_num = 0, i;
char buf[32];
for_each_domain(cpu, sd)
domain_num++;
entry = table = sd_alloc_ctl_entry(domain_num + 1);
if (table == NULL)
return NULL;
i = 0;
for_each_domain(cpu, sd) {
snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_domain_table(sd);
entry++;
i++;
}
return table;
}
static struct ctl_table_header *sd_sysctl_header;
static void register_sched_domain_sysctl(void)
{
int i, cpu_num = num_possible_cpus();
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
char buf[32];
WARN_ON(sd_ctl_dir[0].child);
sd_ctl_dir[0].child = entry;
if (entry == NULL)
return;
for_each_possible_cpu(i) {
snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i);
entry++;
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#else
static void register_sched_domain_sysctl(void)
{
}
static void unregister_sched_domain_sysctl(void)
{
}
#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq) static void set_rq_online(struct rq *rq)
{ {
if (!rq->online) { if (!rq->online) {
@ -6176,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Setup the mask of cpus configured for isolated domains */ /* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str) static int __init isolated_cpu_setup(char *str)
{ {
int ret;
alloc_bootmem_cpumask_var(&cpu_isolated_map); alloc_bootmem_cpumask_var(&cpu_isolated_map);
cpulist_parse(str, cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
return 0;
}
return 1; return 1;
} }
__setup("isolcpus=", isolated_cpu_setup); __setup("isolcpus=", isolated_cpu_setup);
struct s_data { struct s_data {
@ -7863,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg)
void sched_offline_group(struct task_group *tg) void sched_offline_group(struct task_group *tg)
{ {
unsigned long flags; unsigned long flags;
int i;
/* end participation in shares distribution */ /* end participation in shares distribution */
for_each_possible_cpu(i) unregister_fair_sched_group(tg);
unregister_fair_sched_group(tg, i);
spin_lock_irqsave(&task_group_lock, flags); spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list); list_del_rcu(&tg->list);
@ -7893,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk); queued = task_on_rq_queued(tsk);
if (queued) if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE); dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running)) if (unlikely(running))
put_prev_task(rq, tsk); put_prev_task(rq, tsk);
@ -7917,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running)) if (unlikely(running))
tsk->sched_class->set_curr_task(rq); tsk->sched_class->set_curr_task(rq);
if (queued) if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE); enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags); task_rq_unlock(rq, tsk, &flags);
} }

View file

@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) { if (static_key_false(&paravirt_steal_enabled)) {
u64 steal; u64 steal;
cputime_t steal_ct; unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id()); steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time; steal -= this_rq()->prev_steal_time;
/* /*
* cputime_t may be less precise than nsecs (eg: if it's * steal is in nsecs but our caller is expecting steal
* based on jiffies). Lets cast the result to cputime * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds. * granularity and account the rest on the next rounds.
*/ */
steal_ct = nsecs_to_cputime(steal); steal_jiffies = nsecs_to_jiffies(steal);
this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
account_steal_time(steal_ct); account_steal_time(jiffies_to_cputime(steal_jiffies));
return steal_ct; return steal_jiffies;
} }
#endif #endif
return false; return false;
@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static unsigned long long vtime_delta(struct task_struct *tsk) static cputime_t vtime_delta(struct task_struct *tsk)
{ {
unsigned long long clock; unsigned long now = READ_ONCE(jiffies);
clock = local_clock(); if (time_before(now, (unsigned long)tsk->vtime_snap))
if (clock < tsk->vtime_snap)
return 0; return 0;
return clock - tsk->vtime_snap; return jiffies_to_cputime(now - tsk->vtime_snap);
} }
static cputime_t get_vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk)
{ {
unsigned long long delta = vtime_delta(tsk); unsigned long now = READ_ONCE(jiffies);
unsigned long delta = now - tsk->vtime_snap;
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap += delta; tsk->vtime_snap = now;
/* CHECKME: always safe to convert nsecs to cputime? */ return jiffies_to_cputime(delta);
return nsecs_to_cputime(delta);
} }
static void __vtime_account_system(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk)
@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk)
{ {
if (!vtime_delta(tsk))
return;
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk); __vtime_account_system(tsk);
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&tsk->vtime_seqcount);
@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
void vtime_gen_account_irq_exit(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk)
{ {
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk); if (vtime_delta(tsk))
__vtime_account_system(tsk);
if (context_tracking_in_user()) if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER; tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&tsk->vtime_seqcount);
@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
cputime_t delta_cpu; cputime_t delta_cpu;
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&tsk->vtime_seqcount);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS; tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); if (vtime_delta(tsk)) {
delta_cpu = get_vtime_delta(tsk);
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
}
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&tsk->vtime_seqcount);
} }
void vtime_user_enter(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk)
{ {
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk); if (vtime_delta(tsk))
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER; tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&tsk->vtime_seqcount);
} }
@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta. * that can thus safely catch up with a tickless delta.
*/ */
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk); if (vtime_delta(tsk))
__vtime_account_system(tsk);
current->flags |= PF_VCPU; current->flags |= PF_VCPU;
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&tsk->vtime_seqcount);
} }
@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqcount_begin(&current->vtime_seqcount); write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS; current->vtime_snap_whence = VTIME_SYS;
current->vtime_snap = sched_clock_cpu(smp_processor_id()); current->vtime_snap = jiffies;
write_seqcount_end(&current->vtime_seqcount); write_seqcount_end(&current->vtime_seqcount);
} }
@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags); local_irq_save(flags);
write_seqcount_begin(&t->vtime_seqcount); write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS; t->vtime_snap_whence = VTIME_SYS;
t->vtime_snap = sched_clock_cpu(cpu); t->vtime_snap = jiffies;
write_seqcount_end(&t->vtime_seqcount); write_seqcount_end(&t->vtime_seqcount);
local_irq_restore(flags); local_irq_restore(flags);
} }

View file

@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
* We are racing with the deadline timer. So, do nothing because
* the deadline timer handler will take care of properly recharging
* the runtime and postponing the deadline
*/
if (dl_se->dl_throttled)
return;
/* /*
* We use the regular wall clock time to set deadlines in the * We use the regular wall clock time to set deadlines in the
@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
*/ */
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = pi_se->dl_runtime;
dl_se->dl_new = 0;
} }
/* /*
@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->runtime = pi_se->dl_runtime; dl_se->runtime = pi_se->dl_runtime;
} }
if (dl_se->dl_yielded && dl_se->runtime > 0)
dl_se->runtime = 0;
/* /*
* We keep moving the deadline away until we get some * We keep moving the deadline away until we get some
* available runtime for the entity. This ensures correct * available runtime for the entity. This ensures correct
@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq); struct rq *rq = rq_of_dl_rq(dl_rq);
/*
* The arrival of a new instance needs special treatment, i.e.,
* the actual scheduling parameters have to be "renewed".
*/
if (dl_se->dl_new) {
setup_new_dl_entity(dl_se, pi_se);
return;
}
if (dl_time_before(dl_se->deadline, rq_clock(rq)) || if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@ -604,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock; goto unlock;
} }
/*
* This is possible if switched_from_dl() raced against a running
* callback that took the above !dl_task() path and we've since then
* switched back into SCHED_DEADLINE.
*
* There's nothing to do except drop our task reference.
*/
if (dl_se->dl_new)
goto unlock;
/* /*
* The task might have been boosted by someone else and might be in the * The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled. * boosting/deboosting path, its not throttled.
@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
* approach need further study. * approach need further study.
*/ */
delta_exec = rq_clock_task(rq) - curr->se.exec_start; delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0)) if (unlikely((s64)delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return; return;
}
schedstat_set(curr->se.statistics.exec_max, schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec)); max(curr->se.statistics.exec_max, delta_exec));
@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec); sched_rt_avg_update(rq, delta_exec);
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; dl_se->runtime -= delta_exec;
if (dl_runtime_exceeded(dl_se)) {
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1; dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0); __dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise, * parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime. * we want a replenishment of its runtime.
*/ */
if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) if (flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se); update_dl_entity(dl_se, pi_se);
else if (flags & ENQUEUE_REPLENISH) else if (flags & ENQUEUE_REPLENISH)
replenish_dl_entity(dl_se, pi_se); replenish_dl_entity(dl_se, pi_se);
@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/ */
static void yield_task_dl(struct rq *rq) static void yield_task_dl(struct rq *rq)
{ {
struct task_struct *p = rq->curr;
/* /*
* We make the task go to sleep until its current deadline by * We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops * forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it * it and the bandwidth timer will wake it up and will give it
* new scheduling parameters (thanks to dl_yielded=1). * new scheduling parameters (thanks to dl_yielded=1).
*/ */
if (p->dl.runtime > 0) { rq->curr->dl.dl_yielded = 1;
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_rq_clock(rq); update_rq_clock(rq);
update_curr_dl(rq); update_curr_dl(rq);
/* /*
@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/ */
static void switched_to_dl(struct rq *rq, struct task_struct *p) static void switched_to_dl(struct rq *rq, struct task_struct *p)
{ {
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl, &p->dl);
if (task_on_rq_queued(p) && rq->curr != p) { if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/ */
resched_curr(rq); resched_curr(rq);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
} else }
switched_to_dl(rq, p);
} }
const struct sched_class dl_sched_class = { const struct sched_class dl_sched_class = {

View file

@ -16,6 +16,7 @@
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/utsname.h> #include <linux/utsname.h>
#include <linux/mempolicy.h> #include <linux/mempolicy.h>
#include <linux/debugfs.h>
#include "sched.h" #include "sched.h"
@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x) #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
#define SCHED_FEAT(name, enabled) \
#name ,
static const char * const sched_feat_names[] = {
#include "features.h"
};
#undef SCHED_FEAT
static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
return 0;
}
#ifdef HAVE_JUMP_LABEL
#define jump_label_key__true STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
#undef SCHED_FEAT
static void sched_feat_disable(int i)
{
static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
int i;
int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
}
return i;
}
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp;
int i;
struct inode *inode;
if (cnt > 63)
cnt = 63;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
cmp = strstrip(buf);
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static const struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
return 0;
}
late_initcall(sched_init_debug);
#ifdef CONFIG_SMP
#ifdef CONFIG_SYSCTL
static struct ctl_table sd_ctl_dir[] = {
{
.procname = "sched_domain",
.mode = 0555,
},
{}
};
static struct ctl_table sd_ctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
.child = sd_ctl_dir,
},
{}
};
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
return entry;
}
static void sd_free_ctl_entry(struct ctl_table **tablep)
{
struct ctl_table *entry;
/*
* In the intermediate directories, both the child directory and
* procname are dynamically allocated and could fail but the mode
* will always be set. In the lowest directory the names are
* static strings and all have proc handlers.
*/
for (entry = *tablep; entry->mode; entry++) {
if (entry->child)
sd_free_ctl_entry(&entry->child);
if (entry->proc_handler == NULL)
kfree(entry->procname);
}
kfree(*tablep);
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
}
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
int domain_num = 0, i;
char buf[32];
for_each_domain(cpu, sd)
domain_num++;
entry = table = sd_alloc_ctl_entry(domain_num + 1);
if (table == NULL)
return NULL;
i = 0;
for_each_domain(cpu, sd) {
snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_domain_table(sd);
entry++;
i++;
}
return table;
}
static struct ctl_table_header *sd_sysctl_header;
void register_sched_domain_sysctl(void)
{
int i, cpu_num = num_possible_cpus();
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
char buf[32];
WARN_ON(sd_ctl_dir[0].child);
sd_ctl_dir[0].child = entry;
if (entry == NULL)
return;
for_each_possible_cpu(i) {
snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i);
entry++;
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{ {
@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime); PN(se->vruntime);
PN(se->sum_exec_runtime); PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
PN(se->statistics.wait_start); if (schedstat_enabled()) {
PN(se->statistics.sleep_start); PN(se->statistics.wait_start);
PN(se->statistics.block_start); PN(se->statistics.sleep_start);
PN(se->statistics.sleep_max); PN(se->statistics.block_start);
PN(se->statistics.block_max); PN(se->statistics.sleep_max);
PN(se->statistics.exec_max); PN(se->statistics.block_max);
PN(se->statistics.slice_max); PN(se->statistics.exec_max);
PN(se->statistics.wait_max); PN(se->statistics.slice_max);
PN(se->statistics.wait_sum); PN(se->statistics.wait_max);
P(se->statistics.wait_count); PN(se->statistics.wait_sum);
P(se->statistics.wait_count);
}
#endif #endif
P(se->load.weight); P(se->load.weight);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw), (long long)(p->nvcsw + p->nivcsw),
p->prio); p->prio);
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", if (schedstat_enabled()) {
SPLIT_NS(p->se.statistics.wait_sum), SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.statistics.sum_sleep_runtime)); SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
}
#else #else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L, 0LL, 0L,
@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{ {
struct dl_bw *dl_bw;
SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
#ifdef CONFIG_SMP
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
} }
extern __read_mostly int sched_clock_running; extern __read_mostly int sched_clock_running;
@ -313,17 +630,18 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P(yld_count);
P(sched_count);
P(sched_goidle);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
P64(avg_idle); P64(avg_idle);
P64(max_idle_balance_cost); P64(max_idle_balance_cost);
#endif #endif
P(ttwu_count); if (schedstat_enabled()) {
P(ttwu_local); P(yld_count);
P(sched_count);
P(sched_goidle);
P(ttwu_count);
P(ttwu_local);
}
#undef P #undef P
#undef P64 #undef P64
@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw; nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
P(se.nr_migrations); P(se.nr_migrations);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
{ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu; u64 avg_atom, avg_per_cpu;
PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime; avg_atom = p->se.sum_exec_runtime;
if (nr_switches) if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches); avg_atom = div64_ul(avg_atom, nr_switches);

View file

@ -20,8 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/ */
#include <linux/latencytop.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/latencytop.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/cpuidle.h> #include <linux/cpuidle.h>
#include <linux/slab.h> #include <linux/slab.h>
@ -755,7 +755,9 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
struct task_struct *p; struct task_struct *p;
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; u64 delta;
delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
if (entity_is_task(se)) { if (entity_is_task(se)) {
p = task_of(se); p = task_of(se);
@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->statistics.wait_sum += delta; se->statistics.wait_sum += delta;
se->statistics.wait_start = 0; se->statistics.wait_start = 0;
} }
#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
#endif
/* /*
* Task is being enqueued - update stats: * Task is being enqueued - update stats:
*/ */
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
/* /*
* Are we enqueueing a waiting task? (for current tasks * Are we enqueueing a waiting task? (for current tasks
@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} }
static inline void static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ {
/* /*
* Mark the end of the wait period if dequeueing a * Mark the end of the wait period if dequeueing a
@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/ */
if (se != cfs_rq->curr) if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se); update_stats_wait_end(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
}
} }
#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
}
#endif
/* /*
* We are picking a new current task - update its stats: * We are picking a new current task - update its stats:
@ -907,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */ spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks; int nr_tasks;
pid_t gid; pid_t gid;
int active_nodes;
struct rcu_head rcu; struct rcu_head rcu;
nodemask_t active_nodes;
unsigned long total_faults; unsigned long total_faults;
unsigned long max_faults_cpu;
/* /*
* Faults_cpu is used to decide whether memory should move * Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted * towards the CPU. As a consequence, these stats are weighted
@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
} }
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3
static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */ /* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid, static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task) int maxdist, bool task)
@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true; return true;
/* /*
* Do not migrate if the destination is not a node that * Destination node is much more heavily used than the source
* is actively used by this numa group. * node? Allow migration.
*/ */
if (!node_isset(dst_nid, ng->active_nodes)) if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
return false; ACTIVE_NODE_FRACTION)
/*
* Source is a node that is not actively used by this
* numa group, while the destination is. Migrate.
*/
if (!node_isset(src_nid, ng->active_nodes))
return true; return true;
/* /*
* Both source and destination are nodes in active * Distribute memory according to CPU & memory use on each node,
* use by this numa group. Maximize memory bandwidth * with 3/4 hysteresis to avoid unnecessary memory migrations:
* by migrating from more heavily used groups, to less *
* heavily used ones, spreading the load around. * faults_cpu(dst) 3 faults_cpu(src)
* Use a 1/4 hysteresis to avoid spurious page movement. * --------------- * - > ---------------
* faults_mem(dst) 4 faults_mem(src)
*/ */
return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
} }
static unsigned long weighted_cpuload(const int cpu); static unsigned long weighted_cpuload(const int cpu);
@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL, .best_task = NULL,
.best_imp = 0, .best_imp = 0,
.best_cpu = -1 .best_cpu = -1,
}; };
struct sched_domain *sd; struct sched_domain *sd;
unsigned long taskweight, groupweight; unsigned long taskweight, groupweight;
@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group, * multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations. * we need to check other locations.
*/ */
if (env.best_cpu == -1 || (p->numa_group && if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) { for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid) if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue; continue;
@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here. * trying for a better one later. Do not set the preferred node here.
*/ */
if (p->numa_group) { if (p->numa_group) {
struct numa_group *ng = p->numa_group;
if (env.best_cpu == -1) if (env.best_cpu == -1)
nid = env.src_nid; nid = env.src_nid;
else else
nid = env.dst_nid; nid = env.dst_nid;
if (node_isset(nid, p->numa_group->active_nodes)) if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid); sched_setnuma(p, env.dst_nid);
} }
@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
} }
/* /*
* Find the nodes on which the workload is actively running. We do this by * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can * tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently * be different from the set of nodes where the workload's memory is currently
* located. * located.
*
* The bitmask is used to make smarter decisions on when to do NUMA page
* migrations, To prevent flip-flopping, and excessive page migrations, nodes
* are added when they cause over 6/16 of the maximum number of faults, but
* only removed when they drop below 3/16.
*/ */
static void update_numa_active_node_mask(struct numa_group *numa_group) static void numa_group_count_active_nodes(struct numa_group *numa_group)
{ {
unsigned long faults, max_faults = 0; unsigned long faults, max_faults = 0;
int nid; int nid, active_nodes = 0;
for_each_online_node(nid) { for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid); faults = group_faults_cpu(numa_group, nid);
@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) { for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid); faults = group_faults_cpu(numa_group, nid);
if (!node_isset(nid, numa_group->active_nodes)) { if (faults * ACTIVE_NODE_FRACTION > max_faults)
if (faults > max_faults * 6 / 16) active_nodes++;
node_set(nid, numa_group->active_nodes);
} else if (faults < max_faults * 3 / 16)
node_clear(nid, numa_group->active_nodes);
} }
numa_group->max_faults_cpu = max_faults;
numa_group->active_nodes = active_nodes;
} }
/* /*
@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]); update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) { if (p->numa_group) {
update_numa_active_node_mask(p->numa_group); numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock); spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid); max_nid = preferred_group_nid(p, max_group_nid);
} }
@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return; return;
atomic_set(&grp->refcount, 1); atomic_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock); spin_lock_init(&grp->lock);
grp->gid = p->pid; grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */ /* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids; nr_node_ids;
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i]; grp->faults[i] = p->numa_faults[i];
@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED; bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current); int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL); int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv; int priv;
if (!static_branch_likely(&sched_numa_balancing)) if (!static_branch_likely(&sched_numa_balancing))
@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the * actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down. * scan rate to slow down when a workload has settled down.
*/ */
if (!priv && !local && p->numa_group && ng = p->numa_group;
node_isset(cpu_node, p->numa_group->active_nodes) && if (!priv && !local && ng && ng->active_nodes > 1 &&
node_isset(mem_node, p->numa_group->active_nodes)) numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1; local = 1;
task_numa_placement(p); task_numa_placement(p);
@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled())
return;
/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enabled or "
"kernel.sched_schedstats=1\n");
}
#endif
}
static void static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ {
@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) { if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0); place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se); if (schedstat_enabled())
enqueue_sleeper(cfs_rq, se);
} }
update_stats_enqueue(cfs_rq, se); check_schedstat_required();
check_spread(cfs_rq, se); if (schedstat_enabled()) {
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
if (se != cfs_rq->curr) if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se); __enqueue_entity(cfs_rq, se);
se->on_rq = 1; se->on_rq = 1;
@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq); update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se); dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se); if (schedstat_enabled())
if (flags & DEQUEUE_SLEEP) { update_stats_dequeue(cfs_rq, se, flags);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
clear_buddies(cfs_rq, se); clear_buddies(cfs_rq, se);
@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the * a CPU. So account for the time it spent waiting on the
* runqueue. * runqueue.
*/ */
update_stats_wait_end(cfs_rq, se); if (schedstat_enabled())
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se); __dequeue_entity(cfs_rq, se);
update_load_avg(se, 1); update_load_avg(se, 1);
} }
@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it * least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around): * when there are only lesser-weight tasks around):
*/ */
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max, se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime); se->sum_exec_runtime - se->prev_sum_exec_runtime);
} }
@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */ /* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq); check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev); if (schedstat_enabled()) {
check_spread(cfs_rq, prev);
if (prev->on_rq)
update_stats_wait_start(cfs_rq, prev);
}
if (prev->on_rq) { if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */ /* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev); __enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */ /* in !on_rq case, update occurred at dequeue */
@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */ /* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i] - tickless_load; old_load = this_rq->cpu_load[i];
old_load = decay_load_missed(old_load, pending_updates - 1, i); old_load = decay_load_missed(old_load, pending_updates - 1, i);
old_load += tickless_load; if (tickless_load) {
old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
/*
* old_load can never be a negative value because a
* decayed tickless_load cannot be greater than the
* original tickless_load.
*/
old_load += tickless_load;
}
new_load = this_load; new_load = this_load;
/* /*
* Round up the averaging division if load is increasing. This * Round up the averaging division if load is increasing. This
@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
} }
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
static void __update_cpu_load_nohz(struct rq *this_rq,
unsigned long curr_jiffies,
unsigned long load,
int active)
{
unsigned long pending_updates;
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
* In the regular NOHZ case, we were idle, this means load 0.
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
__update_cpu_load(this_rq, load, pending_updates, active);
}
}
/* /*
* There is no sane way to deal with nohz on smp when using jiffies because the * There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
* Called from nohz_idle_balance() to update the load ratings before doing the * Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance. * idle balance.
*/ */
static void update_idle_cpu_load(struct rq *this_rq) static void update_cpu_load_idle(struct rq *this_rq)
{ {
unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = weighted_cpuload(cpu_of(this_rq));
unsigned long pending_updates;
/* /*
* bail if there's load or we're actually up-to-date. * bail if there's load or we're actually up-to-date.
*/ */
if (load || curr_jiffies == this_rq->last_load_update_tick) if (weighted_cpuload(cpu_of(this_rq)))
return; return;
pending_updates = curr_jiffies - this_rq->last_load_update_tick; __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
this_rq->last_load_update_tick = curr_jiffies;
__update_cpu_load(this_rq, load, pending_updates, 0);
} }
/* /*
@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
struct rq *this_rq = this_rq(); struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
unsigned long pending_updates;
if (curr_jiffies == this_rq->last_load_update_tick) if (curr_jiffies == this_rq->last_load_update_tick)
return; return;
raw_spin_lock(&this_rq->lock); raw_spin_lock(&this_rq->lock);
pending_updates = curr_jiffies - this_rq->last_load_update_tick; __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
* In the regular NOHZ case, we were idle, this means load 0.
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
__update_cpu_load(this_rq, load, pending_updates, active);
}
raw_spin_unlock(&this_rq->lock); raw_spin_unlock(&this_rq->lock);
} }
#endif /* CONFIG_NO_HZ */ #endif /* CONFIG_NO_HZ */
@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
{ {
unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long load = weighted_cpuload(cpu_of(this_rq));
/* /*
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
*/ */
this_rq->last_load_update_tick = jiffies; this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, load, 1, 1); __update_cpu_load(this_rq, load, 1, 1);
@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) { if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock); raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq); update_rq_clock(rq);
update_idle_cpu_load(rq); update_cpu_load_idle(rq);
raw_spin_unlock_irq(&rq->lock); raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE); rebalance_domains(rq, CPU_IDLE);
} }
@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
if (tg->cfs_rq) if (tg->cfs_rq)
kfree(tg->cfs_rq[i]); kfree(tg->cfs_rq[i]);
if (tg->se) { if (tg->se)
if (tg->se[i])
remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]); kfree(tg->se[i]);
}
} }
kfree(tg->cfs_rq); kfree(tg->cfs_rq);
@ -8286,21 +8343,29 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 0; return 0;
} }
void unregister_fair_sched_group(struct task_group *tg, int cpu) void unregister_fair_sched_group(struct task_group *tg)
{ {
struct rq *rq = cpu_rq(cpu);
unsigned long flags; unsigned long flags;
struct rq *rq;
int cpu;
/* for_each_possible_cpu(cpu) {
* Only empty task groups can be destroyed; so we can speculatively if (tg->se[cpu])
* check on_list without danger of it being re-added. remove_entity_load_avg(tg->se[cpu]);
*/
if (!tg->cfs_rq[cpu]->on_list)
return;
raw_spin_lock_irqsave(&rq->lock, flags); /*
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); * Only empty task groups can be destroyed; so we can speculatively
raw_spin_unlock_irqrestore(&rq->lock, flags); * check on_list without danger of it being re-added.
*/
if (!tg->cfs_rq[cpu]->on_list)
continue;
rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
} }
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1; return 1;
} }
void unregister_fair_sched_group(struct task_group *tg, int cpu) { } void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */

View file

@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_lock(&rt_b->rt_runtime_lock); raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) { if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1; rt_b->rt_period_active = 1;
hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); /*
* SCHED_DEADLINE updates the bandwidth, as a run away
* RT task with a DL task could hog a CPU. But DL does
* not reset the period. If a deadline task was running
* without an RT task running, it can cause RT tasks to
* throttle when they start up. Kick the timer right away
* to update the period.
*/
hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
} }
raw_spin_unlock(&rt_b->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock);
@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se) static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{ {
return !list_empty(&rt_se->run_list); return rt_se->on_rq;
} }
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q; return rt_se->my_q;
} }
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{ {
@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se) if (!rt_se)
enqueue_top_rt_rq(rt_rq); enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se)) else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false); enqueue_rt_entity(rt_se, 0);
if (rt_rq->highest_prio.curr < curr->prio) if (rt_rq->highest_prio.curr < curr->prio)
resched_curr(rq); resched_curr(rq);
@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
if (!rt_se) if (!rt_se)
dequeue_top_rt_rq(rt_rq); dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se)) else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se); dequeue_rt_entity(rt_se, 0);
} }
static inline int rt_rq_throttled(struct rt_rq *rt_rq) static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq);
} }
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) /*
* Change rt_se->run_list location unless SAVE && !MOVE
*
* assumes ENQUEUE/DEQUEUE flags match
*/
static inline bool move_entity(unsigned int flags)
{
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
return false;
return true;
}
static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
{
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
rt_se->on_list = 0;
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active; struct rt_prio_array *array = &rt_rq->active;
@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other * get throttled and the current group doesn't have any other
* active members. * active members.
*/ */
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
if (rt_se->on_list)
__delist_rt_entity(rt_se, array);
return; return;
}
if (head) if (move_entity(flags)) {
list_add(&rt_se->run_list, queue); WARN_ON_ONCE(rt_se->on_list);
else if (flags & ENQUEUE_HEAD)
list_add_tail(&rt_se->run_list, queue); list_add(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap); else
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
rt_se->on_list = 1;
}
rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq); inc_rt_tasks(rt_se, rt_rq);
} }
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active; struct rt_prio_array *array = &rt_rq->active;
list_del_init(&rt_se->run_list); if (move_entity(flags)) {
if (list_empty(array->queue + rt_se_prio(rt_se))) WARN_ON_ONCE(!rt_se->on_list);
__clear_bit(rt_se_prio(rt_se), array->bitmap); __delist_rt_entity(rt_se, array);
}
rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq); dec_rt_tasks(rt_se, rt_rq);
} }
@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower * Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down. * entries, we must remove entries top - down.
*/ */
static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{ {
struct sched_rt_entity *back = NULL; struct sched_rt_entity *back = NULL;
@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
for (rt_se = back; rt_se; rt_se = rt_se->back) { for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se)) if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se); __dequeue_rt_entity(rt_se, flags);
} }
} }
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ {
struct rq *rq = rq_of_rt_se(rt_se); struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se); dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head); __enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt); enqueue_top_rt_rq(&rq->rt);
} }
static void dequeue_rt_entity(struct sched_rt_entity *rt_se) static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ {
struct rq *rq = rq_of_rt_se(rt_se); struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se); dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) { for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se); struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running) if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false); __enqueue_rt_entity(rt_se, flags);
} }
enqueue_top_rt_rq(&rq->rt); enqueue_top_rt_rq(&rq->rt);
} }
@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP) if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0; rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1) if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p); enqueue_pushable_task(rq, p);
@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
struct sched_rt_entity *rt_se = &p->rt; struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq); update_curr_rt(rq);
dequeue_rt_entity(rt_se); dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
} }

View file

@ -3,6 +3,7 @@
#include <linux/sched/sysctl.h> #include <linux/sched/sysctl.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/sched/deadline.h> #include <linux/sched/deadline.h>
#include <linux/binfmts.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/stop_machine.h> #include <linux/stop_machine.h>
@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg); extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void unregister_fair_sched_group(struct task_group *tg, int cpu); extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu, struct sched_entity *se, int cpu,
struct sched_entity *parent); struct sched_entity *parent);
extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@ -909,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg); extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
static inline void unregister_sched_domain_sysctl(void)
{
}
#endif
#else #else
static inline void sched_ttwu_pending(void) { } static inline void sched_ttwu_pending(void) { }
@ -1022,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void) static inline u64 global_rt_period(void)
{ {
@ -1130,18 +1143,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
extern const int sched_prio_to_weight[40]; extern const int sched_prio_to_weight[40];
extern const u32 sched_prio_to_wmult[40]; extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
* DEQUEUE_SLEEP - task is no longer runnable
* ENQUEUE_WAKEUP - task just became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_WAKING - sched_class::task_waking was called
*
*/
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_HEAD 0x02 #define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04
#define ENQUEUE_HEAD 0x08
#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #define ENQUEUE_WAKING 0x20
#else #else
#define ENQUEUE_WAKING 0x00 #define ENQUEUE_WAKING 0x00
#endif #endif
#define ENQUEUE_REPLENISH 0x08
#define ENQUEUE_RESTORE 0x10
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL) #define RETRY_TASK ((void *)-1UL)

View file

@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq) if (rq)
rq->rq_sched_info.run_delay += delta; rq->rq_sched_info.run_delay += delta;
} }
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
# define schedstat_set(var, val) do { var = (val); } while (0) # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */ #else /* !CONFIG_SCHEDSTATS */
static inline void static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta) rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta) rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{} {}
# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0) # define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0) # define schedstat_set(var, val) do { } while (0)

123
kernel/sched/swait.c Normal file
View file

@ -0,0 +1,123 @@
#include <linux/sched.h>
#include <linux/swait.h>
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
{
raw_spin_lock_init(&q->lock);
lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
}
EXPORT_SYMBOL(__init_swait_queue_head);
/*
* The thing about the wake_up_state() return value; I think we can ignore it.
*
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
void swake_up_locked(struct swait_queue_head *q)
{
struct swait_queue *curr;
if (list_empty(&q->task_list))
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
wake_up_process(curr->task);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
void swake_up(struct swait_queue_head *q)
{
unsigned long flags;
if (!swait_active(q))
return;
raw_spin_lock_irqsave(&q->lock, flags);
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up);
/*
* Does not allow usage from IRQ disabled, since we must be able to
* release IRQs to guarantee bounded hold time.
*/
void swake_up_all(struct swait_queue_head *q)
{
struct swait_queue *curr;
LIST_HEAD(tmp);
if (!swait_active(q))
return;
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
curr = list_first_entry(&tmp, typeof(*curr), task_list);
wake_up_state(curr->task, TASK_NORMAL);
list_del_init(&curr->task_list);
if (list_empty(&tmp))
break;
raw_spin_unlock_irq(&q->lock);
raw_spin_lock_irq(&q->lock);
}
raw_spin_unlock_irq(&q->lock);
}
EXPORT_SYMBOL(swake_up_all);
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
list_add(&wait->task_list, &q->task_list);
}
void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
__prepare_to_swait(q, wait);
set_current_state(state);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_swait);
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
if (signal_pending_state(state, current))
return -ERESTARTSYS;
prepare_to_swait(q, wait, state);
return 0;
}
EXPORT_SYMBOL(prepare_to_swait_event);
void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
__set_current_state(TASK_RUNNING);
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
}
void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait->task_list)) {
raw_spin_lock_irqsave(&q->lock, flags);
list_del_init(&wait->task_list);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
}
EXPORT_SYMBOL(finish_swait);

View file

@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
if (preempt_count() == cnt) { if (preempt_count() == cnt) {
#ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); current->preempt_disable_ip = get_lock_parent_ip();
#endif #endif
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
} }
} }
EXPORT_SYMBOL(__local_bh_disable_ip); EXPORT_SYMBOL(__local_bh_disable_ip);

View file

@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = &zero,
.extra2 = &one,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
{ {
@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
.data = &latencytop_enabled, .data = &latencytop_enabled,
.maxlen = sizeof(int), .maxlen = sizeof(int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = sysctl_latencytop,
}, },
#endif #endif
#ifdef CONFIG_BLK_DEV_INITRD #ifdef CONFIG_BLK_DEV_INITRD

View file

@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{ {
struct mm_struct *mm; struct mm_struct *mm;
/* convert pages-usec to Mbyte-usec */ /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; do_div(stats->coremem, 1000 * KB);
stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p); mm = get_task_mm(p);
if (mm) { if (mm) {
/* adjust to KB unit */ /* adjust to KB unit */
@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
static void __acct_update_integrals(struct task_struct *tsk, static void __acct_update_integrals(struct task_struct *tsk,
cputime_t utime, cputime_t stime) cputime_t utime, cputime_t stime)
{ {
if (likely(tsk->mm)) { cputime_t time, dtime;
cputime_t time, dtime; u64 delta;
struct timeval value;
unsigned long flags;
u64 delta;
local_irq_save(flags); if (!likely(tsk->mm))
time = stime + utime; return;
dtime = time - tsk->acct_timexpd;
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
delta = delta * USEC_PER_SEC + value.tv_usec;
if (delta == 0) time = stime + utime;
goto out; dtime = time - tsk->acct_timexpd;
tsk->acct_timexpd = time; /* Avoid division: cputime_t is often in nanoseconds already. */
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); delta = cputime_to_nsecs(dtime);
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
out: if (delta < TICK_NSEC)
local_irq_restore(flags); return;
}
tsk->acct_timexpd = time;
/*
* Divide by 1024 to avoid overflow, and to avoid division.
* The final unit reported to userspace is Mbyte-usecs,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
} }
/** /**
@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
void acct_update_integrals(struct task_struct *tsk) void acct_update_integrals(struct task_struct *tsk)
{ {
cputime_t utime, stime; cputime_t utime, stime;
unsigned long flags;
local_irq_save(flags);
task_cputime(tsk, &utime, &stime); task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime); __acct_update_integrals(tsk, utime, stime);
local_irq_restore(flags);
} }
/** /**

View file

@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
* This memory barrier pairs with prepare_to_wait's set_current_state() * This memory barrier pairs with prepare_to_wait's set_current_state()
*/ */
smp_mb(); smp_mb();
if (waitqueue_active(&vcpu->wq)) if (swait_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq); swake_up(&vcpu->wq);
mmput(mm); mmput(mm);
kvm_put_kvm(vcpu->kvm); kvm_put_kvm(vcpu->kvm);

View file

@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm; vcpu->kvm = kvm;
vcpu->vcpu_id = id; vcpu->vcpu_id = id;
vcpu->pid = NULL; vcpu->pid = NULL;
vcpu->halt_poll_ns = 0; init_swait_queue_head(&vcpu->wq);
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu); kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1; vcpu->pre_pcpu = -1;
@ -1993,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
void kvm_vcpu_block(struct kvm_vcpu *vcpu) void kvm_vcpu_block(struct kvm_vcpu *vcpu)
{ {
ktime_t start, cur; ktime_t start, cur;
DEFINE_WAIT(wait); DECLARE_SWAITQUEUE(wait);
bool waited = false; bool waited = false;
u64 block_ns; u64 block_ns;
@ -2018,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_blocking(vcpu); kvm_arch_vcpu_blocking(vcpu);
for (;;) { for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_vcpu_check_block(vcpu) < 0) if (kvm_vcpu_check_block(vcpu) < 0)
break; break;
@ -2027,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
schedule(); schedule();
} }
finish_wait(&vcpu->wq, &wait); finish_swait(&vcpu->wq, &wait);
cur = ktime_get(); cur = ktime_get();
kvm_arch_vcpu_unblocking(vcpu); kvm_arch_vcpu_unblocking(vcpu);
@ -2059,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{ {
int me; int me;
int cpu = vcpu->cpu; int cpu = vcpu->cpu;
wait_queue_head_t *wqp; struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu); wqp = kvm_arch_vcpu_wq(vcpu);
if (waitqueue_active(wqp)) { if (swait_active(wqp)) {
wake_up_interruptible(wqp); swake_up(wqp);
++vcpu->stat.halt_wakeup; ++vcpu->stat.halt_wakeup;
} }
@ -2164,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue; continue;
if (vcpu == me) if (vcpu == me)
continue; continue;
if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
continue; continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue; continue;