Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (27 commits) sched: Use correct macro to display sched_child_runs_first in /proc/sched_debug sched: No need for bootmem special cases sched: Revert nohz_ratelimit() for now sched: Reduce update_group_power() calls sched: Update rq->clock for nohz balanced cpus sched: Fix spelling of sibling sched, cpuset: Drop __cpuexit from cpu hotplug callbacks sched: Fix the racy usage of thread_group_cputimer() in fastpath_timer_check() sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand() sched: thread_group_cputime: Simplify, document the "alive" check sched: Remove the obsolete exit_state/signal hacks sched: task_tick_rt: Remove the obsolete ->signal != NULL check sched: __sched_setscheduler: Read the RLIMIT_RTPRIO value lockless sched: Fix comments to make them DocBook happy sched: Fix fix_small_capacity powerpc: Exclude arch_sd_sibiling_asym_packing() on UP powerpc: Enable asymmetric SMT scheduling on POWER7 sched: Add asymmetric group packing option for sibling domain sched: Fix capacity calculations for SMT4 sched: Change nohz idle load balancing logic to push model ...
This commit is contained in:
commit
c4efd6b569
28 changed files with 878 additions and 411 deletions
|
@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0,
|
|||
unsigned long ret;
|
||||
|
||||
pop_return_trace(&trace, &ret);
|
||||
trace.rettime = cpu_clock(raw_smp_processor_id());
|
||||
trace.rettime = local_clock();
|
||||
ftrace_graph_return(&trace);
|
||||
|
||||
if (unlikely(!ret)) {
|
||||
|
@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
|
|||
return;
|
||||
}
|
||||
|
||||
calltime = cpu_clock(raw_smp_processor_id());
|
||||
calltime = local_clock();
|
||||
|
||||
if (push_return_trace(old, calltime,
|
||||
self_addr, &trace.depth) == -EBUSY) {
|
||||
|
|
|
@ -197,6 +197,7 @@ extern const char *powerpc_base_platform;
|
|||
#define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000)
|
||||
#define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000)
|
||||
#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000)
|
||||
#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
|
@ -412,7 +413,7 @@ extern const char *powerpc_base_platform;
|
|||
CPU_FTR_MMCRA | CPU_FTR_SMT | \
|
||||
CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
|
||||
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
|
||||
CPU_FTR_DSCR | CPU_FTR_SAO)
|
||||
CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT)
|
||||
#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
|
||||
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
|
||||
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
|
||||
|
|
|
@ -1299,3 +1299,14 @@ unsigned long randomize_et_dyn(unsigned long base)
|
|||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int arch_sd_sibling_asym_packing(void)
|
||||
{
|
||||
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
|
||||
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
|
||||
return SD_ASYM_PACKING;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -48,6 +48,31 @@ extern ssize_t arch_cpu_release(const char *, size_t);
|
|||
#endif
|
||||
struct notifier_block;
|
||||
|
||||
/*
|
||||
* CPU notifier priorities.
|
||||
*/
|
||||
enum {
|
||||
/*
|
||||
* SCHED_ACTIVE marks a cpu which is coming up active during
|
||||
* CPU_ONLINE and CPU_DOWN_FAILED and must be the first
|
||||
* notifier. CPUSET_ACTIVE adjusts cpuset according to
|
||||
* cpu_active mask right after SCHED_ACTIVE. During
|
||||
* CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
|
||||
* ordered in the similar way.
|
||||
*
|
||||
* This ordering guarantees consistent cpu_active mask and
|
||||
* migration behavior to all cpu notifiers.
|
||||
*/
|
||||
CPU_PRI_SCHED_ACTIVE = INT_MAX,
|
||||
CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
|
||||
CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
|
||||
CPU_PRI_CPUSET_INACTIVE = INT_MIN,
|
||||
|
||||
/* migration should happen before other stuff but after perf */
|
||||
CPU_PRI_PERF = 20,
|
||||
CPU_PRI_MIGRATION = 10,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* Need to know about CPUs going up/down? */
|
||||
#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
|
||||
|
|
|
@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
|
|||
|
||||
extern int cpuset_init(void);
|
||||
extern void cpuset_init_smp(void);
|
||||
extern void cpuset_update_active_cpus(void);
|
||||
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
|
||||
extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
|
||||
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
|
||||
|
@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
|
|||
static inline int cpuset_init(void) { return 0; }
|
||||
static inline void cpuset_init_smp(void) {}
|
||||
|
||||
static inline void cpuset_update_active_cpus(void)
|
||||
{
|
||||
partition_sched_domains(1, NULL, NULL);
|
||||
}
|
||||
|
||||
static inline void cpuset_cpus_allowed(struct task_struct *p,
|
||||
struct cpumask *mask)
|
||||
{
|
||||
|
|
|
@ -1067,7 +1067,7 @@ static inline void perf_event_disable(struct perf_event *event) { }
|
|||
#define perf_cpu_notifier(fn) \
|
||||
do { \
|
||||
static struct notifier_block fn##_nb __cpuinitdata = \
|
||||
{ .notifier_call = fn, .priority = 20 }; \
|
||||
{ .notifier_call = fn, .priority = CPU_PRI_PERF }; \
|
||||
fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \
|
||||
(void *)(unsigned long)smp_processor_id()); \
|
||||
fn(&fn##_nb, (unsigned long)CPU_STARTING, \
|
||||
|
|
|
@ -272,19 +272,10 @@ extern int runqueue_is_locked(int cpu);
|
|||
|
||||
extern cpumask_var_t nohz_cpu_mask;
|
||||
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
|
||||
extern int select_nohz_load_balancer(int cpu);
|
||||
extern int get_nohz_load_balancer(void);
|
||||
extern int nohz_ratelimit(int cpu);
|
||||
extern void select_nohz_load_balancer(int stop_tick);
|
||||
extern int get_nohz_timer_target(void);
|
||||
#else
|
||||
static inline int select_nohz_load_balancer(int cpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int nohz_ratelimit(int cpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void select_nohz_load_balancer(int stop_tick) { }
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -801,7 +792,7 @@ enum cpu_idle_type {
|
|||
#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
|
||||
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
|
||||
|
||||
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
|
||||
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
|
||||
|
||||
enum powersavings_balance_level {
|
||||
|
@ -836,6 +827,8 @@ static inline int sd_balance_for_package_power(void)
|
|||
return SD_PREFER_SIBLING;
|
||||
}
|
||||
|
||||
extern int __weak arch_sd_sibiling_asym_packing(void);
|
||||
|
||||
/*
|
||||
* Optimise SD flags for power savings:
|
||||
* SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
|
||||
|
@ -857,7 +850,7 @@ struct sched_group {
|
|||
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
|
||||
* single CPU.
|
||||
*/
|
||||
unsigned int cpu_power;
|
||||
unsigned int cpu_power, cpu_power_orig;
|
||||
|
||||
/*
|
||||
* The CPUs this group covers.
|
||||
|
@ -1693,6 +1686,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
|
|||
#define PF_EXITING 0x00000004 /* getting shut down */
|
||||
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
|
||||
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
|
||||
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
|
||||
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
|
||||
#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
|
||||
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
|
||||
|
@ -1787,20 +1781,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
|
|||
#endif
|
||||
|
||||
/*
|
||||
* Architectures can set this to 1 if they have specified
|
||||
* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
|
||||
* but then during bootup it turns out that sched_clock()
|
||||
* is reliable after all:
|
||||
* Do not use outside of architecture code which knows its limitations.
|
||||
*
|
||||
* sched_clock() has no promise of monotonicity or bounded drift between
|
||||
* CPUs, use (which you should not) requires disabling IRQs.
|
||||
*
|
||||
* Please use one of the three interfaces below.
|
||||
*/
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
extern int sched_clock_stable;
|
||||
#endif
|
||||
|
||||
/* ftrace calls sched_clock() directly */
|
||||
extern unsigned long long notrace sched_clock(void);
|
||||
/*
|
||||
* See the comment in kernel/sched_clock.c
|
||||
*/
|
||||
extern u64 cpu_clock(int cpu);
|
||||
extern u64 local_clock(void);
|
||||
extern u64 sched_clock_cpu(int cpu);
|
||||
|
||||
|
||||
extern void sched_clock_init(void);
|
||||
extern u64 sched_clock_cpu(int cpu);
|
||||
|
||||
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
static inline void sched_clock_tick(void)
|
||||
|
@ -1815,17 +1812,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
|
|||
{
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Architectures can set this to 1 if they have specified
|
||||
* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
|
||||
* but then during bootup it turns out that sched_clock()
|
||||
* is reliable after all:
|
||||
*/
|
||||
extern int sched_clock_stable;
|
||||
|
||||
extern void sched_clock_tick(void);
|
||||
extern void sched_clock_idle_sleep_event(void);
|
||||
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
|
||||
* clock constructed from sched_clock():
|
||||
*/
|
||||
extern unsigned long long cpu_clock(int cpu);
|
||||
|
||||
extern unsigned long long
|
||||
task_sched_runtime(struct task_struct *task);
|
||||
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
|
||||
|
|
|
@ -103,6 +103,7 @@ int arch_update_cpu_topology(void);
|
|||
| 1*SD_SHARE_PKG_RESOURCES \
|
||||
| 0*SD_SERIALIZE \
|
||||
| 0*SD_PREFER_SIBLING \
|
||||
| arch_sd_sibling_asym_packing() \
|
||||
, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 1, \
|
||||
|
|
|
@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
|
|||
return -EINVAL;
|
||||
|
||||
cpu_hotplug_begin();
|
||||
set_cpu_active(cpu, false);
|
||||
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
|
||||
if (err) {
|
||||
set_cpu_active(cpu, true);
|
||||
|
||||
nr_calls--;
|
||||
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
|
||||
printk("%s: attempt to take down CPU %u failed\n",
|
||||
|
@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
|
|||
|
||||
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
|
||||
if (err) {
|
||||
set_cpu_active(cpu, true);
|
||||
/* CPU didn't die: tell everyone. Can't complain. */
|
||||
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
|
||||
|
||||
|
@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
|
|||
goto out_notify;
|
||||
BUG_ON(!cpu_online(cpu));
|
||||
|
||||
set_cpu_active(cpu, true);
|
||||
|
||||
/* Now call notifier in preparation. */
|
||||
cpu_notify(CPU_ONLINE | mod, hcpu);
|
||||
|
||||
|
|
|
@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
|
|||
* but making no active use of cpusets.
|
||||
*
|
||||
* This routine ensures that top_cpuset.cpus_allowed tracks
|
||||
* cpu_online_map on each CPU hotplug (cpuhp) event.
|
||||
* cpu_active_mask on each CPU hotplug (cpuhp) event.
|
||||
*
|
||||
* Called within get_online_cpus(). Needs to call cgroup_lock()
|
||||
* before calling generate_sched_domains().
|
||||
*/
|
||||
static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
|
||||
unsigned long phase, void *unused_cpu)
|
||||
void cpuset_update_active_cpus(void)
|
||||
{
|
||||
struct sched_domain_attr *attr;
|
||||
cpumask_var_t *doms;
|
||||
int ndoms;
|
||||
|
||||
switch (phase) {
|
||||
case CPU_ONLINE:
|
||||
case CPU_ONLINE_FROZEN:
|
||||
case CPU_DOWN_PREPARE:
|
||||
case CPU_DOWN_PREPARE_FROZEN:
|
||||
case CPU_DOWN_FAILED:
|
||||
case CPU_DOWN_FAILED_FROZEN:
|
||||
break;
|
||||
|
||||
default:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
cgroup_lock();
|
||||
mutex_lock(&callback_mutex);
|
||||
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
|
||||
|
@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
|
|||
|
||||
/* Have scheduler rebuild the domains */
|
||||
partition_sched_domains(ndoms, doms, attr);
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
|
@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
|
|||
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
|
||||
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
|
||||
|
||||
hotcpu_notifier(cpuset_track_online_cpus, 0);
|
||||
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
|
||||
|
||||
cpuset_wq = create_singlethread_workqueue("cpuset");
|
||||
|
|
|
@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
|
|||
{
|
||||
unsigned long new_flags = p->flags;
|
||||
|
||||
new_flags &= ~PF_SUPERPRIV;
|
||||
new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
|
||||
new_flags |= PF_FORKNOEXEC;
|
||||
new_flags |= PF_STARTING;
|
||||
p->flags = new_flags;
|
||||
|
|
|
@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
|
|||
static int hrtimer_get_target(int this_cpu, int pinned)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ
|
||||
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
|
||||
int preferred_cpu = get_nohz_load_balancer();
|
||||
|
||||
if (preferred_cpu >= 0)
|
||||
return preferred_cpu;
|
||||
}
|
||||
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
|
||||
return get_nohz_timer_target();
|
||||
#endif
|
||||
return this_cpu;
|
||||
}
|
||||
|
|
|
@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
|
|||
|
||||
static inline u64 lockstat_clock(void)
|
||||
{
|
||||
return cpu_clock(smp_processor_id());
|
||||
return local_clock();
|
||||
}
|
||||
|
||||
static int lock_point(unsigned long points[], unsigned long ip)
|
||||
|
|
|
@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
|
|||
|
||||
static inline u64 perf_clock(void)
|
||||
{
|
||||
return cpu_clock(raw_smp_processor_id());
|
||||
return local_clock();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
|
|||
|
||||
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
|
||||
{
|
||||
struct sighand_struct *sighand;
|
||||
struct signal_struct *sig;
|
||||
struct signal_struct *sig = tsk->signal;
|
||||
struct task_struct *t;
|
||||
|
||||
*times = INIT_CPUTIME;
|
||||
times->utime = sig->utime;
|
||||
times->stime = sig->stime;
|
||||
times->sum_exec_runtime = sig->sum_sched_runtime;
|
||||
|
||||
rcu_read_lock();
|
||||
sighand = rcu_dereference(tsk->sighand);
|
||||
if (!sighand)
|
||||
/* make sure we can trust tsk->thread_group list */
|
||||
if (!likely(pid_alive(tsk)))
|
||||
goto out;
|
||||
|
||||
sig = tsk->signal;
|
||||
|
||||
t = tsk;
|
||||
do {
|
||||
times->utime = cputime_add(times->utime, t->utime);
|
||||
times->stime = cputime_add(times->stime, t->stime);
|
||||
times->sum_exec_runtime += t->se.sum_exec_runtime;
|
||||
|
||||
t = next_thread(t);
|
||||
} while (t != tsk);
|
||||
|
||||
times->utime = cputime_add(times->utime, sig->utime);
|
||||
times->stime = cputime_add(times->stime, sig->stime);
|
||||
times->sum_exec_runtime += sig->sum_sched_runtime;
|
||||
} while_each_thread(tsk, t);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
|
|||
{
|
||||
struct signal_struct *sig;
|
||||
|
||||
/* tsk == current, ensure it is safe to use ->signal/sighand */
|
||||
if (unlikely(tsk->exit_state))
|
||||
return 0;
|
||||
|
||||
if (!task_cputime_zero(&tsk->cputime_expires)) {
|
||||
struct task_cputime task_sample = {
|
||||
.utime = tsk->utime,
|
||||
|
@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
|
|||
if (sig->cputimer.running) {
|
||||
struct task_cputime group_sample;
|
||||
|
||||
thread_group_cputimer(tsk, &group_sample);
|
||||
spin_lock(&sig->cputimer.lock);
|
||||
group_sample = sig->cputimer.cputime;
|
||||
spin_unlock(&sig->cputimer.lock);
|
||||
|
||||
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
|
||||
return 1;
|
||||
}
|
||||
|
@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
|
|||
{
|
||||
LIST_HEAD(firing);
|
||||
struct k_itimer *timer, *next;
|
||||
unsigned long flags;
|
||||
|
||||
BUG_ON(!irqs_disabled());
|
||||
|
||||
|
@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
|
|||
if (!fastpath_timer_check(tsk))
|
||||
return;
|
||||
|
||||
spin_lock(&tsk->sighand->siglock);
|
||||
if (!lock_task_sighand(tsk, &flags))
|
||||
return;
|
||||
/*
|
||||
* Here we take off tsk->signal->cpu_timers[N] and
|
||||
* tsk->cpu_timers[N] all the timers that are firing, and
|
||||
|
@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
|
|||
* that gets the timer lock before we do will give it up and
|
||||
* spin until we've taken care of that timer below.
|
||||
*/
|
||||
spin_unlock(&tsk->sighand->siglock);
|
||||
unlock_task_sighand(tsk, &flags);
|
||||
|
||||
/*
|
||||
* Now that all the timers on our list have the firing flag,
|
||||
|
|
|
@ -239,8 +239,7 @@ static unsigned long
|
|||
rcu_random(struct rcu_random_state *rrsp)
|
||||
{
|
||||
if (--rrsp->rrs_count < 0) {
|
||||
rrsp->rrs_state +=
|
||||
(unsigned long)cpu_clock(raw_smp_processor_id());
|
||||
rrsp->rrs_state += (unsigned long)local_clock();
|
||||
rrsp->rrs_count = RCU_RANDOM_REFRESH;
|
||||
}
|
||||
rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
|
||||
|
|
393
kernel/sched.c
393
kernel/sched.c
|
@ -77,6 +77,7 @@
|
|||
#include <asm/irq_regs.h>
|
||||
|
||||
#include "sched_cpupri.h"
|
||||
#include "workqueue_sched.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
|
@ -456,9 +457,10 @@ struct rq {
|
|||
unsigned long nr_running;
|
||||
#define CPU_LOAD_IDX_MAX 5
|
||||
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
||||
unsigned long last_load_update_tick;
|
||||
#ifdef CONFIG_NO_HZ
|
||||
u64 nohz_stamp;
|
||||
unsigned char in_nohz_recently;
|
||||
unsigned char nohz_balance_kick;
|
||||
#endif
|
||||
unsigned int skip_clock_update;
|
||||
|
||||
|
@ -1192,6 +1194,27 @@ static void resched_cpu(int cpu)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
* In the semi idle case, use the nearest busy cpu for migrating timers
|
||||
* from an idle cpu. This is good for power-savings.
|
||||
*
|
||||
* We don't do similar optimization for completely idle system, as
|
||||
* selecting an idle cpu will add more delays to the timers than intended
|
||||
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
|
||||
*/
|
||||
int get_nohz_timer_target(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
int i;
|
||||
struct sched_domain *sd;
|
||||
|
||||
for_each_domain(cpu, sd) {
|
||||
for_each_cpu(i, sched_domain_span(sd))
|
||||
if (!idle_cpu(i))
|
||||
return i;
|
||||
}
|
||||
return cpu;
|
||||
}
|
||||
/*
|
||||
* When add_timer_on() enqueues a timer into the timer wheel of an
|
||||
* idle CPU then this timer might expire before the next timer event
|
||||
|
@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
|
|||
smp_send_reschedule(cpu);
|
||||
}
|
||||
|
||||
int nohz_ratelimit(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
u64 diff = rq->clock - rq->nohz_stamp;
|
||||
|
||||
rq->nohz_stamp = rq->clock;
|
||||
|
||||
return diff < (NSEC_PER_SEC / HZ) >> 1;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NO_HZ */
|
||||
|
||||
static u64 sched_avg_period(void)
|
||||
|
@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
|
|||
if (root_task_group_empty())
|
||||
return;
|
||||
|
||||
now = cpu_clock(raw_smp_processor_id());
|
||||
now = local_clock();
|
||||
elapsed = now - sd->last_update;
|
||||
|
||||
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
|
||||
|
@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|||
static void calc_load_account_idle(struct rq *this_rq);
|
||||
static void update_sysctl(void);
|
||||
static int get_update_sysctl_factor(void);
|
||||
static void update_cpu_load(struct rq *this_rq);
|
||||
|
||||
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
||||
{
|
||||
|
@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
|
|||
}
|
||||
#endif
|
||||
|
||||
/***
|
||||
static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
|
||||
bool is_sync, bool is_migrate, bool is_local,
|
||||
unsigned long en_flags)
|
||||
{
|
||||
schedstat_inc(p, se.statistics.nr_wakeups);
|
||||
if (is_sync)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_sync);
|
||||
if (is_migrate)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
|
||||
if (is_local)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_local);
|
||||
else
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_remote);
|
||||
|
||||
activate_task(rq, p, en_flags);
|
||||
}
|
||||
|
||||
static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
|
||||
int wake_flags, bool success)
|
||||
{
|
||||
trace_sched_wakeup(p, success);
|
||||
check_preempt_curr(rq, p, wake_flags);
|
||||
|
||||
p->state = TASK_RUNNING;
|
||||
#ifdef CONFIG_SMP
|
||||
if (p->sched_class->task_woken)
|
||||
p->sched_class->task_woken(rq, p);
|
||||
|
||||
if (unlikely(rq->idle_stamp)) {
|
||||
u64 delta = rq->clock - rq->idle_stamp;
|
||||
u64 max = 2*sysctl_sched_migration_cost;
|
||||
|
||||
if (delta > max)
|
||||
rq->avg_idle = max;
|
||||
else
|
||||
update_avg(&rq->avg_idle, delta);
|
||||
rq->idle_stamp = 0;
|
||||
}
|
||||
#endif
|
||||
/* if a worker is waking up, notify workqueue */
|
||||
if ((p->flags & PF_WQ_WORKER) && success)
|
||||
wq_worker_waking_up(p, cpu_of(rq));
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_wake_up - wake up a thread
|
||||
* @p: the to-be-woken-up thread
|
||||
* @p: the thread to be awakened
|
||||
* @state: the mask of task states that can be woken
|
||||
* @sync: do a synchronous wakeup?
|
||||
* @wake_flags: wake modifier flags (WF_*)
|
||||
*
|
||||
* Put it on the run-queue if it's not already there. The "current"
|
||||
* thread is always on the run-queue (except when the actual
|
||||
|
@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
|
|||
* the simpler "current->state = TASK_RUNNING" to mark yourself
|
||||
* runnable without the overhead of this.
|
||||
*
|
||||
* returns failure only if the task is already active.
|
||||
* Returns %true if @p was woken up, %false if it was already running
|
||||
* or @state didn't match @p's state.
|
||||
*/
|
||||
static int try_to_wake_up(struct task_struct *p, unsigned int state,
|
||||
int wake_flags)
|
||||
|
@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
|
|||
|
||||
out_activate:
|
||||
#endif /* CONFIG_SMP */
|
||||
schedstat_inc(p, se.statistics.nr_wakeups);
|
||||
if (wake_flags & WF_SYNC)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_sync);
|
||||
if (orig_cpu != cpu)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
|
||||
if (cpu == this_cpu)
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_local);
|
||||
else
|
||||
schedstat_inc(p, se.statistics.nr_wakeups_remote);
|
||||
activate_task(rq, p, en_flags);
|
||||
ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
|
||||
cpu == this_cpu, en_flags);
|
||||
success = 1;
|
||||
|
||||
out_running:
|
||||
trace_sched_wakeup(p, success);
|
||||
check_preempt_curr(rq, p, wake_flags);
|
||||
|
||||
p->state = TASK_RUNNING;
|
||||
#ifdef CONFIG_SMP
|
||||
if (p->sched_class->task_woken)
|
||||
p->sched_class->task_woken(rq, p);
|
||||
|
||||
if (unlikely(rq->idle_stamp)) {
|
||||
u64 delta = rq->clock - rq->idle_stamp;
|
||||
u64 max = 2*sysctl_sched_migration_cost;
|
||||
|
||||
if (delta > max)
|
||||
rq->avg_idle = max;
|
||||
else
|
||||
update_avg(&rq->avg_idle, delta);
|
||||
rq->idle_stamp = 0;
|
||||
}
|
||||
#endif
|
||||
ttwu_post_activation(p, rq, wake_flags, success);
|
||||
out:
|
||||
task_rq_unlock(rq, &flags);
|
||||
put_cpu();
|
||||
|
@ -2398,6 +2430,37 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
|
|||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_wake_up_local - try to wake up a local task with rq lock held
|
||||
* @p: the thread to be awakened
|
||||
*
|
||||
* Put @p on the run-queue if it's not alredy there. The caller must
|
||||
* ensure that this_rq() is locked, @p is bound to this_rq() and not
|
||||
* the current task. this_rq() stays locked over invocation.
|
||||
*/
|
||||
static void try_to_wake_up_local(struct task_struct *p)
|
||||
{
|
||||
struct rq *rq = task_rq(p);
|
||||
bool success = false;
|
||||
|
||||
BUG_ON(rq != this_rq());
|
||||
BUG_ON(p == current);
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (!(p->state & TASK_NORMAL))
|
||||
return;
|
||||
|
||||
if (!p->se.on_rq) {
|
||||
if (likely(!task_running(rq, p))) {
|
||||
schedstat_inc(rq, ttwu_count);
|
||||
schedstat_inc(rq, ttwu_local);
|
||||
}
|
||||
ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
|
||||
success = true;
|
||||
}
|
||||
ttwu_post_activation(p, rq, 0, success);
|
||||
}
|
||||
|
||||
/**
|
||||
* wake_up_process - Wake up a specific process
|
||||
* @p: The process to be woken up.
|
||||
|
@ -3011,24 +3074,103 @@ static void calc_load_account_active(struct rq *this_rq)
|
|||
this_rq->calc_load_update += LOAD_FREQ;
|
||||
}
|
||||
|
||||
/*
|
||||
* The exact cpuload at various idx values, calculated at every tick would be
|
||||
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
* on nth tick when cpu may be busy, then we have:
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* decay_load_missed() below does efficient calculation of
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
*
|
||||
* The calculation is approximated on a 128 point scale.
|
||||
* degrade_zero_ticks is the number of ticks after which load at any
|
||||
* particular idx is approximated to be zero.
|
||||
* degrade_factor is a precomputed table, a row for each load idx.
|
||||
* Each column corresponds to degradation factor for a power of two ticks,
|
||||
* based on 128 point scale.
|
||||
* Example:
|
||||
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
*
|
||||
* With this power of 2 load factors, we can degrade the load n times
|
||||
* by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
* n mult/shifts needed by the exact degradation.
|
||||
*/
|
||||
#define DEGRADE_SHIFT 7
|
||||
static const unsigned char
|
||||
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const unsigned char
|
||||
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{64, 32, 8, 0, 0, 0, 0, 0},
|
||||
{96, 72, 40, 12, 1, 0, 0},
|
||||
{112, 98, 75, 43, 15, 1, 0},
|
||||
{120, 112, 98, 76, 45, 16, 2} };
|
||||
|
||||
/*
|
||||
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
* would be when CPU is idle and so we just decay the old load without
|
||||
* adding any new load.
|
||||
*/
|
||||
static unsigned long
|
||||
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
if (!missed_updates)
|
||||
return load;
|
||||
|
||||
if (missed_updates >= degrade_zero_ticks[idx])
|
||||
return 0;
|
||||
|
||||
if (idx == 1)
|
||||
return load >> missed_updates;
|
||||
|
||||
while (missed_updates) {
|
||||
if (missed_updates % 2)
|
||||
load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
||||
|
||||
missed_updates >>= 1;
|
||||
j++;
|
||||
}
|
||||
return load;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
* scheduler tick (TICK_NSEC).
|
||||
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
* every tick. We fix it up based on jiffies.
|
||||
*/
|
||||
static void update_cpu_load(struct rq *this_rq)
|
||||
{
|
||||
unsigned long this_load = this_rq->load.weight;
|
||||
unsigned long curr_jiffies = jiffies;
|
||||
unsigned long pending_updates;
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
|
||||
/* Avoid repeated calls on same jiffy, when moving in and out of idle */
|
||||
if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
|
||||
/* Update our load: */
|
||||
for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
||||
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
unsigned long old_load, new_load;
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
|
@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
|
|||
* example.
|
||||
*/
|
||||
if (new_load > old_load)
|
||||
new_load += scale-1;
|
||||
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
|
||||
new_load += scale - 1;
|
||||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
}
|
||||
|
||||
static void update_cpu_load_active(struct rq *this_rq)
|
||||
{
|
||||
update_cpu_load(this_rq);
|
||||
|
||||
calc_load_account_active(this_rq);
|
||||
}
|
||||
|
@ -3426,7 +3574,7 @@ void scheduler_tick(void)
|
|||
|
||||
raw_spin_lock(&rq->lock);
|
||||
update_rq_clock(rq);
|
||||
update_cpu_load(rq);
|
||||
update_cpu_load_active(rq);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
|
@ -3598,7 +3746,6 @@ asmlinkage void __sched schedule(void)
|
|||
rq = cpu_rq(cpu);
|
||||
rcu_note_context_switch(cpu);
|
||||
prev = rq->curr;
|
||||
switch_count = &prev->nivcsw;
|
||||
|
||||
release_kernel_lock(prev);
|
||||
need_resched_nonpreemptible:
|
||||
|
@ -3611,11 +3758,26 @@ asmlinkage void __sched schedule(void)
|
|||
raw_spin_lock_irq(&rq->lock);
|
||||
clear_tsk_need_resched(prev);
|
||||
|
||||
switch_count = &prev->nivcsw;
|
||||
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
|
||||
if (unlikely(signal_pending_state(prev->state, prev)))
|
||||
if (unlikely(signal_pending_state(prev->state, prev))) {
|
||||
prev->state = TASK_RUNNING;
|
||||
else
|
||||
} else {
|
||||
/*
|
||||
* If a worker is going to sleep, notify and
|
||||
* ask workqueue whether it wants to wake up a
|
||||
* task to maintain concurrency. If so, wake
|
||||
* up the task.
|
||||
*/
|
||||
if (prev->flags & PF_WQ_WORKER) {
|
||||
struct task_struct *to_wakeup;
|
||||
|
||||
to_wakeup = wq_worker_sleeping(prev, cpu);
|
||||
if (to_wakeup)
|
||||
try_to_wake_up_local(to_wakeup);
|
||||
}
|
||||
deactivate_task(rq, prev, DEQUEUE_SLEEP);
|
||||
}
|
||||
switch_count = &prev->nvcsw;
|
||||
}
|
||||
|
||||
|
@ -3637,8 +3799,10 @@ asmlinkage void __sched schedule(void)
|
|||
|
||||
context_switch(rq, prev, next); /* unlocks the rq */
|
||||
/*
|
||||
* the context switch might have flipped the stack from under
|
||||
* us, hence refresh the local variables.
|
||||
* The context switch have flipped the stack from under us
|
||||
* and restored the local variables which were saved when
|
||||
* this task called schedule() in the past. prev == current
|
||||
* is still correct, but it can be moved to another cpu/rq.
|
||||
*/
|
||||
cpu = smp_processor_id();
|
||||
rq = cpu_rq(cpu);
|
||||
|
@ -3647,11 +3811,8 @@ asmlinkage void __sched schedule(void)
|
|||
|
||||
post_schedule(rq);
|
||||
|
||||
if (unlikely(reacquire_kernel_lock(current) < 0)) {
|
||||
prev = rq->curr;
|
||||
switch_count = &prev->nivcsw;
|
||||
if (unlikely(reacquire_kernel_lock(prev)))
|
||||
goto need_resched_nonpreemptible;
|
||||
}
|
||||
|
||||
preempt_enable_no_resched();
|
||||
if (need_resched())
|
||||
|
@ -4441,12 +4602,8 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
|
|||
*/
|
||||
if (user && !capable(CAP_SYS_NICE)) {
|
||||
if (rt_policy(policy)) {
|
||||
unsigned long rlim_rtprio;
|
||||
|
||||
if (!lock_task_sighand(p, &flags))
|
||||
return -ESRCH;
|
||||
rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
|
||||
unlock_task_sighand(p, &flags);
|
||||
unsigned long rlim_rtprio =
|
||||
task_rlimit(p, RLIMIT_RTPRIO);
|
||||
|
||||
/* can't set/change the rt policy */
|
||||
if (policy != p->policy && !rlim_rtprio)
|
||||
|
@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
*/
|
||||
static struct notifier_block __cpuinitdata migration_notifier = {
|
||||
.notifier_call = migration_call,
|
||||
.priority = 10
|
||||
.priority = CPU_PRI_MIGRATION,
|
||||
};
|
||||
|
||||
static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
{
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_ONLINE:
|
||||
case CPU_DOWN_FAILED:
|
||||
set_cpu_active((long)hcpu, true);
|
||||
return NOTIFY_OK;
|
||||
default:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
}
|
||||
|
||||
static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
{
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_DOWN_PREPARE:
|
||||
set_cpu_active((long)hcpu, false);
|
||||
return NOTIFY_OK;
|
||||
default:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
}
|
||||
|
||||
static int __init migration_init(void)
|
||||
{
|
||||
void *cpu = (void *)(long)smp_processor_id();
|
||||
int err;
|
||||
|
||||
/* Start one for the boot CPU: */
|
||||
/* Initialize migration for the boot CPU */
|
||||
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
|
||||
BUG_ON(err == NOTIFY_BAD);
|
||||
migration_call(&migration_notifier, CPU_ONLINE, cpu);
|
||||
register_cpu_notifier(&migration_notifier);
|
||||
|
||||
/* Register cpu active notifiers */
|
||||
cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
|
||||
cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
early_initcall(migration_init);
|
||||
|
@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
|
|||
free_rootdomain(old_rd);
|
||||
}
|
||||
|
||||
static int init_rootdomain(struct root_domain *rd, bool bootmem)
|
||||
static int init_rootdomain(struct root_domain *rd)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL;
|
||||
|
||||
memset(rd, 0, sizeof(*rd));
|
||||
|
||||
if (bootmem)
|
||||
gfp = GFP_NOWAIT;
|
||||
|
||||
if (!alloc_cpumask_var(&rd->span, gfp))
|
||||
if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
|
||||
goto out;
|
||||
if (!alloc_cpumask_var(&rd->online, gfp))
|
||||
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
|
||||
goto free_span;
|
||||
if (!alloc_cpumask_var(&rd->rto_mask, gfp))
|
||||
if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
|
||||
goto free_online;
|
||||
|
||||
if (cpupri_init(&rd->cpupri, bootmem) != 0)
|
||||
if (cpupri_init(&rd->cpupri) != 0)
|
||||
goto free_rto_mask;
|
||||
return 0;
|
||||
|
||||
|
@ -6096,7 +6277,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
|
|||
|
||||
static void init_defrootdomain(void)
|
||||
{
|
||||
init_rootdomain(&def_root_domain, true);
|
||||
init_rootdomain(&def_root_domain);
|
||||
|
||||
atomic_set(&def_root_domain.refcount, 1);
|
||||
}
|
||||
|
@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
|
|||
if (!rd)
|
||||
return NULL;
|
||||
|
||||
if (init_rootdomain(rd, false) != 0) {
|
||||
if (init_rootdomain(rd) != 0) {
|
||||
kfree(rd);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
|
|||
}
|
||||
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
|
||||
|
||||
#ifndef CONFIG_CPUSETS
|
||||
/*
|
||||
* Add online and remove offline CPUs from the scheduler domains.
|
||||
* When cpusets are enabled they take over this function.
|
||||
* Update cpusets according to cpu_active mask. If cpusets are
|
||||
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
|
||||
* around partition_sched_domains().
|
||||
*/
|
||||
static int update_sched_domains(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
switch (action) {
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_ONLINE:
|
||||
case CPU_ONLINE_FROZEN:
|
||||
case CPU_DOWN_PREPARE:
|
||||
case CPU_DOWN_PREPARE_FROZEN:
|
||||
case CPU_DOWN_FAILED:
|
||||
case CPU_DOWN_FAILED_FROZEN:
|
||||
partition_sched_domains(1, NULL, NULL);
|
||||
cpuset_update_active_cpus();
|
||||
return NOTIFY_OK;
|
||||
default:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
}
|
||||
|
||||
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_DOWN_PREPARE:
|
||||
cpuset_update_active_cpus();
|
||||
return NOTIFY_OK;
|
||||
|
||||
default:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static int update_runtime(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
|
@ -7356,10 +7543,8 @@ void __init sched_init_smp(void)
|
|||
mutex_unlock(&sched_domains_mutex);
|
||||
put_online_cpus();
|
||||
|
||||
#ifndef CONFIG_CPUSETS
|
||||
/* XXX: Theoretical race here - CPU may be hotplugged now */
|
||||
hotcpu_notifier(update_sched_domains, 0);
|
||||
#endif
|
||||
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
|
||||
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
|
||||
|
||||
/* RT runtime code needs to handle some hotplug events */
|
||||
hotcpu_notifier(update_runtime, 0);
|
||||
|
@ -7604,6 +7789,9 @@ void __init sched_init(void)
|
|||
|
||||
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
||||
rq->cpu_load[j] = 0;
|
||||
|
||||
rq->last_load_update_tick = jiffies;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
rq->sd = NULL;
|
||||
rq->rd = NULL;
|
||||
|
@ -7617,6 +7805,10 @@ void __init sched_init(void)
|
|||
rq->idle_stamp = 0;
|
||||
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
||||
rq_attach_root(rq, &def_root_domain);
|
||||
#ifdef CONFIG_NO_HZ
|
||||
rq->nohz_balance_kick = 0;
|
||||
init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
|
||||
#endif
|
||||
#endif
|
||||
init_rq_hrtick(rq);
|
||||
atomic_set(&rq->nr_iowait, 0);
|
||||
|
@ -7661,8 +7853,11 @@ void __init sched_init(void)
|
|||
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
|
||||
#ifdef CONFIG_SMP
|
||||
#ifdef CONFIG_NO_HZ
|
||||
zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
|
||||
alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
|
||||
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
|
||||
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
|
||||
atomic_set(&nohz.load_balancer, nr_cpu_ids);
|
||||
atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
|
||||
atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
|
||||
#endif
|
||||
/* May be allocated at isolcpus cmdline parse time */
|
||||
if (cpu_isolated_map == NULL)
|
||||
|
|
|
@ -10,19 +10,55 @@
|
|||
* Ingo Molnar <mingo@redhat.com>
|
||||
* Guillaume Chazarain <guichaz@gmail.com>
|
||||
*
|
||||
* Create a semi stable clock from a mixture of other events, including:
|
||||
* - gtod
|
||||
*
|
||||
* What:
|
||||
*
|
||||
* cpu_clock(i) provides a fast (execution time) high resolution
|
||||
* clock with bounded drift between CPUs. The value of cpu_clock(i)
|
||||
* is monotonic for constant i. The timestamp returned is in nanoseconds.
|
||||
*
|
||||
* ######################### BIG FAT WARNING ##########################
|
||||
* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
|
||||
* # go backwards !! #
|
||||
* ####################################################################
|
||||
*
|
||||
* There is no strict promise about the base, although it tends to start
|
||||
* at 0 on boot (but people really shouldn't rely on that).
|
||||
*
|
||||
* cpu_clock(i) -- can be used from any context, including NMI.
|
||||
* sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
|
||||
* local_clock() -- is cpu_clock() on the current cpu.
|
||||
*
|
||||
* How:
|
||||
*
|
||||
* The implementation either uses sched_clock() when
|
||||
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
|
||||
* sched_clock() is assumed to provide these properties (mostly it means
|
||||
* the architecture provides a globally synchronized highres time source).
|
||||
*
|
||||
* Otherwise it tries to create a semi stable clock from a mixture of other
|
||||
* clocks, including:
|
||||
*
|
||||
* - GTOD (clock monotomic)
|
||||
* - sched_clock()
|
||||
* - explicit idle events
|
||||
*
|
||||
* We use gtod as base and the unstable clock deltas. The deltas are filtered,
|
||||
* making it monotonic and keeping it within an expected window.
|
||||
* We use GTOD as base and use sched_clock() deltas to improve resolution. The
|
||||
* deltas are filtered to provide monotonicity and keeping it within an
|
||||
* expected window.
|
||||
*
|
||||
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
|
||||
* that is otherwise invisible (TSC gets stopped).
|
||||
*
|
||||
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
|
||||
* consistent between cpus (never more than 2 jiffies difference).
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
* The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
|
||||
* like cpufreq interrupts that can change the base clock (TSC) multiplier
|
||||
* and cause funny jumps in time -- although the filtering provided by
|
||||
* sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
|
||||
* in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
|
||||
* sched_clock().
|
||||
*/
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/hardirq.h>
|
||||
|
@ -170,6 +206,11 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
|
|||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to cpu_clock(), but requires local IRQs to be disabled.
|
||||
*
|
||||
* See cpu_clock().
|
||||
*/
|
||||
u64 sched_clock_cpu(int cpu)
|
||||
{
|
||||
struct sched_clock_data *scd;
|
||||
|
@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
|
||||
|
||||
unsigned long long cpu_clock(int cpu)
|
||||
/*
|
||||
* As outlined at the top, provides a fast, high resolution, nanosecond
|
||||
* time source that is monotonic per cpu argument and has bounded drift
|
||||
* between cpus.
|
||||
*
|
||||
* ######################### BIG FAT WARNING ##########################
|
||||
* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
|
||||
* # go backwards !! #
|
||||
* ####################################################################
|
||||
*/
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
unsigned long long clock;
|
||||
u64 clock;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
|
@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
|
|||
return clock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to cpu_clock() for the current cpu. Time will only be observed
|
||||
* to be monotonic if care is taken to only compare timestampt taken on the
|
||||
* same CPU.
|
||||
*
|
||||
* See cpu_clock().
|
||||
*/
|
||||
u64 local_clock(void)
|
||||
{
|
||||
u64 clock;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
clock = sched_clock_cpu(smp_processor_id());
|
||||
local_irq_restore(flags);
|
||||
|
||||
return clock;
|
||||
}
|
||||
|
||||
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
|
||||
void sched_clock_init(void)
|
||||
|
@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
|
|||
return sched_clock();
|
||||
}
|
||||
|
||||
|
||||
unsigned long long cpu_clock(int cpu)
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
return sched_clock_cpu(cpu);
|
||||
}
|
||||
|
||||
u64 local_clock(void)
|
||||
{
|
||||
return sched_clock_cpu(0);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
|
||||
EXPORT_SYMBOL_GPL(cpu_clock);
|
||||
EXPORT_SYMBOL_GPL(local_clock);
|
||||
|
|
|
@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
|
|||
*
|
||||
* Returns: -ENOMEM if memory fails.
|
||||
*/
|
||||
int cpupri_init(struct cpupri *cp, bool bootmem)
|
||||
int cpupri_init(struct cpupri *cp)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL;
|
||||
int i;
|
||||
|
||||
if (bootmem)
|
||||
gfp = GFP_NOWAIT;
|
||||
|
||||
memset(cp, 0, sizeof(*cp));
|
||||
|
||||
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
|
||||
|
@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
|
|||
|
||||
raw_spin_lock_init(&vec->lock);
|
||||
vec->count = 0;
|
||||
if (!zalloc_cpumask_var(&vec->mask, gfp))
|
||||
if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ struct cpupri {
|
|||
int cpupri_find(struct cpupri *cp,
|
||||
struct task_struct *p, struct cpumask *lowest_mask);
|
||||
void cpupri_set(struct cpupri *cp, int cpu, int pri);
|
||||
int cpupri_init(struct cpupri *cp, bool bootmem);
|
||||
int cpupri_init(struct cpupri *cp);
|
||||
void cpupri_cleanup(struct cpupri *cp);
|
||||
#else
|
||||
#define cpupri_set(cp, cpu, pri) do { } while (0)
|
||||
|
|
|
@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
|
|||
PN(sysctl_sched_latency);
|
||||
PN(sysctl_sched_min_granularity);
|
||||
PN(sysctl_sched_wakeup_granularity);
|
||||
PN(sysctl_sched_child_runs_first);
|
||||
P(sysctl_sched_child_runs_first);
|
||||
P(sysctl_sched_features);
|
||||
#undef PN
|
||||
#undef P
|
||||
|
|
|
@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
|
|||
unsigned long power = SCHED_LOAD_SCALE;
|
||||
struct sched_group *sdg = sd->groups;
|
||||
|
||||
if (sched_feat(ARCH_POWER))
|
||||
power *= arch_scale_freq_power(sd, cpu);
|
||||
else
|
||||
power *= default_scale_freq_power(sd, cpu);
|
||||
|
||||
power >>= SCHED_LOAD_SHIFT;
|
||||
|
||||
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
|
||||
if (sched_feat(ARCH_POWER))
|
||||
power *= arch_scale_smt_power(sd, cpu);
|
||||
|
@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
|
|||
power >>= SCHED_LOAD_SHIFT;
|
||||
}
|
||||
|
||||
sdg->cpu_power_orig = power;
|
||||
|
||||
if (sched_feat(ARCH_POWER))
|
||||
power *= arch_scale_freq_power(sd, cpu);
|
||||
else
|
||||
power *= default_scale_freq_power(sd, cpu);
|
||||
|
||||
power >>= SCHED_LOAD_SHIFT;
|
||||
|
||||
power *= scale_rt_power(cpu);
|
||||
power >>= SCHED_LOAD_SHIFT;
|
||||
|
||||
|
@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
|
|||
sdg->cpu_power = power;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try and fix up capacity for tiny siblings, this is needed when
|
||||
* things like SD_ASYM_PACKING need f_b_g to select another sibling
|
||||
* which on its own isn't powerful enough.
|
||||
*
|
||||
* See update_sd_pick_busiest() and check_asym_packing().
|
||||
*/
|
||||
static inline int
|
||||
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
||||
{
|
||||
/*
|
||||
* Only siblings can have significantly less than SCHED_LOAD_SCALE
|
||||
*/
|
||||
if (sd->level != SD_LV_SIBLING)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If ~90% of the cpu_power is still there, we're good.
|
||||
*/
|
||||
if (group->cpu_power * 32 > group->cpu_power_orig * 29)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
||||
* @sd: The sched_domain whose statistics are to be updated.
|
||||
|
@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||
* domains. In the newly idle case, we will allow all the cpu's
|
||||
* to do the newly idle load balance.
|
||||
*/
|
||||
if (idle != CPU_NEWLY_IDLE && local_group &&
|
||||
balance_cpu != this_cpu) {
|
||||
*balance = 0;
|
||||
return;
|
||||
if (idle != CPU_NEWLY_IDLE && local_group) {
|
||||
if (balance_cpu != this_cpu) {
|
||||
*balance = 0;
|
||||
return;
|
||||
}
|
||||
update_group_power(sd, this_cpu);
|
||||
}
|
||||
|
||||
update_group_power(sd, this_cpu);
|
||||
|
||||
/* Adjust by relative CPU power of the group */
|
||||
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
|
||||
|
||||
|
@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||
|
||||
sgs->group_capacity =
|
||||
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
|
||||
if (!sgs->group_capacity)
|
||||
sgs->group_capacity = fix_small_capacity(sd, group);
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sd_pick_busiest - return 1 on busiest group
|
||||
* @sd: sched_domain whose statistics are to be checked
|
||||
* @sds: sched_domain statistics
|
||||
* @sg: sched_group candidate to be checked for being the busiest
|
||||
* @sgs: sched_group statistics
|
||||
* @this_cpu: the current cpu
|
||||
*
|
||||
* Determine if @sg is a busier group than the previously selected
|
||||
* busiest group.
|
||||
*/
|
||||
static bool update_sd_pick_busiest(struct sched_domain *sd,
|
||||
struct sd_lb_stats *sds,
|
||||
struct sched_group *sg,
|
||||
struct sg_lb_stats *sgs,
|
||||
int this_cpu)
|
||||
{
|
||||
if (sgs->avg_load <= sds->max_load)
|
||||
return false;
|
||||
|
||||
if (sgs->sum_nr_running > sgs->group_capacity)
|
||||
return true;
|
||||
|
||||
if (sgs->group_imb)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* ASYM_PACKING needs to move all the work to the lowest
|
||||
* numbered CPUs in the group, therefore mark all groups
|
||||
* higher than ourself as busy.
|
||||
*/
|
||||
if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
|
||||
this_cpu < group_first_cpu(sg)) {
|
||||
if (!sds->busiest)
|
||||
return true;
|
||||
|
||||
if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||
* @sd: sched_domain whose statistics are to be updated.
|
||||
* @this_cpu: Cpu for which load balance is currently performed.
|
||||
* @idle: Idle status of this_cpu
|
||||
* @sd_idle: Idle status of the sched_domain containing group.
|
||||
* @sd_idle: Idle status of the sched_domain containing sg.
|
||||
* @cpus: Set of cpus considered for load balancing.
|
||||
* @balance: Should we balance.
|
||||
* @sds: variable to hold the statistics for this sched_domain.
|
||||
|
@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
|||
struct sd_lb_stats *sds)
|
||||
{
|
||||
struct sched_domain *child = sd->child;
|
||||
struct sched_group *group = sd->groups;
|
||||
struct sched_group *sg = sd->groups;
|
||||
struct sg_lb_stats sgs;
|
||||
int load_idx, prefer_sibling = 0;
|
||||
|
||||
|
@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
|||
do {
|
||||
int local_group;
|
||||
|
||||
local_group = cpumask_test_cpu(this_cpu,
|
||||
sched_group_cpus(group));
|
||||
local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
|
||||
memset(&sgs, 0, sizeof(sgs));
|
||||
update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
|
||||
update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
|
||||
local_group, cpus, balance, &sgs);
|
||||
|
||||
if (local_group && !(*balance))
|
||||
return;
|
||||
|
||||
sds->total_load += sgs.group_load;
|
||||
sds->total_pwr += group->cpu_power;
|
||||
sds->total_pwr += sg->cpu_power;
|
||||
|
||||
/*
|
||||
* In case the child domain prefers tasks go to siblings
|
||||
* first, lower the group capacity to one so that we'll try
|
||||
* first, lower the sg capacity to one so that we'll try
|
||||
* and move all the excess tasks away.
|
||||
*/
|
||||
if (prefer_sibling)
|
||||
|
@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
|||
|
||||
if (local_group) {
|
||||
sds->this_load = sgs.avg_load;
|
||||
sds->this = group;
|
||||
sds->this = sg;
|
||||
sds->this_nr_running = sgs.sum_nr_running;
|
||||
sds->this_load_per_task = sgs.sum_weighted_load;
|
||||
} else if (sgs.avg_load > sds->max_load &&
|
||||
(sgs.sum_nr_running > sgs.group_capacity ||
|
||||
sgs.group_imb)) {
|
||||
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
|
||||
sds->max_load = sgs.avg_load;
|
||||
sds->busiest = group;
|
||||
sds->busiest = sg;
|
||||
sds->busiest_nr_running = sgs.sum_nr_running;
|
||||
sds->busiest_group_capacity = sgs.group_capacity;
|
||||
sds->busiest_load_per_task = sgs.sum_weighted_load;
|
||||
sds->group_imb = sgs.group_imb;
|
||||
}
|
||||
|
||||
update_sd_power_savings_stats(group, sds, local_group, &sgs);
|
||||
group = group->next;
|
||||
} while (group != sd->groups);
|
||||
update_sd_power_savings_stats(sg, sds, local_group, &sgs);
|
||||
sg = sg->next;
|
||||
} while (sg != sd->groups);
|
||||
}
|
||||
|
||||
int __weak arch_sd_sibling_asym_packing(void)
|
||||
{
|
||||
return 0*SD_ASYM_PACKING;
|
||||
}
|
||||
|
||||
/**
|
||||
* check_asym_packing - Check to see if the group is packed into the
|
||||
* sched doman.
|
||||
*
|
||||
* This is primarily intended to used at the sibling level. Some
|
||||
* cores like POWER7 prefer to use lower numbered SMT threads. In the
|
||||
* case of POWER7, it can move to lower SMT modes only when higher
|
||||
* threads are idle. When in lower SMT modes, the threads will
|
||||
* perform better since they share less core resources. Hence when we
|
||||
* have idle threads, we want them to be the higher ones.
|
||||
*
|
||||
* This packing function is run on idle threads. It checks to see if
|
||||
* the busiest CPU in this domain (core in the P7 case) has a higher
|
||||
* CPU number than the packing function is being run on. Here we are
|
||||
* assuming lower CPU number will be equivalent to lower a SMT thread
|
||||
* number.
|
||||
*
|
||||
* Returns 1 when packing is required and a task should be moved to
|
||||
* this CPU. The amount of the imbalance is returned in *imbalance.
|
||||
*
|
||||
* @sd: The sched_domain whose packing is to be checked.
|
||||
* @sds: Statistics of the sched_domain which is to be packed
|
||||
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
|
||||
* @imbalance: returns amount of imbalanced due to packing.
|
||||
*/
|
||||
static int check_asym_packing(struct sched_domain *sd,
|
||||
struct sd_lb_stats *sds,
|
||||
int this_cpu, unsigned long *imbalance)
|
||||
{
|
||||
int busiest_cpu;
|
||||
|
||||
if (!(sd->flags & SD_ASYM_PACKING))
|
||||
return 0;
|
||||
|
||||
if (!sds->busiest)
|
||||
return 0;
|
||||
|
||||
busiest_cpu = group_first_cpu(sds->busiest);
|
||||
if (this_cpu > busiest_cpu)
|
||||
return 0;
|
||||
|
||||
*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
|
||||
SCHED_LOAD_SCALE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
|||
if (!(*balance))
|
||||
goto ret;
|
||||
|
||||
if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
|
||||
check_asym_packing(sd, &sds, this_cpu, imbalance))
|
||||
return sds.busiest;
|
||||
|
||||
if (!sds.busiest || sds.busiest_nr_running == 0)
|
||||
goto out_balanced;
|
||||
|
||||
|
@ -2726,8 +2850,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
|||
* find_busiest_queue - find the busiest runqueue among the cpus in group.
|
||||
*/
|
||||
static struct rq *
|
||||
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
|
||||
unsigned long imbalance, const struct cpumask *cpus)
|
||||
find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
|
||||
enum cpu_idle_type idle, unsigned long imbalance,
|
||||
const struct cpumask *cpus)
|
||||
{
|
||||
struct rq *busiest = NULL, *rq;
|
||||
unsigned long max_load = 0;
|
||||
|
@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
|
|||
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
|
||||
unsigned long wl;
|
||||
|
||||
if (!capacity)
|
||||
capacity = fix_small_capacity(sd, group);
|
||||
|
||||
if (!cpumask_test_cpu(i, cpus))
|
||||
continue;
|
||||
|
||||
|
@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
|
|||
/* Working cpumask for load_balance and load_balance_newidle. */
|
||||
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
|
||||
|
||||
static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
|
||||
static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
|
||||
int busiest_cpu, int this_cpu)
|
||||
{
|
||||
if (idle == CPU_NEWLY_IDLE) {
|
||||
|
||||
/*
|
||||
* ASYM_PACKING needs to force migrate tasks from busy but
|
||||
* higher numbered CPUs in order to pack all tasks in the
|
||||
* lowest numbered CPUs.
|
||||
*/
|
||||
if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* The only task running in a non-idle cpu can be moved to this
|
||||
* cpu in an attempt to completely freeup the other CPU
|
||||
|
@ -2854,7 +2992,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||
goto out_balanced;
|
||||
}
|
||||
|
||||
busiest = find_busiest_queue(group, idle, imbalance, cpus);
|
||||
busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
|
||||
if (!busiest) {
|
||||
schedstat_inc(sd, lb_nobusyq[idle]);
|
||||
goto out_balanced;
|
||||
|
@ -2898,7 +3036,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||
schedstat_inc(sd, lb_failed[idle]);
|
||||
sd->nr_balance_failed++;
|
||||
|
||||
if (need_active_balance(sd, sd_idle, idle)) {
|
||||
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
|
||||
this_cpu)) {
|
||||
raw_spin_lock_irqsave(&busiest->lock, flags);
|
||||
|
||||
/* don't kick the active_load_balance_cpu_stop,
|
||||
|
@ -3093,13 +3232,40 @@ static int active_load_balance_cpu_stop(void *data)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
|
||||
static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
|
||||
|
||||
static void trigger_sched_softirq(void *data)
|
||||
{
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
|
||||
static inline void init_sched_softirq_csd(struct call_single_data *csd)
|
||||
{
|
||||
csd->func = trigger_sched_softirq;
|
||||
csd->info = NULL;
|
||||
csd->flags = 0;
|
||||
csd->priv = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* idle load balancing details
|
||||
* - One of the idle CPUs nominates itself as idle load_balancer, while
|
||||
* entering idle.
|
||||
* - This idle load balancer CPU will also go into tickless mode when
|
||||
* it is idle, just like all other idle CPUs
|
||||
* - When one of the busy CPUs notice that there may be an idle rebalancing
|
||||
* needed, they will kick the idle load balancer, which then does idle
|
||||
* load balancing for all the idle CPUs.
|
||||
*/
|
||||
static struct {
|
||||
atomic_t load_balancer;
|
||||
cpumask_var_t cpu_mask;
|
||||
cpumask_var_t ilb_grp_nohz_mask;
|
||||
} nohz ____cacheline_aligned = {
|
||||
.load_balancer = ATOMIC_INIT(-1),
|
||||
};
|
||||
atomic_t first_pick_cpu;
|
||||
atomic_t second_pick_cpu;
|
||||
cpumask_var_t idle_cpus_mask;
|
||||
cpumask_var_t grp_idle_mask;
|
||||
unsigned long next_balance; /* in jiffy units */
|
||||
} nohz ____cacheline_aligned;
|
||||
|
||||
int get_nohz_load_balancer(void)
|
||||
{
|
||||
|
@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
|
|||
*/
|
||||
static inline int is_semi_idle_group(struct sched_group *ilb_group)
|
||||
{
|
||||
cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
|
||||
cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
|
||||
sched_group_cpus(ilb_group));
|
||||
|
||||
/*
|
||||
* A sched_group is semi-idle when it has atleast one busy cpu
|
||||
* and atleast one idle cpu.
|
||||
*/
|
||||
if (cpumask_empty(nohz.ilb_grp_nohz_mask))
|
||||
if (cpumask_empty(nohz.grp_idle_mask))
|
||||
return 0;
|
||||
|
||||
if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
|
||||
if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
|
@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
|
|||
* Optimize for the case when we have no idle CPUs or only one
|
||||
* idle CPU. Don't walk the sched_domain hierarchy in such cases
|
||||
*/
|
||||
if (cpumask_weight(nohz.cpu_mask) < 2)
|
||||
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
|
||||
goto out_done;
|
||||
|
||||
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
|
||||
|
@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
|
|||
|
||||
do {
|
||||
if (is_semi_idle_group(ilb_group))
|
||||
return cpumask_first(nohz.ilb_grp_nohz_mask);
|
||||
return cpumask_first(nohz.grp_idle_mask);
|
||||
|
||||
ilb_group = ilb_group->next;
|
||||
|
||||
|
@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
|
|||
}
|
||||
|
||||
out_done:
|
||||
return cpumask_first(nohz.cpu_mask);
|
||||
return nr_cpu_ids;
|
||||
}
|
||||
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
|
||||
static inline int find_new_ilb(int call_cpu)
|
||||
{
|
||||
return cpumask_first(nohz.cpu_mask);
|
||||
return nr_cpu_ids;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
|
||||
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
|
||||
* CPU (if there is one).
|
||||
*/
|
||||
static void nohz_balancer_kick(int cpu)
|
||||
{
|
||||
int ilb_cpu;
|
||||
|
||||
nohz.next_balance++;
|
||||
|
||||
ilb_cpu = get_nohz_load_balancer();
|
||||
|
||||
if (ilb_cpu >= nr_cpu_ids) {
|
||||
ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
|
||||
if (ilb_cpu >= nr_cpu_ids)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
|
||||
struct call_single_data *cp;
|
||||
|
||||
cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
|
||||
cp = &per_cpu(remote_sched_softirq_cb, cpu);
|
||||
__smp_call_function_single(ilb_cpu, cp, 0);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine will try to nominate the ilb (idle load balancing)
|
||||
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
|
||||
* load balancing on behalf of all those cpus. If all the cpus in the system
|
||||
* go into this tickless mode, then there will be no ilb owner (as there is
|
||||
* no need for one) and all the cpus will sleep till the next wakeup event
|
||||
* arrives...
|
||||
* load balancing on behalf of all those cpus.
|
||||
*
|
||||
* For the ilb owner, tick is not stopped. And this tick will be used
|
||||
* for idle load balancing. ilb owner will still be part of
|
||||
* nohz.cpu_mask..
|
||||
* When the ilb owner becomes busy, we will not have new ilb owner until some
|
||||
* idle CPU wakes up and goes back to idle or some busy CPU tries to kick
|
||||
* idle load balancing by kicking one of the idle CPUs.
|
||||
*
|
||||
* While stopping the tick, this cpu will become the ilb owner if there
|
||||
* is no other owner. And will be the owner till that cpu becomes busy
|
||||
* or if all cpus in the system stop their ticks at which point
|
||||
* there is no need for ilb owner.
|
||||
*
|
||||
* When the ilb owner becomes busy, it nominates another owner, during the
|
||||
* next busy scheduler_tick()
|
||||
* Ticks are stopped for the ilb owner as well, with busy CPU kicking this
|
||||
* ilb owner CPU in future (when there is a need for idle load balancing on
|
||||
* behalf of all idle CPUs).
|
||||
*/
|
||||
int select_nohz_load_balancer(int stop_tick)
|
||||
void select_nohz_load_balancer(int stop_tick)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (stop_tick) {
|
||||
cpu_rq(cpu)->in_nohz_recently = 1;
|
||||
|
||||
if (!cpu_active(cpu)) {
|
||||
if (atomic_read(&nohz.load_balancer) != cpu)
|
||||
return 0;
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we are going offline and still the leader,
|
||||
* give up!
|
||||
*/
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, cpu,
|
||||
nr_cpu_ids) != cpu)
|
||||
BUG();
|
||||
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
cpumask_set_cpu(cpu, nohz.cpu_mask);
|
||||
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
|
||||
|
||||
/* time for ilb owner also to sleep */
|
||||
if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
|
||||
if (atomic_read(&nohz.load_balancer) == cpu)
|
||||
atomic_set(&nohz.load_balancer, -1);
|
||||
return 0;
|
||||
}
|
||||
if (atomic_read(&nohz.first_pick_cpu) == cpu)
|
||||
atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
|
||||
if (atomic_read(&nohz.second_pick_cpu) == cpu)
|
||||
atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
|
||||
|
||||
if (atomic_read(&nohz.load_balancer) == -1) {
|
||||
/* make me the ilb owner */
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
|
||||
return 1;
|
||||
} else if (atomic_read(&nohz.load_balancer) == cpu) {
|
||||
if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
|
||||
int new_ilb;
|
||||
|
||||
if (!(sched_smt_power_savings ||
|
||||
sched_mc_power_savings))
|
||||
return 1;
|
||||
/* make me the ilb owner */
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
|
||||
cpu) != nr_cpu_ids)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Check to see if there is a more power-efficient
|
||||
* ilb.
|
||||
*/
|
||||
new_ilb = find_new_ilb(cpu);
|
||||
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
|
||||
atomic_set(&nohz.load_balancer, -1);
|
||||
atomic_set(&nohz.load_balancer, nr_cpu_ids);
|
||||
resched_cpu(new_ilb);
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
return 1;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
|
||||
return 0;
|
||||
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
|
||||
return;
|
||||
|
||||
cpumask_clear_cpu(cpu, nohz.cpu_mask);
|
||||
cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
|
||||
|
||||
if (atomic_read(&nohz.load_balancer) == cpu)
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
||||
if (atomic_cmpxchg(&nohz.load_balancer, cpu,
|
||||
nr_cpu_ids) != cpu)
|
||||
BUG();
|
||||
}
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -3385,10 +3569,101 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
|||
rq->next_balance = next_balance;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
* In CONFIG_NO_HZ case, the idle balance kickee will do the
|
||||
* rebalancing for all the cpus for whom scheduler ticks are stopped.
|
||||
*/
|
||||
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
|
||||
{
|
||||
struct rq *this_rq = cpu_rq(this_cpu);
|
||||
struct rq *rq;
|
||||
int balance_cpu;
|
||||
|
||||
if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
|
||||
return;
|
||||
|
||||
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
|
||||
if (balance_cpu == this_cpu)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If this cpu gets work to do, stop the load balancing
|
||||
* work being done for other cpus. Next load
|
||||
* balancing owner will pick it up.
|
||||
*/
|
||||
if (need_resched()) {
|
||||
this_rq->nohz_balance_kick = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
raw_spin_lock_irq(&this_rq->lock);
|
||||
update_rq_clock(this_rq);
|
||||
update_cpu_load(this_rq);
|
||||
raw_spin_unlock_irq(&this_rq->lock);
|
||||
|
||||
rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
|
||||
rq = cpu_rq(balance_cpu);
|
||||
if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
this_rq->next_balance = rq->next_balance;
|
||||
}
|
||||
nohz.next_balance = this_rq->next_balance;
|
||||
this_rq->nohz_balance_kick = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Current heuristic for kicking the idle load balancer
|
||||
* - first_pick_cpu is the one of the busy CPUs. It will kick
|
||||
* idle load balancer when it has more than one process active. This
|
||||
* eliminates the need for idle load balancing altogether when we have
|
||||
* only one running process in the system (common case).
|
||||
* - If there are more than one busy CPU, idle load balancer may have
|
||||
* to run for active_load_balance to happen (i.e., two busy CPUs are
|
||||
* SMT or core siblings and can run better if they move to different
|
||||
* physical CPUs). So, second_pick_cpu is the second of the busy CPUs
|
||||
* which will kick idle load balancer as soon as it has any load.
|
||||
*/
|
||||
static inline int nohz_kick_needed(struct rq *rq, int cpu)
|
||||
{
|
||||
unsigned long now = jiffies;
|
||||
int ret;
|
||||
int first_pick_cpu, second_pick_cpu;
|
||||
|
||||
if (time_before(now, nohz.next_balance))
|
||||
return 0;
|
||||
|
||||
if (!rq->nr_running)
|
||||
return 0;
|
||||
|
||||
first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
|
||||
second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
|
||||
|
||||
if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
|
||||
second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
|
||||
return 0;
|
||||
|
||||
ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
|
||||
if (ret == nr_cpu_ids || ret == cpu) {
|
||||
atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
|
||||
if (rq->nr_running > 1)
|
||||
return 1;
|
||||
} else {
|
||||
ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
|
||||
if (ret == nr_cpu_ids || ret == cpu) {
|
||||
if (rq->nr_running)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
|
||||
#endif
|
||||
|
||||
/*
|
||||
* run_rebalance_domains is triggered when needed from the scheduler tick.
|
||||
* In CONFIG_NO_HZ case, the idle load balance owner will do the
|
||||
* rebalancing for all the cpus for whom scheduler ticks are stopped.
|
||||
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
|
||||
*/
|
||||
static void run_rebalance_domains(struct softirq_action *h)
|
||||
{
|
||||
|
@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
|
|||
|
||||
rebalance_domains(this_cpu, idle);
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
* If this cpu is the owner for idle load balancing, then do the
|
||||
* If this cpu has a pending nohz_balance_kick, then do the
|
||||
* balancing on behalf of the other idle cpus whose ticks are
|
||||
* stopped.
|
||||
*/
|
||||
if (this_rq->idle_at_tick &&
|
||||
atomic_read(&nohz.load_balancer) == this_cpu) {
|
||||
struct rq *rq;
|
||||
int balance_cpu;
|
||||
|
||||
for_each_cpu(balance_cpu, nohz.cpu_mask) {
|
||||
if (balance_cpu == this_cpu)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If this cpu gets work to do, stop the load balancing
|
||||
* work being done for other cpus. Next load
|
||||
* balancing owner will pick it up.
|
||||
*/
|
||||
if (need_resched())
|
||||
break;
|
||||
|
||||
rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
|
||||
rq = cpu_rq(balance_cpu);
|
||||
if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
this_rq->next_balance = rq->next_balance;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
nohz_idle_balance(this_cpu, idle);
|
||||
}
|
||||
|
||||
static inline int on_null_domain(int cpu)
|
||||
|
@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
|
|||
|
||||
/*
|
||||
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
||||
*
|
||||
* In case of CONFIG_NO_HZ, this is the place where we nominate a new
|
||||
* idle load balancing owner or decide to stop the periodic load balancing,
|
||||
* if the whole system is idle.
|
||||
*/
|
||||
static inline void trigger_load_balance(struct rq *rq, int cpu)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
* If we were in the nohz mode recently and busy at the current
|
||||
* scheduler tick, then check if we need to nominate new idle
|
||||
* load balancer.
|
||||
*/
|
||||
if (rq->in_nohz_recently && !rq->idle_at_tick) {
|
||||
rq->in_nohz_recently = 0;
|
||||
|
||||
if (atomic_read(&nohz.load_balancer) == cpu) {
|
||||
cpumask_clear_cpu(cpu, nohz.cpu_mask);
|
||||
atomic_set(&nohz.load_balancer, -1);
|
||||
}
|
||||
|
||||
if (atomic_read(&nohz.load_balancer) == -1) {
|
||||
int ilb = find_new_ilb(cpu);
|
||||
|
||||
if (ilb < nr_cpu_ids)
|
||||
resched_cpu(ilb);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this cpu is idle and doing idle load balancing for all the
|
||||
* cpus with ticks stopped, is it time for that to stop?
|
||||
*/
|
||||
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
|
||||
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
|
||||
resched_cpu(cpu);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this cpu is idle and the idle load balancing is done by
|
||||
* someone else, then no need raise the SCHED_SOFTIRQ
|
||||
*/
|
||||
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
|
||||
cpumask_test_cpu(cpu, nohz.cpu_mask))
|
||||
return;
|
||||
#endif
|
||||
/* Don't need to rebalance while attached to NULL domain */
|
||||
if (time_after_eq(jiffies, rq->next_balance) &&
|
||||
likely(!on_null_domain(cpu)))
|
||||
raise_softirq(SCHED_SOFTIRQ);
|
||||
#ifdef CONFIG_NO_HZ
|
||||
else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
|
||||
nohz_balancer_kick(cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void rq_online_fair(struct rq *rq)
|
||||
|
|
|
@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
|
|||
{
|
||||
unsigned long soft, hard;
|
||||
|
||||
if (!p->signal)
|
||||
return;
|
||||
|
||||
/* max may change after cur was read, this will be fixed next tick */
|
||||
soft = task_rlimit(p, RLIMIT_RTTIME);
|
||||
hard = task_rlimit_max(p, RLIMIT_RTTIME);
|
||||
|
|
|
@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
|
|||
static inline void account_group_user_time(struct task_struct *tsk,
|
||||
cputime_t cputime)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer;
|
||||
|
||||
/* tsk == current, ensure it is safe to use ->signal */
|
||||
if (unlikely(tsk->exit_state))
|
||||
return;
|
||||
|
||||
cputimer = &tsk->signal->cputimer;
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer->running)
|
||||
return;
|
||||
|
@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
|
|||
static inline void account_group_system_time(struct task_struct *tsk,
|
||||
cputime_t cputime)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer;
|
||||
|
||||
/* tsk == current, ensure it is safe to use ->signal */
|
||||
if (unlikely(tsk->exit_state))
|
||||
return;
|
||||
|
||||
cputimer = &tsk->signal->cputimer;
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer->running)
|
||||
return;
|
||||
|
@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
|
|||
static inline void account_group_exec_runtime(struct task_struct *tsk,
|
||||
unsigned long long ns)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer;
|
||||
struct signal_struct *sig;
|
||||
|
||||
sig = tsk->signal;
|
||||
/* see __exit_signal()->task_rq_unlock_wait() */
|
||||
barrier();
|
||||
if (unlikely(!sig))
|
||||
return;
|
||||
|
||||
cputimer = &sig->cputimer;
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer->running)
|
||||
return;
|
||||
|
|
|
@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
|
|||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
|
||||
arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
|
||||
arch_needs_cpu(cpu)) {
|
||||
next_jiffies = last_jiffies + 1;
|
||||
delta_jiffies = 1;
|
||||
} else {
|
||||
|
@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
|
|||
* the scheduler tick in nohz_restart_sched_tick.
|
||||
*/
|
||||
if (!ts->tick_stopped) {
|
||||
if (select_nohz_load_balancer(1)) {
|
||||
/*
|
||||
* sched tick not stopped!
|
||||
*/
|
||||
cpumask_clear_cpu(cpu, nohz_cpu_mask);
|
||||
goto out;
|
||||
}
|
||||
select_nohz_load_balancer(1);
|
||||
|
||||
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
|
||||
ts->tick_stopped = 1;
|
||||
|
|
|
@ -692,12 +692,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
|
|||
cpu = smp_processor_id();
|
||||
|
||||
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
|
||||
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
|
||||
int preferred_cpu = get_nohz_load_balancer();
|
||||
|
||||
if (preferred_cpu >= 0)
|
||||
cpu = preferred_cpu;
|
||||
}
|
||||
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
|
||||
cpu = get_nohz_timer_target();
|
||||
#endif
|
||||
new_base = per_cpu(tvec_bases, cpu);
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ u64 notrace trace_clock_local(void)
|
|||
*/
|
||||
u64 notrace trace_clock(void)
|
||||
{
|
||||
return cpu_clock(raw_smp_processor_id());
|
||||
return local_clock();
|
||||
}
|
||||
|
||||
|
||||
|
|
16
kernel/workqueue_sched.h
Normal file
16
kernel/workqueue_sched.h
Normal file
|
@ -0,0 +1,16 @@
|
|||
/*
|
||||
* kernel/workqueue_sched.h
|
||||
*
|
||||
* Scheduler hooks for concurrency managed workqueue. Only to be
|
||||
* included from sched.c and workqueue.c.
|
||||
*/
|
||||
static inline void wq_worker_waking_up(struct task_struct *task,
|
||||
unsigned int cpu)
|
||||
{
|
||||
}
|
||||
|
||||
static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
|
||||
unsigned int cpu)
|
||||
{
|
||||
return NULL;
|
||||
}
|
Loading…
Reference in a new issue